From cf8e76d3a4024132b3d45bdd7c33daa440a677b1 Mon Sep 17 00:00:00 2001 From: Daniil Trotsenko Date: Fri, 10 Jan 2025 03:05:51 +0200 Subject: [PATCH 1/2] feat: add loaders for epub files --- rig-core/Cargo.toml | 2 + rig-core/src/loaders/epub.rs | 412 +++++++++++++++++++++++++++++++++ rig-core/src/loaders/mod.rs | 12 + rig-core/tests/data/dummy.epub | Bin 0 -> 3216 bytes 4 files changed, 426 insertions(+) create mode 100644 rig-core/src/loaders/epub.rs create mode 100644 rig-core/tests/data/dummy.epub diff --git a/rig-core/Cargo.toml b/rig-core/Cargo.toml index afb3c1d9..147502f8 100644 --- a/rig-core/Cargo.toml +++ b/rig-core/Cargo.toml @@ -27,6 +27,7 @@ rig-derive = { version = "0.1.0", path = "./rig-core-derive", optional = true } glob = "0.3.1" lopdf = { version = "0.34.0", optional = true } rayon = { version = "1.10.0", optional = true} +epub = { version = "2.1.2", optional = true } [dev-dependencies] anyhow = "1.0.75" @@ -39,6 +40,7 @@ tokio-test = "0.4.4" all = ["derive", "pdf", "rayon"] derive = ["dep:rig-derive"] pdf = ["dep:lopdf"] +epub = ["dep:epub"] rayon = ["dep:rayon"] [[test]] diff --git a/rig-core/src/loaders/epub.rs b/rig-core/src/loaders/epub.rs new file mode 100644 index 00000000..f2687f06 --- /dev/null +++ b/rig-core/src/loaders/epub.rs @@ -0,0 +1,412 @@ +use super::file::FileLoaderError; +use epub::doc::{DocError, EpubDoc}; +use thiserror::Error; + +use std::fs::File; +use std::io::BufReader; +use std::path::PathBuf; + +#[derive(Error, Debug)] +pub enum EpubLoaderError { + #[error("IO error: {0}")] + EpubError(#[from] DocError), + + #[error("File loader error: {0}")] + FileLoaderError(#[from] FileLoaderError), +} + +// ================================================================ +// Implementing Loadable trait for loading epubs +// ================================================================ + +pub(crate) trait Loadable { + fn load(self) -> Result>, EpubLoaderError>; + fn load_with_path(self) -> Result<(PathBuf, EpubDoc>), EpubLoaderError>; +} + +impl Loadable for PathBuf { + fn load(self) -> Result>, EpubLoaderError> { + EpubDoc::new(self).map_err(EpubLoaderError::EpubError) + } + + fn load_with_path(self) -> Result<(PathBuf, EpubDoc>), EpubLoaderError> { + let contents = EpubDoc::new(&self).map_err(EpubLoaderError::EpubError); + Ok((self, contents?)) + } +} + +impl Loadable for Result { + fn load(self) -> Result>, EpubLoaderError> { + self.map(|t| t.load())? + } + fn load_with_path(self) -> Result<(PathBuf, EpubDoc>), EpubLoaderError> { + self.map(|t| t.load_with_path())? + } +} + +// ================================================================ +// EpubFileLoader definitions and implementations +// ================================================================ + +/// [EpubFileLoader] is a utility for loading epub files from the filesystem using glob patterns or +/// directory paths. It provides methods to read file contents and handle errors gracefully. +/// +/// # Errors +/// +/// This module defines a custom error type [EpubLoaderError] which can represent various errors +/// that might occur during file loading operations, such as any [FileLoaderError] alongside +/// specific EPUB-related errors. +/// +/// # Example Usage +/// +/// ```rust +/// use rig::loaders::EpubFileLoader; +/// +/// fn main() -> Result<(), Box> { +/// // Create a FileLoader using a glob pattern +/// let loader = EpubFileLoader::with_glob("tests/data/*.epub")?; +/// +/// // Load epub file contents by chapter, ignoring any errors +/// let contents = loader +/// .load_with_path() +/// .ignore_errors() +/// .by_chapter(); +/// +/// for (path, chapters) in contents { +/// println!("{}", path.display()); +/// for (idx, chapter) in chapters { +/// println!("Chapter {} begins", idx); +/// println!("{}", chapter); +/// println!("Chapter {} ends", idx); +/// } +/// } +/// +/// Ok(()) +/// } +/// ``` +/// +/// [EpubFileLoader] uses strict typing between the iterator methods to ensure that transitions +/// between different implementations of the loaders and it's methods are handled properly by +/// the compiler. +pub struct EpubFileLoader<'a, T> { + iterator: Box + 'a>, +} + +type EpubLoaded = Result<(PathBuf, EpubDoc>), EpubLoaderError>; + +impl<'a> EpubFileLoader<'a, Result> { + /// Loads the contents of the epub files within the iterator returned by [EpubFileLoader::with_glob] + /// or [EpubFileLoader::with_dir]. Loaded EPUB documents are raw EPUB instances that can be + /// further processed (by chapter, etc). + /// + /// # Example + /// Load epub files in directory "tests/data/*.epub" and return the loaded documents + /// + /// ```rust + /// use rig::loaders::EpubFileLoader; + /// + /// let content = EpubFileLoader::with_glob("tests/data/*.epub")?.load().into_iter(); + /// for result in content { + /// match result { + /// Ok(doc) => println!("{:?}", doc), + /// Err(e) => eprintln!("Error reading epub: {}", e), + /// } + /// } + /// ``` + pub fn load(self) -> EpubFileLoader<'a, Result>, EpubLoaderError>> { + EpubFileLoader { + iterator: Box::new(self.iterator.map(|res| res.load())), + } + } + + /// Loads the contents of the epub files within the iterator returned by [EpubFileLoader::with_glob] + /// or [EpubFileLoader::with_dir]. Loaded EPUB documents are raw EPUB instances with their path + /// that can be further processed. + /// + /// # Example + /// Load epub files in directory "tests/data/*.epub" and return the loaded documents + /// + /// ```rust + /// use rig::loaders::EpubFileLoader; + /// + /// let content = EpubFileLoader::with_glob("tests/data/*.epub").unwrap().load_with_path().into_iter(); + /// for result in content { + /// match result { + /// Ok((path, doc)) => println!("{:?} {:?}", path, doc), + /// Err(e) => eprintln!("Error reading epub: {}", e), + /// } + /// } + /// ``` + pub fn load_with_path(self) -> EpubFileLoader<'a, EpubLoaded> { + EpubFileLoader { + iterator: Box::new(self.iterator.map(|res| res.load_with_path())), + } + } +} + +impl<'a> EpubFileLoader<'a, Result> { + /// Directly reads the contents of the epub files within the iterator returned by + /// [EpubFileLoader::with_glob] or [EpubFileLoader::with_dir]. + /// + /// # Example + /// Read epub files in directory "tests/data/*.epub" and return the contents of the documents. + /// + /// ```rust + /// let content = EpubFileLoader::with_glob("tests/data/*.epub")?.read().into_iter(); + /// for result in content { + /// match result { + /// Ok(content) => println!("{}", content), + /// Err(e) => eprintln!("Error reading epub: {}", e), + /// } + /// } + /// ``` + pub fn read(self) -> EpubFileLoader<'a, Result> { + EpubFileLoader { + iterator: Box::new(self.iterator.map(|res| { + let doc = res.load().map(EpubChapterIterator::from)?; + + Ok(doc.into_iter().collect::()) + })), + } + } + + /// Directly reads the contents of the epub files within the iterator returned by + /// [EpubFileLoader::with_glob] or [EpubFileLoader::with_dir] and returns the path along with + /// the content. + /// + /// # Example + /// Read epub files in directory "tests/data/*.epub" and return the content and paths of the documents. + /// + /// ```rust + /// let content = EpubFileLoader::with_glob("tests/data/*.epub")?.read_with_path().into_iter(); + /// for result in content { + /// match result { + /// Ok((path, content)) => println!("{:?} {}", path, content), + /// Err(e) => eprintln!("Error reading epub: {}", e), + /// } + /// } + /// ``` + pub fn read_with_path(self) -> EpubFileLoader<'a, Result<(PathBuf, String), EpubLoaderError>> { + EpubFileLoader { + iterator: Box::new(self.iterator.map(|res| { + let (path, doc) = res.load_with_path()?; + Ok((path, EpubChapterIterator::from(doc).collect::())) + })), + } + } +} + +impl<'a> EpubFileLoader<'a, EpubDoc>> { + /// Chunks the chapters of a loaded document by chapter, flattened as a single vector. + /// + /// # Example + /// Load epub files in directory "tests/data/*.epub" and chunk all document into it's chapters. + /// + /// ```rust + /// let content = EpubFileLoader::with_glob("tests/data/*.epub")?.load().by_chapter().into_iter(); + /// for result in content { + /// println!("{}", result); + /// } + /// ``` + pub fn by_chapter(self) -> EpubFileLoader<'a, String> { + EpubFileLoader { + iterator: Box::new(self.iterator.flat_map(|doc| EpubChapterIterator::from(doc))), + } + } +} + +type ByChapter = (PathBuf, Vec<(usize, String)>); +impl<'a> EpubFileLoader<'a, (PathBuf, EpubDoc>)> { + /// Chunks the chapters of a loaded document by chapter, processed as a vector of documents by path + /// which each document container an inner vector of chapters by chapter number. + /// + /// # Example + /// Read epub files in directory "tests/data/*.epub" and chunk all documents by path by it's chapters. + /// + /// ```rust + /// let content = EpubFileLoader::with_glob("tests/data/*.epub")? + /// .load_with_path() + /// .ignore_errors() + /// .by_chapter() + /// .into_iter(); + /// + /// for result in content { + /// println!("{:?}", result); + /// } + /// ``` + pub fn by_chapter(self) -> EpubFileLoader<'a, ByChapter> { + EpubFileLoader { + iterator: Box::new(self.iterator.map(|doc| { + let (path, doc) = doc; + + ( + path, + EpubChapterIterator::from(doc) + .enumerate() + .collect::>(), + ) + })), + } + } +} + +impl<'a, T: 'a> EpubFileLoader<'a, Result> { + /// Ignores errors in the iterator, returning only successful results. This can be used on any + /// [EpubFileLoader] state of iterator whose items are results. + /// + /// # Example + /// Read files in directory "tests/data/*.epub" and ignore errors from unreadable files. + /// + /// ```rust + /// let content = EpubFileLoader::with_glob("tests/data/*.epub")?.read().ignore_errors().into_iter(); + /// for result in content { + /// println!("{}", content) + /// } + /// ``` + pub fn ignore_errors(self) -> EpubFileLoader<'a, T> { + EpubFileLoader { + iterator: Box::new(self.iterator.filter_map(|res| res.ok())), + } + } +} + +impl EpubFileLoader<'_, Result> { + /// Creates a new [EpubFileLoader] using a glob pattern to match files. + /// + /// # Example + /// Create a [EpubFileLoader] for all `.epub` files that match the glob "tests/data/*.epub". + /// + /// ```rust + /// let loader = EpubFileLoader::with_glob("tests/data/*.epub")?; + /// ``` + pub fn with_glob( + pattern: &str, + ) -> Result>, EpubLoaderError> { + let paths = glob::glob(pattern).map_err(FileLoaderError::PatternError)?; + + Ok(EpubFileLoader { + iterator: Box::new(paths.into_iter().map(|path| { + path.map_err(FileLoaderError::GlobError) + .map_err(EpubLoaderError::FileLoaderError) + })), + }) + } + + /// Creates a new [EpubFileLoader] on all files within a directory. + /// + /// # Example + /// Create a [EpubFileLoader] for all files that are in the directory "files". + /// + /// ```rust + /// let loader = EpubFileLoader::with_dir("files")?; + /// ``` + pub fn with_dir( + directory: &str, + ) -> Result>, EpubLoaderError> { + let paths = std::fs::read_dir(directory).map_err(FileLoaderError::IoError)?; + + Ok(EpubFileLoader { + iterator: Box::new( + paths + .into_iter() + .map(|entry| Ok(entry.map_err(FileLoaderError::IoError)?.path())), + ), + }) + } +} + +// ================================================================ +// EpubFileLoader iterator implementations +// ================================================================ +pub struct IntoIter<'a, T> { + iterator: Box + 'a>, +} + +impl<'a, T> IntoIterator for EpubFileLoader<'a, T> { + type Item = T; + type IntoIter = IntoIter<'a, T>; + + fn into_iter(self) -> Self::IntoIter { + IntoIter { + iterator: self.iterator, + } + } +} + +impl Iterator for IntoIter<'_, T> { + type Item = T; + + fn next(&mut self) -> Option { + self.iterator.next() + } +} + +// ================================================================ +// EpubChapterIterator definitions and implementations +// ================================================================ + +struct EpubChapterIterator { + epub: EpubDoc>, + finished: bool, +} + +impl From>> for EpubChapterIterator { + fn from(epub: EpubDoc>) -> Self { + Self::new(epub) + } +} + +impl EpubChapterIterator { + fn new(epub: EpubDoc>) -> Self { + Self { + epub, + finished: false, + } + } +} + +impl Iterator for EpubChapterIterator { + type Item = String; + + fn next(&mut self) -> Option { + if self.finished { + return None; + } + + // ignore empty chapters if they exist + while !self.finished { + let chapter = self.epub.get_current_str(); + + if !self.epub.go_next() { + self.finished = true; + } + + if let Some((text, _)) = chapter { + return Some(text); + } + } + + None + } +} + +#[cfg(test)] +mod tests { + use super::EpubFileLoader; + + #[test] + fn test_epub_loader() { + let loader = EpubFileLoader::with_glob("tests/data/*.epub").unwrap(); + let actual = loader + .load_with_path() + .ignore_errors() + .by_chapter() + .into_iter() + .collect::>(); + + assert_eq!(actual.len(), 1); + + let (_, chapters) = &actual[0]; + assert_eq!(chapters.len(), 3); + } +} diff --git a/rig-core/src/loaders/mod.rs b/rig-core/src/loaders/mod.rs index 6611819e..6ae033f0 100644 --- a/rig-core/src/loaders/mod.rs +++ b/rig-core/src/loaders/mod.rs @@ -9,6 +9,12 @@ //! and keeping track of the page numbers along with their contents. //! //! Note: The [PdfFileLoader] requires the `pdf` feature to be enabled in the `Cargo.toml` file. +//! +//! The [EpubFileLoader] works similarly to the [FileLoader], but is specifically designed to load EPUB +//! files. This loader also provides EPUB-specific preprocessing methods for splitting the EPUB into chapters +//! and keeping track of the chapter numbers along with their contents. +//! +//! Note: The [EpubFileLoader] requires the `epub` feature to be enabled in the `Cargo.toml` file. pub mod file; @@ -19,3 +25,9 @@ pub mod pdf; #[cfg(feature = "pdf")] pub use pdf::PdfFileLoader; + +#[cfg(feature = "epub")] +pub mod epub; + +#[cfg(feature = "epub")] +pub use epub::EpubFileLoader; diff --git a/rig-core/tests/data/dummy.epub b/rig-core/tests/data/dummy.epub new file mode 100644 index 0000000000000000000000000000000000000000..4f2fa72e31187252d26e9b26395a173de62a5fc6 GIT binary patch literal 3216 zcmb7Hc|4SB8y=w;laVc@7(})id!;gDX&7rL31yhUWSKF>dZIW&n8{N1eMwR#X^2R6 zDtlQAaXM0s$vQbn-^{6#V@}`iJKyvE?sxg)x}N*KpXa`x>$NiFk~BEB$9lxgNh zw#T3*7ROawaXv&8)(1m?1$lcpksEM_tJpyjln;{Uc9*9b)P#FG*`_G$LJWWP*DycP+?(to zB?tq+5dIEz*gdO}-Zamo+P{607m`S)YYKX{XC|10F^#WjX1cq0R1@B;b z4rZDN(=3hjt!z|WJy3WehHy>|7UV(n_VVjCR+cw`S{rxwD4BFAar|E#>i>#^IW=Gl z>&6_co%YQSKVq;K#?J$TA;Mh!{5aX>WgxsL;!IEmHvqtj;k)n5&4uwH!f<%E;%nx~ zV-QdjeN_ZB#NW0eXa26DK014rno289<>$~wop+6$ms@3A_kC1}J^adyU#tkxA7{8; z=Y4)*Yx%$(1rNd2YnKx5^t+wxrBlw0M8H#%UjkQ6Z%?9EHBT2%W`VUU<8E2F1DVE3 z4n}tz<~7MPbyN}A`o4x=oX>>M*Y-~WfmP?GwwiiqOYOrw=98@x3rlWXO9$Pm1F zs#efJG1ESE(!j1bp_CSiZj<>FpXELCfbY7OyEr(*V@P zJMm5KoJhXk14V&~x9?DeC2-le^aa!VpCTh{!VCm)+XP{Sw9&#Bp@aB6dePEaM48_r z*1+oq_!?1B?myD^Op&iR;x2M!Ke}q6dr3mL;Zn|lxv8UGI;lW_dMZNxE$k@aK6f-l_=c`xKN8uLCbD!=*%LCMxX`~__vp@QXeS+IH7qlOd6u^C z>fzajq#6a{xwG92u($Rq=jJMSVi*=?&H(^SasmL`f9xb8&K2h48kC*f>GP|qKomU~ z__V%C%-EqgQ7YP4$G=lb$F41B>%O}?%mhbEC?gq~vT-4c>XM7|8XxVi7o$0i(#7^# z_4q1qKuO3Y#XQ{8_DtUF%QEE{YT1GlW3H{RrJxq%5uawcYs{ybENHSio95E`s12O_ z%PBkcr|P@OJg?PwPO5xR=M*-K18upbnxWXCcEfNtDX%ZqswqmTFVA?V3%6h8;DhJ~ zh(t46t&jJ*rRBHsbcd*w$2-cU=`$daz~W8~Ifhzh>^takp4L_W*RdZ9IRXT^h zPd>BWdS?CW{k4fR2%|P#X#>TJ=yDTYL7J^Zn-)5tjrL@$@<=@Go*1sDEyyJec14JE zbU8HBNh6|eoz~QMaC6=nr|x8>GV|Wf%sZxU7KV#=rJ=E5; z)Z1j-ZH5gozhS_y;~!b?Y>DoafpV;e%8ExAbQbH_OVaB@Ki^LY{5<%WZ?*SOnQD(5 z58BjDE1>Wrc6EA1XJ`SQADI%>O!X}vlZ*z+_f64HuKgxsj;ne3DYT~HN1 zFMWK{V5g@V&HkDcMQ_^oQ%8aI?`o^F|@X;$+f3x+#ob8P}<6Z*@3PdTR7P3b>=B498N$MJQDtCTn zfqv!?^1?HObS}iI)3;-NMd)P9%~HM+$~(23WUA79ysRa`y5$o^_t(Cv^VEy{pU&Q* z;4jZ=IdoNk*GleA+*%g;m7J3*#9JiN9U+ zWDTfypamF<)V2ienoJow#uGD8d}2H1fs}zog+I?_Ewd<%wTH`~Y6iGLZ%C=+CFDR4#be2o7_o4$cTgO*r{JSRKHY60k#2-QC-1bmg>TGzqkA}ut-N9vc6!B*f;UmrV-UK{*G7K1^v!! zmhYDs_k;kc9JM!k`;B7062$MTV+vmuN`^T1*o20=i#;5X=ZAl^K{dF&mz$hU`7oR$ z{3Y^o#oP6EcPmqlEilgiF7=od`R6DKXFdKBejBKt%e-$G0H7%RF!S~2qK}<}T}W+k z>|t)Lzj6F;S;bDxu5dP}$C=bW)jR*5LYgT*HdB0;JM0v!S^utYHoS2AL9towursi$ zoehSJ9}FyE{ayR8!`Y9x4ft Date: Fri, 10 Jan 2025 11:09:45 +0200 Subject: [PATCH 2/2] style: Remove redundant closure --- rig-core/src/loaders/epub.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rig-core/src/loaders/epub.rs b/rig-core/src/loaders/epub.rs index f2687f06..a8e8b5fd 100644 --- a/rig-core/src/loaders/epub.rs +++ b/rig-core/src/loaders/epub.rs @@ -210,7 +210,7 @@ impl<'a> EpubFileLoader<'a, EpubDoc>> { /// ``` pub fn by_chapter(self) -> EpubFileLoader<'a, String> { EpubFileLoader { - iterator: Box::new(self.iterator.flat_map(|doc| EpubChapterIterator::from(doc))), + iterator: Box::new(self.iterator.flat_map(EpubChapterIterator::from)), } } }