Merge pull request #6 from oscar-corpus/dev-extract-text

Text extraction
oscar-project · Feb 10, 2022 · dcc3aad · dcc3aad
2 parents 4c51940 + 3f73ac2
commit dcc3aad
Show file tree

Hide file tree

Showing 9 changed files with 324 additions and 5 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -10,7 +10,7 @@ edition = "2018"
 structopt = "0.3.21"
 log = "0.4.14"
 env_logger = "0.9.0"
-serde_json = "1.0.75"
+serde_json = "1.0.78"
 rayon = "1.5.1"
 flate2 = "1.0.22"
 

diff --git a/src/cli.rs b/src/cli.rs
@@ -2,6 +2,7 @@
 use crate::compress::CompressCorpus;
 use crate::error::Error;
 use crate::extract_clean::ExtractCleanCorpus;
+use crate::extract_text::ExtractText;
 use crate::lang_codes::UpdateLangCodes;
 use crate::split_latest::SplitLatest;
 use structopt::StructOpt;
@@ -26,8 +27,12 @@ pub enum OscarTools {
     ExtractCleanCorpus(ExtractCleanCorpus),
     #[structopt(about = "Split a corpus into a set of smaller files")]
     SplitLatest(SplitLatest),
-    #[structopt(about = "compress")]
+    #[structopt(about = "Compress corpus. Useable on files and folders (compresses on a depth=1)")]
     Compress(CompressCorpus),
+    #[structopt(
+        about = "Extracts textual information, discarding metadata. Produces a corpus following OSCAR Scheme v1"
+    )]
+    Extract(ExtractText),
 }
 
 impl Runnable for OscarTools {
@@ -37,6 +42,7 @@ impl Runnable for OscarTools {
             OscarTools::ExtractCleanCorpus(u) => u.run(),
             OscarTools::SplitLatest(u) => u.run(),
             OscarTools::Compress(u) => u.run(),
+            OscarTools::Extract(u) => u.run(),
         }
     }
 }
diff --git a/src/error.rs b/src/error.rs
@@ -4,6 +4,8 @@ pub enum Error {
     Io(std::io::Error),
     Json(serde_json::Error),
     ThreadPoolBuild(rayon::ThreadPoolBuildError),
+    MissingContent(serde_json::Value),
+    MalformedContent(serde_json::Value),
     Custom(String),
 }
 

diff --git a/src/extract_text.rs b/src/extract_text.rs
@@ -0,0 +1,107 @@
+//! Splitting of OSCAR Schema v2 corpora
+//!
+//! Untested but should work on OSCAR Schema v1 corpora
+use std::path::PathBuf;
+
+use crate::impls::OscarDoc;
+use crate::ops::ExtractText as ET;
+use crate::{cli::Runnable, error::Error};
+use log::error;
+use structopt::StructOpt;
+
+#[derive(StructOpt, Debug)]
+pub struct ExtractText {
+    #[structopt(help = "source corpus file. Does not work with folders")]
+    src: PathBuf,
+    #[structopt(help = "dest corpus folder.")]
+    dst: PathBuf,
+
+    #[structopt(help = "delete source files", short = "m")]
+    del_src: bool,
+}
+
+impl Runnable for ExtractText {
+    fn run(&self) -> Result<(), Error> {
+        if self.src.is_file() {
+            OscarDoc::extract_text(&self.src, &self.dst, self.del_src)?;
+            Ok(())
+        } else {
+            error!("Extraction is not supported on folders. Call on each file.");
+            Err(Error::Custom(
+                "Extraction is not supported on folders. Call on each file.".to_string(),
+            ))
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::io::Write;
+
+    use tempfile;
+
+    use crate::{impls::OscarDoc, ops::ExtractText};
+
+    pub fn setup_oscardoc() -> (String, String) {
+        let mut corpus = String::new();
+        let mut content_only = String::new();
+        for i in 0..100 {
+            let content = format!(r#"document n{0}\nthis is document n{0}"#, i);
+            corpus.push_str(&format!(
+                r#"{{"content":"{content}", "metadata": ["foo"]}}"#,
+            ));
+            corpus.push('\n');
+
+            content_only.push_str(&content.replace(r#"\n"#, "\n"));
+            content_only.push_str("\n\n");
+        }
+
+        (corpus, content_only)
+    }
+
+    #[test]
+    fn test_extract() {
+        //get both documents and expected output
+        let (docs, content_only) = setup_oscardoc();
+        let mut src = tempfile::NamedTempFile::new().unwrap();
+
+        //write fake corpus
+        src.write_all(docs.as_bytes()).unwrap();
+
+        // create destination path and file path
+        let dst = tempfile::tempdir().unwrap();
+        let dst_path = dst.into_path().join("text_only.txt");
+
+        let src_path = src.into_temp_path();
+        OscarDoc::extract_text(&src_path, &dst_path, false).unwrap();
+
+        // read extracted
+        let text = std::fs::read_to_string(dst_path).unwrap();
+
+        assert!(src_path.exists());
+        assert_eq!(text, content_only);
+    }
+
+    #[test]
+    fn test_extract_rm_src() {
+        //get both documents and expected output
+        let (docs, content_only) = setup_oscardoc();
+        let mut src = tempfile::NamedTempFile::new().unwrap();
+
+        //write fake corpus
+        src.write_all(docs.as_bytes()).unwrap();
+
+        // create destination path and file path
+        let dst = tempfile::tempdir().unwrap();
+        let dst_path = dst.into_path().join("text_only.txt");
+
+        let src_path = src.into_temp_path();
+        OscarDoc::extract_text(&src_path, &dst_path, true).unwrap();
+
+        // read extracted
+        let text = std::fs::read_to_string(dst_path).unwrap();
+
+        assert!(!src_path.exists());
+        assert_eq!(text, content_only);
+    }
+}
diff --git a/src/impls/oscar_doc.rs b/src/impls/oscar_doc.rs
@@ -1,8 +1,16 @@
 //! OSCAR Schema v2 (See [oscar-corpus.com](https://oscar-corpus.com)) operation implementations.
 //!
 //! Implementations mostly use default trait implementations, as the format is simple.
+use std::{
+    fs::File,
+    io::{BufRead, BufReader, Read, Write},
+};
+
+use serde_json::Value;
+
 use crate::{
-    ops::{Compress, Split},
+    error::Error,
+    ops::{Compress, ExtractText, Split},
     versions::{Schema, Version},
 };
 
@@ -20,3 +28,187 @@ impl Schema for OscarDoc {
 /// Use default implementation of splitting (see [crate::ops::Split])
 impl Split for OscarDoc {}
 impl Compress for OscarDoc {}
+
+/// impl block for helper functions related to [ExtractText].
+impl OscarDoc {
+    /// Extracts content from a Document.
+    ///
+    /// Fails if the `content` field is missing or is not a string.
+    fn extract_from_doc(doc: &str) -> Result<String, Error> {
+        let v: Value = serde_json::from_str(doc)?;
+
+        if let Some(content) = v.get("content") {
+            if let Value::String(c) = content {
+                let mut content_str = c.to_string().replace(r#"\n"#, "\n");
+                content_str.push('\n');
+                Ok(content_str)
+            } else {
+                Err(Error::MalformedContent(v))
+            }
+        } else {
+            Err(Error::MissingContent(v))
+        }
+    }
+
+    fn extract<T: Read, U: Write>(src: T, dst: &mut U) -> Result<(), Error> {
+        let b = BufReader::new(src);
+        let docs = b.lines();
+        for doc in docs {
+            //extract and add newline
+            let doc = doc?;
+            let content = Self::extract_from_doc(&doc)? + "\n";
+            let content_length = content.len();
+
+            // check written bytes
+            if dst.write(content.as_bytes())? > content_length {
+                error!("IO Error: Could not write into destination writer.");
+            }
+        }
+
+        // flush output
+        dst.flush()?;
+
+        Ok(())
+    }
+}
+
+impl ExtractText for OscarDoc {
+    fn extract_text(
+        src: &std::path::Path,
+        dst: &std::path::Path,
+        del_src: bool,
+    ) -> Result<(), Error> {
+        if !src.is_file() {
+            warn!("{:?} is not a file: ignoring", src);
+            return Ok(());
+        }
+        let src_file = File::open(src)?;
+
+        if dst.exists() {
+            error!("File {:?} already exists!", dst);
+            return Err(std::io::Error::new(
+                std::io::ErrorKind::AlreadyExists,
+                format!("{:?}", dst),
+            )
+            .into());
+        }
+
+        let mut dst = dst.to_path_buf();
+        dst.set_extension("txt");
+        let mut dest_file = File::create(&dst)?;
+
+        info!("extracting text from {:?} to {:?}", src, dst);
+
+        OscarDoc::extract(src_file, &mut dest_file)?;
+
+        if del_src {
+            std::fs::remove_file(src)?;
+        }
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use crate::impls::OscarDoc;
+
+    fn get_doc() -> &'static str {
+        r#"{"content":"foo\nbar\nbaz\nquux"}
+{"content":"123456789"}
+{"content":"246810"}
+{"content":"test"}"#
+    }
+
+    #[test]
+    fn test_extract_single() {
+        let docs = get_doc();
+        let doc = docs.lines().next().unwrap().as_bytes();
+
+        let mut buf = Vec::new();
+        OscarDoc::extract(doc, &mut buf).unwrap();
+
+        assert_eq!(String::from_utf8(buf).unwrap(), "foo\nbar\nbaz\nquux\n\n");
+    }
+    #[test]
+    fn test_extract_multiple() {
+        let doc = get_doc().as_bytes();
+        let mut buf = Vec::new();
+        OscarDoc::extract(doc, &mut buf).unwrap();
+
+        assert_eq!(
+            String::from_utf8(buf).unwrap(),
+            "foo\nbar\nbaz\nquux\n\n123456789\n\n246810\n\ntest\n\n"
+        );
+    }
+    #[test]
+    fn extract_no_content() {
+        let document = r#"{"no_content": "hehe"}"#;
+        let extracted = OscarDoc::extract_from_doc(document);
+
+        assert!(extracted.is_err())
+    }
+
+    #[test]
+    fn extract_bad_content() {
+        let document = r#"{"content": ["hehe"]}"#;
+        let extracted = OscarDoc::extract_from_doc(document);
+
+        assert!(extracted.is_err())
+    }
+
+    #[test]
+    fn text_extract_from_doc() {
+        let content = "foo
+bar
+baz
+quux
+";
+
+        let document = r#"
+        {
+            "content":"foo\nbar\nbaz\nquux",
+            "warc_headers":{
+              "warc-block-digest":"sha1:X3OWP47FG2O5LBNMFSNB44FJF2SSRC26",
+              "content-type":"text/plain",
+              "warc-refers-to":"<urn:uuid:83f2e1d4-5ed3-41db-86ff-f7826c4c20f9>",
+              "content-length":"16",
+              "warc-identified-content-language":"eng",
+              "warc-target-uri":"http://3dv2015.inria.fr/registration-2/index.html",
+              "warc-date":"2021-09-16T11:07:14Z",
+              "warc-record-id":"<urn:uuid:3304bc27-17d0-4ffd-a692-340381478a5f>",
+              "warc-type":"conversion"
+            },
+            "metadata":{
+              "identification":{
+                "label":"en",
+                "prob":0.6268374
+              },
+              "annotation":[
+                "short_sentences",
+                "footer"
+              ],
+              "sentence_identifications":[
+                {
+                  "label":"en",
+                  "prob":0.93925816
+                },
+                null,
+                {
+                  "label":"en",
+                  "prob":0.9937219
+                },
+                {
+                  "label":"en",
+                  "prob":0.9996538
+                }
+              ]
+            }
+          }
+        "#;
+
+        let extracted = OscarDoc::extract_from_doc(document).unwrap();
+        assert_eq!(extracted, content);
+    }
+}
diff --git a/src/main.rs b/src/main.rs
@@ -6,6 +6,7 @@ mod cli;
 mod compress;
 mod error;
 mod extract_clean;
+mod extract_text;
 mod impls;
 mod lang_codes;
 mod ops;

diff --git a/src/ops/extract_text.rs b/src/ops/extract_text.rs
@@ -0,0 +1,9 @@
+/*! Extracts textual content into new files, discarding metadata.
+!*/
+
+use std::path::Path;
+
+use crate::error::Error;
+pub trait ExtractText {
+    fn extract_text(src: &Path, dst: &Path, del_src: bool) -> Result<(), Error>;
+}