diff --git a/Cargo.lock b/Cargo.lock index b30a5fd..f1fdf8d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -393,9 +393,9 @@ checksum = "96b3c34c1690edf8174f5b289a336ab03f568a4460d8c6df75f2f3a692b3bc6a" [[package]] name = "serde_json" -version = "1.0.75" +version = "1.0.78" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c059c05b48c5c0067d4b4b2b4f0732dd65feb52daf7e0ea09cd87e7dadc1af79" +checksum = "d23c1ba4cf0efd44be32017709280b32d1cea5c3f1275c3b6d9e8bc54f758085" dependencies = [ "itoa", "ryu", diff --git a/Cargo.toml b/Cargo.toml index d895791..87683b1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ edition = "2018" structopt = "0.3.21" log = "0.4.14" env_logger = "0.9.0" -serde_json = "1.0.75" +serde_json = "1.0.78" rayon = "1.5.1" flate2 = "1.0.22" diff --git a/src/cli.rs b/src/cli.rs index 9e43492..6b9f3be 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -2,6 +2,7 @@ use crate::compress::CompressCorpus; use crate::error::Error; use crate::extract_clean::ExtractCleanCorpus; +use crate::extract_text::ExtractText; use crate::lang_codes::UpdateLangCodes; use crate::split_latest::SplitLatest; use structopt::StructOpt; @@ -26,8 +27,12 @@ pub enum OscarTools { ExtractCleanCorpus(ExtractCleanCorpus), #[structopt(about = "Split a corpus into a set of smaller files")] SplitLatest(SplitLatest), - #[structopt(about = "compress")] + #[structopt(about = "Compress corpus. Useable on files and folders (compresses on a depth=1)")] Compress(CompressCorpus), + #[structopt( + about = "Extracts textual information, discarding metadata. Produces a corpus following OSCAR Scheme v1" + )] + Extract(ExtractText), } impl Runnable for OscarTools { @@ -37,6 +42,7 @@ impl Runnable for OscarTools { OscarTools::ExtractCleanCorpus(u) => u.run(), OscarTools::SplitLatest(u) => u.run(), OscarTools::Compress(u) => u.run(), + OscarTools::Extract(u) => u.run(), } } } diff --git a/src/error.rs b/src/error.rs index 2cb8e62..18fba9e 100644 --- a/src/error.rs +++ b/src/error.rs @@ -4,6 +4,8 @@ pub enum Error { Io(std::io::Error), Json(serde_json::Error), ThreadPoolBuild(rayon::ThreadPoolBuildError), + MissingContent(serde_json::Value), + MalformedContent(serde_json::Value), Custom(String), } diff --git a/src/extract_text.rs b/src/extract_text.rs new file mode 100644 index 0000000..59ec87d --- /dev/null +++ b/src/extract_text.rs @@ -0,0 +1,107 @@ +//! Splitting of OSCAR Schema v2 corpora +//! +//! Untested but should work on OSCAR Schema v1 corpora +use std::path::PathBuf; + +use crate::impls::OscarDoc; +use crate::ops::ExtractText as ET; +use crate::{cli::Runnable, error::Error}; +use log::error; +use structopt::StructOpt; + +#[derive(StructOpt, Debug)] +pub struct ExtractText { + #[structopt(help = "source corpus file. Does not work with folders")] + src: PathBuf, + #[structopt(help = "dest corpus folder.")] + dst: PathBuf, + + #[structopt(help = "delete source files", short = "m")] + del_src: bool, +} + +impl Runnable for ExtractText { + fn run(&self) -> Result<(), Error> { + if self.src.is_file() { + OscarDoc::extract_text(&self.src, &self.dst, self.del_src)?; + Ok(()) + } else { + error!("Extraction is not supported on folders. Call on each file."); + Err(Error::Custom( + "Extraction is not supported on folders. Call on each file.".to_string(), + )) + } + } +} + +#[cfg(test)] +mod tests { + use std::io::Write; + + use tempfile; + + use crate::{impls::OscarDoc, ops::ExtractText}; + + pub fn setup_oscardoc() -> (String, String) { + let mut corpus = String::new(); + let mut content_only = String::new(); + for i in 0..100 { + let content = format!(r#"document n{0}\nthis is document n{0}"#, i); + corpus.push_str(&format!( + r#"{{"content":"{content}", "metadata": ["foo"]}}"#, + )); + corpus.push('\n'); + + content_only.push_str(&content.replace(r#"\n"#, "\n")); + content_only.push_str("\n\n"); + } + + (corpus, content_only) + } + + #[test] + fn test_extract() { + //get both documents and expected output + let (docs, content_only) = setup_oscardoc(); + let mut src = tempfile::NamedTempFile::new().unwrap(); + + //write fake corpus + src.write_all(docs.as_bytes()).unwrap(); + + // create destination path and file path + let dst = tempfile::tempdir().unwrap(); + let dst_path = dst.into_path().join("text_only.txt"); + + let src_path = src.into_temp_path(); + OscarDoc::extract_text(&src_path, &dst_path, false).unwrap(); + + // read extracted + let text = std::fs::read_to_string(dst_path).unwrap(); + + assert!(src_path.exists()); + assert_eq!(text, content_only); + } + + #[test] + fn test_extract_rm_src() { + //get both documents and expected output + let (docs, content_only) = setup_oscardoc(); + let mut src = tempfile::NamedTempFile::new().unwrap(); + + //write fake corpus + src.write_all(docs.as_bytes()).unwrap(); + + // create destination path and file path + let dst = tempfile::tempdir().unwrap(); + let dst_path = dst.into_path().join("text_only.txt"); + + let src_path = src.into_temp_path(); + OscarDoc::extract_text(&src_path, &dst_path, true).unwrap(); + + // read extracted + let text = std::fs::read_to_string(dst_path).unwrap(); + + assert!(!src_path.exists()); + assert_eq!(text, content_only); + } +} diff --git a/src/impls/oscar_doc.rs b/src/impls/oscar_doc.rs index 18afbcb..916aef7 100644 --- a/src/impls/oscar_doc.rs +++ b/src/impls/oscar_doc.rs @@ -1,8 +1,16 @@ //! OSCAR Schema v2 (See [oscar-corpus.com](https://oscar-corpus.com)) operation implementations. //! //! Implementations mostly use default trait implementations, as the format is simple. +use std::{ + fs::File, + io::{BufRead, BufReader, Read, Write}, +}; + +use serde_json::Value; + use crate::{ - ops::{Compress, Split}, + error::Error, + ops::{Compress, ExtractText, Split}, versions::{Schema, Version}, }; @@ -20,3 +28,187 @@ impl Schema for OscarDoc { /// Use default implementation of splitting (see [crate::ops::Split]) impl Split for OscarDoc {} impl Compress for OscarDoc {} + +/// impl block for helper functions related to [ExtractText]. +impl OscarDoc { + /// Extracts content from a Document. + /// + /// Fails if the `content` field is missing or is not a string. + fn extract_from_doc(doc: &str) -> Result { + let v: Value = serde_json::from_str(doc)?; + + if let Some(content) = v.get("content") { + if let Value::String(c) = content { + let mut content_str = c.to_string().replace(r#"\n"#, "\n"); + content_str.push('\n'); + Ok(content_str) + } else { + Err(Error::MalformedContent(v)) + } + } else { + Err(Error::MissingContent(v)) + } + } + + fn extract(src: T, dst: &mut U) -> Result<(), Error> { + let b = BufReader::new(src); + let docs = b.lines(); + for doc in docs { + //extract and add newline + let doc = doc?; + let content = Self::extract_from_doc(&doc)? + "\n"; + let content_length = content.len(); + + // check written bytes + if dst.write(content.as_bytes())? > content_length { + error!("IO Error: Could not write into destination writer."); + } + } + + // flush output + dst.flush()?; + + Ok(()) + } +} + +impl ExtractText for OscarDoc { + fn extract_text( + src: &std::path::Path, + dst: &std::path::Path, + del_src: bool, + ) -> Result<(), Error> { + if !src.is_file() { + warn!("{:?} is not a file: ignoring", src); + return Ok(()); + } + let src_file = File::open(src)?; + + if dst.exists() { + error!("File {:?} already exists!", dst); + return Err(std::io::Error::new( + std::io::ErrorKind::AlreadyExists, + format!("{:?}", dst), + ) + .into()); + } + + let mut dst = dst.to_path_buf(); + dst.set_extension("txt"); + let mut dest_file = File::create(&dst)?; + + info!("extracting text from {:?} to {:?}", src, dst); + + OscarDoc::extract(src_file, &mut dest_file)?; + + if del_src { + std::fs::remove_file(src)?; + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + + use crate::impls::OscarDoc; + + fn get_doc() -> &'static str { + r#"{"content":"foo\nbar\nbaz\nquux"} +{"content":"123456789"} +{"content":"246810"} +{"content":"test"}"# + } + + #[test] + fn test_extract_single() { + let docs = get_doc(); + let doc = docs.lines().next().unwrap().as_bytes(); + + let mut buf = Vec::new(); + OscarDoc::extract(doc, &mut buf).unwrap(); + + assert_eq!(String::from_utf8(buf).unwrap(), "foo\nbar\nbaz\nquux\n\n"); + } + #[test] + fn test_extract_multiple() { + let doc = get_doc().as_bytes(); + let mut buf = Vec::new(); + OscarDoc::extract(doc, &mut buf).unwrap(); + + assert_eq!( + String::from_utf8(buf).unwrap(), + "foo\nbar\nbaz\nquux\n\n123456789\n\n246810\n\ntest\n\n" + ); + } + #[test] + fn extract_no_content() { + let document = r#"{"no_content": "hehe"}"#; + let extracted = OscarDoc::extract_from_doc(document); + + assert!(extracted.is_err()) + } + + #[test] + fn extract_bad_content() { + let document = r#"{"content": ["hehe"]}"#; + let extracted = OscarDoc::extract_from_doc(document); + + assert!(extracted.is_err()) + } + + #[test] + fn text_extract_from_doc() { + let content = "foo +bar +baz +quux +"; + + let document = r#" + { + "content":"foo\nbar\nbaz\nquux", + "warc_headers":{ + "warc-block-digest":"sha1:X3OWP47FG2O5LBNMFSNB44FJF2SSRC26", + "content-type":"text/plain", + "warc-refers-to":"", + "content-length":"16", + "warc-identified-content-language":"eng", + "warc-target-uri":"http://3dv2015.inria.fr/registration-2/index.html", + "warc-date":"2021-09-16T11:07:14Z", + "warc-record-id":"", + "warc-type":"conversion" + }, + "metadata":{ + "identification":{ + "label":"en", + "prob":0.6268374 + }, + "annotation":[ + "short_sentences", + "footer" + ], + "sentence_identifications":[ + { + "label":"en", + "prob":0.93925816 + }, + null, + { + "label":"en", + "prob":0.9937219 + }, + { + "label":"en", + "prob":0.9996538 + } + ] + } + } + "#; + + let extracted = OscarDoc::extract_from_doc(document).unwrap(); + assert_eq!(extracted, content); + } +} diff --git a/src/main.rs b/src/main.rs index 56dec29..52169af 100644 --- a/src/main.rs +++ b/src/main.rs @@ -6,6 +6,7 @@ mod cli; mod compress; mod error; mod extract_clean; +mod extract_text; mod impls; mod lang_codes; mod ops; diff --git a/src/ops/extract_text.rs b/src/ops/extract_text.rs new file mode 100644 index 0000000..c314abf --- /dev/null +++ b/src/ops/extract_text.rs @@ -0,0 +1,9 @@ +/*! Extracts textual content into new files, discarding metadata. +!*/ + +use std::path::Path; + +use crate::error::Error; +pub trait ExtractText { + fn extract_text(src: &Path, dst: &Path, del_src: bool) -> Result<(), Error>; +} diff --git a/src/ops/mod.rs b/src/ops/mod.rs index 077d71a..5164d34 100644 --- a/src/ops/mod.rs +++ b/src/ops/mod.rs @@ -2,6 +2,8 @@ //! //! A subset of these should be implemented for different corpus versions. mod compress; +mod extract_text; mod split; pub(crate) use compress::Compress; +pub(crate) use extract_text::ExtractText; pub(crate) use split::Split;