Skip to content

Commit

Permalink
Merge pull request #6 from oscar-corpus/dev-extract-text
Browse files Browse the repository at this point in the history
Text extraction
  • Loading branch information
Uinelj authored Feb 10, 2022
2 parents 4c51940 + 3f73ac2 commit dcc3aad
Show file tree
Hide file tree
Showing 9 changed files with 324 additions and 5 deletions.
4 changes: 2 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ edition = "2018"
structopt = "0.3.21"
log = "0.4.14"
env_logger = "0.9.0"
serde_json = "1.0.75"
serde_json = "1.0.78"
rayon = "1.5.1"
flate2 = "1.0.22"

Expand Down
8 changes: 7 additions & 1 deletion src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
use crate::compress::CompressCorpus;
use crate::error::Error;
use crate::extract_clean::ExtractCleanCorpus;
use crate::extract_text::ExtractText;
use crate::lang_codes::UpdateLangCodes;
use crate::split_latest::SplitLatest;
use structopt::StructOpt;
Expand All @@ -26,8 +27,12 @@ pub enum OscarTools {
ExtractCleanCorpus(ExtractCleanCorpus),
#[structopt(about = "Split a corpus into a set of smaller files")]
SplitLatest(SplitLatest),
#[structopt(about = "compress")]
#[structopt(about = "Compress corpus. Useable on files and folders (compresses on a depth=1)")]
Compress(CompressCorpus),
#[structopt(
about = "Extracts textual information, discarding metadata. Produces a corpus following OSCAR Scheme v1"
)]
Extract(ExtractText),
}

impl Runnable for OscarTools {
Expand All @@ -37,6 +42,7 @@ impl Runnable for OscarTools {
OscarTools::ExtractCleanCorpus(u) => u.run(),
OscarTools::SplitLatest(u) => u.run(),
OscarTools::Compress(u) => u.run(),
OscarTools::Extract(u) => u.run(),
}
}
}
2 changes: 2 additions & 0 deletions src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ pub enum Error {
Io(std::io::Error),
Json(serde_json::Error),
ThreadPoolBuild(rayon::ThreadPoolBuildError),
MissingContent(serde_json::Value),
MalformedContent(serde_json::Value),
Custom(String),
}

Expand Down
107 changes: 107 additions & 0 deletions src/extract_text.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
//! Splitting of OSCAR Schema v2 corpora
//!
//! Untested but should work on OSCAR Schema v1 corpora
use std::path::PathBuf;

use crate::impls::OscarDoc;
use crate::ops::ExtractText as ET;
use crate::{cli::Runnable, error::Error};
use log::error;
use structopt::StructOpt;

#[derive(StructOpt, Debug)]
pub struct ExtractText {
#[structopt(help = "source corpus file. Does not work with folders")]
src: PathBuf,
#[structopt(help = "dest corpus folder.")]
dst: PathBuf,

#[structopt(help = "delete source files", short = "m")]
del_src: bool,
}

impl Runnable for ExtractText {
fn run(&self) -> Result<(), Error> {
if self.src.is_file() {
OscarDoc::extract_text(&self.src, &self.dst, self.del_src)?;
Ok(())
} else {
error!("Extraction is not supported on folders. Call on each file.");
Err(Error::Custom(
"Extraction is not supported on folders. Call on each file.".to_string(),
))
}
}
}

#[cfg(test)]
mod tests {
use std::io::Write;

use tempfile;

use crate::{impls::OscarDoc, ops::ExtractText};

pub fn setup_oscardoc() -> (String, String) {
let mut corpus = String::new();
let mut content_only = String::new();
for i in 0..100 {
let content = format!(r#"document n{0}\nthis is document n{0}"#, i);
corpus.push_str(&format!(
r#"{{"content":"{content}", "metadata": ["foo"]}}"#,
));
corpus.push('\n');

content_only.push_str(&content.replace(r#"\n"#, "\n"));
content_only.push_str("\n\n");
}

(corpus, content_only)
}

#[test]
fn test_extract() {
//get both documents and expected output
let (docs, content_only) = setup_oscardoc();
let mut src = tempfile::NamedTempFile::new().unwrap();

//write fake corpus
src.write_all(docs.as_bytes()).unwrap();

// create destination path and file path
let dst = tempfile::tempdir().unwrap();
let dst_path = dst.into_path().join("text_only.txt");

let src_path = src.into_temp_path();
OscarDoc::extract_text(&src_path, &dst_path, false).unwrap();

// read extracted
let text = std::fs::read_to_string(dst_path).unwrap();

assert!(src_path.exists());
assert_eq!(text, content_only);
}

#[test]
fn test_extract_rm_src() {
//get both documents and expected output
let (docs, content_only) = setup_oscardoc();
let mut src = tempfile::NamedTempFile::new().unwrap();

//write fake corpus
src.write_all(docs.as_bytes()).unwrap();

// create destination path and file path
let dst = tempfile::tempdir().unwrap();
let dst_path = dst.into_path().join("text_only.txt");

let src_path = src.into_temp_path();
OscarDoc::extract_text(&src_path, &dst_path, true).unwrap();

// read extracted
let text = std::fs::read_to_string(dst_path).unwrap();

assert!(!src_path.exists());
assert_eq!(text, content_only);
}
}
194 changes: 193 additions & 1 deletion src/impls/oscar_doc.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
//! OSCAR Schema v2 (See [oscar-corpus.com](https://oscar-corpus.com)) operation implementations.
//!
//! Implementations mostly use default trait implementations, as the format is simple.
use std::{
fs::File,
io::{BufRead, BufReader, Read, Write},
};

use serde_json::Value;

use crate::{
ops::{Compress, Split},
error::Error,
ops::{Compress, ExtractText, Split},
versions::{Schema, Version},
};

Expand All @@ -20,3 +28,187 @@ impl Schema for OscarDoc {
/// Use default implementation of splitting (see [crate::ops::Split])
impl Split for OscarDoc {}
impl Compress for OscarDoc {}

/// impl block for helper functions related to [ExtractText].
impl OscarDoc {
/// Extracts content from a Document.
///
/// Fails if the `content` field is missing or is not a string.
fn extract_from_doc(doc: &str) -> Result<String, Error> {
let v: Value = serde_json::from_str(doc)?;

if let Some(content) = v.get("content") {
if let Value::String(c) = content {
let mut content_str = c.to_string().replace(r#"\n"#, "\n");
content_str.push('\n');
Ok(content_str)
} else {
Err(Error::MalformedContent(v))
}
} else {
Err(Error::MissingContent(v))
}
}

fn extract<T: Read, U: Write>(src: T, dst: &mut U) -> Result<(), Error> {
let b = BufReader::new(src);
let docs = b.lines();
for doc in docs {
//extract and add newline
let doc = doc?;
let content = Self::extract_from_doc(&doc)? + "\n";
let content_length = content.len();

// check written bytes
if dst.write(content.as_bytes())? > content_length {
error!("IO Error: Could not write into destination writer.");
}
}

// flush output
dst.flush()?;

Ok(())
}
}

impl ExtractText for OscarDoc {
fn extract_text(
src: &std::path::Path,
dst: &std::path::Path,
del_src: bool,
) -> Result<(), Error> {
if !src.is_file() {
warn!("{:?} is not a file: ignoring", src);
return Ok(());
}
let src_file = File::open(src)?;

if dst.exists() {
error!("File {:?} already exists!", dst);
return Err(std::io::Error::new(
std::io::ErrorKind::AlreadyExists,
format!("{:?}", dst),
)
.into());
}

let mut dst = dst.to_path_buf();
dst.set_extension("txt");
let mut dest_file = File::create(&dst)?;

info!("extracting text from {:?} to {:?}", src, dst);

OscarDoc::extract(src_file, &mut dest_file)?;

if del_src {
std::fs::remove_file(src)?;
}

Ok(())
}
}

#[cfg(test)]
mod tests {

use crate::impls::OscarDoc;

fn get_doc() -> &'static str {
r#"{"content":"foo\nbar\nbaz\nquux"}
{"content":"123456789"}
{"content":"246810"}
{"content":"test"}"#
}

#[test]
fn test_extract_single() {
let docs = get_doc();
let doc = docs.lines().next().unwrap().as_bytes();

let mut buf = Vec::new();
OscarDoc::extract(doc, &mut buf).unwrap();

assert_eq!(String::from_utf8(buf).unwrap(), "foo\nbar\nbaz\nquux\n\n");
}
#[test]
fn test_extract_multiple() {
let doc = get_doc().as_bytes();
let mut buf = Vec::new();
OscarDoc::extract(doc, &mut buf).unwrap();

assert_eq!(
String::from_utf8(buf).unwrap(),
"foo\nbar\nbaz\nquux\n\n123456789\n\n246810\n\ntest\n\n"
);
}
#[test]
fn extract_no_content() {
let document = r#"{"no_content": "hehe"}"#;
let extracted = OscarDoc::extract_from_doc(document);

assert!(extracted.is_err())
}

#[test]
fn extract_bad_content() {
let document = r#"{"content": ["hehe"]}"#;
let extracted = OscarDoc::extract_from_doc(document);

assert!(extracted.is_err())
}

#[test]
fn text_extract_from_doc() {
let content = "foo
bar
baz
quux
";

let document = r#"
{
"content":"foo\nbar\nbaz\nquux",
"warc_headers":{
"warc-block-digest":"sha1:X3OWP47FG2O5LBNMFSNB44FJF2SSRC26",
"content-type":"text/plain",
"warc-refers-to":"<urn:uuid:83f2e1d4-5ed3-41db-86ff-f7826c4c20f9>",
"content-length":"16",
"warc-identified-content-language":"eng",
"warc-target-uri":"http://3dv2015.inria.fr/registration-2/index.html",
"warc-date":"2021-09-16T11:07:14Z",
"warc-record-id":"<urn:uuid:3304bc27-17d0-4ffd-a692-340381478a5f>",
"warc-type":"conversion"
},
"metadata":{
"identification":{
"label":"en",
"prob":0.6268374
},
"annotation":[
"short_sentences",
"footer"
],
"sentence_identifications":[
{
"label":"en",
"prob":0.93925816
},
null,
{
"label":"en",
"prob":0.9937219
},
{
"label":"en",
"prob":0.9996538
}
]
}
}
"#;

let extracted = OscarDoc::extract_from_doc(document).unwrap();
assert_eq!(extracted, content);
}
}
1 change: 1 addition & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ mod cli;
mod compress;
mod error;
mod extract_clean;
mod extract_text;
mod impls;
mod lang_codes;
mod ops;
Expand Down
9 changes: 9 additions & 0 deletions src/ops/extract_text.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
/*! Extracts textual content into new files, discarding metadata.
!*/

use std::path::Path;

use crate::error::Error;
pub trait ExtractText {
fn extract_text(src: &Path, dst: &Path, del_src: bool) -> Result<(), Error>;
}
Loading

0 comments on commit dcc3aad

Please sign in to comment.