Skip to content

Commit

Permalink
Make extra processors more flexible.
Browse files Browse the repository at this point in the history
  • Loading branch information
luleyleo committed Oct 7, 2024
1 parent 27183be commit 7616dd6
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 21 deletions.
5 changes: 4 additions & 1 deletion core/src/extra/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
use crate::search::SearchSink;
use grep::{regex::RegexMatcher, searcher::Searcher};
use std::{error::Error, path::Path};

pub mod office;
pub mod pdf;

pub type ExtractFn = fn(path: &Path) -> Result<String, Box<dyn Error>>;
pub type ExtraFn =
fn(&mut Searcher, &RegexMatcher, &Path, &mut SearchSink) -> Result<(), Box<dyn Error>>;
15 changes: 14 additions & 1 deletion core/src/extra/office.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,22 @@
use crate::search::SearchSink;
use dotext::{doc::OpenOfficeDoc, *};
use grep::{regex::RegexMatcher, searcher::Searcher};
use std::{error::Error, io::Read, path::Path};

pub static EXTENSIONS: &[&str] = &["docx", "pptx", "xlsx", "odt", "odp", "ods"];

pub fn extract(path: &Path) -> Result<String, Box<dyn Error>> {
pub fn process(
searcher: &mut Searcher,
matcher: &RegexMatcher,
path: &Path,
sink: &mut SearchSink,
) -> Result<(), Box<dyn Error>> {
let text = extract(path)?;
searcher.search_slice(matcher, text.as_bytes(), sink)?;
Ok(())
}

fn extract(path: &Path) -> Result<String, Box<dyn Error>> {
let ext = path
.extension()
.unwrap_or_default()
Expand Down
15 changes: 14 additions & 1 deletion core/src/extra/pdf.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
use crate::search::SearchSink;
use euclid::vec2;
use grep::{regex::RegexMatcher, searcher::Searcher};
use pdf_extract::{
encryption::DecryptionError, ConvertToFmt, Document, MediaBox, OutputDev, OutputError,
Transform,
Expand All @@ -7,7 +9,18 @@ use std::{error::Error, fmt::Write, panic::catch_unwind, path::Path};

pub static EXTENSIONS: &[&str] = &["pdf"];

pub fn extract(path: &Path) -> Result<String, Box<dyn Error>> {
pub fn process(
searcher: &mut Searcher,
matcher: &RegexMatcher,
path: &Path,
sink: &mut SearchSink,
) -> Result<(), Box<dyn Error>> {
let text = extract(path)?;
searcher.search_slice(matcher, text.as_bytes(), sink)?;
Ok(())
}

fn extract(path: &Path) -> Result<String, Box<dyn Error>> {
let path = path.to_owned();
//because the library panics, we need to catch panics
let res = catch_unwind(|| extract_text(&path));
Expand Down
26 changes: 8 additions & 18 deletions core/src/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use grep::{
};
use ignore::{WalkBuilder, WalkState};
use std::{
error::Error,
io,
path::PathBuf,
sync::{
Expand Down Expand Up @@ -71,12 +72,12 @@ pub fn run(engine: SearchEngine, params: SearchParameters) {
.same_file_system(params.flags.same_filesystem)
.build_parallel();

let mut preprocessors: Vec<(_, extra::ExtractFn)> = Vec::new();
let mut preprocessors: Vec<(_, extra::ExtraFn)> = Vec::new();
if params.flags.search_pdf {
preprocessors.push((extra::pdf::EXTENSIONS, extra::pdf::extract));
preprocessors.push((extra::pdf::EXTENSIONS, extra::pdf::process));
}
if params.flags.search_office {
preprocessors.push((extra::office::EXTENSIONS, extra::office::extract));
preprocessors.push((extra::office::EXTENSIONS, extra::office::process));
}

walker.run(|| {
Expand Down Expand Up @@ -115,18 +116,7 @@ pub fn run(engine: SearchEngine, params: SearchParameters) {
.map(|(_, extract_fn)| extract_fn);

let search_result = match pre_processor {
Some(extract_fn) => {
let slice = extract_fn(entry.path());
if let Err(err) = slice {
_ = engine.send_error(
search,
entry.path().to_path_buf(),
format!("failed to extract text from file: {}", err),
);
return WalkState::Continue;
}
searcher.search_slice(&matcher, slice.unwrap().as_bytes(), &mut sink)
}
Some(process) => process(&mut searcher, &matcher, entry.path(), &mut sink),
None => searcher.search_path(&matcher, entry.path(), &mut sink),
};

Expand Down Expand Up @@ -156,7 +146,7 @@ pub fn run(engine: SearchEngine, params: SearchParameters) {
_ = engine.sender.send(SearchMessage::Completed { search });
}

struct SearchSink {
pub struct SearchSink {
matcher: RegexMatcher,
entries: Vec<ResultEntry>,
}
Expand All @@ -173,7 +163,7 @@ impl SearchSink {
std::mem::take(&mut self.entries)
}

fn extract_matches(
pub fn extract_matches(
&self,
searcher: &grep::searcher::Searcher,
bytes: &[u8],
Expand All @@ -192,7 +182,7 @@ impl SearchSink {
}

impl grep::searcher::Sink for SearchSink {
type Error = io::Error;
type Error = Box<dyn Error>;

fn matched(
&mut self,
Expand Down

0 comments on commit 7616dd6

Please sign in to comment.