Skip to content

Commit

Permalink
feat: handle whitespace
Browse files Browse the repository at this point in the history
  • Loading branch information
u8slvn committed Feb 4, 2024
1 parent d66e1cf commit a53aea6
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 7 deletions.
7 changes: 7 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ crate-type = ["cdylib"]
pyo3 = "0.19.0"
rayon = "1.8.1"
regex = "1.10.3"
lazy_static = "1.4.0"
23 changes: 16 additions & 7 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,22 +1,31 @@
use pyo3::prelude::*;
use rayon::prelude::*;
use regex::Regex;
use std::collections::HashMap;

const RE_WORDS: &str = r"\w+|[^\w]|[\s]";
#[macro_use]
extern crate lazy_static;

// fn load_regexen() -> HashMap<&str, Regex> {
// let mut regexen = HashMap::new();
// regexen.insert("words", Regex::new(RE_WORDS).unwrap());
// Ok(regexen)
// }
lazy_static! {
static ref REGEXEN: HashMap<&'static str, Regex> = {
let mut m = HashMap::new();
m.insert("WORDS", Regex::new(r"\w+|[^\w]|[\s]").unwrap());
m.insert("WHITESPACE", Regex::new(r"\s").unwrap());
m
};
}

// Extract all words from the given text
fn split_words(text: &str) -> Vec<&str> {
let re_words = Regex::new(RE_WORDS).unwrap();
let re_words = REGEXEN.get("WORDS").unwrap();
re_words.find_iter(text).map(|m| m.as_str()).collect()
}

fn process_word(word: &str, affix: &str, postfix: &str) -> String {
if REGEXEN.get("WHITESPACE").unwrap().is_match(word) {
return word.to_string();
}

let mid_word = word.len() / 2;
let start = &word[..mid_word];
let end = &word[mid_word..];
Expand Down

0 comments on commit a53aea6

Please sign in to comment.