From a59a97bbc8edacd2a30cc6e84bc06ad5bfe4533f Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 22 Jul 2021 15:38:02 +0200 Subject: [PATCH] Introduce a new default normalizer that removes zeroes from tokens --- src/analyzer.rs | 8 ++++++-- src/normalizer/mod.rs | 2 ++ src/normalizer/zeroes_remover.rs | 31 +++++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 2 deletions(-) create mode 100644 src/normalizer/zeroes_remover.rs diff --git a/src/analyzer.rs b/src/analyzer.rs index 4e69251..d4ec33e 100644 --- a/src/analyzer.rs +++ b/src/analyzer.rs @@ -4,7 +4,7 @@ use fst::Set; use once_cell::sync::Lazy; use crate::detection::is_latin; -use crate::normalizer::{Normalizer, DeunicodeNormalizer, LowercaseNormalizer}; +use crate::normalizer::{DeunicodeNormalizer, LowercaseNormalizer, Normalizer, ZeroesRemover}; use crate::processors::{PreProcessor, IdentityPreProcessor, ProcessedText, ChineseTranslationPreProcessor}; use crate::token_classifier::TokenClassifier; use crate::Token; @@ -22,7 +22,11 @@ impl Default for Pipeline { fn default() -> Self { // Hotfix: make a common default normalizer for every pipeline let deunicoder = DeunicodeNormalizer::new(&|text: &str| !text.chars().next().map_or(true, is_latin)); - let normalizer: Vec> = vec![Box::new(deunicoder), Box::new(LowercaseNormalizer)]; + let normalizer: Vec> = vec![ + Box::new(deunicoder), + Box::new(LowercaseNormalizer), + Box::new(ZeroesRemover), + ]; Self { pre_processor: Box::new(IdentityPreProcessor), diff --git a/src/normalizer/mod.rs b/src/normalizer/mod.rs index e2d2b3d..5a18a47 100644 --- a/src/normalizer/mod.rs +++ b/src/normalizer/mod.rs @@ -1,12 +1,14 @@ mod identity; mod lowercase; mod deunicoder; +mod zeroes_remover; use crate::Token; pub use identity::IdentityNormalizer; pub use lowercase::LowercaseNormalizer; pub use deunicoder::DeunicodeNormalizer; +pub use zeroes_remover::ZeroesRemover; pub trait Normalizer: Sync + Send { fn normalize<'a>(&self, token: Token<'a>) -> Token<'a>; diff --git a/src/normalizer/zeroes_remover.rs b/src/normalizer/zeroes_remover.rs new file mode 100644 index 0000000..1203f3e --- /dev/null +++ b/src/normalizer/zeroes_remover.rs @@ -0,0 +1,31 @@ +use std::borrow::Cow; + +use super::Normalizer; +use crate::Token; + +pub struct ZeroesRemover; + +impl Normalizer for ZeroesRemover { + fn normalize<'a>(&self, mut token: Token<'a>) -> Token<'a> { + if token.word.chars().any(|c| c == '\0') { + token.word = Cow::Owned(token.word.chars().filter(|c| *c != '\0').collect()); + } + token + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn remove_zeroes() { + let s = "\0loloo\0"; + let token = Token { + word: Cow::Borrowed(s), + ..Token::default() + }; + let token = ZeroesRemover.normalize(token); + assert!(!token.word.chars().any(|c| c == '\0')); + } +}