Skip to content

Commit

Permalink
Merge #52
Browse files Browse the repository at this point in the history
52: Introduce a new default normalizer that removes zeroes from tokens r=Kerollmops a=Kerollmops



Co-authored-by: Kerollmops <clement@meilisearch.com>
  • Loading branch information
bors[bot] and Kerollmops authored Jul 22, 2021
2 parents c2399c3 + a59a97b commit 271dc70
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 2 deletions.
8 changes: 6 additions & 2 deletions src/analyzer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use fst::Set;
use once_cell::sync::Lazy;

use crate::detection::is_latin;
use crate::normalizer::{Normalizer, DeunicodeNormalizer, LowercaseNormalizer};
use crate::normalizer::{DeunicodeNormalizer, LowercaseNormalizer, Normalizer, ZeroesRemover};
use crate::processors::{PreProcessor, IdentityPreProcessor, ProcessedText, ChineseTranslationPreProcessor};
use crate::token_classifier::TokenClassifier;
use crate::Token;
Expand All @@ -22,7 +22,11 @@ impl Default for Pipeline {
fn default() -> Self {
// Hotfix: make a common default normalizer for every pipeline
let deunicoder = DeunicodeNormalizer::new(&|text: &str| !text.chars().next().map_or(true, is_latin));
let normalizer: Vec<Box<dyn Normalizer>> = vec![Box::new(deunicoder), Box::new(LowercaseNormalizer)];
let normalizer: Vec<Box<dyn Normalizer>> = vec![
Box::new(deunicoder),
Box::new(LowercaseNormalizer),
Box::new(ZeroesRemover),
];

Self {
pre_processor: Box::new(IdentityPreProcessor),
Expand Down
2 changes: 2 additions & 0 deletions src/normalizer/mod.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
mod identity;
mod lowercase;
mod deunicoder;
mod zeroes_remover;

use crate::Token;

pub use identity::IdentityNormalizer;
pub use lowercase::LowercaseNormalizer;
pub use deunicoder::DeunicodeNormalizer;
pub use zeroes_remover::ZeroesRemover;

pub trait Normalizer: Sync + Send {
fn normalize<'a>(&self, token: Token<'a>) -> Token<'a>;
Expand Down
31 changes: 31 additions & 0 deletions src/normalizer/zeroes_remover.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
use std::borrow::Cow;

use super::Normalizer;
use crate::Token;

pub struct ZeroesRemover;

impl Normalizer for ZeroesRemover {
fn normalize<'a>(&self, mut token: Token<'a>) -> Token<'a> {
if token.word.chars().any(|c| c == '\0') {
token.word = Cow::Owned(token.word.chars().filter(|c| *c != '\0').collect());
}
token
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn remove_zeroes() {
let s = "\0loloo\0";
let token = Token {
word: Cow::Borrowed(s),
..Token::default()
};
let token = ZeroesRemover.normalize(token);
assert!(!token.word.chars().any(|c| c == '\0'));
}
}

0 comments on commit 271dc70

Please sign in to comment.