Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Normalizer for russian #296

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion charabia/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ unicode-normalization = "0.1.23"
irg-kvariants = { path = "../irg-kvariants", version = "=0.1.1" }

[features]
default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese", "swedish-recomposition", "turkish"]
default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese", "swedish-recomposition", "turkish", "russian"]

# allow chinese specialized tokenization
chinese = ["chinese-segmentation", "chinese-normalization"]
Expand All @@ -57,6 +57,9 @@ thai = []
# allow greek specialized tokenization
greek = []

# allow russian specialized tokenization
russian = []

# allow splitting camelCase latin words
latin-camelcase = ["dep:finl_unicode"]

Expand Down
24 changes: 24 additions & 0 deletions charabia/src/normalizer/compatibility_decomposition.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,13 @@ mod test {
// base tokens to normalize.
fn tokens() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned("Ёё".to_string()),
char_end: 2,
byte_end: 2,
script: Script::Cyrillic,
..Default::default()
},
Token {
// Decompose 1E69 to 0073 0323 0307
lemma: Owned("ṩ ṩ".to_string()),
Expand All @@ -74,6 +81,14 @@ mod test {
// expected result of the current Normalizer.
fn normalizer_result() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned("Е\u{308}е\u{308}".to_string()),
char_end: 2,
byte_end: 2,
char_map: Some(vec![(2, 4), (2, 4)]),
script: Script::Cyrillic,
..Default::default()
},
Token {
lemma: Owned("s\u{0323}\u{0307} s\u{0323}\u{0307}".to_string()),
char_end: 2,
Expand Down Expand Up @@ -108,6 +123,15 @@ mod test {
// expected result of the complete Normalizer pieline.
fn normalized_tokens() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned("ее".to_string()),
char_end: 2,
byte_end: 2,
script: Script::Cyrillic,
char_map: Some(vec![(2, 2), (2, 2)]),
kind: TokenKind::Word,
..Default::default()
},
Token {
lemma: Owned("s s".to_string()),
char_end: 2,
Expand Down
6 changes: 6 additions & 0 deletions charabia/src/normalizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ pub use self::japanese::JapaneseNormalizer;
pub use self::lowercase::LowercaseNormalizer;
use self::nonspacing_mark::NonspacingMarkNormalizer;
use self::quote::QuoteNormalizer;
#[cfg(feature = "russian")]
pub use self::russian::RussianNormalizer;
#[cfg(feature = "swedish-recomposition")]
use self::swedish_recomposition::SwedishRecompositionNormalizer;
#[cfg(feature = "turkish")]
Expand All @@ -39,6 +41,8 @@ mod japanese;
mod lowercase;
mod nonspacing_mark;
mod quote;
#[cfg(feature = "russian")]
mod russian;
#[cfg(feature = "swedish-recomposition")]
mod swedish_recomposition;
#[cfg(feature = "turkish")]
Expand Down Expand Up @@ -75,6 +79,8 @@ pub static LOSSY_NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
Box::new(NonspacingMarkNormalizer),
#[cfg(feature = "vietnamese")]
Box::new(VietnameseNormalizer),
#[cfg(feature = "russian")]
Box::new(RussianNormalizer),
#[cfg(feature = "turkish")]
Box::new(TurkishNormalizer),
]
Expand Down
133 changes: 133 additions & 0 deletions charabia/src/normalizer/russian.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
use std::borrow::Cow;

use super::{Normalizer, NormalizerOption};
use crate::{Script, Token};
use aho_corasick::AhoCorasick;
use once_cell::sync::Lazy;

pub struct RussianNormalizer;

static MATCHING_STR: Lazy<AhoCorasick> =
Lazy::new(|| AhoCorasick::new(["Е\u{308}", "е\u{308}"]).unwrap());

impl Normalizer for RussianNormalizer {
fn normalize<'o>(&self, mut token: Token<'o>, options: &NormalizerOption) -> Token<'o> {
match token.char_map.take() {
Some(mut char_map) => {
// if a char_map already exists,iterate over it to reconstruct sub-strings.
let mut lemma = String::new();
let mut tail = token.lemma.as_ref();
let mut normalized = String::new();
for (_, normalized_len) in char_map.iter_mut() {
let (head, t) = tail.split_at(*normalized_len as usize);
tail = t;
normalized.clear();
// then normalize each sub-strings recomputing the size in the char_map.
let mut peekable = head.chars().peekable();
while let Some(c) = peekable.next() {
let (c, peek_consumed) = normalize_russian(c, peekable.peek());

if peek_consumed {
peekable.next();
}

normalized.push(c);
}

*normalized_len = normalized.len() as u8;
lemma.push_str(normalized.as_ref());
}

token.lemma = Cow::Owned(lemma);
token.char_map = Some(char_map);
}
None => {
// if no char_map exists, iterate over the lemma recomposing characters.
let mut char_map = Vec::new();
let mut lemma = String::new();
let mut peekable = token.lemma.chars().peekable();
while let Some(c) = peekable.next() {
let (normalized, peek_consumed) = normalize_russian(c, peekable.peek());

if peek_consumed {
peekable.next();
}

if options.create_char_map {
char_map.push((c.len_utf8() as u8, normalized.len_utf8() as u8));
}
lemma.push(normalized);
}
token.lemma = Cow::Owned(lemma);
if options.create_char_map {
token.char_map = Some(char_map);
}
}
}

token
}

fn should_normalize(&self, token: &Token) -> bool {
token.script == Script::Cyrillic && MATCHING_STR.is_match(token.lemma())
}
}

// https://en.wikipedia.org/wiki/Russian_alphabet
// Only decomposed forms are considered, as compatibility decomposition already takes care of 1-codepoint forms.
fn normalize_russian(current: char, next: Option<&char>) -> (char, bool) {
match (current, next) {
// ё -> е, grammatically permissible, common in writing
('Е', Some('\u{308}')) => ('Е', true),
('е', Some('\u{308}')) => ('е', true),

(c, _) => (c, false),
}
}

#[cfg(test)]
mod test {
use std::borrow::Cow::Owned;

use crate::normalizer::test::test_normalizer;
use crate::normalizer::Normalizer;
use crate::token::TokenKind;

// base tokens to normalize.
fn tokens() -> Vec<Token<'static>> {
vec![Token {
lemma: Owned("Ёё".to_string()),
char_end: 2,
byte_end: 2,
script: Script::Cyrillic,
..Default::default()
}]
}

// expected result of the current Normalizer.
fn normalizer_result() -> Vec<Token<'static>> {
vec![Token {
lemma: Owned("Ёё".to_string()),
char_end: 2,
byte_end: 2,
script: Script::Cyrillic,
char_map: None,
..Default::default()
}]
}

// expected result of the complete Normalizer pipeline.
fn normalized_tokens() -> Vec<Token<'static>> {
vec![Token {
lemma: Owned("ее".to_string()),
char_end: 2,
byte_end: 2,
script: Script::Cyrillic,
char_map: Some(vec![(2, 2), (2, 2)]),
kind: TokenKind::Word,
..Default::default()
}]
}

test_normalizer!(RussianNormalizer, tokens(), normalizer_result(), normalized_tokens());
}
Loading