Skip to content

Commit

Permalink
Merge branch 'main' into basic-cyrillic-normalization
Browse files Browse the repository at this point in the history
  • Loading branch information
ManyTheFish committed Aug 28, 2024
2 parents 1872257 + dd260b9 commit 2edcf4a
Show file tree
Hide file tree
Showing 14 changed files with 602 additions and 126 deletions.
25 changes: 12 additions & 13 deletions charabia/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "charabia"
version = "0.8.11"
version = "0.9.0"
license = "MIT"
authors = ["Many <many@meilisearch.com>"]
edition = "2021"
Expand All @@ -12,30 +12,26 @@ categories = ["text-processing"]
exclude = ["dictionaries/txt/thai/words.txt"]

[dependencies]
aho-corasick = "1.1.2"
cow-utils = "0.1"
aho-corasick = "1.1.3"
csv = "1.3.0"
deunicode = "1.4.2"
either = "1.9.0"
either = "1.13.0"
finl_unicode = { version= "1.2.0", optional = true }
fst = "0.4"
jieba-rs = { version = "0.6", optional = true }
jieba-rs = { version = "0.7", optional = true }
once_cell = "1.19.0"
serde = "1.0"
serde = "1.0.192"
slice-group-by = "0.3.1"
whatlang = "0.16.4"
lindera = { version = "=0.31.0", default-features = false, optional = true }
lindera = { version = "=0.32.2", default-features = false, optional = true }
pinyin = { version = "0.10", default-features = false, features = [
"with_tone",
], optional = true }
wana_kana = { version = "3.0.0", optional = true }
unicode-normalization = "0.1.22"
irg-kvariants = "0.1.0"
litemap = "0.7.2"
zerovec = "0.10.1"
unicode-normalization = "0.1.23"
irg-kvariants = { path = "../irg-kvariants", version = "=0.1.1" }

[features]
default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese", "russian"]
default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese", "swedish-recomposition", "turkish", "russian"]

# allow chinese specialized tokenization
chinese = ["chinese-segmentation", "chinese-normalization"]
Expand Down Expand Up @@ -78,6 +74,9 @@ latin-snakecase = ["dep:finl_unicode"]
# force Charabia to recompose Swedish characters
swedish-recomposition = []

# allow turkish specialized tokenization
turkish = []

[dev-dependencies]
criterion = "0.5"
jemallocator = "0.5.4"
Expand Down
2 changes: 1 addition & 1 deletion charabia/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Charabia provides a simple API to segment, normalize, or tokenize (segment + nor

| Script / Language | specialized segmentation | specialized normalization | Segmentation Performance level | Tokenization Performance level |
|---------------------|-------------------------------------------------------------------------------|---------------------------|-------------------|---|
| **Latin** | ✅ CamelCase segmentation |[compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + `Ð vs Đ` spoofing normalization | 🟩 ~23MiB/sec | 🟨 ~9MiB/sec |
| **Latin** | ✅ CamelCase segmentation |[compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + `Ð vs Đ` spoofing normalization + `ı` normalization | 🟩 ~23MiB/sec | 🟨 ~9MiB/sec |
| **Greek** ||[compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + final sigma normalization | 🟩 ~27MiB/sec | 🟨 ~8MiB/sec |
| **Cyrillic** - **Georgian** ||[compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase | 🟩 ~27MiB/sec | 🟨 ~9MiB/sec |
| **Chinese** **CMN** 🇨🇳 |[jieba](https://github.com/messense/jieba-rs) |[compatibility decomposition](https://unicode.org/reports/tr15/) + kvariant conversion | 🟨 ~10MiB/sec | 🟧 ~5MiB/sec |
Expand Down
35 changes: 13 additions & 22 deletions charabia/src/detection/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
use std::collections::HashMap;

pub use script_language::{Language, Script};
use whatlang::Detector;

Expand All @@ -12,11 +10,11 @@ pub struct StrDetection<'o, 'al> {
inner: &'o str,
pub script: Option<Script>,
pub language: Option<Language>,
allow_list: Option<&'al HashMap<Script, Vec<Language>>>,
allow_list: Option<&'al [Language]>,
}

impl<'o, 'al> StrDetection<'o, 'al> {
pub fn new(inner: &'o str, allow_list: Option<&'al HashMap<Script, Vec<Language>>>) -> Self {
pub fn new(inner: &'o str, allow_list: Option<&'al [Language]>) -> Self {
Self { inner, script: None, language: None, allow_list }
}

Expand All @@ -25,10 +23,14 @@ impl<'o, 'al> StrDetection<'o, 'al> {
*self.script.get_or_insert_with(|| Self::detect_script(inner))
}

pub fn language(&mut self) -> Language {
pub fn language(&mut self) -> Option<Language> {
let inner = self.inner;
let script = self.script();
*self.language.get_or_insert_with(|| Self::detect_lang(inner, script, self.allow_list))
self.language = match self.language.take() {
Some(lang) => Some(lang),
None => Self::detect_lang(inner, self.allow_list),
};

self.language
}

/// detect script with whatlang,
Expand All @@ -39,33 +41,22 @@ impl<'o, 'al> StrDetection<'o, 'al> {

/// detect lang with whatlang
/// if no language is detected, return Language::Other
fn detect_lang(
text: &str,
script: Script,
allow_list: Option<&HashMap<Script, Vec<Language>>>,
) -> Language {
fn detect_lang(text: &str, allow_list: Option<&[Language]>) -> Option<Language> {
let detector = allow_list
.and_then(|allow_list| allow_list.get(&script))
.map(|allow_list| allow_list.iter().map(|lang| (*lang).into()).collect())
.map(Detector::with_allowlist)
.unwrap_or_default();

detector.detect_lang(text).map(Language::from).unwrap_or_default()
detector.detect_lang(text).map(Language::from)
}
}

pub trait Detect<'o, 'al> {
fn detect(
&'o self,
allow_list: Option<&'al HashMap<Script, Vec<Language>>>,
) -> StrDetection<'o, 'al>;
fn detect(&'o self, allow_list: Option<&'al [Language]>) -> StrDetection<'o, 'al>;
}

impl<'o, 'al> Detect<'o, 'al> for &str {
fn detect(
&'o self,
allow_list: Option<&'al HashMap<Script, Vec<Language>>>,
) -> StrDetection<'o, 'al> {
fn detect(&'o self, allow_list: Option<&'al [Language]>) -> StrDetection<'o, 'al> {
StrDetection::new(self, allow_list)
}
}
32 changes: 12 additions & 20 deletions charabia/src/detection/script_language.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@ use core::str::FromStr;

#[cfg(test)]
use quickcheck::{Arbitrary, Gen};
use serde::{Deserialize, Serialize};

use super::chars;

macro_rules! make_language {
($($language:tt), +) => {
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize, PartialOrd, Ord)]
pub enum Language {
$($language),+,
Other,
}
impl From<whatlang::Lang> for Language {
fn from(other: whatlang::Lang) -> Language {
Expand All @@ -24,27 +24,19 @@ macro_rules! make_language {
fn from(other: Language) -> whatlang::Lang {
match other {
$(Language::$language => whatlang::Lang::$language), +,
_other => whatlang::Lang::Eng,
}
}
}

impl Default for Language {
fn default() -> Self {
Self::Other
}
}

impl Language {
pub fn name(&self) -> &'static str {
pub fn code(&self) -> &'static str {
match self {
$(Language::$language => whatlang::Lang::$language.code()), +,
_other => "other",
}
}

pub fn from_name<S: AsRef<str>>(code: S) -> Language {
whatlang::Lang::from_code(code.as_ref()).map(Language::from).unwrap_or_default()
pub fn from_code<S: AsRef<str>>(code: S) -> Option<Language> {
whatlang::Lang::from_code(code.as_ref()).map(Language::from)
}
}
};
Expand Down Expand Up @@ -124,7 +116,7 @@ make_language! {

macro_rules! make_script {
($($script:tt), +) => {
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize, PartialOrd, Ord)]
pub enum Script {
$($script),+,
Cj,
Expand Down Expand Up @@ -361,12 +353,12 @@ mod test {

#[test]
fn from_into_language() {
assert_eq!(Language::Eng.name(), "eng");
assert_eq!(Language::from_name("eng"), Language::Eng);
assert_eq!(Language::Jpn.name(), "jpn");
assert_eq!(Language::from_name("jpn"), Language::Jpn);
assert_eq!(Language::Cmn.name(), "cmn");
assert_eq!(Language::from_name("cmn"), Language::Cmn);
assert_eq!(Language::Eng.code(), "eng");
assert_eq!(Language::from_code("eng"), Some(Language::Eng));
assert_eq!(Language::Jpn.code(), "jpn");
assert_eq!(Language::from_code("jpn"), Some(Language::Jpn));
assert_eq!(Language::Cmn.code(), "cmn");
assert_eq!(Language::from_code("cmn"), Some(Language::Cmn));
}

#[test]
Expand Down
2 changes: 1 addition & 1 deletion charabia/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ mod detection;
mod token;
mod tokenizer;

pub use detection::{Language, Script};
pub use detection::{Language, Script, StrDetection};
pub use normalizer::Normalize;
pub use segmenter::Segment;
pub use token::{SeparatorKind, Token, TokenKind};
Expand Down
4 changes: 2 additions & 2 deletions charabia/src/normalizer/arabic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ use crate::{Script, Token};
/// - normalizing the arabic Alef 'أ','إ','آ','ٱ' to 'ا'
/// - normalizing the arabic Yeh 'ى' to 'ي'
/// - Normalizing the arabic Taa Marbuta 'ة' to 'ه'
/// https://en.wikipedia.org/wiki/Arabic_alphabet
/// https://en.wikipedia.org/wiki/Kashida
/// https://en.wikipedia.org/wiki/Arabic_alphabet
/// https://en.wikipedia.org/wiki/Kashida

pub struct ArabicNormalizer;

Expand Down
19 changes: 14 additions & 5 deletions charabia/src/normalizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ use self::quote::QuoteNormalizer;
pub use self::russian::RussianNormalizer;
#[cfg(feature = "swedish-recomposition")]
use self::swedish_recomposition::SwedishRecompositionNormalizer;
#[cfg(feature = "turkish")]
pub use self::turkish::TurkishNormalizer;
#[cfg(feature = "vietnamese")]
pub use self::vietnamese::VietnameseNormalizer;
use crate::segmenter::SegmentedTokenIter;
Expand All @@ -43,6 +45,8 @@ mod quote;
mod russian;
#[cfg(feature = "swedish-recomposition")]
mod swedish_recomposition;
#[cfg(feature = "turkish")]
mod turkish;
#[cfg(feature = "vietnamese")]
mod vietnamese;

Expand Down Expand Up @@ -77,6 +81,8 @@ pub static LOSSY_NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
Box::new(VietnameseNormalizer),
#[cfg(feature = "russian")]
Box::new(RussianNormalizer),
#[cfg(feature = "turkish")]
Box::new(TurkishNormalizer),
]
});

Expand All @@ -87,12 +93,12 @@ pub(crate) const DEFAULT_NORMALIZER_OPTION: NormalizerOption = NormalizerOption
};

/// Iterator over Normalized [`Token`]s.
pub struct NormalizedTokenIter<'o, 'tb> {
token_iter: SegmentedTokenIter<'o, 'tb>,
pub struct NormalizedTokenIter<'o, 'aho, 'lang, 'tb> {
token_iter: SegmentedTokenIter<'o, 'aho, 'lang>,
options: &'tb NormalizerOption<'tb>,
}

impl<'o> Iterator for NormalizedTokenIter<'o, '_> {
impl<'o> Iterator for NormalizedTokenIter<'o, '_, '_, '_> {
type Item = Token<'o>;

fn next(&mut self) -> Option<Self::Item> {
Expand Down Expand Up @@ -238,11 +244,14 @@ impl From<String> for CharOrStr {
}
}

impl<'o, 'tb> SegmentedTokenIter<'o, 'tb> {
impl<'o, 'aho, 'lang> SegmentedTokenIter<'o, 'aho, 'lang> {
/// Normalize [`Token`]s using all the compatible Normalizers.
///
/// A Latin `Token` would not be normalized the same as a Chinese `Token`.
pub fn normalize(self, options: &'tb NormalizerOption<'tb>) -> NormalizedTokenIter<'o, 'tb> {
pub fn normalize<'tb>(
self,
options: &'tb NormalizerOption<'tb>,
) -> NormalizedTokenIter<'o, 'aho, 'lang, 'tb> {
NormalizedTokenIter { token_iter: self, options }
}
}
Expand Down
10 changes: 7 additions & 3 deletions charabia/src/normalizer/swedish_recomposition.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ use once_cell::sync::Lazy;

use super::Normalizer;
use crate::normalizer::NormalizerOption;
use crate::{Script, Token};
use crate::{Language, Token};

static MATCHING_STR: Lazy<AhoCorasick> = Lazy::new(|| {
AhoCorasick::new(&["A\u{30a}", "a\u{30a}", "A\u{308}", "a\u{308}", "O\u{308}", "o\u{308}"])
AhoCorasick::new(["A\u{30a}", "a\u{30a}", "A\u{308}", "a\u{308}", "O\u{308}", "o\u{308}"])
.unwrap()
});

Expand Down Expand Up @@ -77,7 +77,7 @@ impl Normalizer for SwedishRecompositionNormalizer {

// Returns `true` if the Normalizer should be used.
fn should_normalize(&self, token: &Token) -> bool {
token.script == Script::Latin && MATCHING_STR.is_match(token.lemma())
token.language == Some(Language::Swe) && MATCHING_STR.is_match(token.lemma())
}
}

Expand All @@ -101,6 +101,7 @@ mod test {
use crate::normalizer::test::test_normalizer;
use crate::normalizer::Normalizer;
use crate::token::TokenKind;
use crate::Script;

// base tokens to normalize.
fn tokens() -> Vec<Token<'static>> {
Expand All @@ -109,6 +110,7 @@ mod test {
char_end: 13,
byte_end: 19,
script: Script::Latin,
language: Some(Language::Swe),
..Default::default()
}]
}
Expand All @@ -121,6 +123,7 @@ mod test {
char_end: 13,
byte_end: 19,
script: Script::Latin,
language: Some(Language::Swe),
..Default::default()
}]
}
Expand Down Expand Up @@ -148,6 +151,7 @@ mod test {
]),
script: Script::Latin,
kind: TokenKind::Word,
language: Some(Language::Swe),
..Default::default()
}]
}
Expand Down
Loading

0 comments on commit 2edcf4a

Please sign in to comment.