Merge branch 'main' into basic-cyrillic-normalization

meilisearch · Aug 28, 2024 · 2edcf4a · 2edcf4a
2 parents 1872257 + dd260b9
commit 2edcf4a
Show file tree

Hide file tree

Showing 14 changed files with 602 additions and 126 deletions.
diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "charabia"
-version = "0.8.11"
+version = "0.9.0"
 license = "MIT"
 authors = ["Many <many@meilisearch.com>"]
 edition = "2021"
@@ -12,30 +12,26 @@ categories = ["text-processing"]
 exclude = ["dictionaries/txt/thai/words.txt"]
 
 [dependencies]
-aho-corasick = "1.1.2"
-cow-utils = "0.1"
+aho-corasick = "1.1.3"
 csv = "1.3.0"
-deunicode = "1.4.2"
-either = "1.9.0"
+either = "1.13.0"
 finl_unicode = { version= "1.2.0", optional = true }
 fst = "0.4"
-jieba-rs = { version = "0.6", optional = true }
+jieba-rs = { version = "0.7", optional = true }
 once_cell = "1.19.0"
-serde = "1.0"
+serde = "1.0.192"
 slice-group-by = "0.3.1"
 whatlang = "0.16.4"
-lindera = { version = "=0.31.0", default-features = false, optional = true }
+lindera = { version = "=0.32.2", default-features = false, optional = true }
 pinyin = { version = "0.10", default-features = false, features = [
   "with_tone",
 ], optional = true }
 wana_kana = { version = "3.0.0", optional = true }
-unicode-normalization = "0.1.22"
-irg-kvariants = "0.1.0"
-litemap = "0.7.2"
-zerovec = "0.10.1"
+unicode-normalization = "0.1.23"
+irg-kvariants = { path = "../irg-kvariants", version = "=0.1.1" }
 
 [features]
-default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese", "russian"]
+default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese", "swedish-recomposition", "turkish", "russian"]
 
 # allow chinese specialized tokenization
 chinese = ["chinese-segmentation", "chinese-normalization"]
@@ -78,6 +74,9 @@ latin-snakecase = ["dep:finl_unicode"]
 # force Charabia to recompose Swedish characters
 swedish-recomposition = []
 
+# allow turkish specialized tokenization
+turkish = []
+
 [dev-dependencies]
 criterion = "0.5"
 jemallocator = "0.5.4"

diff --git a/charabia/README.md b/charabia/README.md
@@ -16,7 +16,7 @@ Charabia provides a simple API to segment, normalize, or tokenize (segment + nor
 
 |  Script / Language  |                           specialized segmentation                            | specialized normalization | Segmentation Performance level | Tokenization Performance level |
 |---------------------|-------------------------------------------------------------------------------|---------------------------|-------------------|---|
-| **Latin** | ✅ CamelCase segmentation | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + `Ð vs Đ` spoofing normalization         | 🟩 ~23MiB/sec    | 🟨 ~9MiB/sec    |
+| **Latin** | ✅ CamelCase segmentation | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + `Ð vs Đ` spoofing normalization + `ı` normalization | 🟩 ~23MiB/sec    | 🟨 ~9MiB/sec    |
 | **Greek** | ❌ | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + final sigma normalization         | 🟩 ~27MiB/sec    | 🟨 ~8MiB/sec    |
 | **Cyrillic** - **Georgian** | ❌ | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase          | 🟩 ~27MiB/sec    | 🟨 ~9MiB/sec    |
 | **Chinese** **CMN** 🇨🇳 | ✅ [jieba](https://github.com/messense/jieba-rs) | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + kvariant conversion | 🟨 ~10MiB/sec    | 🟧 ~5MiB/sec    |

diff --git a/charabia/src/detection/mod.rs b/charabia/src/detection/mod.rs
@@ -1,5 +1,3 @@
-use std::collections::HashMap;
-
 pub use script_language::{Language, Script};
 use whatlang::Detector;
 
@@ -12,11 +10,11 @@ pub struct StrDetection<'o, 'al> {
     inner: &'o str,
     pub script: Option<Script>,
     pub language: Option<Language>,
-    allow_list: Option<&'al HashMap<Script, Vec<Language>>>,
+    allow_list: Option<&'al [Language]>,
 }
 
 impl<'o, 'al> StrDetection<'o, 'al> {
-    pub fn new(inner: &'o str, allow_list: Option<&'al HashMap<Script, Vec<Language>>>) -> Self {
+    pub fn new(inner: &'o str, allow_list: Option<&'al [Language]>) -> Self {
         Self { inner, script: None, language: None, allow_list }
     }
 
@@ -25,10 +23,14 @@ impl<'o, 'al> StrDetection<'o, 'al> {
         *self.script.get_or_insert_with(|| Self::detect_script(inner))
     }
 
-    pub fn language(&mut self) -> Language {
+    pub fn language(&mut self) -> Option<Language> {
         let inner = self.inner;
-        let script = self.script();
-        *self.language.get_or_insert_with(|| Self::detect_lang(inner, script, self.allow_list))
+        self.language = match self.language.take() {
+            Some(lang) => Some(lang),
+            None => Self::detect_lang(inner, self.allow_list),
+        };
+
+        self.language
     }
 
     /// detect script with whatlang,
@@ -39,33 +41,22 @@ impl<'o, 'al> StrDetection<'o, 'al> {
 
     /// detect lang with whatlang
     /// if no language is detected, return Language::Other
-    fn detect_lang(
-        text: &str,
-        script: Script,
-        allow_list: Option<&HashMap<Script, Vec<Language>>>,
-    ) -> Language {
+    fn detect_lang(text: &str, allow_list: Option<&[Language]>) -> Option<Language> {
         let detector = allow_list
-            .and_then(|allow_list| allow_list.get(&script))
             .map(|allow_list| allow_list.iter().map(|lang| (*lang).into()).collect())
             .map(Detector::with_allowlist)
             .unwrap_or_default();
 
-        detector.detect_lang(text).map(Language::from).unwrap_or_default()
+        detector.detect_lang(text).map(Language::from)
     }
 }
 
 pub trait Detect<'o, 'al> {
-    fn detect(
-        &'o self,
-        allow_list: Option<&'al HashMap<Script, Vec<Language>>>,
-    ) -> StrDetection<'o, 'al>;
+    fn detect(&'o self, allow_list: Option<&'al [Language]>) -> StrDetection<'o, 'al>;
 }
 
 impl<'o, 'al> Detect<'o, 'al> for &str {
-    fn detect(
-        &'o self,
-        allow_list: Option<&'al HashMap<Script, Vec<Language>>>,
-    ) -> StrDetection<'o, 'al> {
+    fn detect(&'o self, allow_list: Option<&'al [Language]>) -> StrDetection<'o, 'al> {
         StrDetection::new(self, allow_list)
     }
 }
diff --git a/charabia/src/detection/script_language.rs b/charabia/src/detection/script_language.rs
@@ -2,15 +2,15 @@ use core::str::FromStr;
 
 #[cfg(test)]
 use quickcheck::{Arbitrary, Gen};
+use serde::{Deserialize, Serialize};
 
 use super::chars;
 
 macro_rules! make_language {
     ($($language:tt), +) => {
-        #[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
+        #[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize, PartialOrd, Ord)]
         pub enum Language {
             $($language),+,
-            Other,
         }
         impl From<whatlang::Lang> for Language {
             fn from(other: whatlang::Lang) -> Language {
@@ -24,27 +24,19 @@ macro_rules! make_language {
             fn from(other: Language) -> whatlang::Lang {
                 match other {
                     $(Language::$language => whatlang::Lang::$language), +,
-                    _other => whatlang::Lang::Eng,
                 }
             }
         }
 
-        impl Default for Language {
-            fn default() -> Self {
-                Self::Other
-            }
-        }
-
         impl Language {
-            pub fn name(&self) -> &'static str {
+            pub fn code(&self) -> &'static str {
                 match self {
                     $(Language::$language => whatlang::Lang::$language.code()), +,
-                    _other => "other",
                 }
             }
 
-            pub fn from_name<S: AsRef<str>>(code: S) -> Language {
-                whatlang::Lang::from_code(code.as_ref()).map(Language::from).unwrap_or_default()
+            pub fn from_code<S: AsRef<str>>(code: S) -> Option<Language> {
+                whatlang::Lang::from_code(code.as_ref()).map(Language::from)
             }
         }
     };
@@ -124,7 +116,7 @@ make_language! {
 
 macro_rules! make_script {
     ($($script:tt), +) => {
-        #[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
+        #[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize, PartialOrd, Ord)]
         pub enum Script {
             $($script),+,
             Cj,
@@ -361,12 +353,12 @@ mod test {
 
     #[test]
     fn from_into_language() {
-        assert_eq!(Language::Eng.name(), "eng");
-        assert_eq!(Language::from_name("eng"), Language::Eng);
-        assert_eq!(Language::Jpn.name(), "jpn");
-        assert_eq!(Language::from_name("jpn"), Language::Jpn);
-        assert_eq!(Language::Cmn.name(), "cmn");
-        assert_eq!(Language::from_name("cmn"), Language::Cmn);
+        assert_eq!(Language::Eng.code(), "eng");
+        assert_eq!(Language::from_code("eng"), Some(Language::Eng));
+        assert_eq!(Language::Jpn.code(), "jpn");
+        assert_eq!(Language::from_code("jpn"), Some(Language::Jpn));
+        assert_eq!(Language::Cmn.code(), "cmn");
+        assert_eq!(Language::from_code("cmn"), Some(Language::Cmn));
     }
 
     #[test]

diff --git a/charabia/src/lib.rs b/charabia/src/lib.rs
@@ -56,7 +56,7 @@ mod detection;
 mod token;
 mod tokenizer;
 
-pub use detection::{Language, Script};
+pub use detection::{Language, Script, StrDetection};
 pub use normalizer::Normalize;
 pub use segmenter::Segment;
 pub use token::{SeparatorKind, Token, TokenKind};

diff --git a/charabia/src/normalizer/arabic.rs b/charabia/src/normalizer/arabic.rs
@@ -8,8 +8,8 @@ use crate::{Script, Token};
 /// - normalizing the arabic Alef 'أ','إ','آ','ٱ' to 'ا'
 /// - normalizing the arabic Yeh 'ى' to 'ي'
 /// - Normalizing the arabic Taa Marbuta 'ة' to 'ه'
-/// https://en.wikipedia.org/wiki/Arabic_alphabet
-/// https://en.wikipedia.org/wiki/Kashida
+///   https://en.wikipedia.org/wiki/Arabic_alphabet
+///   https://en.wikipedia.org/wiki/Kashida
 
 pub struct ArabicNormalizer;
 

diff --git a/charabia/src/normalizer/mod.rs b/charabia/src/normalizer/mod.rs
@@ -19,6 +19,8 @@ use self::quote::QuoteNormalizer;
 pub use self::russian::RussianNormalizer;
 #[cfg(feature = "swedish-recomposition")]
 use self::swedish_recomposition::SwedishRecompositionNormalizer;
+#[cfg(feature = "turkish")]
+pub use self::turkish::TurkishNormalizer;
 #[cfg(feature = "vietnamese")]
 pub use self::vietnamese::VietnameseNormalizer;
 use crate::segmenter::SegmentedTokenIter;
@@ -43,6 +45,8 @@ mod quote;
 mod russian;
 #[cfg(feature = "swedish-recomposition")]
 mod swedish_recomposition;
+#[cfg(feature = "turkish")]
+mod turkish;
 #[cfg(feature = "vietnamese")]
 mod vietnamese;
 
@@ -77,6 +81,8 @@ pub static LOSSY_NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
         Box::new(VietnameseNormalizer),
         #[cfg(feature = "russian")]
         Box::new(RussianNormalizer),
+        #[cfg(feature = "turkish")]
+        Box::new(TurkishNormalizer),
     ]
 });
 
@@ -87,12 +93,12 @@ pub(crate) const DEFAULT_NORMALIZER_OPTION: NormalizerOption = NormalizerOption
 };
 
 /// Iterator over Normalized [`Token`]s.
-pub struct NormalizedTokenIter<'o, 'tb> {
-    token_iter: SegmentedTokenIter<'o, 'tb>,
+pub struct NormalizedTokenIter<'o, 'aho, 'lang, 'tb> {
+    token_iter: SegmentedTokenIter<'o, 'aho, 'lang>,
     options: &'tb NormalizerOption<'tb>,
 }
 
-impl<'o> Iterator for NormalizedTokenIter<'o, '_> {
+impl<'o> Iterator for NormalizedTokenIter<'o, '_, '_, '_> {
     type Item = Token<'o>;
 
     fn next(&mut self) -> Option<Self::Item> {
@@ -238,11 +244,14 @@ impl From<String> for CharOrStr {
     }
 }
 
-impl<'o, 'tb> SegmentedTokenIter<'o, 'tb> {
+impl<'o, 'aho, 'lang> SegmentedTokenIter<'o, 'aho, 'lang> {
     /// Normalize [`Token`]s using all the compatible Normalizers.
     ///
     /// A Latin `Token` would not be normalized the same as a Chinese `Token`.
-    pub fn normalize(self, options: &'tb NormalizerOption<'tb>) -> NormalizedTokenIter<'o, 'tb> {
+    pub fn normalize<'tb>(
+        self,
+        options: &'tb NormalizerOption<'tb>,
+    ) -> NormalizedTokenIter<'o, 'aho, 'lang, 'tb> {
         NormalizedTokenIter { token_iter: self, options }
     }
 }

diff --git a/charabia/src/normalizer/swedish_recomposition.rs b/charabia/src/normalizer/swedish_recomposition.rs
@@ -5,10 +5,10 @@ use once_cell::sync::Lazy;
 
 use super::Normalizer;
 use crate::normalizer::NormalizerOption;
-use crate::{Script, Token};
+use crate::{Language, Token};
 
 static MATCHING_STR: Lazy<AhoCorasick> = Lazy::new(|| {
-    AhoCorasick::new(&["A\u{30a}", "a\u{30a}", "A\u{308}", "a\u{308}", "O\u{308}", "o\u{308}"])
+    AhoCorasick::new(["A\u{30a}", "a\u{30a}", "A\u{308}", "a\u{308}", "O\u{308}", "o\u{308}"])
         .unwrap()
 });
 
@@ -77,7 +77,7 @@ impl Normalizer for SwedishRecompositionNormalizer {
 
     // Returns `true` if the Normalizer should be used.
     fn should_normalize(&self, token: &Token) -> bool {
-        token.script == Script::Latin && MATCHING_STR.is_match(token.lemma())
+        token.language == Some(Language::Swe) && MATCHING_STR.is_match(token.lemma())
     }
 }
 
@@ -101,6 +101,7 @@ mod test {
     use crate::normalizer::test::test_normalizer;
     use crate::normalizer::Normalizer;
     use crate::token::TokenKind;
+    use crate::Script;
 
     // base tokens to normalize.
     fn tokens() -> Vec<Token<'static>> {
@@ -109,6 +110,7 @@ mod test {
             char_end: 13,
             byte_end: 19,
             script: Script::Latin,
+            language: Some(Language::Swe),
             ..Default::default()
         }]
     }
@@ -121,6 +123,7 @@ mod test {
             char_end: 13,
             byte_end: 19,
             script: Script::Latin,
+            language: Some(Language::Swe),
             ..Default::default()
         }]
     }
@@ -148,6 +151,7 @@ mod test {
             ]),
             script: Script::Latin,
             kind: TokenKind::Word,
+            language: Some(Language::Swe),
             ..Default::default()
         }]
     }