Skip to content

Commit

Permalink
Merge #308
Browse files Browse the repository at this point in the history
308: Prepare v0.9.1 r=Kerollmops a=ManyTheFish

# Pull Request
Make some modification to prepare v0.9.1


Co-authored-by: ManyTheFish <many@meilisearch.com>
Co-authored-by: Many the fish <many@meilisearch.com>
  • Loading branch information
meili-bors[bot] and ManyTheFish authored Sep 19, 2024
2 parents 2dc8ac8 + 30692ec commit 2d90e4c
Show file tree
Hide file tree
Showing 7 changed files with 50 additions and 12 deletions.
2 changes: 1 addition & 1 deletion charabia/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ unicode-normalization = "0.1.23"
irg-kvariants = { path = "../irg-kvariants", version = "=0.1.1" }

[features]
default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese", "swedish-recomposition", "turkish", "german-segmentation"]
default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "khmer", "vietnamese", "swedish-recomposition", "turkish", "german-segmentation"]

# allow chinese specialized tokenization
chinese = ["chinese-segmentation", "chinese-normalization"]
Expand Down
1 change: 1 addition & 0 deletions charabia/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Charabia provides a simple API to segment, normalize, or tokenize (segment + nor
| Script / Language | specialized segmentation | specialized normalization | Segmentation Performance level | Tokenization Performance level |
|---------------------|-------------------------------------------------------------------------------|---------------------------|-------------------|---|
| **Latin** | ✅ CamelCase segmentation |[compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + `Ð vs Đ` spoofing normalization + `ı` normalization | 🟩 ~23MiB/sec | 🟨 ~9MiB/sec |
| **Latin** - **German** | ✅ CamelCase segmentation + German word segmentation |[compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + `Ð vs Đ` spoofing normalization + `ı` normalization | 🟩 ~23MiB/sec | 🟨 ~9MiB/sec |
| **Greek** ||[compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + final sigma normalization | 🟩 ~27MiB/sec | 🟨 ~8MiB/sec |
| **Cyrillic** - **Georgian** ||[compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase | 🟩 ~27MiB/sec | 🟨 ~9MiB/sec |
| **Chinese** **CMN** 🇨🇳 |[jieba](https://github.com/messense/jieba-rs) |[compatibility decomposition](https://unicode.org/reports/tr15/) + kvariant conversion | 🟨 ~10MiB/sec | 🟧 ~5MiB/sec |
Expand Down
6 changes: 5 additions & 1 deletion charabia/src/detection/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@ impl<'o, 'al> StrDetection<'o, 'al> {
let inner = self.inner;
self.language = match self.language.take() {
Some(lang) => Some(lang),
None => Self::detect_lang(inner, self.allow_list),
None => match self.allow_list {
Some([unique_language]) => Some(*unique_language),
None if Self::detect_script(inner) == Script::Latin => None,
_otherwise => Self::detect_lang(inner, self.allow_list),
},
};

self.language
Expand Down
8 changes: 7 additions & 1 deletion charabia/src/detection/script_language.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ macro_rules! make_language {
($($language:tt), +) => {
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize, PartialOrd, Ord)]
pub enum Language {
Zho,
$($language),+,
}
impl From<whatlang::Lang> for Language {
Expand All @@ -23,6 +24,7 @@ macro_rules! make_language {
impl From<Language> for whatlang::Lang {
fn from(other: Language) -> whatlang::Lang {
match other {
Language::Zho => whatlang::Lang::Cmn,
$(Language::$language => whatlang::Lang::$language), +,
}
}
Expand All @@ -31,12 +33,16 @@ macro_rules! make_language {
impl Language {
pub fn code(&self) -> &'static str {
match self {
Language::Zho => "zho",
$(Language::$language => whatlang::Lang::$language.code()), +,
}
}

pub fn from_code<S: AsRef<str>>(code: S) -> Option<Language> {
whatlang::Lang::from_code(code.as_ref()).map(Language::from)
match code.as_ref() {
"zho" => Some(Language::Zho),
_ => whatlang::Lang::from_code(code.as_ref()).map(Language::from),
}
}
}
};
Expand Down
13 changes: 7 additions & 6 deletions charabia/src/normalizer/chinese.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ impl CharNormalizer for ChineseNormalizer {
}

fn should_normalize(&self, token: &Token) -> bool {
token.script == Script::Cj && matches!(token.language, None | Some(Language::Cmn))
token.script == Script::Cj
&& matches!(token.language, None | Some(Language::Cmn) | Some(Language::Zho))
}
}

Expand Down Expand Up @@ -74,7 +75,7 @@ mod test {
char_end: 5,
byte_end: 15,
script: Script::Cj,
language: Some(Language::Cmn),
language: Some(Language::Zho),
..Default::default()
},
]
Expand Down Expand Up @@ -111,7 +112,7 @@ mod test {
byte_end: 15,
char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 4), (3, 4), (3, 4)]),
script: Script::Cj,
language: Some(Language::Cmn),
language: Some(Language::Zho),
..Default::default()
},
]
Expand Down Expand Up @@ -147,7 +148,7 @@ mod test {
byte_end: 15,
char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 4), (3, 4), (3, 4)]),
script: Script::Cj,
language: Some(Language::Cmn),
language: Some(Language::Zho),
kind: TokenKind::Word,
..Default::default()
},
Expand Down Expand Up @@ -182,7 +183,7 @@ mod test {
byte_end: 15,
char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 4)]),
script: Script::Cj,
language: Some(Language::Cmn),
language: Some(Language::Zho),
..Default::default()
},
]
Expand Down Expand Up @@ -223,7 +224,7 @@ mod test {
byte_end: 15,
char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 4)]),
script: Script::Cj,
language: Some(Language::Cmn),
language: Some(Language::Zho),
},
]
}
Expand Down
24 changes: 24 additions & 0 deletions charabia/src/segmenter/latin/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,42 @@ mod test {

const TEXT: &str =
"The quick (\"brown\") fox can’t jump 32.3 feet, right? Brr, it's 29.3°F! camelCase kebab-case snake_case";

#[rustfmt::skip]
#[cfg(feature = "latin-camelcase")]
const SEGMENTED: &[&str] = &[
"The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "’", "t",
" ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "Brr", ", ", "it",
"'", "s", " ", "29", ".", "3°F", "!", " ", "camel", "Case", " ", "kebab", "-", "case", " ",
"snake", "_", "case",
];

#[rustfmt::skip]
#[cfg(feature = "latin-camelcase")]
const TOKENIZED: &[&str] = &[
"the", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "'", "t",
" ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "brr", ", ", "it",
"'", "s", " ", "29", ".", "3°f", "!", " ", "camel", "case", " ", "kebab", "-", "case", " ",
"snake", "_", "case",
];

#[rustfmt::skip]
#[cfg(not(feature = "latin-camelcase"))]
const SEGMENTED: &[&str] = &[
"The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "’", "t",
" ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "Brr", ", ", "it",
"'", "s", " ", "29", ".", "3°F", "!", " ", "camelCase", " ", "kebab", "-", "case", " ",
"snake", "_", "case",
];

#[rustfmt::skip]
#[cfg(not(feature = "latin-camelcase"))]
const TOKENIZED: &[&str] = &[
"the", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "'", "t",
" ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "brr", ", ", "it",
"'", "s", " ", "29", ".", "3°f", "!", " ", "camelcase", " ", "kebab", "-", "case", " ",
"snake", "_", "case",
];

test_segmenter!(LatinSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Latin, Language::Eng);
}
8 changes: 5 additions & 3 deletions charabia/src/segmenter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ pub static SEGMENTERS: Lazy<SegmenterMap> = Lazy::new(|| {
// chinese segmenter
#[cfg(feature = "chinese-segmentation")]
((Script::Cj, Some(Language::Cmn)), Box::new(ChineseSegmenter) as Box<dyn Segmenter>),
#[cfg(feature = "chinese-segmentation")]
((Script::Cj, Some(Language::Zho)), Box::new(ChineseSegmenter) as Box<dyn Segmenter>),
// japanese segmenter
#[cfg(feature = "japanese")]
((Script::Cj, Some(Language::Jpn)), Box::new(JapaneseSegmenter) as Box<dyn Segmenter>),
Expand Down Expand Up @@ -395,7 +397,6 @@ mod test {
($segmenter:expr, $text:expr, $segmented:expr, $tokenized:expr, $script:expr, $language:expr) => {
use crate::{Token, Language, Script};
use crate::segmenter::{Segment, AhoSegmentedStrIter, MatchType, DEFAULT_SEPARATOR_AHO};
use crate::tokenizer::Tokenize;
use super::*;

#[test]
Expand Down Expand Up @@ -425,7 +426,7 @@ Check if the expected Script/Language corresponds to the detected Script/Languag

#[test]
fn segment() {
let segmented_text: Vec<_> = $text.segment_str().collect();
let segmented_text: Vec<_> = $text.segment_str_with_option(None, Some(&[$language])).collect();
assert_eq!(&segmented_text[..], $segmented, r#"
Segmenter chosen by global segment() function, didn't segment the text as expected.
Expand All @@ -436,7 +437,8 @@ Check if the tested segmenter is assigned to the good Script/Language in `SEGMEN

#[test]
fn tokenize() {
let tokens: Vec<_> = $text.tokenize().collect();
let tokenizer = crate::TokenizerBuilder::default().into_tokenizer();
let tokens: Vec<_> = tokenizer.tokenize_with_allow_list($text, Some(&[$language])).collect();
let tokenized_text: Vec<_> = tokens.iter().map(|t| t.lemma()).collect();

assert_eq!(&tokenized_text[..], $tokenized, r#"
Expand Down

0 comments on commit 2d90e4c

Please sign in to comment.