From 9a1cba6a83fcd048ffd0c91ee6320cc9a9136d04 Mon Sep 17 00:00:00 2001 From: Lukas Kalbertodt Date: Wed, 26 Jun 2024 11:46:33 +0200 Subject: [PATCH 01/25] Add null byte as hard context separator This allows one to use \0 as artificial separator, for example when concatting lots of small strings into a large string. See this discussion for context: https://github.com/orgs/meilisearch/discussions/744 --- charabia/src/separators.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/charabia/src/separators.rs b/charabia/src/separators.rs index 60a00483..1ca63860 100644 --- a/charabia/src/separators.rs +++ b/charabia/src/separators.rs @@ -11,11 +11,11 @@ /// - Zl Line Separator /// - Zp Paragraph Separator /// - Zs Space Separator -/// plus ". ", ", " and ។ល។" (៘ decomposition) to categorize them as hard separators +/// plus "\0", ". ", ", " and ។ល។" (៘ decomposition) to categorize them as hard separators /// and "`" to understand markdown formatted text #[rustfmt::skip] pub const DEFAULT_SEPARATORS: &[&str] = &[ - ". ", ", ", "_", "‿", "⁀", "⁔", "︳", "︴", "﹍", "﹎", "﹏", "_", "-", "֊", "־", "᐀", "᠆", "‐", "‒", "–", + "\0", ". ", ", ", "_", "‿", "⁀", "⁔", "︳", "︴", "﹍", "﹎", "﹏", "_", "-", "֊", "־", "᐀", "᠆", "‐", "‒", "–", "—", "―", "⸗", "⸚", "⸺", "⸻", "⹀", "〜", "〰", "゠", "︱", "︲", "﹘", "﹣", "-", "𐺭", ")", "]", "}", "༻", "༽", "᚜", "⁆", "⁾", "₎", "⌉", "⌋", "〉", "❩", "❫", "❭", "❯", "❱", "❳", "❵", "⟆", "⟧", "⟩", "⟫", "⟭", "⟯", "⦄", "⦆", "⦈", "⦊", "⦌", "⦎", "⦐", "⦒", "⦔", "⦖", "⦘", "⧙", "⧛", "⧽", @@ -64,6 +64,7 @@ pub const DEFAULT_SEPARATORS: &[&str] = &[ #[rustfmt::skip] pub const CONTEXT_SEPARATORS: &[&str] = &[ + "\0", // Null byte, can be used as artificial separator "᠆", // Mongolian Todo Soft Hyphen, mark the end of a paragraph. "᚛", "᚜", // Oghams, mark start and end of text "!", ". ", ", ", ";", "?", "¡", "§", "¶", "¿", ";", // Latin From b28ac5c3810acbc63b7e089212023863aeb1793d Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 2 Jul 2024 14:42:07 +0200 Subject: [PATCH 02/25] update dependencies --- charabia/Cargo.toml | 12 ++++++------ irg-kvariants/Cargo.toml | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml index 15e01d5a..05b437b1 100644 --- a/charabia/Cargo.toml +++ b/charabia/Cargo.toml @@ -12,11 +12,11 @@ categories = ["text-processing"] exclude = ["dictionaries/txt/thai/words.txt"] [dependencies] -aho-corasick = "1.1.2" +aho-corasick = "1.1.3" cow-utils = "0.1" csv = "1.3.0" -deunicode = "1.4.2" -either = "1.9.0" +deunicode = "1.6.0" +either = "1.13.0" finl_unicode = { version= "1.2.0", optional = true } fst = "0.4" jieba-rs = { version = "0.6", optional = true } @@ -29,10 +29,10 @@ pinyin = { version = "0.10", default-features = false, features = [ "with_tone", ], optional = true } wana_kana = { version = "3.0.0", optional = true } -unicode-normalization = "0.1.22" +unicode-normalization = "0.1.23" irg-kvariants = "0.1.0" -litemap = "0.7.2" -zerovec = "0.10.1" +litemap = "0.7.3" +zerovec = "0.10.4" [features] default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese"] diff --git a/irg-kvariants/Cargo.toml b/irg-kvariants/Cargo.toml index 08ae6121..cb1bc69a 100644 --- a/irg-kvariants/Cargo.toml +++ b/irg-kvariants/Cargo.toml @@ -11,8 +11,8 @@ repository = "https://github.com/meilisearch/charabia" [dependencies] csv = "1.3.0" once_cell = "1.19.0" -serde = { version = "1.0.196", features = ["derive"] } +serde = { version = "1.0.203", features = ["derive"] } [build-dependencies] csv = "1.3.0" -serde = { version = "1.0.196", features = ["derive"] } +serde = { version = "1.0.203", features = ["derive"] } From c5ca9206dbff984ec71d4f3f3597e4e6f90e2150 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 2 Jul 2024 14:47:17 +0200 Subject: [PATCH 03/25] update lindera --- charabia/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml index 05b437b1..3573efa2 100644 --- a/charabia/Cargo.toml +++ b/charabia/Cargo.toml @@ -24,7 +24,7 @@ once_cell = "1.19.0" serde = "1.0" slice-group-by = "0.3.1" whatlang = "0.16.4" -lindera = { version = "=0.31.0", default-features = false, optional = true } +lindera = { version = "=0.32.2", default-features = false, optional = true } pinyin = { version = "0.10", default-features = false, features = [ "with_tone", ], optional = true } From 3e188098d364fd6596c7c121657a0915c116a057 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 2 Jul 2024 14:51:32 +0200 Subject: [PATCH 04/25] update jieba --- charabia/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml index 3573efa2..63d7c679 100644 --- a/charabia/Cargo.toml +++ b/charabia/Cargo.toml @@ -19,7 +19,7 @@ deunicode = "1.6.0" either = "1.13.0" finl_unicode = { version= "1.2.0", optional = true } fst = "0.4" -jieba-rs = { version = "0.6", optional = true } +jieba-rs = { version = "0.7", optional = true } once_cell = "1.19.0" serde = "1.0" slice-group-by = "0.3.1" From 99e8b38840201a813b631f415a86be9d9ed3f739 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 2 Jul 2024 14:53:53 +0200 Subject: [PATCH 05/25] prepare for release --- charabia/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml index 63d7c679..2eace07f 100644 --- a/charabia/Cargo.toml +++ b/charabia/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "charabia" -version = "0.8.11" +version = "0.8.12" license = "MIT" authors = ["Many "] edition = "2021" From b6a3943b291386bbc4326937610e2c1d44c84aa8 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 2 Jul 2024 15:40:07 +0200 Subject: [PATCH 06/25] update internal dependencies for release --- charabia/Cargo.toml | 2 +- irg-kvariants/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml index 2eace07f..b7a31676 100644 --- a/charabia/Cargo.toml +++ b/charabia/Cargo.toml @@ -30,7 +30,7 @@ pinyin = { version = "0.10", default-features = false, features = [ ], optional = true } wana_kana = { version = "3.0.0", optional = true } unicode-normalization = "0.1.23" -irg-kvariants = "0.1.0" +irg-kvariants = "0.1.1" litemap = "0.7.3" zerovec = "0.10.4" diff --git a/irg-kvariants/Cargo.toml b/irg-kvariants/Cargo.toml index cb1bc69a..f32d005f 100644 --- a/irg-kvariants/Cargo.toml +++ b/irg-kvariants/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "irg-kvariants" -version = "0.1.0" +version = "0.1.1" edition = "2021" license = "MIT" description = "A simple wrapper around kvariant from hfhchan/irg" From cf3916a40c4d86de5695bf7d56589420270dcd92 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 2 Jul 2024 15:45:35 +0200 Subject: [PATCH 07/25] use local dependencies when doing local testing --- charabia/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml index b7a31676..6086c9ce 100644 --- a/charabia/Cargo.toml +++ b/charabia/Cargo.toml @@ -30,7 +30,7 @@ pinyin = { version = "0.10", default-features = false, features = [ ], optional = true } wana_kana = { version = "3.0.0", optional = true } unicode-normalization = "0.1.23" -irg-kvariants = "0.1.1" +irg-kvariants = { path = "../irg-kvariants", version = "=0.1.1" } litemap = "0.7.3" zerovec = "0.10.4" From 28d1305da7f3b5e0831514ea8ef3a3b9ea680209 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 10 Jul 2024 08:36:32 +0200 Subject: [PATCH 08/25] Simplify lang detection --- charabia/Cargo.toml | 6 +----- charabia/src/detection/mod.rs | 17 ++++++----------- charabia/src/segmenter/mod.rs | 4 +++- charabia/src/tokenizer.rs | 5 ++--- 4 files changed, 12 insertions(+), 20 deletions(-) diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml index 6086c9ce..69562d5b 100644 --- a/charabia/Cargo.toml +++ b/charabia/Cargo.toml @@ -13,15 +13,13 @@ exclude = ["dictionaries/txt/thai/words.txt"] [dependencies] aho-corasick = "1.1.3" -cow-utils = "0.1" csv = "1.3.0" -deunicode = "1.6.0" either = "1.13.0" finl_unicode = { version= "1.2.0", optional = true } fst = "0.4" jieba-rs = { version = "0.7", optional = true } once_cell = "1.19.0" -serde = "1.0" +serde = "1.0.192" slice-group-by = "0.3.1" whatlang = "0.16.4" lindera = { version = "=0.32.2", default-features = false, optional = true } @@ -31,8 +29,6 @@ pinyin = { version = "0.10", default-features = false, features = [ wana_kana = { version = "3.0.0", optional = true } unicode-normalization = "0.1.23" irg-kvariants = { path = "../irg-kvariants", version = "=0.1.1" } -litemap = "0.7.3" -zerovec = "0.10.4" [features] default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese"] diff --git a/charabia/src/detection/mod.rs b/charabia/src/detection/mod.rs index 0c5781b0..f965be83 100644 --- a/charabia/src/detection/mod.rs +++ b/charabia/src/detection/mod.rs @@ -1,5 +1,3 @@ -use std::collections::HashMap; - pub use script_language::{Language, Script}; use whatlang::Detector; @@ -12,11 +10,11 @@ pub struct StrDetection<'o, 'al> { inner: &'o str, pub script: Option