Skip to content

Commit

Permalink
Disable pinyin normalizer
Browse files Browse the repository at this point in the history
Fix lindera UniDic download error

Support traditional_to_simplified

update curstom dict
  • Loading branch information
lzw65 committed Jan 16, 2024
1 parent 5f8abfe commit 899655d
Show file tree
Hide file tree
Showing 6 changed files with 140,048 additions and 88 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ charabia/target
/data.ms
Cargo.lock
.idea
.hypothesis
12 changes: 5 additions & 7 deletions charabia/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,10 @@ once_cell = "1.17.1"
serde = "1.0"
slice-group-by = "0.3.0"
whatlang = "0.16.2"
lindera-core = "=0.25.0"
lindera-dictionary = "=0.25.0"
lindera-tokenizer = { version = "=0.25.0", default-features = false, optional = true }
pinyin = { version = "0.9", default-features = false, features = [
"with_tone",
], optional = true }
lindera-core = "=0.27.1"
lindera-dictionary = "=0.27.1"
lindera-tokenizer = { version = "=0.27.1", default-features = false, optional = true }
character_converter = { version = "2.1.0", optional = true }
wana_kana = { version = "3.0.0", optional = true }
unicode-normalization = "0.1.22"
irg-kvariants = "0.1.0"
Expand All @@ -38,7 +36,7 @@ irg-kvariants = "0.1.0"
default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase"]

# allow chinese specialized tokenization
chinese = ["dep:pinyin", "dep:jieba-rs"]
chinese = ["dep:character_converter", "dep:jieba-rs"]

# allow hebrew specialized tokenization
hebrew = []
Expand Down
65 changes: 19 additions & 46 deletions charabia/src/normalizer/chinese.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use pinyin::ToPinyin;
use character_converter::traditional_to_simplified;

use super::CharNormalizer;
use crate::detection::{Language, Script};
Expand All @@ -23,14 +23,15 @@ impl CharNormalizer for ChineseNormalizer {
// Normalize to Pinyin
// If we don't manage to convert the kvariant, we try to convert the original character.
// If none of them are converted, we return the kvariant.
match kvariant.to_pinyin().or_else(|| c.to_pinyin()) {
Some(converted) => {
let with_tone = converted.with_tone();
Some(traditional_to_simplified(kvariant.to_string().as_str()).to_string().into())
// match kvariant.to_pinyin().or_else(|| c.to_pinyin()) {
// Some(converted) => {
// let with_tone = converted.with_tone();

Some(with_tone.to_string().into())
}
None => Some(kvariant.into()), // e.g. 杤
}
// Some(with_tone.to_string().into())
// }
// None => Some(kvariant.into()), // e.g. 杤
// }
}

fn should_normalize(&self, token: &Token) -> bool {
Expand Down Expand Up @@ -65,14 +66,6 @@ mod test {
language: Some(Language::Cmn),
..Default::default()
},
Token {
lemma: Owned("澚䀾亚㮺刄杤".to_string()),
char_end: 5,
byte_end: 15,
script: Script::Cj,
language: Some(Language::Cmn),
..Default::default()
},
]
}

Expand All @@ -81,70 +74,50 @@ mod test {
vec![
Token {
// lowercased
lemma: Owned("zūnyán".to_string()),
lemma: Owned("尊严".to_string()),
char_end: 2,
byte_end: 6,
char_map: Some(vec![(3, 4), (3, 4)]),
char_map: Some(vec![(3, 3), (3, 3)]),
script: Script::Cj,
language: Some(Language::Cmn),
..Default::default()
},
Token {
// lowercased
lemma: Owned("shēngérzìyóu".to_string()),
lemma: Owned("生而自由".to_string()),
char_end: 4,
byte_end: 12,
char_map: Some(vec![(3, 6), (3, 3), (3, 3), (3, 4)]),
script: Script::Cj,
language: Some(Language::Cmn),
..Default::default()
},
Token {
// It would be "yudǔyàběnrèn" without the kvariant normalization.
lemma: Owned("àoqìyàběnrènwàn".to_string()),
char_end: 5,
byte_end: 15,
char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 4), (3, 4), (3, 4)]),
char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3)]),
script: Script::Cj,
language: Some(Language::Cmn),
..Default::default()
},
}
]
}

// expected result of the complete Normalizer pieline.
fn normalized_tokens() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned("zūnyán".to_string()),
lemma: Owned("尊严".to_string()),
char_end: 2,
byte_end: 6,
char_map: Some(vec![(3, 4), (3, 4)]),
char_map: Some(vec![(3, 3), (3, 3)]),
script: Script::Cj,
language: Some(Language::Cmn),
kind: TokenKind::Word,
..Default::default()
},
Token {
lemma: Owned("shēngérzìyóu".to_string()),
lemma: Owned("生而自由".to_string()),
char_end: 4,
byte_end: 12,
char_map: Some(vec![(3, 6), (3, 3), (3, 3), (3, 4)]),
char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3)]),
script: Script::Cj,
language: Some(Language::Cmn),
kind: TokenKind::Word,
..Default::default()
},
Token {
lemma: Owned("àoqìyàběnrènwàn".to_string()),
char_end: 5,
byte_end: 15,
char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 4), (3, 4), (3, 4)]),
script: Script::Cj,
language: Some(Language::Cmn),
kind: TokenKind::Word,
..Default::default()
},
}
]
}

Expand Down
12 changes: 6 additions & 6 deletions charabia/src/normalizer/control_char.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,16 +106,16 @@ mod test {
fn normalized_tokens() -> Vec<Token<'static>> {
vec![
Token {
lemma: Owned("shēngérzìyóuoo".to_string()),
lemma: Owned("生而自由oo".to_string()),
char_end: 9,
byte_end: 17,
script: Script::Cj,
char_map: Some(vec![
(1, 0),
(3, 6),
(3, 3),
(3, 3),
(3, 4),
(3, 3),
(3, 3),
(1, 0),
(1, 1),
(1, 1),
Expand All @@ -125,16 +125,16 @@ mod test {
..Default::default()
},
Token {
lemma: Owned("shēngérzìyóuoo".to_string()),
lemma: Owned("生而自由oo".to_string()),
char_end: 9,
byte_end: 17,
script: Script::Cj,
char_map: Some(vec![
(1, 0),
(3, 6),
(3, 3),
(3, 3),
(3, 4),
(3, 3),
(3, 3),
(1, 0),
(1, 1),
(1, 1),
Expand Down
93 changes: 64 additions & 29 deletions charabia/src/segmenter/chinese.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
use jieba_rs::Jieba;
use once_cell::sync::Lazy;
use std::fs::File;
use std::io::{self, BufRead};
use std::path::Path;

use crate::segmenter::Segmenter;

Expand All @@ -17,7 +20,39 @@ impl Segmenter for ChineseSegmenter {
}
}

static JIEBA: Lazy<Jieba> = Lazy::new(Jieba::new);
fn read_lines<P>(filename: P) -> Vec<String>
where
P: AsRef<Path>,
{
let path = filename.as_ref();
if !path.exists() {
println!("****");
return vec![];
}

if let Ok(file) = File::open(&path) {
let reader = io::BufReader::new(file);
let mut lines = Vec::new();

for line in reader.lines() {
if let Ok(line) = line {
lines.push(line);
}
}

return lines;
}
return vec![];
}

static JIEBA: Lazy<Jieba> = Lazy::new(|| {
let mut jieba = Jieba::new();
let lines = read_lines("./words.txt");
for line in lines {
jieba.add_word(line.as_str(), Some(99 as usize), None);
}
jieba
});

#[cfg(test)]
mod test {
Expand Down Expand Up @@ -65,37 +100,37 @@ mod test {

// Segmented and normalized version of the text.
const TOKENIZED: &[&str] = &[
"rénrén",
"shēngérzìyóu",
"人人",
"生而自由",
",",
"zài",
"zūn",
"yán",
"",
"quán",
"",
"shàng",
"yīlǜpíngděng",
"",
"",
"",
"",
"",
"",
"",
"一律平等",
"。",
"",
"men",
"",
"yǒu",
"lǐxìng",
"",
"liángxīn",
"",
"",
"",
"",
"理性",
"",
"良心",
",",
"bìng",
"yīng",
"",
"xiōngdì",
"guān",
"",
"de",
"jīngshén",
"hùxiāng",
"duì",
"dài",
"",
"",
"",
"兄弟",
"",
"",
"",
"精神",
"互相",
"",
"",
"。",
];

Expand Down
Loading

0 comments on commit 899655d

Please sign in to comment.