Skip to content

Commit

Permalink
Count word chars early
Browse files Browse the repository at this point in the history
Avoid calling the iterator repeatedly
  • Loading branch information
ZJaume committed Sep 1, 2024
1 parent f9f10e1 commit 92533cb
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 7 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,4 +74,4 @@ Speed benchmarks with 100k random sentences from [OpenLID](https://github.com/la
| lingua all high preloaded | 56.29 |
| lingua all low preloaded | 23.34
| fasttext openlid193 | 8.44 |
| heliport | 4.72 |
| heliport | 2.33 |
18 changes: 12 additions & 6 deletions src/identifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ impl Identifier {
let mut last_was_space = false;
let mut cjk_num_chars = 0_usize;
let mut mystery_text = String::with_capacity(replaced.len());
let mut mystery_length = 0;

for mystery_char in replaced.chars() {
let charset = match unicode_blocks::find_unicode_block(mystery_char) {
Expand All @@ -147,6 +148,9 @@ impl Identifier {
last_was_space = mystery_char == ' ';
last_was_cjk = false;
}
if !last_was_space {
mystery_length += 1;
}
mystery_text.push(mystery_char);
}

Expand All @@ -170,11 +174,9 @@ impl Identifier {

let mut word_scored;
let mut num_words = 0;
let mut mystery_length = 0;
for word in words {
debug!("Scoring '{}'", word);
num_words += 1;
mystery_length += word.chars().count(); //TODO move this to the cjk count above? .chars() iterator is expensive
self.word_scores.reset();
word_scored = self.score_gram(word, 0);

Expand Down Expand Up @@ -222,14 +224,18 @@ impl Identifier {
// Normalize lang points and apply penalties if more than 50% is CJK
//TODO try to simplify this
// the CJK fix could just finish early?
let cjk_pct;
if mystery_length == 0 {
cjk_pct = 0;
} else {
cjk_pct = 100 / mystery_length * cjk_num_chars
}
for lang in Lang::iter() {
let lang_score_norm = self.lang_points.get(lang) / num_words as f32;
self.lang_points.insert(lang, lang_score_norm);

if (100 / mystery_length * cjk_num_chars) > 50 {
if !lang.is_cjk() {
self.lang_points.insert(lang, Self::PENALTY_VALUE + 1.0);
}
if cjk_pct > 50 && !lang.is_cjk() {
self.lang_points.insert(lang, Self::PENALTY_VALUE + 1.0);
}
}
debug!("Normalized lang points: {:?}", self.lang_points);
Expand Down

0 comments on commit 92533cb

Please sign in to comment.