From fc7be5277961483fc2296b63d6a451ff6016ae57 Mon Sep 17 00:00:00 2001 From: Mostafa Date: Tue, 5 Aug 2025 23:25:28 +0800 Subject: [PATCH 1/3] fix: use AHashMap to fix compile error --- tokenizers/src/models/wordpiece/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tokenizers/src/models/wordpiece/mod.rs b/tokenizers/src/models/wordpiece/mod.rs index fa5c3e775..61fa44f07 100644 --- a/tokenizers/src/models/wordpiece/mod.rs +++ b/tokenizers/src/models/wordpiece/mod.rs @@ -175,7 +175,7 @@ impl WordPiece { pub fn read_bytes(vocab: &[u8]) -> Result { let file = BufReader::new(vocab); - let mut vocab = HashMap::new(); + let mut vocab = AHashMap::new(); for (index, line) in file.lines().enumerate() { let line = line?; vocab.insert(line.trim_end().to_owned(), index as u32); From f1bd5b0a00490cc358d5adf38ca1452b918f4869 Mon Sep 17 00:00:00 2001 From: Mostafa Date: Wed, 6 Aug 2025 17:03:40 +0800 Subject: [PATCH 2/3] feat: whitespace optimizer --- tokenizers/benches/whitespace_benchmark.rs | 103 +++++++++ tokenizers/src/pre_tokenizers/whitespace.rs | 218 ++++++++++++++++++++ 2 files changed, 321 insertions(+) create mode 100644 tokenizers/benches/whitespace_benchmark.rs diff --git a/tokenizers/benches/whitespace_benchmark.rs b/tokenizers/benches/whitespace_benchmark.rs new file mode 100644 index 000000000..86a1877b8 --- /dev/null +++ b/tokenizers/benches/whitespace_benchmark.rs @@ -0,0 +1,103 @@ +#[macro_use] +extern crate criterion; + +use criterion::{Criterion, Throughput}; +use tokenizers::pre_tokenizers::whitespace::{Whitespace, WhitespaceOptimized}; +use tokenizers::{OffsetReferential, OffsetType, PreTokenizer, PreTokenizedString}; + +fn bench_whitespace_comparison(c: &mut Criterion) { + let mut group = c.benchmark_group("whitespace-pre-tokenizers"); + + // Test data with various characteristics + let test_cases = vec![ + ("simple", "Hello world! How are you doing?"), + ("mixed", "This is a test with numbers 123 and symbols @#$% and unicode: café résumé"), + ("whitespace_heavy", "Multiple spaces\tand\nnewlines\r\nhere"), + ("symbol_heavy", "Hello!@#$%^&*()world?><>{}[]|\\"), + ("word_heavy", "This is a very long sentence with many words that should be tokenized properly"), + ("unicode_heavy", "αβγ δέζ ηθι κλμ νξο πρσ τυφ χψω"), + ("mixed_unicode", "Hello 123 αβγ !@# world δέζ ηθι"), + ]; + + for (name, text) in test_cases { + let data_len = text.len() as u64; + group.throughput(Throughput::Bytes(data_len)); + + // Benchmark original regex-based implementation + group.bench_function(&format!("{}-original", name), |b| { + b.iter(|| { + let mut pretokenized = PreTokenizedString::from(text); + let pretok = Whitespace {}; + pretok.pre_tokenize(&mut pretokenized).unwrap(); + let _result = pretokenized + .get_splits(OffsetReferential::Original, OffsetType::Byte) + .into_iter() + .map(|(s, o, _)| (s, o)) + .collect::>(); + }) + }); + + // Benchmark optimized byte-level implementation + group.bench_function(&format!("{}-optimized", name), |b| { + b.iter(|| { + let mut pretokenized = PreTokenizedString::from(text); + let pretok = WhitespaceOptimized {}; + pretok.pre_tokenize(&mut pretokenized).unwrap(); + let _result = pretokenized + .get_splits(OffsetReferential::Original, OffsetType::Byte) + .into_iter() + .map(|(s, o, _)| (s, o)) + .collect::>(); + }) + }); + } + + group.finish(); +} + +fn bench_large_text(c: &mut Criterion) { + let mut group = c.benchmark_group("whitespace-large-text"); + + // Create a large text by repeating patterns + let base_text = "Hello world! This is a test with numbers 123 and symbols @#$% and unicode: café résumé. "; + let large_text: String = base_text.repeat(1000); // ~50KB of text + let data_len = large_text.len() as u64; + + group.throughput(Throughput::Bytes(data_len)); + + group.bench_function("large-original", |b| { + b.iter(|| { + let mut pretokenized = PreTokenizedString::from(large_text.as_str()); + let pretok = Whitespace {}; + pretok.pre_tokenize(&mut pretokenized).unwrap(); + let _result = pretokenized + .get_splits(OffsetReferential::Original, OffsetType::Byte) + .into_iter() + .map(|(s, o, _)| (s, o)) + .collect::>(); + }) + }); + + group.bench_function("large-optimized", |b| { + b.iter(|| { + let mut pretokenized = PreTokenizedString::from(large_text.as_str()); + let pretok = WhitespaceOptimized {}; + pretok.pre_tokenize(&mut pretokenized).unwrap(); + let _result = pretokenized + .get_splits(OffsetReferential::Original, OffsetType::Byte) + .into_iter() + .map(|(s, o, _)| (s, o)) + .collect::>(); + }) + }); + + group.finish(); +} + +criterion_group! { + name = whitespace_benches; + config = Criterion::default().sample_size(20); + targets = bench_whitespace_comparison, bench_large_text +} + +criterion_main!(whitespace_benches); \ No newline at end of file diff --git a/tokenizers/src/pre_tokenizers/whitespace.rs b/tokenizers/src/pre_tokenizers/whitespace.rs index 20cfb6519..8c486de8d 100644 --- a/tokenizers/src/pre_tokenizers/whitespace.rs +++ b/tokenizers/src/pre_tokenizers/whitespace.rs @@ -28,6 +28,141 @@ impl PreTokenizer for Whitespace { } } +/// Optimized whitespace pre-tokenizer that uses byte-level scanning instead of regex. +/// This provides better performance but may have slightly different behavior in edge cases +/// compared to the regex-based implementation. +#[derive(Clone, Debug, PartialEq, Eq)] +#[macro_rules_attribute(impl_serde_type!)] +pub struct WhitespaceOptimized; + +impl Default for WhitespaceOptimized { + fn default() -> Self { + Self + } +} + +impl PreTokenizer for WhitespaceOptimized { + fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> { + pretokenized.split(|_, normalized| { + normalized.split(Invert(WhitespacePattern), SplitDelimiterBehavior::Removed) + }) + } +} + +/// Custom pattern implementation for optimized whitespace splitting +/// This implements the equivalent of the regex r"\w+|[^\w\s]+" but with manual byte scanning +struct WhitespacePattern; + +impl crate::tokenizer::pattern::Pattern for WhitespacePattern { + fn find_matches(&self, inside: &str) -> Result> { + if inside.is_empty() { + return Ok(vec![((0, 0), false)]); + } + + let mut matches = Vec::new(); + let mut current_start = 0; + let mut current_end = 0; + let mut current_type = None; // None = whitespace, Some(true) = word, Some(false) = symbol + + let mut i = 0; + while i < inside.len() { + let char_start = inside[i..].chars().next().unwrap(); + let char_len = char_start.len_utf8(); + + let is_whitespace = char_start.is_whitespace(); + let is_word_char = char_start.is_alphanumeric() || char_start == '_'; + let is_symbol = !is_whitespace && !is_word_char; + + match (current_type, is_whitespace, is_word_char, is_symbol) { + (None, true, _, _) => { + // Continue in whitespace + i += char_len; + } + (None, false, true, _) => { + // Transition from whitespace to word + current_start = i; + current_end = i + char_len; + current_type = Some(true); + i += char_len; + } + (None, false, false, true) => { + // Transition from whitespace to symbol + current_start = i; + current_end = i + char_len; + current_type = Some(false); + i += char_len; + } + (None, false, false, false) => { + // This shouldn't happen since a char is either whitespace, word, or symbol + // But handle it gracefully by treating as symbol + current_start = i; + current_end = i + char_len; + current_type = Some(false); + i += char_len; + } + (Some(true), true, _, _) => { + // Transition from word to whitespace - finish word + matches.push(((current_start, current_end), true)); + current_type = None; + i += char_len; + } + (Some(true), false, true, _) => { + // Continue in word + current_end = i + char_len; + i += char_len; + } + (Some(true), false, false, true) => { + // Transition from word to symbol - finish word, start symbol + matches.push(((current_start, current_end), true)); + current_start = i; + current_end = i + char_len; + current_type = Some(false); + i += char_len; + } + (Some(true), false, false, false) => { + // This shouldn't happen, but handle as symbol + matches.push(((current_start, current_end), true)); + current_start = i; + current_end = i + char_len; + current_type = Some(false); + i += char_len; + } + (Some(false), true, _, _) => { + // Transition from symbol to whitespace - finish symbol + matches.push(((current_start, current_end), true)); + current_type = None; + i += char_len; + } + (Some(false), false, true, _) => { + // Transition from symbol to word - finish symbol, start word + matches.push(((current_start, current_end), true)); + current_start = i; + current_end = i + char_len; + current_type = Some(true); + i += char_len; + } + (Some(false), false, false, true) => { + // Continue in symbol + current_end = i + char_len; + i += char_len; + } + (Some(false), false, false, false) => { + // This shouldn't happen, but handle as symbol + current_end = i + char_len; + i += char_len; + } + } + } + + // Don't forget the last token + if let Some(_) = current_type { + matches.push(((current_start, current_end), true)); + } + + Ok(matches) + } +} + #[derive(Copy, Clone, Debug, PartialEq, Eq)] #[macro_rules_attribute(impl_serde_type!)] pub struct WhitespaceSplit; @@ -102,4 +237,87 @@ mod tests { ); } } + + #[test] + fn optimized_compatibility() { + // Test that the optimized version produces the same results as the original + let test_cases = vec![ + "Hello world!", + "How are you doing?", + "This is a test with numbers 123 and symbols @#$%", + "Multiple spaces", + "Tabs\tand\nnewlines", + "Unicode: café résumé naïve", + "Mixed: Hello123!@# world", + "Edge cases: a.b,c;d:e", + "Empty string:", + "Only spaces: ", + "Only symbols: !@#$%", + "Only words: hello world", + "Numbers: 123 456 789", + "Underscores: hello_world test_case", + "Special chars: αβγ δέζ ηθι", + ]; + + for test_case in test_cases { + let mut original = PreTokenizedString::from(test_case); + let mut optimized = PreTokenizedString::from(test_case); + + let original_pretok = Whitespace {}; + let optimized_pretok = WhitespaceOptimized {}; + + original_pretok.pre_tokenize(&mut original).unwrap(); + optimized_pretok.pre_tokenize(&mut optimized).unwrap(); + + let original_splits = original + .get_splits(OffsetReferential::Original, OffsetType::Byte) + .into_iter() + .map(|(s, o, _)| (s, o)) + .collect::>(); + + let optimized_splits = optimized + .get_splits(OffsetReferential::Original, OffsetType::Byte) + .into_iter() + .map(|(s, o, _)| (s, o)) + .collect::>(); + + assert_eq!( + original_splits, optimized_splits, + "Mismatch for test case: '{}'", + test_case + ); + } + } + + #[test] + fn optimized_edge_cases() { + let pretok = WhitespaceOptimized {}; + + // Test various edge cases + let edge_cases = vec![ + ("", vec![]), + (" ", vec![]), + (" ", vec![]), + ("a", vec![("a", (0, 1))]), + ("!", vec![("!", (0, 1))]), + ("a!", vec![("a", (0, 1)), ("!", (1, 2))]), + ("!a", vec![("!", (0, 1)), ("a", (1, 2))]), + ("a b", vec![("a", (0, 1)), ("b", (2, 3))]), + ("a b", vec![("a", (0, 1)), ("b", (3, 4))]), + ("a\tb", vec![("a", (0, 1)), ("b", (2, 3))]), + ("a\nb", vec![("a", (0, 1)), ("b", (2, 3))]), + ("a\r\nb", vec![("a", (0, 1)), ("b", (3, 4))]), + ]; + + for (input, expected) in edge_cases { + let mut pretokenized = PreTokenizedString::from(input); + pretok.pre_tokenize(&mut pretokenized).unwrap(); + let result = pretokenized + .get_splits(OffsetReferential::Original, OffsetType::Byte) + .into_iter() + .map(|(s, o, _)| (s, o)) + .collect::>(); + assert_eq!(result, expected, "Failed for input: '{}'", input); + } + } } From 2ac4ff67920c5fd72583ba6126fa10736f5ed6e9 Mon Sep 17 00:00:00 2001 From: Mostafa Date: Wed, 6 Aug 2025 17:11:46 +0800 Subject: [PATCH 3/3] chore: fix linting issues --- tokenizers/benches/whitespace_benchmark.rs | 26 ++++++++++++++------- tokenizers/src/pre_tokenizers/whitespace.rs | 2 +- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/tokenizers/benches/whitespace_benchmark.rs b/tokenizers/benches/whitespace_benchmark.rs index 86a1877b8..33329618a 100644 --- a/tokenizers/benches/whitespace_benchmark.rs +++ b/tokenizers/benches/whitespace_benchmark.rs @@ -3,7 +3,7 @@ extern crate criterion; use criterion::{Criterion, Throughput}; use tokenizers::pre_tokenizers::whitespace::{Whitespace, WhitespaceOptimized}; -use tokenizers::{OffsetReferential, OffsetType, PreTokenizer, PreTokenizedString}; +use tokenizers::{OffsetReferential, OffsetType, PreTokenizedString, PreTokenizer}; fn bench_whitespace_comparison(c: &mut Criterion) { let mut group = c.benchmark_group("whitespace-pre-tokenizers"); @@ -11,10 +11,19 @@ fn bench_whitespace_comparison(c: &mut Criterion) { // Test data with various characteristics let test_cases = vec![ ("simple", "Hello world! How are you doing?"), - ("mixed", "This is a test with numbers 123 and symbols @#$% and unicode: café résumé"), - ("whitespace_heavy", "Multiple spaces\tand\nnewlines\r\nhere"), + ( + "mixed", + "This is a test with numbers 123 and symbols @#$% and unicode: café résumé", + ), + ( + "whitespace_heavy", + "Multiple spaces\tand\nnewlines\r\nhere", + ), ("symbol_heavy", "Hello!@#$%^&*()world?><>{}[]|\\"), - ("word_heavy", "This is a very long sentence with many words that should be tokenized properly"), + ( + "word_heavy", + "This is a very long sentence with many words that should be tokenized properly", + ), ("unicode_heavy", "αβγ δέζ ηθι κλμ νξο πρσ τυφ χψω"), ("mixed_unicode", "Hello 123 αβγ !@# world δέζ ηθι"), ]; @@ -24,7 +33,7 @@ fn bench_whitespace_comparison(c: &mut Criterion) { group.throughput(Throughput::Bytes(data_len)); // Benchmark original regex-based implementation - group.bench_function(&format!("{}-original", name), |b| { + group.bench_function(format!("{}-original", name), |b| { b.iter(|| { let mut pretokenized = PreTokenizedString::from(text); let pretok = Whitespace {}; @@ -38,7 +47,7 @@ fn bench_whitespace_comparison(c: &mut Criterion) { }); // Benchmark optimized byte-level implementation - group.bench_function(&format!("{}-optimized", name), |b| { + group.bench_function(format!("{}-optimized", name), |b| { b.iter(|| { let mut pretokenized = PreTokenizedString::from(text); let pretok = WhitespaceOptimized {}; @@ -59,7 +68,8 @@ fn bench_large_text(c: &mut Criterion) { let mut group = c.benchmark_group("whitespace-large-text"); // Create a large text by repeating patterns - let base_text = "Hello world! This is a test with numbers 123 and symbols @#$% and unicode: café résumé. "; + let base_text = + "Hello world! This is a test with numbers 123 and symbols @#$% and unicode: café résumé. "; let large_text: String = base_text.repeat(1000); // ~50KB of text let data_len = large_text.len() as u64; @@ -100,4 +110,4 @@ criterion_group! { targets = bench_whitespace_comparison, bench_large_text } -criterion_main!(whitespace_benches); \ No newline at end of file +criterion_main!(whitespace_benches); diff --git a/tokenizers/src/pre_tokenizers/whitespace.rs b/tokenizers/src/pre_tokenizers/whitespace.rs index 8c486de8d..01662ee58 100644 --- a/tokenizers/src/pre_tokenizers/whitespace.rs +++ b/tokenizers/src/pre_tokenizers/whitespace.rs @@ -155,7 +155,7 @@ impl crate::tokenizer::pattern::Pattern for WhitespacePattern { } // Don't forget the last token - if let Some(_) = current_type { + if current_type.is_some() { matches.push(((current_start, current_end), true)); }