From 4d151ad25ff116cb47b83a5a83610a81ec14e114 Mon Sep 17 00:00:00 2001 From: YdrMaster Date: Wed, 7 Aug 2024 11:02:06 +0800 Subject: [PATCH] =?UTF-8?q?perf(bpe):=20=E5=8E=8B=E7=BC=A9=E8=AF=8D?= =?UTF-8?q?=E8=A1=A8=E5=86=85=E5=AE=B9=E4=BB=A5=E9=99=8D=E4=BD=8E=E7=A9=BA?= =?UTF-8?q?=E9=97=B4=E5=8D=A0=E7=94=A8=E5=B9=B6=E6=8F=90=E5=8D=87=E5=B1=80?= =?UTF-8?q?=E9=83=A8=E6=80=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: YdrMaster --- .github/workflows/build.yml | 4 ++ .gitignore | 1 + Cargo.toml | 1 + src/bpe/mod.rs | 75 ++++++++++++++++++++++++++----------- 4 files changed, 59 insertions(+), 22 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 34aa06b..518515a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -29,6 +29,10 @@ jobs: - name: Check format run: cargo fmt --check + - name: Download tokenizer.model + run: wget https://huggingface.co/TinyLlama/TinyLlama_v1.1/resolve/main/tokenizer.model?download=true + # run on windows: wget -Uri https://huggingface.co/TinyLlama/TinyLlama_v1.1/resolve/main/tokenizer.model?download=true -OutFile tokenizer.model + - name: Run test run: cargo test diff --git a/.gitignore b/.gitignore index 4fffb2f..f88cdc7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ /target /Cargo.lock +/tokenizer.model diff --git a/Cargo.toml b/Cargo.toml index 5408aef..d027341 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,3 +8,4 @@ authors = ["YdrMaster "] [dependencies] regex = "1.10" +memchr = "2.7" diff --git a/src/bpe/mod.rs b/src/bpe/mod.rs index e3b13df..d5f2316 100644 --- a/src/bpe/mod.rs +++ b/src/bpe/mod.rs @@ -1,6 +1,6 @@ mod algorithm; -use crate::{as_byte_token, utok, Method}; +use crate::{utok, Method}; use std::{ collections::{HashMap, HashSet}, iter::zip, @@ -82,7 +82,7 @@ impl Bpe { } }); // 构造分词器 - Self::new(vocabs, scores, is_byte, 0, offsets.len()) + Self::new(vocabs, scores, is_byte, 0) } pub fn new<'a>( @@ -90,33 +90,50 @@ impl Bpe { scores: impl IntoIterator, is_byte: impl IntoIterator, unk: utok, - vocab_size_hint: usize, ) -> Self { - let mut text_buf = Vec::with_capacity(vocab_size_hint * 4); let mut bytes = Box::new([unk; 256]); - // 重新编排词表 - // 将字符串的内容和元信息分离 - // 内容全部保存到 text_buf 以实现缓存友好性 - // 字符串起始位置在 text_buf 中的偏移量和字符串长度保存到 meta 中 - let meta = vocabs - .into_iter() - .map(str::as_bytes) - .zip(is_byte) + let mut total_len = 0; + // 收集词表字符内容和字节 token,同时计算内容总长度 + let vocabs = zip(vocabs, is_byte) .enumerate() - .map(|(t, (piece, is_byte))| { - let off = text_buf.len(); - let len = if is_byte { - let b = as_byte_token(piece).unwrap(); - text_buf.push(b); - bytes[b as usize] = t as utok; - 1 + .map(|(i, (piece, is_byte))| { + let piece = if is_byte { + const BYTES: [u8; 256] = { + let mut bytes = [0u8; 256]; + let mut i = 0usize; + while i < 256 { + bytes[i] = i as _; + i += 1; + } + bytes + }; + + let b = crate::as_byte_token(piece.as_bytes()).unwrap() as usize; + bytes[b] = i as utok; + std::slice::from_ref(&BYTES[b]) } else { - text_buf.extend_from_slice(piece); - piece.len() + piece.as_bytes() }; - (off, len) + total_len += piece.len(); + piece }) .collect::>(); + // 创建字符内容缓存 + let mut meta = vec![(0usize, 0usize); vocabs.len()]; + let mut text_buf = Vec::::with_capacity(total_len); + let mut indices = (0..vocabs.len()).collect::>(); + // 对词按内容长度从长到短排序,因为短的内容有可能是长内容的子串,可以避免重复存储相同内容 + indices.sort_unstable_by_key(|&i| -(vocabs[i].len() as isize)); + for i in indices { + let v = vocabs[i]; + // 查找子串,若存在则复用,否则将新的内容追加到缓存 + let off = memchr::memmem::find(&text_buf, v).unwrap_or_else(|| { + let off = text_buf.len(); + text_buf.extend(v); + off + }); + meta[i] = (off, v.len()); + } // 锁定字符串内容的位置,以实现安全的自引用 let _vocab = unsafe { Pin::new_unchecked(text_buf.into_boxed_slice()) }; // 对分词评分重新赋权,转换为整型 @@ -142,6 +159,12 @@ impl Bpe { .collect::>(); sorted_pieces.sort_unstable_by_key(|&i| &*tokens[i as usize]); + // println!( + // "Building BPE vocab, detected {} tokens, compressed to {} bytes from {len} bytes", + // tokens.len(), + // _vocab.len(), + // ); + Self { _vocab, tokens, @@ -251,3 +274,11 @@ fn rank(scores: &[f32]) -> Vec { scores.iter().map(|f| map[&FloatOrd(*f)]).collect() } + +#[test] +fn test() { + if let Ok(buf) = std::fs::read("tokenizer.model") { + let bpe = Bpe::from_tokenizer_model(&buf); + let _inaccessible = bpe.inaccessible(); + } +}