From 4d151ad25ff116cb47b83a5a83610a81ec14e114 Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Wed, 7 Aug 2024 11:02:06 +0800
Subject: [PATCH] =?UTF-8?q?perf(bpe):=20=E5=8E=8B=E7=BC=A9=E8=AF=8D?=
 =?UTF-8?q?=E8=A1=A8=E5=86=85=E5=AE=B9=E4=BB=A5=E9=99=8D=E4=BD=8E=E7=A9=BA?=
 =?UTF-8?q?=E9=97=B4=E5=8D=A0=E7=94=A8=E5=B9=B6=E6=8F=90=E5=8D=87=E5=B1=80?=
 =?UTF-8?q?=E9=83=A8=E6=80=A7?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 .github/workflows/build.yml |  4 ++
 .gitignore                  |  1 +
 Cargo.toml                  |  1 +
 src/bpe/mod.rs              | 75 ++++++++++++++++++++++++++-----------
 4 files changed, 59 insertions(+), 22 deletions(-)
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 34aa06b..518515a 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -29,6 +29,10 @@ jobs:
       - name: Check format
         run: cargo fmt --check
 
+      - name: Download tokenizer.model
+        run: wget https://huggingface.co/TinyLlama/TinyLlama_v1.1/resolve/main/tokenizer.model?download=true
+        # run on windows: wget -Uri https://huggingface.co/TinyLlama/TinyLlama_v1.1/resolve/main/tokenizer.model?download=true -OutFile tokenizer.model
+
       - name: Run test
         run: cargo test
 
diff --git a/.gitignore b/.gitignore
index 4fffb2f..f88cdc7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 /target
 /Cargo.lock
+/tokenizer.model
diff --git a/Cargo.toml b/Cargo.toml
index 5408aef..d027341 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -8,3 +8,4 @@ authors = ["YdrMaster <ydrml@hotmail.com>"]
 
 [dependencies]
 regex = "1.10"
+memchr = "2.7"
diff --git a/src/bpe/mod.rs b/src/bpe/mod.rs
index e3b13df..d5f2316 100644
--- a/src/bpe/mod.rs
+++ b/src/bpe/mod.rs
@@ -1,6 +1,6 @@
 ﻿mod algorithm;
 
-use crate::{as_byte_token, utok, Method};
+use crate::{utok, Method};
 use std::{
     collections::{HashMap, HashSet},
     iter::zip,
@@ -82,7 +82,7 @@ impl Bpe {
             }
         });
         // 构造分词器
-        Self::new(vocabs, scores, is_byte, 0, offsets.len())
+        Self::new(vocabs, scores, is_byte, 0)
     }
 
     pub fn new<'a>(
@@ -90,33 +90,50 @@ impl Bpe {
         scores: impl IntoIterator<Item = f32>,
         is_byte: impl IntoIterator<Item = bool>,
         unk: utok,
-        vocab_size_hint: usize,
     ) -> Self {
-        let mut text_buf = Vec::with_capacity(vocab_size_hint * 4);
         let mut bytes = Box::new([unk; 256]);
-        // 重新编排词表
-        // 将字符串的内容和元信息分离
-        // 内容全部保存到 text_buf 以实现缓存友好性
-        // 字符串起始位置在 text_buf 中的偏移量和字符串长度保存到 meta 中
-        let meta = vocabs
-            .into_iter()
-            .map(str::as_bytes)
-            .zip(is_byte)
+        let mut total_len = 0;
+        // 收集词表字符内容和字节 token，同时计算内容总长度
+        let vocabs = zip(vocabs, is_byte)
             .enumerate()
-            .map(|(t, (piece, is_byte))| {
-                let off = text_buf.len();
-                let len = if is_byte {
-                    let b = as_byte_token(piece).unwrap();
-                    text_buf.push(b);
-                    bytes[b as usize] = t as utok;
-                    1
+            .map(|(i, (piece, is_byte))| {
+                let piece = if is_byte {
+                    const BYTES: [u8; 256] = {
+                        let mut bytes = [0u8; 256];
+                        let mut i = 0usize;
+                        while i < 256 {
+                            bytes[i] = i as _;
+                            i += 1;
+                        }
+                        bytes
+                    };
+
+                    let b = crate::as_byte_token(piece.as_bytes()).unwrap() as usize;
+                    bytes[b] = i as utok;
+                    std::slice::from_ref(&BYTES[b])
                 } else {
-                    text_buf.extend_from_slice(piece);
-                    piece.len()
+                    piece.as_bytes()
                 };
-                (off, len)
+                total_len += piece.len();
+                piece
             })
             .collect::<Vec<_>>();
+        // 创建字符内容缓存
+        let mut meta = vec![(0usize, 0usize); vocabs.len()];
+        let mut text_buf = Vec::<u8>::with_capacity(total_len);
+        let mut indices = (0..vocabs.len()).collect::<Vec<_>>();
+        // 对词按内容长度从长到短排序，因为短的内容有可能是长内容的子串，可以避免重复存储相同内容
+        indices.sort_unstable_by_key(|&i| -(vocabs[i].len() as isize));
+        for i in indices {
+            let v = vocabs[i];
+            // 查找子串，若存在则复用，否则将新的内容追加到缓存
+            let off = memchr::memmem::find(&text_buf, v).unwrap_or_else(|| {
+                let off = text_buf.len();
+                text_buf.extend(v);
+                off
+            });
+            meta[i] = (off, v.len());
+        }
         // 锁定字符串内容的位置，以实现安全的自引用
         let _vocab = unsafe { Pin::new_unchecked(text_buf.into_boxed_slice()) };
         // 对分词评分重新赋权，转换为整型
@@ -142,6 +159,12 @@ impl Bpe {
             .collect::<Box<[_]>>();
         sorted_pieces.sort_unstable_by_key(|&i| &*tokens[i as usize]);
 
+        // println!(
+        //     "Building BPE vocab, detected {} tokens, compressed to {} bytes from {len} bytes",
+        //     tokens.len(),
+        //     _vocab.len(),
+        // );
+
         Self {
             _vocab,
             tokens,
@@ -251,3 +274,11 @@ fn rank(scores: &[f32]) -> Vec<u32> {
 
     scores.iter().map(|f| map[&FloatOrd(*f)]).collect()
 }
+
+#[test]
+fn test() {
+    if let Ok(buf) = std::fs::read("tokenizer.model") {
+        let bpe = Bpe::from_tokenizer_model(&buf);
+        let _inaccessible = bpe.inaccessible();
+    }
+}