Skip to content

Commit

Permalink
fixed the decoding
Browse files Browse the repository at this point in the history
  • Loading branch information
cahya-wirawan committed Jun 3, 2024
1 parent 33a2c75 commit 3096b13
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 8 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "rwkv_tokenizer"
version = "0.3.1"
version = "0.3.2"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand Down
15 changes: 9 additions & 6 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@ use std::io::{self, BufRead};
use regex::Regex;
use trie::Trie;
use unescape::unescape;
use std::str;

#[derive(Debug)]
#[pyclass]
pub(crate) struct Tokenizer {
tokens: Vec<String>,
tokens: Vec<Vec<u8>>,
trie: Trie
}

Expand All @@ -26,7 +27,7 @@ impl Tokenizer {
let reader = io::BufReader::new(file);

let re = Regex::new(r"(\d+)\s+(b?)(.+)\s+(\d+)").unwrap();
tokenizer.tokens.push("".to_string());
tokenizer.tokens.push(vec![0]);
for line in reader.lines() {
let line = line?;
if let Some(captures) = re.captures(&line) {
Expand All @@ -39,12 +40,13 @@ impl Tokenizer {
if is_byte.len() == 0 {
string = unescape(string.as_str()).unwrap();
sbytes = string.clone().into_bytes();
tokenizer.tokens.push(Vec::from(string.as_bytes()));
} else {
sbytes = hex_to_bytes(string.as_str()).unwrap();
tokenizer.tokens.push(sbytes.clone());
}
assert_eq!(sbytes.len(), length);
tokenizer.trie.insert(&sbytes, id);
tokenizer.tokens.push(string.to_string());
}
else {
println!("Line with issue: {:?}", line)
Expand All @@ -58,11 +60,12 @@ impl Tokenizer {
}

pub(crate) fn decode(&self, vec: Vec<u16>) -> String {
let mut result: String = "".to_owned();
let mut result: Vec<u8> = Vec::new();
for index in vec.iter() {
result.push_str(&*self.tokens[*index as usize]);
let mut current_tokens = self.tokens[*index as usize].clone();
result.append(&mut current_tokens);
}
return result;
return str::from_utf8(&*result).unwrap().to_string();
}
}

Expand Down

0 comments on commit 3096b13

Please sign in to comment.