Skip to content

Commit cef7f4a

Browse files
committed
fixed the tokenizer
1 parent 41a0570 commit cef7f4a

File tree

4 files changed

+37
-50
lines changed

4 files changed

+37
-50
lines changed

test.py

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,44 @@
1-
# import os
2-
# current_dir = os.path.dirname(os.path.realpath(__file__))
3-
# os.chdir(current_dir)
4-
5-
# from tokenizer import KMerTokenizer
6-
7-
# tokenizer = KMerTokenizer(k_mers=8)
8-
9-
# with open('training files/file1.txt', 'r', encoding='utf-8') as f:
10-
# test_data = f.read().lower()
11-
# print("file opened!")
12-
# f.close()
13-
# tokenizer.load_model('tokenizer/vocabs/base_4k.json')
14-
15-
# encoded_tokens = tokenizer.encode(test_data)
16-
# print(encoded_tokens)
17-
# decoded_tokens = tokenizer.decode(encoded_tokens)
18-
# print(decoded_tokens)
19-
# print(f"seq length: {len(test_data)} \ntokens length: {len(decoded_tokens)}")
20-
# print(test_data == decoded_tokens)
21-
# print(f"file length: {len(test_data)} \ntokens: {len(encoded_tokens)}")
22-
# print(f"compression ration: {(len(test_data) / len(encoded_tokens)):.2f}x")
23-
241
import os
252
current_dir = os.path.dirname(os.path.realpath(__file__))
263
os.chdir(current_dir)
274

28-
from tokenizer import PerChar
29-
tokenizer = PerChar()
5+
from tokenizer import KMerTokenizer
6+
7+
tokenizer = KMerTokenizer(k_mers=8)
308

319
with open('training files/file1.txt', 'r', encoding='utf-8') as f:
32-
test_data = f.read()
10+
test_data = f.read().lower()
3311
print("file opened!")
3412
f.close()
13+
tokenizer.load_model('tokenizer/vocabs/base_4k.json')
3514

3615
encoded_tokens = tokenizer.encode(test_data)
3716
print(encoded_tokens)
3817
decoded_tokens = tokenizer.decode(encoded_tokens)
3918
print(decoded_tokens)
40-
4119
print(f"seq length: {len(test_data)} \ntokens length: {len(decoded_tokens)}")
4220
print(test_data == decoded_tokens)
4321
print(f"file length: {len(test_data)} \ntokens: {len(encoded_tokens)}")
44-
print(f"compression ration: {(len(test_data) / len(encoded_tokens)):.2f}x")
22+
print(f"compression ration: {(len(test_data) / len(encoded_tokens)):.2f}x")
23+
24+
# import os
25+
# current_dir = os.path.dirname(os.path.realpath(__file__))
26+
# os.chdir(current_dir)
27+
28+
# from tokenizer import PerChar
29+
# tokenizer = PerChar()
30+
31+
# with open('training files/file1.txt', 'r', encoding='utf-8') as f:
32+
# test_data = f.read()
33+
# print("file opened!")
34+
# f.close()
35+
36+
# encoded_tokens = tokenizer.encode(test_data)
37+
# print(encoded_tokens)
38+
# decoded_tokens = tokenizer.decode(encoded_tokens)
39+
# print(decoded_tokens)
40+
41+
# print(f"seq length: {len(test_data)} \ntokens length: {len(decoded_tokens)}")
42+
# print(test_data == decoded_tokens)
43+
# print(f"file length: {len(test_data)} \ntokens: {len(encoded_tokens)}")
44+
# print(f"compression ration: {(len(test_data) / len(encoded_tokens)):.2f}x")

tokenizer/base.py

Lines changed: 2 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import kmer_c
22
import json
3-
from tqdm import tqdm
43

54
class KMerTokenizer:
65
def __init__(self, k_mers: int = 4):
@@ -16,7 +15,7 @@ def encode(self, sequence):
1615

1716
def decode(self, encoded_sequence):
1817
return self.tokenizer.decode(encoded_sequence)
19-
18+
2019
def save_model(self, model_path):
2120
vocab_file = f"{model_path}/base_{self.k_mers}k.json"
2221
with open(vocab_file, 'w') as f:
@@ -30,21 +29,4 @@ def load_model(self, path):
3029
print("loaded the vocab!")
3130

3231
self.vocab = vocab
33-
self.tokenizer.set_vocab(vocab)
34-
self.tokenizer.vocab_size = len(vocab)
35-
36-
self.id_to_token = [None] * self.vocab_size
37-
for token, idx in self.vocab.items():
38-
self.id_to_token[idx] = token
39-
40-
# if __name__ == "__main__":
41-
# tokenizer = KMerTokenizer(k_mers=4)
42-
# sequences = ["ATGCGTAC", "GTCAGTAC"]
43-
# for sequence in sequences:
44-
# print(tokenizer.tokenize_sequence(sequence))
45-
# encoded = tokenizer.encode(sequence)
46-
# print(encoded)
47-
# decoded = tokenizer.decode(encoded)
48-
# print(decoded)
49-
# tokenizer.save_model("model")
50-
# tokenizer.load_model("model/base_4k.json")
32+
self.tokenizer.set_vocab(vocab)

tokenizer/csrc/kmer.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
#include <cmath>
44
#include <algorithm>
55

6-
KMerTokenizer::KMerTokenizer(int k_mers) : k_mers(k_mers) {}
6+
KMerTokenizer::KMerTokenizer(int k_mers) : k_mers(k_mers), vocab_size(0) {}
77

88
std::vector<std::string> KMerTokenizer::tokenize_sequence(const std::string &sequence) {
99
std::vector<std::string> kmers;
@@ -20,7 +20,10 @@ std::vector<int> KMerTokenizer::encode(const std::string &sequence) {
2020
if (token_to_id.find(kmer) != token_to_id.end()) {
2121
encoded_sequence.push_back(token_to_id[kmer]);
2222
} else {
23-
encoded_sequence.push_back(token_to_id.size() + 1);
23+
int new_id = token_to_id.size();
24+
token_to_id[kmer] = new_id;
25+
id_to_token.push_back(kmer);
26+
encoded_sequence.push_back(new_id);
2427
}
2528
}
2629
return encoded_sequence;
@@ -44,6 +47,7 @@ void KMerTokenizer::set_vocab(const std::unordered_map<std::string, int> &vocab)
4447
for (const auto &pair : vocab) {
4548
id_to_token[pair.second] = pair.first;
4649
}
50+
vocab_size = vocab.size();
4751
}
4852

4953
std::unordered_map<std::string, int> KMerTokenizer::get_vocab() {

tokenizer/csrc/kmer.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ class KMerTokenizer {
1919
int k_mers;
2020
std::unordered_map<std::string, int> token_to_id;
2121
std::vector<std::string> id_to_token;
22+
int vocab_size;
2223
};
2324

2425
#endif

0 commit comments

Comments
 (0)