1
- # import os
2
- # current_dir = os.path.dirname(os.path.realpath(__file__))
3
- # os.chdir(current_dir)
4
-
5
- # from tokenizer import KMerTokenizer
6
-
7
- # tokenizer = KMerTokenizer(k_mers=8)
8
-
9
- # with open('training files/file1.txt', 'r', encoding='utf-8') as f:
10
- # test_data = f.read().lower()
11
- # print("file opened!")
12
- # f.close()
13
- # tokenizer.load_model('tokenizer/vocabs/base_4k.json')
14
-
15
- # encoded_tokens = tokenizer.encode(test_data)
16
- # print(encoded_tokens)
17
- # decoded_tokens = tokenizer.decode(encoded_tokens)
18
- # print(decoded_tokens)
19
- # print(f"seq length: {len(test_data)} \ntokens length: {len(decoded_tokens)}")
20
- # print(test_data == decoded_tokens)
21
- # print(f"file length: {len(test_data)} \ntokens: {len(encoded_tokens)}")
22
- # print(f"compression ration: {(len(test_data) / len(encoded_tokens)):.2f}x")
23
-
24
1
import os
25
2
current_dir = os .path .dirname (os .path .realpath (__file__ ))
26
3
os .chdir (current_dir )
27
4
28
- from tokenizer import PerChar
29
- tokenizer = PerChar ()
5
+ from tokenizer import KMerTokenizer
6
+
7
+ tokenizer = KMerTokenizer (k_mers = 8 )
30
8
31
9
with open ('training files/file1.txt' , 'r' , encoding = 'utf-8' ) as f :
32
- test_data = f .read ()
10
+ test_data = f .read (). lower ()
33
11
print ("file opened!" )
34
12
f .close ()
13
+ tokenizer .load_model ('tokenizer/vocabs/base_4k.json' )
35
14
36
15
encoded_tokens = tokenizer .encode (test_data )
37
16
print (encoded_tokens )
38
17
decoded_tokens = tokenizer .decode (encoded_tokens )
39
18
print (decoded_tokens )
40
-
41
19
print (f"seq length: { len (test_data )} \n tokens length: { len (decoded_tokens )} " )
42
20
print (test_data == decoded_tokens )
43
21
print (f"file length: { len (test_data )} \n tokens: { len (encoded_tokens )} " )
44
- print (f"compression ration: { (len (test_data ) / len (encoded_tokens )):.2f} x" )
22
+ print (f"compression ration: { (len (test_data ) / len (encoded_tokens )):.2f} x" )
23
+
24
+ # import os
25
+ # current_dir = os.path.dirname(os.path.realpath(__file__))
26
+ # os.chdir(current_dir)
27
+
28
+ # from tokenizer import PerChar
29
+ # tokenizer = PerChar()
30
+
31
+ # with open('training files/file1.txt', 'r', encoding='utf-8') as f:
32
+ # test_data = f.read()
33
+ # print("file opened!")
34
+ # f.close()
35
+
36
+ # encoded_tokens = tokenizer.encode(test_data)
37
+ # print(encoded_tokens)
38
+ # decoded_tokens = tokenizer.decode(encoded_tokens)
39
+ # print(decoded_tokens)
40
+
41
+ # print(f"seq length: {len(test_data)} \ntokens length: {len(decoded_tokens)}")
42
+ # print(test_data == decoded_tokens)
43
+ # print(f"file length: {len(test_data)} \ntokens: {len(encoded_tokens)}")
44
+ # print(f"compression ration: {(len(test_data) / len(encoded_tokens)):.2f}x")
0 commit comments