-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenizer.py
72 lines (61 loc) · 2.61 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import os
import struct
import argparse
from typing import List
from sentencepiece import SentencePieceProcessor
TOKENIZER_MODEL = "tokenizer.model" # the llama sentencepices tokenizer model
class Tokenizer:
def __init__(self, tokenizer_model=None):
model_path = tokenizer_model if tokenizer_model else TOKENIZER_MODEL
assert os.path.isfile(model_path), model_path
self.sp_model = SentencePieceProcessor(model_file=model_path)
# BOS/EOS token IDs
self.n_words:int = self.sp_model.vocab_size()
self.bos_id:int = self.sp_model.bos_id()
self.eos_id:int = self.sp_model.eos_id()
self.pad_id:int = self.sp_model.pad_id()
#print(f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}")
def encode(self, s:str, bos:bool, eos:bool)-> List[int]:
assert type(s) is str
t = self.sp_model.encode(s)
if bos:
t = [self.bos_id] + t
if eos:
t = t + [self.eos_id] + t
return t
def decode(self, t:List[int])-> str:
return self.sp_model.decode(t)
def export(self):
# get all the token aand their scores as floats
tokens, scores = [], []
for i in range(self.n_words):
# decode the token and light postprocessing
t = self.sp_model.id_to_piece(i)
s = self.sp_model.get_score(i)
if i == self.eos_id:
t = self.sp_model.id_to_piece(i)
s = self.sp_model.get_score(i)
if i == self.bos_id:
t = '\n<s>\n'
elif i == self.eos_id:
t = '\n</s>\n'
t = t.replace('_', ' ') # sentencepice uses this charactor as whitespace
b = t.encode('utf-8') # bytes of this token, uft-8 encoded
tokens.append(b)
score.append(s)
# record the max token length
max_token_length = max(len(t) for t in tokens)
# write to a binary file
# the tokenizer.bin file is the same as .model file but .bin
tokenizer_bin = self.model_path.replace('.model', '.bin')
with open(tokenizer_bin, 'wb') as f:
f.write(struct.pack("I", max_token_length))
for bytes, score in zip(tokens, scores):
f.write(struct.pack("fI", score, len(bytes)))
f.write(bytes)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-t", "--tokenizer_model", type=str, help="optional path to custom tokenizer ")
args = parser.parse_args()
t = Tokenizer(args.tokenizer_model)
t.export()