-
Notifications
You must be signed in to change notification settings - Fork 1
/
bpe_tokenizer.py
318 lines (248 loc) · 12.2 KB
/
bpe_tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
from collections import defaultdict
from enum import Enum
import json
import tempfile
from bpe_utils import (
dict_to_defaultdict,
duplicate_file,
read_binary_chunk,
read_utf_8_chunk,
)
import os
import argparse
import numpy as np
default_vocabulary_size = 50_257
class BPETokenizer:
"""
Byte-Pair Encoding Tokenizer
"""
def __init__(self, vocabulary: dict = None) -> None:
self.vocabulary = dict_to_defaultdict(
vocabulary or {token: token for token in range(256)}
)
self.token_frequencies = defaultdict(int)
def train(
self,
dataset_path,
vocabulary_size: int = default_vocabulary_size,
in_place: bool = False,
):
"""
Train a tokenizer
:param in_place: if True, then will modify the dataset when training a tokenizer, so it saves a disk space by not copying a dataset during training (bool)
Procedure:
We do things in chunks because datasets might be huge so we don't want to load all the dataset into memory at once
- Reset token frequencies object
- Count a frequency of any two subsequent tokens, like in a sentence "hey, hello" we would identify (("h", "e"), 2) and all other pairs would have 1 occurence
- Pick a pair that has the most occurences
- Combine those tokens and put it into vocabulary
- Replace all occurences of the most frequent pair in dataset (working copy or original, if in_place=True) with a new token index
- Repeat from first step until we get a vocabulary of length of vocabulary_size or when all pairs have only 1 occurence
"""
if dataset_path is None:
raise Exception(
"Please specify a path to a local file containing dataset, so it can be read in chunks into the tokenizer"
)
if in_place:
working_copy_path = dataset_path
else:
working_copy_path = "working_copy.txt"
duplicate_file(dataset_path, working_copy_path)
# in first iteration, turn all characters into tokens and then into binary representation
temp_file, temp_file_path = tempfile.mkstemp()
with open(working_copy_path, "r", encoding="utf-8") as working_copy, open(
temp_file, "wb"
) as temp_copy:
for chunk in read_utf_8_chunk(working_copy):
chunk = chunk.encode(encoding="utf-8")
for token in chunk:
temp_copy.write(
token.to_bytes(4, byteorder="little")
) # token size in bytes (4, 3, 2 or 1) can be calculated based on vocabulary_size and then reused in all places when I operate on bytes
os.replace(temp_file_path, working_copy_path)
while len(self.vocabulary) < vocabulary_size and (
len(self.token_frequencies) == 0 or not self.all_pairs_are_unique()
):
self.token_frequencies = defaultdict(int) # reset token frequencies counter
with open(working_copy_path, "rb") as working_copy:
for chunk in read_binary_chunk(file=working_copy):
tokens = np.frombuffer(
chunk, dtype=np.uint32
) # adjust here based on vocabulary_size
self.count_token_frequencies(tokens)
most_frequent_pair = self.sort_by_token_frequency(self.token_frequencies)[0]
new_token_index = len(self.vocabulary)
self.vocabulary[new_token_index] = (
int(most_frequent_pair[0][0]),
int(most_frequent_pair[0][1]),
)
temp_file, temp_file_path = tempfile.mkstemp()
# replace all occurences of a most frequent pair
with open(
working_copy_path,
"rb",
) as working_copy, open(temp_file, "wb") as temp_copy:
for chunk in read_binary_chunk(working_copy):
tokens = np.frombuffer(
chunk, dtype=np.uint32 # adjust here based on vocabulary_size
)
# this merge code is heavily inspired by how Andrej have implemented it here: https://github.com/karpathy/minbpe/blob/master/minbpe/base.py#L25 Previously, I tried slicing arrays and putting new tokens in the middle and it was too much headache of handling indices properly
new_tokens = []
index = 0
while index < len(tokens):
if (
index + 1 < len(tokens)
and most_frequent_pair[0][0] == tokens[index]
and most_frequent_pair[0][1] == tokens[index + 1]
):
new_tokens.append(
np.uint32(new_token_index)
) # to be adjusted
index += 2
else:
new_tokens.append(tokens[index])
index += 1
for token in new_tokens:
temp_copy.write(token.tobytes())
os.replace(temp_file_path, working_copy_path)
def all_pairs_are_unique(self):
return self.sort_by_token_frequency(self.token_frequencies)[0][1] == 1
def count_token_frequencies(self, data):
for index in range(0, len(data) - 1, 2):
self.token_frequencies[(data[index], data[index + 1])] += 1
def sort_by_token_frequency(self, data: dict):
return list(sorted(data.items(), key=lambda item: item[1], reverse=True))
def tokenize(self, data):
tokens = list(map(int, data.encode(encoding="utf-8")))
added_keys = [key for key in self.vocabulary.keys() if int(key) > 255]
for vocabulary_item in added_keys:
new_tokens = []
index = 0
while index < len(tokens):
if (
index + 1 < len(tokens)
and self.vocabulary[vocabulary_item][0] == tokens[index]
and self.vocabulary[vocabulary_item][1] == tokens[index + 1]
):
new_tokens.append(int(vocabulary_item))
index += 2
else:
new_tokens.append(tokens[index])
index += 1
tokens = new_tokens
return tokens
def detokenize(self, tokens):
# we assume data to be array of tokens from vocabulary
# Procedure:
# - iterate over elements from vocabulary in reverse order, so starting from last and ending on index 255 (not processing it, since 0-255 are predefined elements of our vocab)
# - search for tokens that are like current key of element in vocabulary
# - replace all these elements with a pairs of consecutive tokens from a value of an element in vocab
# - repeat until you get to 255
# - merge items into string and return it
for vocabulary_token in sorted(
list(map(int, self.vocabulary.keys())), reverse=True
):
if vocabulary_token == 255:
break
index = 0
new_tokens = []
while index < len(tokens):
if tokens[index] == vocabulary_token:
new_tokens.append(self.vocabulary[str(vocabulary_token)][0])
new_tokens.append(self.vocabulary[str(vocabulary_token)][1])
else:
new_tokens.append(tokens[index])
index += 1
tokens = new_tokens
return "".join(token for token in bytes(tokens).decode("utf-8"))
def load_vocabulary(self, vocabulary: dict = {}):
self.vocabulary = vocabulary
def load(self, vocabulary: dict = {}):
self.vocabulary = vocabulary
class BPEAction(Enum):
train = "train"
tokenize = "tokenize"
detokenize = "detokenize"
def __str__(self) -> str:
return self.value
default_training_dataset_location = "training.txt"
default_training_output_location = "tokenizer.json"
default_inference_data_location = default_training_output_location
default_tokenize_data_location = "tokenize.txt"
default_detokenize_data_location = "detokenize.txt"
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Byte-Pair Encoding Tokenizer. Default training dataset location: 'training.txt'. Default tokenize data location 'tokenize.txt'. File structure: {'data': [72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100, 33, 256]}. Default detokenize data location 'detokenize.txt'. File structure: {'data': 'Hello world, Morty!'}"
)
parser.add_argument("action", type=BPEAction, choices=list(BPEAction))
parser.add_argument(
"--training_dataset",
)
parser.add_argument("--training_output")
parser.add_argument("--vocabulary_size", type=int)
parser.add_argument("--tokenizer_data")
parser.add_argument("--run_data")
parser.add_argument(
"--in_place",
help="Use carefully! Use it if you want to modify dataset file during training. It will corrupt your dataset, but save memory during training",
)
parser.add_argument(
"text", nargs="?", help="Text to be tokenized when running the tokenizer"
)
args = parser.parse_args()
if args.action == BPEAction.train:
training_dataset = args.training_dataset or default_training_dataset_location
if not os.path.exists(training_dataset):
print("Please provide a training dataset")
exit(1)
training_output = args.training_output or default_training_output_location
output_dir = os.path.dirname(training_output)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir)
vocabulary_size = args.vocabulary_size or default_vocabulary_size
tokenizer = BPETokenizer()
tokenizer.train(
dataset_path=training_dataset,
in_place=args.in_place or False,
vocabulary_size=vocabulary_size,
)
save_output = {"vocabulary": tokenizer.vocabulary}
with open(training_output, "w") as output:
json.dump(save_output, output, indent=4)
print(
f"Tokenizer trained successfully. Vocabulary is now stored in {training_output}"
)
if args.action == BPEAction.tokenize:
tokenizer_data_location = args.tokenizer_data or default_inference_data_location
if not os.path.exists(tokenizer_data_location):
print("Please provide tokenizer data as a JSON that contains vocabulary")
exit(1)
run_data_location = args.run_data or default_tokenize_data_location
if not os.path.exists(run_data_location):
print("Please provide text to tokenize")
exit(1)
with open(run_data_location, "r") as run_data_file:
inference_content = json.load(run_data_file)
inference_content = inference_content.get("data", [])
with open(tokenizer_data_location, "r") as tokenizer_data_file:
tokenizer_data = json.load(tokenizer_data_file)
vocabulary = tokenizer_data.get("vocabulary")
tokenizer = BPETokenizer(vocabulary=vocabulary)
print(tokenizer.tokenize(inference_content))
if args.action == BPEAction.detokenize:
tokenizer_data_location = args.tokenizer_data or default_inference_data_location
if not os.path.exists(tokenizer_data_location):
print("Please provide tokenizer data as a JSON that contains vocabulary")
exit(1)
run_data_location = args.run_data or default_detokenize_data_location
if not os.path.exists(run_data_location):
print("Please provide text to detokenize")
exit(1)
with open(run_data_location, "r") as run_data_file:
inference_content = json.load(run_data_file)
inference_content = inference_content.get("data", [])
with open(tokenizer_data_location, "r") as tokenizer_data_file:
tokenizer_data = json.load(tokenizer_data_file)
vocabulary = tokenizer_data.get("vocabulary")
tokenizer = BPETokenizer(vocabulary=vocabulary)
print(tokenizer.detokenize(inference_content))