Skip to content

Commit

Permalink
feat: from_pretrained enabled with wi17k_base (#6)
Browse files Browse the repository at this point in the history
* feat: from_pretrained enabled with wi17k_base

* fix: tests_train_bpe_w_specail_tokens
  • Loading branch information
Hk669 authored Jun 5, 2024
1 parent 878f9ea commit e5d5e43
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 10 deletions.
3 changes: 2 additions & 1 deletion bpetokenizer/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ def load(self, file_name, mode="json"):
self.merges = {tuple(map(int, k.strip('()').split(','))): v for k, v in merges.items()}
vocab = data["vocab"]
self.vocab = {int(k): v.encode("utf-8") for k, v in vocab.items()}
self.inverse_vocab = {v.decode("utf-8"): k for k, v in self.vocab.items()}



Expand All @@ -197,7 +198,7 @@ def decode(self, ids):
text = bytes_str.decode("utf-8", errors="replace")
return text

def train(self, texts, vocab_size, verbose=False, min_frequency=2):
def train(self, texts, vocab_size, verbose=False, min_frequency=1):
"""
Train the tokenizer on the given texts and vocab size. The vocab size should be greater than 256.
params:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"version": "1.0.32",
"version": "1.0.4",
"pattern": "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
"special_tokens": {
"<PAD>": 17311,
Expand Down
31 changes: 26 additions & 5 deletions bpetokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from .base import Tokenizer, get_stats, merge
import regex as re
import os

# from the openai/tiktoken (used in gpt4 tokenizer)
GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""" # raw string
Expand All @@ -32,7 +33,22 @@ def __init__(self, pattern=None, special_tokens=None):
self.inverse_special_tokens = {} if special_tokens is None else {v: k for k, v in special_tokens.items()}


def train(self, texts, vocab_size, verbose=False, min_frequency=2) -> None:
@classmethod
def from_pretrained(cls,
tokenizer_name: str,
verbose=False):
tokenizer = cls()
pretrained_dir = 'bpetokenizer/pretrained'
tokenizer_file = os.path.join(pretrained_dir, tokenizer_name, f'{tokenizer_name}.json')
if verbose:
print(f"loading tokenizer from: {tokenizer_file}")
if not os.path.exists(tokenizer_file):
raise FileNotFoundError(f"tokenizer file not found: {tokenizer_file}. Please check the tokenizer name")
tokenizer.load(tokenizer_file, mode="json")
return tokenizer


def train(self, texts, vocab_size, verbose=False, min_frequency=1) -> None:
"""
Train the tokenizer on the given texts and vocab size. The vocab size should be greater than 256.
params:
Expand Down Expand Up @@ -92,9 +108,12 @@ def encode_ord(self, text) -> list:
text_chunks = re.findall(self.compiled_pattern, text)
ids = []
for chunk in text_chunks:
_bytes = chunk.encode("utf-8")
chunk_ids = self._encode(_bytes)
ids.extend(chunk_ids)
if chunk in self.vocab:
ids.append(self.vocab[chunk])
else:
_bytes = chunk.encode("utf-8")
chunk_ids = self._encode(_bytes)
ids.extend(chunk_ids)
return ids


Expand Down Expand Up @@ -164,6 +183,8 @@ def tokens(self, text, verbose=False) -> list:
chunk_tokens = [self.vocab[idx].decode("utf-8", errors="replace") if idx in self.vocab else f"[UNK{idx}]" for idx in chunk_ids]
_tokens.extend(chunk_tokens)
if verbose:
print(f"---\nlength: {len(text_chunks)}\n")
print(f"---\ntext chunks: {text_chunks}\n")
print(f"---\npattern: {self.pattern}\n")
return _tokens
return _tokens

2 changes: 1 addition & 1 deletion bpetokenizer/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.0.32"
__version__ = "1.0.4"
3 changes: 3 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@
author_email="hrushi669@gmail.com",
license="MIT",
packages=find_packages(include=["bpetokenizer"]),
package_data={
'bpetokenizer': ['pretrained/wi17k_base/wi17k_base.json'],
},
classifiers=[
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3",
Expand Down
4 changes: 2 additions & 2 deletions tests/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ def test_train_bpe_w_special_tokens():
texts = "<|startoftext|> Hello, World! This is a sample text with the special tokens [SPECIAL1] and [SPECIAL2] to test the tokenizer.<|endoftext|>"
tokenizer.train(texts, vocab_size=310, verbose=False)

assert len(tokenizer.vocab) == 281
assert len(tokenizer.merges) == 25
assert len(tokenizer.vocab) == 310
assert len(tokenizer.merges) == 54
assert tokenizer.decode(tokenizer.encode(texts)) == texts
assert tokenizer.inverse_special_tokens == {v: k for k,v in special_tokens.items()}
assert tokenizer.special_tokens == special_tokens
Expand Down

0 comments on commit e5d5e43

Please sign in to comment.