Skip to content

Commit

Permalink
feat: visiblity of tokens and pattern
Browse files Browse the repository at this point in the history
  • Loading branch information
Hk669 committed May 29, 2024
1 parent 032cf2d commit 020e8c3
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 6 deletions.
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,13 @@ decode_text = tokenizer.decode(ids)
print('---')
print(decode_text)

# you can also print the tokens and the text chunks split with the pattern.
tokens = tokenizer.tokens(encode_text, verbose=True) # if verbose, prints the text chunks and also the pattern used to split.
print('---')
print("tokens: ", tokens)

```
refer to the [load_json_vocab](sample/load_json_vocab/) and run the `bpetokenizer_json` to get an overview of `vocab`, `merges`, `special_tokens`.
refer to the [load_json_vocab](sample/load_json_vocab/) and run the `bpetokenizer_json` to get an overview of `vocab`, `merges`, `special_tokens` and to view the tokens that are split by the tokenizer using pattern, look at [tokens](sample/load_json_vocab/tokens.py)

### Run Tests

Expand Down
9 changes: 8 additions & 1 deletion bpetokenizer/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
and Base class which has the methods to save/load model,
also required to build the BPETokenizer.
"""

import regex as re
from .version import __version__

def get_stats(tokens, counts=None) -> dict:
Expand Down Expand Up @@ -62,6 +64,7 @@ class Tokenizer:
def __init__(self):
self.merges = {}
self.pattern = "" # the regex pattern
self.compiled_pattern = re.compile(self.pattern) if self.pattern else ""
self.special_tokens = {}
self.vocab = self._build_vocab() if self.merges else {}

Expand Down Expand Up @@ -157,7 +160,11 @@ def load(self, file_name, mode="file"):
with open(file_name, "r", encoding="utf-8") as f:
data = json.load(f)
assert data["version"] == __version__
self.pattern = data[r"pattern"]
pattern = data["pattern"]
pattern_regex = re.compile(r'regex.Regex\("(.+)", flags=(regex\.\w+)\)')
match = pattern_regex.match(pattern)
if match:
self.pattern = match.group(1)
self.special_tokens = data["special_tokens"]
self.inverse_special_tokens = {v: k for k, v in self.special_tokens.items()}
merges = data["merges"]
Expand Down
20 changes: 17 additions & 3 deletions bpetokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ class BPETokenizer(Tokenizer):

def __init__(self, pattern=None, special_tokens=None):
super().__init__()
self.pattern = re.compile(GPT4_SPLIT_PATTERN) if pattern is None else pattern
self.pattern = GPT4_SPLIT_PATTERN if pattern is None else pattern
self.compiled_pattern = re.compile(self.pattern)
self.special_tokens = {} if special_tokens is None else special_tokens
self.inverse_special_tokens = {} if special_tokens is None else {v: k for k, v in special_tokens.items()}

Expand All @@ -36,7 +37,7 @@ def train(self, texts, vocab_size, verbose=False) -> None:
assert vocab_size >= 256
num_merges = vocab_size - 256

text_chunks = re.findall(self.pattern, texts) # handles the desired pattern of tokens with regex pattern
text_chunks = re.findall(self.compiled_pattern, texts) # handles the desired pattern of tokens with regex pattern

ids = [list(tokens.encode("utf-8")) for tokens in text_chunks] # List[List[int]]
merges = {}
Expand Down Expand Up @@ -79,7 +80,7 @@ def _encode(self, _bytes) -> list:


def encode_ord(self, text) -> list:
text_chunks = re.findall(self.pattern, text)
text_chunks = re.findall(self.compiled_pattern, text)
ids = []
for chunk in text_chunks:
_bytes = chunk.encode("utf-8")
Expand Down Expand Up @@ -144,3 +145,16 @@ def _special_tokens(self, special_tokens) -> None:
self.inverse_special_tokens = {v: k for k, v in special_tokens.items()}


def tokens(self, text, verbose=False) -> list:
text_chunks = re.findall(self.compiled_pattern, text)

_tokens = []
for chunk in text_chunks:
_bytes = chunk.encode("utf-8")
chunk_ids = self._encode(_bytes)
chunk_tokens = [self.vocab[idx].decode("utf-8", errors="replace") if idx in self.vocab else f"[UNK{idx}]" for idx in chunk_ids]
_tokens.extend(chunk_tokens)
if verbose:
print(f"---\ntext chunks: {text_chunks}\n")
print(f"---\npattern: {self.pattern}\n")
return _tokens
2 changes: 1 addition & 1 deletion bpetokenizer/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.0.3"
__version__ = "1.0.31"

0 comments on commit 020e8c3

Please sign in to comment.