From 020e8c3c25d145a94284b14ae24c5d5efa611b5a Mon Sep 17 00:00:00 2001 From: Hk669 Date: Wed, 29 May 2024 14:07:34 +0530 Subject: [PATCH] feat: visiblity of tokens and pattern --- README.md | 7 ++++++- bpetokenizer/base.py | 9 ++++++++- bpetokenizer/tokenizer.py | 20 +++++++++++++++++--- bpetokenizer/version.py | 2 +- 4 files changed, 32 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 826dd58..e526f56 100644 --- a/README.md +++ b/README.md @@ -110,8 +110,13 @@ decode_text = tokenizer.decode(ids) print('---') print(decode_text) +# you can also print the tokens and the text chunks split with the pattern. +tokens = tokenizer.tokens(encode_text, verbose=True) # if verbose, prints the text chunks and also the pattern used to split. +print('---') +print("tokens: ", tokens) + ``` -refer to the [load_json_vocab](sample/load_json_vocab/) and run the `bpetokenizer_json` to get an overview of `vocab`, `merges`, `special_tokens`. +refer to the [load_json_vocab](sample/load_json_vocab/) and run the `bpetokenizer_json` to get an overview of `vocab`, `merges`, `special_tokens` and to view the tokens that are split by the tokenizer using pattern, look at [tokens](sample/load_json_vocab/tokens.py) ### Run Tests diff --git a/bpetokenizer/base.py b/bpetokenizer/base.py index c0ad7da..6c536ed 100644 --- a/bpetokenizer/base.py +++ b/bpetokenizer/base.py @@ -3,6 +3,8 @@ and Base class which has the methods to save/load model, also required to build the BPETokenizer. """ + +import regex as re from .version import __version__ def get_stats(tokens, counts=None) -> dict: @@ -62,6 +64,7 @@ class Tokenizer: def __init__(self): self.merges = {} self.pattern = "" # the regex pattern + self.compiled_pattern = re.compile(self.pattern) if self.pattern else "" self.special_tokens = {} self.vocab = self._build_vocab() if self.merges else {} @@ -157,7 +160,11 @@ def load(self, file_name, mode="file"): with open(file_name, "r", encoding="utf-8") as f: data = json.load(f) assert data["version"] == __version__ - self.pattern = data[r"pattern"] + pattern = data["pattern"] + pattern_regex = re.compile(r'regex.Regex\("(.+)", flags=(regex\.\w+)\)') + match = pattern_regex.match(pattern) + if match: + self.pattern = match.group(1) self.special_tokens = data["special_tokens"] self.inverse_special_tokens = {v: k for k, v in self.special_tokens.items()} merges = data["merges"] diff --git a/bpetokenizer/tokenizer.py b/bpetokenizer/tokenizer.py index 434bfd5..f65fef1 100644 --- a/bpetokenizer/tokenizer.py +++ b/bpetokenizer/tokenizer.py @@ -26,7 +26,8 @@ class BPETokenizer(Tokenizer): def __init__(self, pattern=None, special_tokens=None): super().__init__() - self.pattern = re.compile(GPT4_SPLIT_PATTERN) if pattern is None else pattern + self.pattern = GPT4_SPLIT_PATTERN if pattern is None else pattern + self.compiled_pattern = re.compile(self.pattern) self.special_tokens = {} if special_tokens is None else special_tokens self.inverse_special_tokens = {} if special_tokens is None else {v: k for k, v in special_tokens.items()} @@ -36,7 +37,7 @@ def train(self, texts, vocab_size, verbose=False) -> None: assert vocab_size >= 256 num_merges = vocab_size - 256 - text_chunks = re.findall(self.pattern, texts) # handles the desired pattern of tokens with regex pattern + text_chunks = re.findall(self.compiled_pattern, texts) # handles the desired pattern of tokens with regex pattern ids = [list(tokens.encode("utf-8")) for tokens in text_chunks] # List[List[int]] merges = {} @@ -79,7 +80,7 @@ def _encode(self, _bytes) -> list: def encode_ord(self, text) -> list: - text_chunks = re.findall(self.pattern, text) + text_chunks = re.findall(self.compiled_pattern, text) ids = [] for chunk in text_chunks: _bytes = chunk.encode("utf-8") @@ -144,3 +145,16 @@ def _special_tokens(self, special_tokens) -> None: self.inverse_special_tokens = {v: k for k, v in special_tokens.items()} + def tokens(self, text, verbose=False) -> list: + text_chunks = re.findall(self.compiled_pattern, text) + + _tokens = [] + for chunk in text_chunks: + _bytes = chunk.encode("utf-8") + chunk_ids = self._encode(_bytes) + chunk_tokens = [self.vocab[idx].decode("utf-8", errors="replace") if idx in self.vocab else f"[UNK{idx}]" for idx in chunk_ids] + _tokens.extend(chunk_tokens) + if verbose: + print(f"---\ntext chunks: {text_chunks}\n") + print(f"---\npattern: {self.pattern}\n") + return _tokens \ No newline at end of file diff --git a/bpetokenizer/version.py b/bpetokenizer/version.py index 679362c..0b5c86e 100644 --- a/bpetokenizer/version.py +++ b/bpetokenizer/version.py @@ -1 +1 @@ -__version__ = "1.0.3" \ No newline at end of file +__version__ = "1.0.31" \ No newline at end of file