version update .31

Hk669 · May 29, 2024 · 75644b1 · 75644b1
1 parent 020e8c3
commit 75644b1
Show file tree

Hide file tree

Showing 4 changed files with 30 additions and 4 deletions.
diff --git a/sample/load_json_vocab/bpetokenizer_json.py b/sample/load_json_vocab/bpetokenizer_json.py
@@ -17,7 +17,7 @@
 
 ids = tokenizer.encode(encode_text, special_tokens="all")
 print('---')
-print(ids)
+print('Ids: ', ids)
 
 decode_text = tokenizer.decode(ids)
 print('---')

diff --git a/sample/load_json_vocab/sample_bpetokenizer.json b/sample/load_json_vocab/sample_bpetokenizer.json
@@ -1,6 +1,6 @@
 {
-    "version": "1.0.2",
-    "pattern": "regex.Regex(\"'(?i:[sdmt]|ll|ve|re)|[^\\\\r\\\\n\\\\p{L}\\\\p{N}]?+\\\\p{L}+|\\\\p{N}{1,3}| ?[^\\\\s\\\\p{L}\\\\p{N}]++[\\\\r\\\\n]*|\\\\s*[\\\\r\\\\n]|\\\\s+(?!\\\\S)|\\\\s+\", flags=regex.V0)",
+    "version": "1.0.31",
+    "pattern": "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
     "special_tokens": {
         "<|endoftext|>": 311,
         "<|startoftext|>": 312,

diff --git a/sample/load_json_vocab/tokens.py b/sample/load_json_vocab/tokens.py
@@ -0,0 +1,26 @@
+import sys
+sys.path.append('../')
+
+from bpetokenizer import BPETokenizer
+
+# intializing the tokenizer
+tokenizer = BPETokenizer()
+
+# load the vocab which is pretrained
+tokenizer.load("sample_bpetokenizer.json", mode="json")
+
+text = "<|startoftext|>This method? generates the tokens! which are split, before the tokenization using the pattern: default we use the gpt4 split pattern mentioned in the tiktoken.<|endoftext|>"
+
+
+# this method returns a list of tokens of the text passed.
+tokens = tokenizer.tokens(text, verbose=True) # if verbose, prints the text chunks and also the pattern used to split.
+print('---')
+print("tokens: ", tokens)
+
+"""
+tokens:  ['<|', 'st', 'ar', 't', 'oftext', '|>', 'T', 'h', 'is', ' ', 'm', 'e', 'th', 'o', 'd', '?', ' ', 'g', 'en', 'er', 'a', 't', 'e', 's', ' the', ' token',
+ 's', '!', ' w', 'h', 'i', 'c', 'h', ' a', 'r', 'e', ' s', 'pl', 'i', 't', ',', ' ', 'b', 'e', 'f', 'o', 'r', 'e', ' the',
+ ' tokeniz', 'a', 't', 'i', 'on', ' ', 'u', 's', 'ing', ' the', ' ', 'p', 'a', 't', 't', 'er', 'n', ':', ' ', 'd', 'e', 'f', 'a', 'u', 'l', 't', ' w', 'e', ' ', 
+ 'u', 'se', ' the', ' ', 'g', 'p', 't', '4', ' s', 'pl', 'i', 't', ' ', 'p', 'a', 't', 't', 'er', 'n', ' ',
+ 'm', 'en', 't', 'i', 'on', 'e', 'd', ' ', 'in', ' the', ' t', 'i', 'k', 't', 'o', 'k', 'en', '.', '<|', 'en', 'd', 'oftext', '|>']
+"""
diff --git a/sample/tokenizer/wiki.model b/sample/tokenizer/wiki.model
@@ -1,4 +1,4 @@
-bpetokenizer v0.1
+1.0.31
 
 0
 97 97
-Original file line number
+Diff line change
@@ -1,4 +1,4 @@
-    bpetokenizer v0.1
+.0.31
 97
@@ Expand Down @@