Skip to content

Commit a825d1a

Browse files
wujinzhuwujinzhu
authored andcommitted
fix: remove conflicting <|reserved_200018|>; add token-id uniqueness test
1 parent 97e49cb commit a825d1a

File tree

2 files changed

+46
-1
lines changed

2 files changed

+46
-1
lines changed

tests/test_token_ids_unique.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# tests/test_token_ids_unique.py
2+
# Checks that token IDs are unique. We don't check token "names" (dict keys are unique by definition).
3+
4+
import pytest
5+
import tiktoken
6+
from collections import defaultdict
7+
8+
ENCODING_NAMES = tiktoken.list_encoding_names()
9+
10+
@pytest.mark.parametrize("enc_name", ENCODING_NAMES)
11+
def test_special_token_ids_are_unique(enc_name):
12+
"""
13+
Special tokens: no two different names should share the same token id.
14+
"""
15+
enc = tiktoken.get_encoding(enc_name)
16+
sp = getattr(enc, "_special_tokens", {})
17+
if not sp:
18+
pytest.skip(f"{enc_name}: no special tokens")
19+
20+
id2names = defaultdict(list)
21+
for name, tid in sp.items():
22+
id2names[tid].append(name)
23+
24+
dups = {tid: names for tid, names in id2names.items() if len(names) > 1}
25+
assert not dups, f"{enc_name}: duplicated special token ids: {dups}"
26+
27+
@pytest.mark.parametrize("enc_name", ENCODING_NAMES)
28+
def test_mergeable_token_ids_are_unique(enc_name):
29+
"""
30+
Mergeable (vocab) tokens: token ids should be unique.
31+
Note: some builds may not expose `_mergeable_ranks` on Python side; skip in that case.
32+
"""
33+
enc = tiktoken.get_encoding(enc_name)
34+
mr = getattr(enc, "_mergeable_ranks", None)
35+
if not mr:
36+
pytest.skip(f"{enc_name}: mergeable ranks not exposed")
37+
38+
ids = list(mr.values())
39+
assert len(ids) == len(set(ids)), f"{enc_name}: duplicated mergeable token ids"
40+

tiktoken_ext/openai_public.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,12 @@ def o200k_harmony():
142142
"<|reserved_200010|>": 200010,
143143
"<|reserved_200011|>": 200011,
144144
"<|call|>": 200012,
145-
} | {f"<|reserved_{i}|>": i for i in range(200013, 201088)}
145+
"<|reserved_200013|>": 200013,
146+
"<|reserved_200014|>": 200014,
147+
"<|reserved_200015|>": 200015,
148+
"<|reserved_200016|>": 200016,
149+
"<|reserved_200017|>": 200017,
150+
} | {f"<|reserved_{i}|>": i for i in range(200019, 201088)}
146151
return {
147152
"name": name,
148153
"pat_str": pat_str,

0 commit comments

Comments
 (0)