|  | 
|  | 1 | +# tests/test_token_ids_unique.py | 
|  | 2 | +# Checks that token IDs are unique. We don't check token "names" (dict keys are unique by definition). | 
|  | 3 | + | 
|  | 4 | +import pytest | 
|  | 5 | +import tiktoken | 
|  | 6 | +from collections import defaultdict | 
|  | 7 | + | 
|  | 8 | +ENCODING_NAMES = tiktoken.list_encoding_names() | 
|  | 9 | + | 
|  | 10 | +@pytest.mark.parametrize("enc_name", ENCODING_NAMES) | 
|  | 11 | +def test_special_token_ids_are_unique(enc_name): | 
|  | 12 | +    """ | 
|  | 13 | +    Special tokens: no two different names should share the same token id. | 
|  | 14 | +    """ | 
|  | 15 | +    enc = tiktoken.get_encoding(enc_name) | 
|  | 16 | +    sp = getattr(enc, "_special_tokens", {}) | 
|  | 17 | +    if not sp: | 
|  | 18 | +        pytest.skip(f"{enc_name}: no special tokens") | 
|  | 19 | + | 
|  | 20 | +    id2names = defaultdict(list) | 
|  | 21 | +    for name, tid in sp.items(): | 
|  | 22 | +        id2names[tid].append(name) | 
|  | 23 | + | 
|  | 24 | +    dups = {tid: names for tid, names in id2names.items() if len(names) > 1} | 
|  | 25 | +    assert not dups, f"{enc_name}: duplicated special token ids: {dups}" | 
|  | 26 | + | 
|  | 27 | +@pytest.mark.parametrize("enc_name", ENCODING_NAMES) | 
|  | 28 | +def test_mergeable_token_ids_are_unique(enc_name): | 
|  | 29 | +    """ | 
|  | 30 | +    Mergeable (vocab) tokens: token ids should be unique. | 
|  | 31 | +    Note: some builds may not expose `_mergeable_ranks` on Python side; skip in that case. | 
|  | 32 | +    """ | 
|  | 33 | +    enc = tiktoken.get_encoding(enc_name) | 
|  | 34 | +    mr = getattr(enc, "_mergeable_ranks", None) | 
|  | 35 | +    if not mr: | 
|  | 36 | +        pytest.skip(f"{enc_name}: mergeable ranks not exposed") | 
|  | 37 | + | 
|  | 38 | +    ids = list(mr.values()) | 
|  | 39 | +    assert len(ids) == len(set(ids)), f"{enc_name}: duplicated mergeable token ids" | 
|  | 40 | + | 
0 commit comments