From ce88d92973a7f988efea1f1fa39073c068181258 Mon Sep 17 00:00:00 2001 From: Gordon Messmer Date: Thu, 2 Oct 2025 12:06:06 -0700 Subject: [PATCH 1/5] Mark Python tests that need network access --- bindings/python/pytest.ini | 4 ++++ bindings/python/tests/bindings/test_tokenizer.py | 7 +++++++ 2 files changed, 11 insertions(+) create mode 100644 bindings/python/pytest.ini diff --git a/bindings/python/pytest.ini b/bindings/python/pytest.ini new file mode 100644 index 000000000..8f8fc06a4 --- /dev/null +++ b/bindings/python/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +markers = + network: mark a test that requires network access. + asyncio: mark a test that uses async io. diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py index 28f6b38d4..69cdaae77 100644 --- a/bindings/python/tests/bindings/test_tokenizer.py +++ b/bindings/python/tests/bindings/test_tokenizer.py @@ -376,6 +376,7 @@ def test_decode(self): stream = DecodeStream(ids=[0, 1, 2]) assert stream.step(tokenizer, 3) == " john" + @pytest.mark.network def test_decode_stream_fallback(self): tokenizer = Tokenizer.from_pretrained("gpt2") # tokenizer.decode([255]) fails because its a fallback @@ -408,6 +409,7 @@ def test_decode_stream_fallback(self): out = stream.step(tokenizer, [109]) assert out == "อั" + @pytest.mark.network def test_decode_skip_special_tokens(self): tokenizer = Tokenizer.from_pretrained("hf-internal-testing/Llama-3.1-8B-Instruct") @@ -557,11 +559,13 @@ def test_multiprocessing_with_parallelism(self): multiprocessing_with_parallelism(tokenizer, False) multiprocessing_with_parallelism(tokenizer, True) + @pytest.mark.network def test_from_pretrained(self): tokenizer = Tokenizer.from_pretrained("bert-base-cased") output = tokenizer.encode("Hey there dear friend!", add_special_tokens=False) assert output.tokens == ["Hey", "there", "dear", "friend", "!"] + @pytest.mark.network def test_from_pretrained_revision(self): tokenizer = Tokenizer.from_pretrained("anthony/tokenizers-test") output = tokenizer.encode("Hey there dear friend!", add_special_tokens=False) @@ -597,6 +601,7 @@ def test_unigram_byte_fallback(self): assert output.ids == [1, 10, 2, 3, 4, 5, 10, 6, 7, 8, 9] assert output.tokens == ["A", " ", "sen", "te", "n", "ce", " ", "<0xF0>", "<0x9F>", "<0xA4>", "<0x97>"] + @pytest.mark.network def test_encode_special_tokens(self): tokenizer = Tokenizer.from_pretrained("t5-base") tokenizer.add_tokens([""]) @@ -628,6 +633,7 @@ def test_encode_special_tokens(self): output = tokenizer.encode("Hey there dearfriend!", add_special_tokens=False) assert output.tokens == ["▁Hey", "▁there", "<", "end", "_", "of_text>", "▁dear", "", "▁friend", "!"] + @pytest.mark.network def test_splitting(self): tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-new-metaspace") tokenizer.pre_tokenizer.split = False @@ -724,6 +730,7 @@ def test_repr_complete(self): ) +@pytest.mark.network class TestAsyncTokenizer: """Tests for async methods of the Tokenizer class.""" From 019a839070902f7213449fe52756c697e8ef2f95 Mon Sep 17 00:00:00 2001 From: Gordon Messmer Date: Thu, 2 Oct 2025 13:27:47 -0700 Subject: [PATCH 2/5] Mark Python tests that need network access --- bindings/python/tests/bindings/test_encoding.py | 1 + bindings/python/tests/bindings/test_models.py | 2 ++ bindings/python/tests/bindings/test_processors.py | 1 + bindings/python/tests/bindings/test_tokenizer.py | 2 ++ bindings/python/tests/bindings/test_trainers.py | 3 +++ bindings/python/tests/documentation/test_pipeline.py | 2 ++ bindings/python/tests/documentation/test_quicktour.py | 1 + bindings/python/tests/implementations/test_bert_wordpiece.py | 2 ++ bindings/python/tests/implementations/test_byte_level_bpe.py | 4 ++++ bindings/python/tests/implementations/test_char_bpe.py | 4 ++++ bindings/python/tests/test_serialization.py | 2 ++ 11 files changed, 24 insertions(+) diff --git a/bindings/python/tests/bindings/test_encoding.py b/bindings/python/tests/bindings/test_encoding.py index 80b8cc2bb..90ac36343 100644 --- a/bindings/python/tests/bindings/test_encoding.py +++ b/bindings/python/tests/bindings/test_encoding.py @@ -5,6 +5,7 @@ from ..utils import bert_files, data_dir +@pytest.mark.network class TestEncoding: @pytest.fixture(scope="class") def encodings(self, bert_files): diff --git a/bindings/python/tests/bindings/test_models.py b/bindings/python/tests/bindings/test_models.py index 063698384..f467465b2 100644 --- a/bindings/python/tests/bindings/test_models.py +++ b/bindings/python/tests/bindings/test_models.py @@ -7,6 +7,7 @@ class TestBPE: + @pytest.mark.network def test_instantiate(self, roberta_files): assert isinstance(BPE(), Model) assert isinstance(BPE(), BPE) @@ -75,6 +76,7 @@ def test_dropout_zero(self): class TestWordPiece: + @pytest.mark.network def test_instantiate(self, bert_files): assert isinstance(WordPiece(), Model) assert isinstance(WordPiece(), WordPiece) diff --git a/bindings/python/tests/bindings/test_processors.py b/bindings/python/tests/bindings/test_processors.py index 3038d8694..4e98bcc1b 100644 --- a/bindings/python/tests/bindings/test_processors.py +++ b/bindings/python/tests/bindings/test_processors.py @@ -70,6 +70,7 @@ def test_instantiate(self): assert isinstance(ByteLevel(), ByteLevel) assert isinstance(pickle.loads(pickle.dumps(ByteLevel())), ByteLevel) + @pytest.mark.network def test_processing(self, roberta_files): # Deprecated in 0.9 with pytest.deprecated_call(): diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py index 69cdaae77..fed7c81bb 100644 --- a/bindings/python/tests/bindings/test_tokenizer.py +++ b/bindings/python/tests/bindings/test_tokenizer.py @@ -154,6 +154,7 @@ def test_encode(self): output = tokenizer.encode_batch(["my name is john", ("my name is john", "pair")]) assert len(output) == 2 + @pytest.mark.network def test_encode_formats(self, bert_files): with pytest.deprecated_call(): tokenizer = BertWordPieceTokenizer(bert_files["vocab"]) @@ -286,6 +287,7 @@ def test_pair(input, is_pretokenized=False): with pytest.raises(TypeError, match="InputSequence must be Union[List[str]"): tokenizer.encode(["My", "name", "is", "John"], "pair", is_pretokenized=True) + @pytest.mark.network def test_encode_add_special_tokens(self, roberta_files): with pytest.deprecated_call(): tokenizer = Tokenizer(BPE(roberta_files["vocab"], roberta_files["merges"])) diff --git a/bindings/python/tests/bindings/test_trainers.py b/bindings/python/tests/bindings/test_trainers.py index 38b599448..fa05cb27e 100644 --- a/bindings/python/tests/bindings/test_trainers.py +++ b/bindings/python/tests/bindings/test_trainers.py @@ -150,6 +150,7 @@ def test_can_pickle(self): class TestUnigram: + @pytest.mark.network def test_train(self, train_files): tokenizer = SentencePieceUnigramTokenizer() tokenizer.train(train_files["small"], show_progress=False) @@ -158,6 +159,7 @@ def test_train(self, train_files): tokenizer.save(filename) os.remove(filename) + @pytest.mark.network def test_train_parallelism_with_custom_pretokenizer(self, train_files): class GoodCustomPretok: def split(self, n, normalized): @@ -287,6 +289,7 @@ def test_can_modify(self): trainer.initial_alphabet = ["d", "z"] assert sorted(trainer.initial_alphabet) == ["d", "z"] + @pytest.mark.network def test_continuing_prefix_trainer_mismatch(self, train_files): UNK = "[UNK]" special_tokens = [UNK] diff --git a/bindings/python/tests/documentation/test_pipeline.py b/bindings/python/tests/documentation/test_pipeline.py index 25300ff64..f6b5cb768 100644 --- a/bindings/python/tests/documentation/test_pipeline.py +++ b/bindings/python/tests/documentation/test_pipeline.py @@ -12,6 +12,7 @@ def print(*args, **kwargs): class TestPipeline: + @pytest.mark.network def test_pipeline(self, doc_wiki_tokenizer): try: # START reload_tokenizer @@ -143,6 +144,7 @@ def slow_train(): bert_tokenizer.save("data/bert-wiki.json") # END bert_train_tokenizer + @pytest.mark.network def test_bert_example(self, doc_pipeline_bert_tokenizer): try: bert_tokenizer = Tokenizer.from_file("data/bert-wiki.json") diff --git a/bindings/python/tests/documentation/test_quicktour.py b/bindings/python/tests/documentation/test_quicktour.py index a98b0c12e..fb8fb972e 100644 --- a/bindings/python/tests/documentation/test_quicktour.py +++ b/bindings/python/tests/documentation/test_quicktour.py @@ -45,6 +45,7 @@ def get_tokenizer_trainer(): # END init_pretok return tokenizer, trainer + @pytest.mark.network def test_quicktour(self, doc_wiki_tokenizer): def print(*args, **kwargs): pass diff --git a/bindings/python/tests/implementations/test_bert_wordpiece.py b/bindings/python/tests/implementations/test_bert_wordpiece.py index 4e7c29cf5..4a20f36ed 100644 --- a/bindings/python/tests/implementations/test_bert_wordpiece.py +++ b/bindings/python/tests/implementations/test_bert_wordpiece.py @@ -4,6 +4,7 @@ class TestBertWordPieceTokenizer: + @pytest.mark.network def test_basic_encode(self, bert_files): tokenizer = BertWordPieceTokenizer.from_file(bert_files["vocab"]) @@ -39,6 +40,7 @@ def test_basic_encode(self, bert_files): assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)] assert output.type_ids == [0, 0, 0, 0, 1] + @pytest.mark.network def test_multiprocessing_with_parallelism(self, bert_files): tokenizer = BertWordPieceTokenizer.from_file(bert_files["vocab"]) multiprocessing_with_parallelism(tokenizer, False) diff --git a/bindings/python/tests/implementations/test_byte_level_bpe.py b/bindings/python/tests/implementations/test_byte_level_bpe.py index 441aded7a..5b027b7cc 100644 --- a/bindings/python/tests/implementations/test_byte_level_bpe.py +++ b/bindings/python/tests/implementations/test_byte_level_bpe.py @@ -4,6 +4,7 @@ class TestByteLevelBPE: + @pytest.mark.network def test_basic_encode(self, roberta_files): tokenizer = ByteLevelBPETokenizer.from_file(roberta_files["vocab"], roberta_files["merges"]) output = tokenizer.encode("The quick brown fox jumps over the lazy dog") @@ -32,6 +33,7 @@ def test_basic_encode(self, roberta_files): (39, 43), ] + @pytest.mark.network def test_add_prefix_space(self, roberta_files): tokenizer = ByteLevelBPETokenizer.from_file( roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True @@ -62,6 +64,7 @@ def test_add_prefix_space(self, roberta_files): (39, 43), ] + @pytest.mark.network def test_lowerspace(self, roberta_files): tokenizer = ByteLevelBPETokenizer.from_file( roberta_files["vocab"], @@ -84,6 +87,7 @@ def test_lowerspace(self, roberta_files): "Ġdog", ] + @pytest.mark.network def test_multiprocessing_with_parallelism(self, roberta_files): tokenizer = ByteLevelBPETokenizer.from_file(roberta_files["vocab"], roberta_files["merges"]) multiprocessing_with_parallelism(tokenizer, False) diff --git a/bindings/python/tests/implementations/test_char_bpe.py b/bindings/python/tests/implementations/test_char_bpe.py index 3ce5cf9a3..789e7d483 100644 --- a/bindings/python/tests/implementations/test_char_bpe.py +++ b/bindings/python/tests/implementations/test_char_bpe.py @@ -4,6 +4,7 @@ class TestCharBPETokenizer: + @pytest.mark.network def test_basic_encode(self, openai_files): tokenizer = CharBPETokenizer.from_file(openai_files["vocab"], openai_files["merges"]) @@ -31,6 +32,7 @@ def test_basic_encode(self, openai_files): ] assert output.type_ids == [0, 0, 0, 0, 0, 0, 0, 1] + @pytest.mark.network def test_lowercase(self, openai_files): tokenizer = CharBPETokenizer.from_file(openai_files["vocab"], openai_files["merges"], lowercase=True) output = tokenizer.encode("My name is John", "pair", add_special_tokens=False) @@ -39,11 +41,13 @@ def test_lowercase(self, openai_files): assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)] assert output.type_ids == [0, 0, 0, 0, 1] + @pytest.mark.network def test_decoding(self, openai_files): tokenizer = CharBPETokenizer.from_file(openai_files["vocab"], openai_files["merges"], lowercase=True) decoded = tokenizer.decode(tokenizer.encode("my name is john").ids) assert decoded == "my name is john" + @pytest.mark.network def test_multiprocessing_with_parallelism(self, openai_files): tokenizer = CharBPETokenizer.from_file(openai_files["vocab"], openai_files["merges"]) multiprocessing_with_parallelism(tokenizer, False) diff --git a/bindings/python/tests/test_serialization.py b/bindings/python/tests/test_serialization.py index 9da2c3e27..ef8abc804 100644 --- a/bindings/python/tests/test_serialization.py +++ b/bindings/python/tests/test_serialization.py @@ -11,12 +11,14 @@ class TestSerialization: + @pytest.mark.network def test_full_serialization_albert(self, albert_base): # Check we can read this file. # This used to fail because of BufReader that would fail because the # file exceeds the buffer capacity Tokenizer.from_file(albert_base) + @pytest.mark.network def test_str_big(self, albert_base): tokenizer = Tokenizer.from_file(albert_base) assert ( From a65dba5614e9172288b84cee8027674e46640bc9 Mon Sep 17 00:00:00 2001 From: Gordon Messmer Date: Thu, 2 Oct 2025 13:39:58 -0700 Subject: [PATCH 3/5] Mark Python tests that need network access --- bindings/python/tests/documentation/test_pipeline.py | 1 + bindings/python/tests/documentation/test_quicktour.py | 1 + bindings/python/tests/implementations/test_bert_wordpiece.py | 1 + bindings/python/tests/implementations/test_byte_level_bpe.py | 1 + bindings/python/tests/implementations/test_char_bpe.py | 1 + bindings/python/tests/test_serialization.py | 1 + 6 files changed, 6 insertions(+) diff --git a/bindings/python/tests/documentation/test_pipeline.py b/bindings/python/tests/documentation/test_pipeline.py index f6b5cb768..a94f9cc61 100644 --- a/bindings/python/tests/documentation/test_pipeline.py +++ b/bindings/python/tests/documentation/test_pipeline.py @@ -1,3 +1,4 @@ +import pytest from tokenizers import Tokenizer from ..utils import data_dir, doc_pipeline_bert_tokenizer, doc_wiki_tokenizer diff --git a/bindings/python/tests/documentation/test_quicktour.py b/bindings/python/tests/documentation/test_quicktour.py index fb8fb972e..59d5cceec 100644 --- a/bindings/python/tests/documentation/test_quicktour.py +++ b/bindings/python/tests/documentation/test_quicktour.py @@ -1,3 +1,4 @@ +import pytest from tokenizers import Tokenizer from ..utils import data_dir, doc_wiki_tokenizer diff --git a/bindings/python/tests/implementations/test_bert_wordpiece.py b/bindings/python/tests/implementations/test_bert_wordpiece.py index 4a20f36ed..a7fefc1d3 100644 --- a/bindings/python/tests/implementations/test_bert_wordpiece.py +++ b/bindings/python/tests/implementations/test_bert_wordpiece.py @@ -1,3 +1,4 @@ +import pytest from tokenizers import BertWordPieceTokenizer from ..utils import bert_files, data_dir, multiprocessing_with_parallelism diff --git a/bindings/python/tests/implementations/test_byte_level_bpe.py b/bindings/python/tests/implementations/test_byte_level_bpe.py index 5b027b7cc..51410ac8e 100644 --- a/bindings/python/tests/implementations/test_byte_level_bpe.py +++ b/bindings/python/tests/implementations/test_byte_level_bpe.py @@ -1,3 +1,4 @@ +import pytest from tokenizers import ByteLevelBPETokenizer from ..utils import data_dir, multiprocessing_with_parallelism, roberta_files diff --git a/bindings/python/tests/implementations/test_char_bpe.py b/bindings/python/tests/implementations/test_char_bpe.py index 789e7d483..3449e8b2a 100644 --- a/bindings/python/tests/implementations/test_char_bpe.py +++ b/bindings/python/tests/implementations/test_char_bpe.py @@ -1,3 +1,4 @@ +import pytest from tokenizers import CharBPETokenizer from ..utils import data_dir, multiprocessing_with_parallelism, openai_files diff --git a/bindings/python/tests/test_serialization.py b/bindings/python/tests/test_serialization.py index ef8abc804..c9060f4ad 100644 --- a/bindings/python/tests/test_serialization.py +++ b/bindings/python/tests/test_serialization.py @@ -1,5 +1,6 @@ import json import os +import pytest import unittest import tqdm From 56e549fe8224b5db237659132823863040481ca7 Mon Sep 17 00:00:00 2001 From: Gordon Messmer Date: Thu, 2 Oct 2025 13:55:47 -0700 Subject: [PATCH 4/5] Mark Python tests that need network access --- bindings/python/tests/bindings/test_models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bindings/python/tests/bindings/test_models.py b/bindings/python/tests/bindings/test_models.py index f467465b2..23839a3e5 100644 --- a/bindings/python/tests/bindings/test_models.py +++ b/bindings/python/tests/bindings/test_models.py @@ -114,6 +114,7 @@ def test_can_modify(self): class TestWordLevel: + @pytest.mark.network def test_instantiate(self, roberta_files): assert isinstance(WordLevel(), Model) assert isinstance(WordLevel(), WordLevel) From 7e4fbc556e2d0fe91161ef20b6fe01a1e34b1609 Mon Sep 17 00:00:00 2001 From: Gordon Messmer Date: Thu, 2 Oct 2025 14:20:45 -0700 Subject: [PATCH 5/5] Mark Python tests that need network access --- bindings/python/tests/bindings/test_trainers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bindings/python/tests/bindings/test_trainers.py b/bindings/python/tests/bindings/test_trainers.py index fa05cb27e..40d4acb0c 100644 --- a/bindings/python/tests/bindings/test_trainers.py +++ b/bindings/python/tests/bindings/test_trainers.py @@ -186,6 +186,7 @@ def test_can_pickle(self): def test_train_with_special_tokens(self): filename = "tests/data/dummy-unigram-special_tokens-train.txt" + os.makedirs("tests/data", exist_ok=True) with open(filename, "w") as f: f.write( """