From ce88d92973a7f988efea1f1fa39073c068181258 Mon Sep 17 00:00:00 2001
From: Gordon Messmer <gordon.messmer@gmail.com>
Date: Thu, 2 Oct 2025 12:06:06 -0700
Subject: [PATCH 1/5] Mark Python tests that need network access

---
 bindings/python/pytest.ini                       | 4 ++++
 bindings/python/tests/bindings/test_tokenizer.py | 7 +++++++
 2 files changed, 11 insertions(+)
 create mode 100644 bindings/python/pytest.ini
diff --git a/bindings/python/pytest.ini b/bindings/python/pytest.ini
new file mode 100644
index 000000000..8f8fc06a4
--- /dev/null
+++ b/bindings/python/pytest.ini
@@ -0,0 +1,4 @@
+[pytest]
+markers =
+   network: mark a test that requires network access.
+   asyncio: mark a test that uses async io.
diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py
index 28f6b38d4..69cdaae77 100644
--- a/bindings/python/tests/bindings/test_tokenizer.py
+++ b/bindings/python/tests/bindings/test_tokenizer.py
@@ -376,6 +376,7 @@ def test_decode(self):
         stream = DecodeStream(ids=[0, 1, 2])
         assert stream.step(tokenizer, 3) == " john"
 
+    @pytest.mark.network
     def test_decode_stream_fallback(self):
         tokenizer = Tokenizer.from_pretrained("gpt2")
         # tokenizer.decode([255]) fails because its a fallback
@@ -408,6 +409,7 @@ def test_decode_stream_fallback(self):
         out = stream.step(tokenizer, [109])
         assert out == "อั"
 
+    @pytest.mark.network
     def test_decode_skip_special_tokens(self):
         tokenizer = Tokenizer.from_pretrained("hf-internal-testing/Llama-3.1-8B-Instruct")
 
@@ -557,11 +559,13 @@ def test_multiprocessing_with_parallelism(self):
         multiprocessing_with_parallelism(tokenizer, False)
         multiprocessing_with_parallelism(tokenizer, True)
 
+    @pytest.mark.network
     def test_from_pretrained(self):
         tokenizer = Tokenizer.from_pretrained("bert-base-cased")
         output = tokenizer.encode("Hey there dear friend!", add_special_tokens=False)
         assert output.tokens == ["Hey", "there", "dear", "friend", "!"]
 
+    @pytest.mark.network
     def test_from_pretrained_revision(self):
         tokenizer = Tokenizer.from_pretrained("anthony/tokenizers-test")
         output = tokenizer.encode("Hey there dear friend!", add_special_tokens=False)
@@ -597,6 +601,7 @@ def test_unigram_byte_fallback(self):
         assert output.ids == [1, 10, 2, 3, 4, 5, 10, 6, 7, 8, 9]
         assert output.tokens == ["A", " ", "sen", "te", "n", "ce", " ", "<0xF0>", "<0x9F>", "<0xA4>", "<0x97>"]
 
+    @pytest.mark.network
     def test_encode_special_tokens(self):
         tokenizer = Tokenizer.from_pretrained("t5-base")
         tokenizer.add_tokens(["<eot>"])
@@ -628,6 +633,7 @@ def test_encode_special_tokens(self):
         output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False)
         assert output.tokens == ["▁Hey", "▁there", "<", "end", "_", "of_text>", "▁dear", "<eot>", "▁friend", "!"]
 
+    @pytest.mark.network
     def test_splitting(self):
         tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-new-metaspace")
         tokenizer.pre_tokenizer.split = False
@@ -724,6 +730,7 @@ def test_repr_complete(self):
         )
 
 
+@pytest.mark.network
 class TestAsyncTokenizer:
     """Tests for async methods of the Tokenizer class."""
 

From 019a839070902f7213449fe52756c697e8ef2f95 Mon Sep 17 00:00:00 2001
From: Gordon Messmer <gordon.messmer@gmail.com>
Date: Thu, 2 Oct 2025 13:27:47 -0700
Subject: [PATCH 2/5] Mark Python tests that need network access

---
 bindings/python/tests/bindings/test_encoding.py              | 1 +
 bindings/python/tests/bindings/test_models.py                | 2 ++
 bindings/python/tests/bindings/test_processors.py            | 1 +
 bindings/python/tests/bindings/test_tokenizer.py             | 2 ++
 bindings/python/tests/bindings/test_trainers.py              | 3 +++
 bindings/python/tests/documentation/test_pipeline.py         | 2 ++
 bindings/python/tests/documentation/test_quicktour.py        | 1 +
 bindings/python/tests/implementations/test_bert_wordpiece.py | 2 ++
 bindings/python/tests/implementations/test_byte_level_bpe.py | 4 ++++
 bindings/python/tests/implementations/test_char_bpe.py       | 4 ++++
 bindings/python/tests/test_serialization.py                  | 2 ++
 11 files changed, 24 insertions(+)

diff --git a/bindings/python/tests/bindings/test_encoding.py b/bindings/python/tests/bindings/test_encoding.py
index 80b8cc2bb..90ac36343 100644
--- a/bindings/python/tests/bindings/test_encoding.py
+++ b/bindings/python/tests/bindings/test_encoding.py
@@ -5,6 +5,7 @@
 from ..utils import bert_files, data_dir
 
 
+@pytest.mark.network
 class TestEncoding:
     @pytest.fixture(scope="class")
     def encodings(self, bert_files):
diff --git a/bindings/python/tests/bindings/test_models.py b/bindings/python/tests/bindings/test_models.py
index 063698384..f467465b2 100644
--- a/bindings/python/tests/bindings/test_models.py
+++ b/bindings/python/tests/bindings/test_models.py
@@ -7,6 +7,7 @@
 
 
 class TestBPE:
+    @pytest.mark.network
     def test_instantiate(self, roberta_files):
         assert isinstance(BPE(), Model)
         assert isinstance(BPE(), BPE)
@@ -75,6 +76,7 @@ def test_dropout_zero(self):
 
 
 class TestWordPiece:
+    @pytest.mark.network
     def test_instantiate(self, bert_files):
         assert isinstance(WordPiece(), Model)
         assert isinstance(WordPiece(), WordPiece)
diff --git a/bindings/python/tests/bindings/test_processors.py b/bindings/python/tests/bindings/test_processors.py
index 3038d8694..4e98bcc1b 100644
--- a/bindings/python/tests/bindings/test_processors.py
+++ b/bindings/python/tests/bindings/test_processors.py
@@ -70,6 +70,7 @@ def test_instantiate(self):
         assert isinstance(ByteLevel(), ByteLevel)
         assert isinstance(pickle.loads(pickle.dumps(ByteLevel())), ByteLevel)
 
+    @pytest.mark.network
     def test_processing(self, roberta_files):
         # Deprecated in 0.9
         with pytest.deprecated_call():
diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py
index 69cdaae77..fed7c81bb 100644
--- a/bindings/python/tests/bindings/test_tokenizer.py
+++ b/bindings/python/tests/bindings/test_tokenizer.py
@@ -154,6 +154,7 @@ def test_encode(self):
         output = tokenizer.encode_batch(["my name is john", ("my name is john", "pair")])
         assert len(output) == 2
 
+    @pytest.mark.network
     def test_encode_formats(self, bert_files):
         with pytest.deprecated_call():
             tokenizer = BertWordPieceTokenizer(bert_files["vocab"])
@@ -286,6 +287,7 @@ def test_pair(input, is_pretokenized=False):
         with pytest.raises(TypeError, match="InputSequence must be Union[List[str]"):
             tokenizer.encode(["My", "name", "is", "John"], "pair", is_pretokenized=True)
 
+    @pytest.mark.network
     def test_encode_add_special_tokens(self, roberta_files):
         with pytest.deprecated_call():
             tokenizer = Tokenizer(BPE(roberta_files["vocab"], roberta_files["merges"]))
diff --git a/bindings/python/tests/bindings/test_trainers.py b/bindings/python/tests/bindings/test_trainers.py
index 38b599448..fa05cb27e 100644
--- a/bindings/python/tests/bindings/test_trainers.py
+++ b/bindings/python/tests/bindings/test_trainers.py
@@ -150,6 +150,7 @@ def test_can_pickle(self):
 
 
 class TestUnigram:
+    @pytest.mark.network
     def test_train(self, train_files):
         tokenizer = SentencePieceUnigramTokenizer()
         tokenizer.train(train_files["small"], show_progress=False)
@@ -158,6 +159,7 @@ def test_train(self, train_files):
         tokenizer.save(filename)
         os.remove(filename)
 
+    @pytest.mark.network
     def test_train_parallelism_with_custom_pretokenizer(self, train_files):
         class GoodCustomPretok:
             def split(self, n, normalized):
@@ -287,6 +289,7 @@ def test_can_modify(self):
         trainer.initial_alphabet = ["d", "z"]
         assert sorted(trainer.initial_alphabet) == ["d", "z"]
 
+    @pytest.mark.network
     def test_continuing_prefix_trainer_mismatch(self, train_files):
         UNK = "[UNK]"
         special_tokens = [UNK]
diff --git a/bindings/python/tests/documentation/test_pipeline.py b/bindings/python/tests/documentation/test_pipeline.py
index 25300ff64..f6b5cb768 100644
--- a/bindings/python/tests/documentation/test_pipeline.py
+++ b/bindings/python/tests/documentation/test_pipeline.py
@@ -12,6 +12,7 @@ def print(*args, **kwargs):
 
 
 class TestPipeline:
+    @pytest.mark.network
     def test_pipeline(self, doc_wiki_tokenizer):
         try:
             # START reload_tokenizer
@@ -143,6 +144,7 @@ def slow_train():
         bert_tokenizer.save("data/bert-wiki.json")
         # END bert_train_tokenizer
 
+    @pytest.mark.network
     def test_bert_example(self, doc_pipeline_bert_tokenizer):
         try:
             bert_tokenizer = Tokenizer.from_file("data/bert-wiki.json")
diff --git a/bindings/python/tests/documentation/test_quicktour.py b/bindings/python/tests/documentation/test_quicktour.py
index a98b0c12e..fb8fb972e 100644
--- a/bindings/python/tests/documentation/test_quicktour.py
+++ b/bindings/python/tests/documentation/test_quicktour.py
@@ -45,6 +45,7 @@ def get_tokenizer_trainer():
         # END init_pretok
         return tokenizer, trainer
 
+    @pytest.mark.network
     def test_quicktour(self, doc_wiki_tokenizer):
         def print(*args, **kwargs):
             pass
diff --git a/bindings/python/tests/implementations/test_bert_wordpiece.py b/bindings/python/tests/implementations/test_bert_wordpiece.py
index 4e7c29cf5..4a20f36ed 100644
--- a/bindings/python/tests/implementations/test_bert_wordpiece.py
+++ b/bindings/python/tests/implementations/test_bert_wordpiece.py
@@ -4,6 +4,7 @@
 
 
 class TestBertWordPieceTokenizer:
+    @pytest.mark.network
     def test_basic_encode(self, bert_files):
         tokenizer = BertWordPieceTokenizer.from_file(bert_files["vocab"])
 
@@ -39,6 +40,7 @@ def test_basic_encode(self, bert_files):
         assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)]
         assert output.type_ids == [0, 0, 0, 0, 1]
 
+    @pytest.mark.network
     def test_multiprocessing_with_parallelism(self, bert_files):
         tokenizer = BertWordPieceTokenizer.from_file(bert_files["vocab"])
         multiprocessing_with_parallelism(tokenizer, False)
diff --git a/bindings/python/tests/implementations/test_byte_level_bpe.py b/bindings/python/tests/implementations/test_byte_level_bpe.py
index 441aded7a..5b027b7cc 100644
--- a/bindings/python/tests/implementations/test_byte_level_bpe.py
+++ b/bindings/python/tests/implementations/test_byte_level_bpe.py
@@ -4,6 +4,7 @@
 
 
 class TestByteLevelBPE:
+    @pytest.mark.network
     def test_basic_encode(self, roberta_files):
         tokenizer = ByteLevelBPETokenizer.from_file(roberta_files["vocab"], roberta_files["merges"])
         output = tokenizer.encode("The quick brown fox jumps over the lazy dog")
@@ -32,6 +33,7 @@ def test_basic_encode(self, roberta_files):
             (39, 43),
         ]
 
+    @pytest.mark.network
     def test_add_prefix_space(self, roberta_files):
         tokenizer = ByteLevelBPETokenizer.from_file(
             roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True
@@ -62,6 +64,7 @@ def test_add_prefix_space(self, roberta_files):
             (39, 43),
         ]
 
+    @pytest.mark.network
     def test_lowerspace(self, roberta_files):
         tokenizer = ByteLevelBPETokenizer.from_file(
             roberta_files["vocab"],
@@ -84,6 +87,7 @@ def test_lowerspace(self, roberta_files):
             "Ġdog",
         ]
 
+    @pytest.mark.network
     def test_multiprocessing_with_parallelism(self, roberta_files):
         tokenizer = ByteLevelBPETokenizer.from_file(roberta_files["vocab"], roberta_files["merges"])
         multiprocessing_with_parallelism(tokenizer, False)
diff --git a/bindings/python/tests/implementations/test_char_bpe.py b/bindings/python/tests/implementations/test_char_bpe.py
index 3ce5cf9a3..789e7d483 100644
--- a/bindings/python/tests/implementations/test_char_bpe.py
+++ b/bindings/python/tests/implementations/test_char_bpe.py
@@ -4,6 +4,7 @@
 
 
 class TestCharBPETokenizer:
+    @pytest.mark.network
     def test_basic_encode(self, openai_files):
         tokenizer = CharBPETokenizer.from_file(openai_files["vocab"], openai_files["merges"])
 
@@ -31,6 +32,7 @@ def test_basic_encode(self, openai_files):
         ]
         assert output.type_ids == [0, 0, 0, 0, 0, 0, 0, 1]
 
+    @pytest.mark.network
     def test_lowercase(self, openai_files):
         tokenizer = CharBPETokenizer.from_file(openai_files["vocab"], openai_files["merges"], lowercase=True)
         output = tokenizer.encode("My name is John", "pair", add_special_tokens=False)
@@ -39,11 +41,13 @@ def test_lowercase(self, openai_files):
         assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)]
         assert output.type_ids == [0, 0, 0, 0, 1]
 
+    @pytest.mark.network
     def test_decoding(self, openai_files):
         tokenizer = CharBPETokenizer.from_file(openai_files["vocab"], openai_files["merges"], lowercase=True)
         decoded = tokenizer.decode(tokenizer.encode("my name is john").ids)
         assert decoded == "my name is john"
 
+    @pytest.mark.network
     def test_multiprocessing_with_parallelism(self, openai_files):
         tokenizer = CharBPETokenizer.from_file(openai_files["vocab"], openai_files["merges"])
         multiprocessing_with_parallelism(tokenizer, False)
diff --git a/bindings/python/tests/test_serialization.py b/bindings/python/tests/test_serialization.py
index 9da2c3e27..ef8abc804 100644
--- a/bindings/python/tests/test_serialization.py
+++ b/bindings/python/tests/test_serialization.py
@@ -11,12 +11,14 @@
 
 
 class TestSerialization:
+    @pytest.mark.network
     def test_full_serialization_albert(self, albert_base):
         # Check we can read this file.
         # This used to fail because of BufReader that would fail because the
         # file exceeds the buffer capacity
         Tokenizer.from_file(albert_base)
 
+    @pytest.mark.network
     def test_str_big(self, albert_base):
         tokenizer = Tokenizer.from_file(albert_base)
         assert (

From a65dba5614e9172288b84cee8027674e46640bc9 Mon Sep 17 00:00:00 2001
From: Gordon Messmer <gordon.messmer@gmail.com>
Date: Thu, 2 Oct 2025 13:39:58 -0700
Subject: [PATCH 3/5] Mark Python tests that need network access

---
 bindings/python/tests/documentation/test_pipeline.py         | 1 +
 bindings/python/tests/documentation/test_quicktour.py        | 1 +
 bindings/python/tests/implementations/test_bert_wordpiece.py | 1 +
 bindings/python/tests/implementations/test_byte_level_bpe.py | 1 +
 bindings/python/tests/implementations/test_char_bpe.py       | 1 +
 bindings/python/tests/test_serialization.py                  | 1 +
 6 files changed, 6 insertions(+)

diff --git a/bindings/python/tests/documentation/test_pipeline.py b/bindings/python/tests/documentation/test_pipeline.py
index f6b5cb768..a94f9cc61 100644
--- a/bindings/python/tests/documentation/test_pipeline.py
+++ b/bindings/python/tests/documentation/test_pipeline.py
@@ -1,3 +1,4 @@
+import pytest
 from tokenizers import Tokenizer
 
 from ..utils import data_dir, doc_pipeline_bert_tokenizer, doc_wiki_tokenizer
diff --git a/bindings/python/tests/documentation/test_quicktour.py b/bindings/python/tests/documentation/test_quicktour.py
index fb8fb972e..59d5cceec 100644
--- a/bindings/python/tests/documentation/test_quicktour.py
+++ b/bindings/python/tests/documentation/test_quicktour.py
@@ -1,3 +1,4 @@
+import pytest
 from tokenizers import Tokenizer
 from ..utils import data_dir, doc_wiki_tokenizer
 
diff --git a/bindings/python/tests/implementations/test_bert_wordpiece.py b/bindings/python/tests/implementations/test_bert_wordpiece.py
index 4a20f36ed..a7fefc1d3 100644
--- a/bindings/python/tests/implementations/test_bert_wordpiece.py
+++ b/bindings/python/tests/implementations/test_bert_wordpiece.py
@@ -1,3 +1,4 @@
+import pytest
 from tokenizers import BertWordPieceTokenizer
 
 from ..utils import bert_files, data_dir, multiprocessing_with_parallelism
diff --git a/bindings/python/tests/implementations/test_byte_level_bpe.py b/bindings/python/tests/implementations/test_byte_level_bpe.py
index 5b027b7cc..51410ac8e 100644
--- a/bindings/python/tests/implementations/test_byte_level_bpe.py
+++ b/bindings/python/tests/implementations/test_byte_level_bpe.py
@@ -1,3 +1,4 @@
+import pytest
 from tokenizers import ByteLevelBPETokenizer
 
 from ..utils import data_dir, multiprocessing_with_parallelism, roberta_files
diff --git a/bindings/python/tests/implementations/test_char_bpe.py b/bindings/python/tests/implementations/test_char_bpe.py
index 789e7d483..3449e8b2a 100644
--- a/bindings/python/tests/implementations/test_char_bpe.py
+++ b/bindings/python/tests/implementations/test_char_bpe.py
@@ -1,3 +1,4 @@
+import pytest
 from tokenizers import CharBPETokenizer
 
 from ..utils import data_dir, multiprocessing_with_parallelism, openai_files
diff --git a/bindings/python/tests/test_serialization.py b/bindings/python/tests/test_serialization.py
index ef8abc804..c9060f4ad 100644
--- a/bindings/python/tests/test_serialization.py
+++ b/bindings/python/tests/test_serialization.py
@@ -1,5 +1,6 @@
 import json
 import os
+import pytest
 import unittest
 
 import tqdm

From 56e549fe8224b5db237659132823863040481ca7 Mon Sep 17 00:00:00 2001
From: Gordon Messmer <gordon.messmer@gmail.com>
Date: Thu, 2 Oct 2025 13:55:47 -0700
Subject: [PATCH 4/5] Mark Python tests that need network access

---
 bindings/python/tests/bindings/test_models.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bindings/python/tests/bindings/test_models.py b/bindings/python/tests/bindings/test_models.py
index f467465b2..23839a3e5 100644
--- a/bindings/python/tests/bindings/test_models.py
+++ b/bindings/python/tests/bindings/test_models.py
@@ -114,6 +114,7 @@ def test_can_modify(self):
 
 
 class TestWordLevel:
+    @pytest.mark.network
     def test_instantiate(self, roberta_files):
         assert isinstance(WordLevel(), Model)
         assert isinstance(WordLevel(), WordLevel)

From 7e4fbc556e2d0fe91161ef20b6fe01a1e34b1609 Mon Sep 17 00:00:00 2001
From: Gordon Messmer <gordon.messmer@gmail.com>
Date: Thu, 2 Oct 2025 14:20:45 -0700
Subject: [PATCH 5/5] Mark Python tests that need network access

---
 bindings/python/tests/bindings/test_trainers.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bindings/python/tests/bindings/test_trainers.py b/bindings/python/tests/bindings/test_trainers.py
index fa05cb27e..40d4acb0c 100644
--- a/bindings/python/tests/bindings/test_trainers.py
+++ b/bindings/python/tests/bindings/test_trainers.py
@@ -186,6 +186,7 @@ def test_can_pickle(self):
 
     def test_train_with_special_tokens(self):
         filename = "tests/data/dummy-unigram-special_tokens-train.txt"
+        os.makedirs("tests/data", exist_ok=True)
         with open(filename, "w") as f:
             f.write(
                 """