From 7637cb980dbfa49f1892a1472e150c981a6a52b7 Mon Sep 17 00:00:00 2001
From: Tobias Pitters <tobias.pitters@gmail.com>
Date: Tue, 21 Oct 2025 19:28:46 +0200
Subject: [PATCH 1/7] docs: add prefix space to pyi and rust

---
 bindings/python/tests/bindings/test_processors.py | 1 +
 1 file changed, 1 insertion(+)
diff --git a/bindings/python/tests/bindings/test_processors.py b/bindings/python/tests/bindings/test_processors.py
index 3038d8694..c72eb3f6e 100644
--- a/bindings/python/tests/bindings/test_processors.py
+++ b/bindings/python/tests/bindings/test_processors.py
@@ -66,6 +66,7 @@ class TestByteLevelProcessing:
     def test_instantiate(self):
         assert ByteLevel() is not None
         assert ByteLevel(trim_offsets=True) is not None
+        assert ByteLevel(add_prefix_space=True) is not None
         assert isinstance(ByteLevel(), PostProcessor)
         assert isinstance(ByteLevel(), ByteLevel)
         assert isinstance(pickle.loads(pickle.dumps(ByteLevel())), ByteLevel)

From 9ec05c94c27b18603e65d13670326570488c4ecb Mon Sep 17 00:00:00 2001
From: Tobias Pitters <tobias.pitters@gmail.com>
Date: Tue, 21 Oct 2025 19:31:12 +0200
Subject: [PATCH 2/7] add prefix space to pyi

---
 bindings/python/py_src/tokenizers/processors/__init__.pyi | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/bindings/python/py_src/tokenizers/processors/__init__.pyi b/bindings/python/py_src/tokenizers/processors/__init__.pyi
index 5136d02bb..f04806cd6 100644
--- a/bindings/python/py_src/tokenizers/processors/__init__.pyi
+++ b/bindings/python/py_src/tokenizers/processors/__init__.pyi
@@ -98,8 +98,11 @@ class ByteLevel(PostProcessor):
     Args:
         trim_offsets (:obj:`bool`):
             Whether to trim the whitespaces from the produced offsets.
+        add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to add a space to the first word if there isn't already one. This
+            lets us treat `hello` exactly like `say hello`.
     """
-    def __init__(self, trim_offsets=True):
+    def __init__(self, trim_offsets=True, add_prefix_space=True):
         pass
 
     def num_special_tokens_to_add(self, is_pair):

From c9296a331f6b07de9c01e8a1021da6891096977b Mon Sep 17 00:00:00 2001
From: Tobias Pitters <tobias.pitters@gmail.com>
Date: Wed, 22 Oct 2025 10:05:09 +0200
Subject: [PATCH 3/7] update docs for processors.rs

---
 bindings/python/src/processors.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/bindings/python/src/processors.rs b/bindings/python/src/processors.rs
index 03fa6bdf7..60d4c8ece 100644
--- a/bindings/python/src/processors.rs
+++ b/bindings/python/src/processors.rs
@@ -484,12 +484,16 @@ impl PyRobertaProcessing {
 /// Args:
 ///     trim_offsets (:obj:`bool`):
 ///         Whether to trim the whitespaces from the produced offsets.
+///
+///     add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+///         Whether the add_prefix_space option was enabled during pre-tokenization. This
+///         is relevant because it defines the way the offsets are trimmed out.
 #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "ByteLevel")]
 pub struct PyByteLevel {}
 #[pymethods]
 impl PyByteLevel {
     #[new]
-    #[pyo3(signature = (add_prefix_space = None, trim_offsets = None, use_regex = None, **_kwargs), text_signature = "(self, trim_offsets=True)")]
+    #[pyo3(signature = (add_prefix_space = None, trim_offsets = None, use_regex = None, **_kwargs), text_signature = "(self, trim_offsets=True, add_prefix_state=True)")]
     fn new(
         add_prefix_space: Option<bool>,
         trim_offsets: Option<bool>,

From 24daf3aba16f7b12edd0024a547a9dfbc8d20d77 Mon Sep 17 00:00:00 2001
From: Tobias Pitters <tobias.pitters@gmail.com>
Date: Fri, 28 Nov 2025 12:56:01 +0100
Subject: [PATCH 4/7] update docs and add tests

---
 .../py_src/tokenizers/processors/__init__.pyi      |  5 +++--
 bindings/python/src/processors.rs                  |  5 +++--
 bindings/python/tests/bindings/test_processors.py  | 14 +++++++++++++-
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/bindings/python/py_src/tokenizers/processors/__init__.pyi b/bindings/python/py_src/tokenizers/processors/__init__.pyi
index f04806cd6..6a6eb8f1a 100644
--- a/bindings/python/py_src/tokenizers/processors/__init__.pyi
+++ b/bindings/python/py_src/tokenizers/processors/__init__.pyi
@@ -99,8 +99,9 @@ class ByteLevel(PostProcessor):
         trim_offsets (:obj:`bool`):
             Whether to trim the whitespaces from the produced offsets.
         add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether to add a space to the first word if there isn't already one. This
-            lets us treat `hello` exactly like `say hello`.
+            If :obj:`True`, keeps the first token's offset as is. If :obj:`False`, increments 
+            the start of the first token's offset by 1. Only has an effect if :obj:`trim_offsets`
+            is set to :obj:`True`.
     """
     def __init__(self, trim_offsets=True, add_prefix_space=True):
         pass
diff --git a/bindings/python/src/processors.rs b/bindings/python/src/processors.rs
index 60d4c8ece..1f960c6d4 100644
--- a/bindings/python/src/processors.rs
+++ b/bindings/python/src/processors.rs
@@ -486,8 +486,9 @@ impl PyRobertaProcessing {
 ///         Whether to trim the whitespaces from the produced offsets.
 ///
 ///     add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
-///         Whether the add_prefix_space option was enabled during pre-tokenization. This
-///         is relevant because it defines the way the offsets are trimmed out.
+///         If :obj:`True`, keeps the first token's offset as is. If :obj:`False`, increments 
+///         the start of the first token's offset by 1. Only has an effect if :obj:`trim_offsets`
+///         is set to :obj:`True`.
 #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "ByteLevel")]
 pub struct PyByteLevel {}
 #[pymethods]
diff --git a/bindings/python/tests/bindings/test_processors.py b/bindings/python/tests/bindings/test_processors.py
index c72eb3f6e..50cf770f0 100644
--- a/bindings/python/tests/bindings/test_processors.py
+++ b/bindings/python/tests/bindings/test_processors.py
@@ -83,11 +83,23 @@ def test_processing(self, roberta_files):
         assert output.offsets == [(0, 2), (2, 7), (7, 10), (10, 15)]
 
         # Trims offsets when activated
-        tokenizer.post_processor = ByteLevel(trim_offsets=True)
+        tokenizer.post_processor = ByteLevel(trim_offsets=True, add_prefix_space=True)
         output = tokenizer.encode("My name is John")
         assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
         assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15)]
 
+        # Trims offsets without adding prefix space at first token
+        tokenizer.post_processor = ByteLevel(trim_offsets=True, add_prefix_space=False)
+        output = tokenizer.encode("My name is John")
+        assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
+        assert output.offsets == [(1, 2), (3, 7), (8, 10), (11, 15)]
+
+        # add_prefix_space without trimming offsets has no effect
+        tokenizer.post_processor = ByteLevel(trim_offsets=False, add_prefix_space=True)
+        output = tokenizer.encode("My name is John")
+        assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
+        assert output.offsets == [(0, 2), (2, 7), (7, 10), (10, 15)]
+
     def test_manual_reload(self):
         byte_level = ByteLevel()
         state = json.loads(byte_level.__getstate__())

From b6a7b908e31f54c54e0515e786eaf1112c01ec0d Mon Sep 17 00:00:00 2001
From: Tobias Pitters <tobias.pitters@gmail.com>
Date: Fri, 28 Nov 2025 14:06:10 +0100
Subject: [PATCH 5/7] update processors.rs

---
 bindings/python/src/processors.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bindings/python/src/processors.rs b/bindings/python/src/processors.rs
index 1f960c6d4..4a25c910c 100644
--- a/bindings/python/src/processors.rs
+++ b/bindings/python/src/processors.rs
@@ -486,7 +486,7 @@ impl PyRobertaProcessing {
 ///         Whether to trim the whitespaces from the produced offsets.
 ///
 ///     add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
-///         If :obj:`True`, keeps the first token's offset as is. If :obj:`False`, increments 
+///         If :obj:`True`, keeps the first token's offset as is. If :obj:`False`, increments
 ///         the start of the first token's offset by 1. Only has an effect if :obj:`trim_offsets`
 ///         is set to :obj:`True`.
 #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "ByteLevel")]

From 0ab19d7390c199096218188cd1f0a5291415851a Mon Sep 17 00:00:00 2001
From: Tobias Pitters <tobias.pitters@gmail.com>
Date: Fri, 28 Nov 2025 16:29:01 +0100
Subject: [PATCH 6/7] make style

---
 bindings/python/py_src/tokenizers/processors/__init__.pyi | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/bindings/python/py_src/tokenizers/processors/__init__.pyi b/bindings/python/py_src/tokenizers/processors/__init__.pyi
index 6a6eb8f1a..ba7df03f9 100644
--- a/bindings/python/py_src/tokenizers/processors/__init__.pyi
+++ b/bindings/python/py_src/tokenizers/processors/__init__.pyi
@@ -98,12 +98,12 @@ class ByteLevel(PostProcessor):
     Args:
         trim_offsets (:obj:`bool`):
             Whether to trim the whitespaces from the produced offsets.
+
         add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            If :obj:`True`, keeps the first token's offset as is. If :obj:`False`, increments 
-            the start of the first token's offset by 1. Only has an effect if :obj:`trim_offsets`
-            is set to :obj:`True`.
+            Whether the add_prefix_space option was enabled during pre-tokenization. This
+            is relevant because it defines the way the offsets are trimmed out.
     """
-    def __init__(self, trim_offsets=True, add_prefix_space=True):
+    def __init__(self, trim_offsets=True, add_prefix_state=True):
         pass
 
     def num_special_tokens_to_add(self, is_pair):

From 295beb2a2905b500a9b4f98e6387d12f36a02f11 Mon Sep 17 00:00:00 2001
From: Tobias Pitters <tobias.pitters@gmail.com>
Date: Fri, 28 Nov 2025 17:19:21 +0100
Subject: [PATCH 7/7] update init.pyi through stubs.py

---
 bindings/python/py_src/tokenizers/processors/__init__.pyi | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/bindings/python/py_src/tokenizers/processors/__init__.pyi b/bindings/python/py_src/tokenizers/processors/__init__.pyi
index ba7df03f9..930eba87a 100644
--- a/bindings/python/py_src/tokenizers/processors/__init__.pyi
+++ b/bindings/python/py_src/tokenizers/processors/__init__.pyi
@@ -100,8 +100,9 @@ class ByteLevel(PostProcessor):
             Whether to trim the whitespaces from the produced offsets.
 
         add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether the add_prefix_space option was enabled during pre-tokenization. This
-            is relevant because it defines the way the offsets are trimmed out.
+            If :obj:`True`, keeps the first token's offset as is. If :obj:`False`, increments
+            the start of the first token's offset by 1. Only has an effect if :obj:`trim_offsets`
+            is set to :obj:`True`.
     """
     def __init__(self, trim_offsets=True, add_prefix_state=True):
         pass