DOCS: add add_prefix_space to processors.ByteLevel (#1878)

CloseChoice · web-flow · commit b83d7c986c1d · 2025-11-28T18:38:49.000+01:00
* docs: add prefix space to pyi and rust

* add prefix space to pyi

* update docs for processors.rs

* update docs and add tests

* update processors.rs

* make style

* update init.pyi through stubs.py
diff --git a/bindings/python/py_src/tokenizers/processors/__init__.pyi b/bindings/python/py_src/tokenizers/processors/__init__.pyi
@@ -98,8 +98,13 @@ class ByteLevel(PostProcessor):
     Args:
         trim_offsets (:obj:`bool`):
             Whether to trim the whitespaces from the produced offsets.
+
+        add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            If :obj:`True`, keeps the first token's offset as is. If :obj:`False`, increments
+            the start of the first token's offset by 1. Only has an effect if :obj:`trim_offsets`
+            is set to :obj:`True`.
     """
-    def __init__(self, trim_offsets=True):
+    def __init__(self, trim_offsets=True, add_prefix_state=True):
         pass
 
     def num_special_tokens_to_add(self, is_pair):
diff --git a/bindings/python/src/processors.rs b/bindings/python/src/processors.rs
@@ -484,12 +484,17 @@ impl PyRobertaProcessing {
 /// Args:
 ///     trim_offsets (:obj:`bool`):
 ///         Whether to trim the whitespaces from the produced offsets.
+///
+///     add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+///         If :obj:`True`, keeps the first token's offset as is. If :obj:`False`, increments
+///         the start of the first token's offset by 1. Only has an effect if :obj:`trim_offsets`
+///         is set to :obj:`True`.
 #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "ByteLevel")]
 pub struct PyByteLevel {}
 #[pymethods]
 impl PyByteLevel {
     #[new]
-    #[pyo3(signature = (add_prefix_space = None, trim_offsets = None, use_regex = None, **_kwargs), text_signature = "(self, trim_offsets=True)")]
+    #[pyo3(signature = (add_prefix_space = None, trim_offsets = None, use_regex = None, **_kwargs), text_signature = "(self, trim_offsets=True, add_prefix_state=True)")]
     fn new(
         add_prefix_space: Option<bool>,
         trim_offsets: Option<bool>,
diff --git a/bindings/python/tests/bindings/test_processors.py b/bindings/python/tests/bindings/test_processors.py
@@ -66,6 +66,7 @@ class TestByteLevelProcessing:
     def test_instantiate(self):
         assert ByteLevel() is not None
         assert ByteLevel(trim_offsets=True) is not None
+        assert ByteLevel(add_prefix_space=True) is not None
         assert isinstance(ByteLevel(), PostProcessor)
         assert isinstance(ByteLevel(), ByteLevel)
         assert isinstance(pickle.loads(pickle.dumps(ByteLevel())), ByteLevel)
@@ -82,11 +83,23 @@ def test_processing(self, roberta_files):
         assert output.offsets == [(0, 2), (2, 7), (7, 10), (10, 15)]
 
         # Trims offsets when activated
-        tokenizer.post_processor = ByteLevel(trim_offsets=True)
+        tokenizer.post_processor = ByteLevel(trim_offsets=True, add_prefix_space=True)
         output = tokenizer.encode("My name is John")
         assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
         assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15)]
 
+        # Trims offsets without adding prefix space at first token
+        tokenizer.post_processor = ByteLevel(trim_offsets=True, add_prefix_space=False)
+        output = tokenizer.encode("My name is John")
+        assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
+        assert output.offsets == [(1, 2), (3, 7), (8, 10), (11, 15)]
+
+        # add_prefix_space without trimming offsets has no effect
+        tokenizer.post_processor = ByteLevel(trim_offsets=False, add_prefix_space=True)
+        output = tokenizer.encode("My name is John")
+        assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
+        assert output.offsets == [(0, 2), (2, 7), (7, 10), (10, 15)]
+
     def test_manual_reload(self):
         byte_level = ByteLevel()
         state = json.loads(byte_level.__getstate__())