From 7637cb980dbfa49f1892a1472e150c981a6a52b7 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Tue, 21 Oct 2025 19:28:46 +0200 Subject: [PATCH 1/7] docs: add prefix space to pyi and rust --- bindings/python/tests/bindings/test_processors.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bindings/python/tests/bindings/test_processors.py b/bindings/python/tests/bindings/test_processors.py index 3038d8694..c72eb3f6e 100644 --- a/bindings/python/tests/bindings/test_processors.py +++ b/bindings/python/tests/bindings/test_processors.py @@ -66,6 +66,7 @@ class TestByteLevelProcessing: def test_instantiate(self): assert ByteLevel() is not None assert ByteLevel(trim_offsets=True) is not None + assert ByteLevel(add_prefix_space=True) is not None assert isinstance(ByteLevel(), PostProcessor) assert isinstance(ByteLevel(), ByteLevel) assert isinstance(pickle.loads(pickle.dumps(ByteLevel())), ByteLevel) From 9ec05c94c27b18603e65d13670326570488c4ecb Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Tue, 21 Oct 2025 19:31:12 +0200 Subject: [PATCH 2/7] add prefix space to pyi --- bindings/python/py_src/tokenizers/processors/__init__.pyi | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bindings/python/py_src/tokenizers/processors/__init__.pyi b/bindings/python/py_src/tokenizers/processors/__init__.pyi index 5136d02bb..f04806cd6 100644 --- a/bindings/python/py_src/tokenizers/processors/__init__.pyi +++ b/bindings/python/py_src/tokenizers/processors/__init__.pyi @@ -98,8 +98,11 @@ class ByteLevel(PostProcessor): Args: trim_offsets (:obj:`bool`): Whether to trim the whitespaces from the produced offsets. + add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to add a space to the first word if there isn't already one. This + lets us treat `hello` exactly like `say hello`. """ - def __init__(self, trim_offsets=True): + def __init__(self, trim_offsets=True, add_prefix_space=True): pass def num_special_tokens_to_add(self, is_pair): From c9296a331f6b07de9c01e8a1021da6891096977b Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Wed, 22 Oct 2025 10:05:09 +0200 Subject: [PATCH 3/7] update docs for processors.rs --- bindings/python/src/processors.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bindings/python/src/processors.rs b/bindings/python/src/processors.rs index 03fa6bdf7..60d4c8ece 100644 --- a/bindings/python/src/processors.rs +++ b/bindings/python/src/processors.rs @@ -484,12 +484,16 @@ impl PyRobertaProcessing { /// Args: /// trim_offsets (:obj:`bool`): /// Whether to trim the whitespaces from the produced offsets. +/// +/// add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`): +/// Whether the add_prefix_space option was enabled during pre-tokenization. This +/// is relevant because it defines the way the offsets are trimmed out. #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "ByteLevel")] pub struct PyByteLevel {} #[pymethods] impl PyByteLevel { #[new] - #[pyo3(signature = (add_prefix_space = None, trim_offsets = None, use_regex = None, **_kwargs), text_signature = "(self, trim_offsets=True)")] + #[pyo3(signature = (add_prefix_space = None, trim_offsets = None, use_regex = None, **_kwargs), text_signature = "(self, trim_offsets=True, add_prefix_state=True)")] fn new( add_prefix_space: Option, trim_offsets: Option, From 24daf3aba16f7b12edd0024a547a9dfbc8d20d77 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Fri, 28 Nov 2025 12:56:01 +0100 Subject: [PATCH 4/7] update docs and add tests --- .../py_src/tokenizers/processors/__init__.pyi | 5 +++-- bindings/python/src/processors.rs | 5 +++-- bindings/python/tests/bindings/test_processors.py | 14 +++++++++++++- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/bindings/python/py_src/tokenizers/processors/__init__.pyi b/bindings/python/py_src/tokenizers/processors/__init__.pyi index f04806cd6..6a6eb8f1a 100644 --- a/bindings/python/py_src/tokenizers/processors/__init__.pyi +++ b/bindings/python/py_src/tokenizers/processors/__init__.pyi @@ -99,8 +99,9 @@ class ByteLevel(PostProcessor): trim_offsets (:obj:`bool`): Whether to trim the whitespaces from the produced offsets. add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`): - Whether to add a space to the first word if there isn't already one. This - lets us treat `hello` exactly like `say hello`. + If :obj:`True`, keeps the first token's offset as is. If :obj:`False`, increments + the start of the first token's offset by 1. Only has an effect if :obj:`trim_offsets` + is set to :obj:`True`. """ def __init__(self, trim_offsets=True, add_prefix_space=True): pass diff --git a/bindings/python/src/processors.rs b/bindings/python/src/processors.rs index 60d4c8ece..1f960c6d4 100644 --- a/bindings/python/src/processors.rs +++ b/bindings/python/src/processors.rs @@ -486,8 +486,9 @@ impl PyRobertaProcessing { /// Whether to trim the whitespaces from the produced offsets. /// /// add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`): -/// Whether the add_prefix_space option was enabled during pre-tokenization. This -/// is relevant because it defines the way the offsets are trimmed out. +/// If :obj:`True`, keeps the first token's offset as is. If :obj:`False`, increments +/// the start of the first token's offset by 1. Only has an effect if :obj:`trim_offsets` +/// is set to :obj:`True`. #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "ByteLevel")] pub struct PyByteLevel {} #[pymethods] diff --git a/bindings/python/tests/bindings/test_processors.py b/bindings/python/tests/bindings/test_processors.py index c72eb3f6e..50cf770f0 100644 --- a/bindings/python/tests/bindings/test_processors.py +++ b/bindings/python/tests/bindings/test_processors.py @@ -83,11 +83,23 @@ def test_processing(self, roberta_files): assert output.offsets == [(0, 2), (2, 7), (7, 10), (10, 15)] # Trims offsets when activated - tokenizer.post_processor = ByteLevel(trim_offsets=True) + tokenizer.post_processor = ByteLevel(trim_offsets=True, add_prefix_space=True) output = tokenizer.encode("My name is John") assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"] assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15)] + # Trims offsets without adding prefix space at first token + tokenizer.post_processor = ByteLevel(trim_offsets=True, add_prefix_space=False) + output = tokenizer.encode("My name is John") + assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"] + assert output.offsets == [(1, 2), (3, 7), (8, 10), (11, 15)] + + # add_prefix_space without trimming offsets has no effect + tokenizer.post_processor = ByteLevel(trim_offsets=False, add_prefix_space=True) + output = tokenizer.encode("My name is John") + assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"] + assert output.offsets == [(0, 2), (2, 7), (7, 10), (10, 15)] + def test_manual_reload(self): byte_level = ByteLevel() state = json.loads(byte_level.__getstate__()) From b6a7b908e31f54c54e0515e786eaf1112c01ec0d Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Fri, 28 Nov 2025 14:06:10 +0100 Subject: [PATCH 5/7] update processors.rs --- bindings/python/src/processors.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/python/src/processors.rs b/bindings/python/src/processors.rs index 1f960c6d4..4a25c910c 100644 --- a/bindings/python/src/processors.rs +++ b/bindings/python/src/processors.rs @@ -486,7 +486,7 @@ impl PyRobertaProcessing { /// Whether to trim the whitespaces from the produced offsets. /// /// add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`): -/// If :obj:`True`, keeps the first token's offset as is. If :obj:`False`, increments +/// If :obj:`True`, keeps the first token's offset as is. If :obj:`False`, increments /// the start of the first token's offset by 1. Only has an effect if :obj:`trim_offsets` /// is set to :obj:`True`. #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "ByteLevel")] From 0ab19d7390c199096218188cd1f0a5291415851a Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Fri, 28 Nov 2025 16:29:01 +0100 Subject: [PATCH 6/7] make style --- bindings/python/py_src/tokenizers/processors/__init__.pyi | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bindings/python/py_src/tokenizers/processors/__init__.pyi b/bindings/python/py_src/tokenizers/processors/__init__.pyi index 6a6eb8f1a..ba7df03f9 100644 --- a/bindings/python/py_src/tokenizers/processors/__init__.pyi +++ b/bindings/python/py_src/tokenizers/processors/__init__.pyi @@ -98,12 +98,12 @@ class ByteLevel(PostProcessor): Args: trim_offsets (:obj:`bool`): Whether to trim the whitespaces from the produced offsets. + add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`): - If :obj:`True`, keeps the first token's offset as is. If :obj:`False`, increments - the start of the first token's offset by 1. Only has an effect if :obj:`trim_offsets` - is set to :obj:`True`. + Whether the add_prefix_space option was enabled during pre-tokenization. This + is relevant because it defines the way the offsets are trimmed out. """ - def __init__(self, trim_offsets=True, add_prefix_space=True): + def __init__(self, trim_offsets=True, add_prefix_state=True): pass def num_special_tokens_to_add(self, is_pair): From 295beb2a2905b500a9b4f98e6387d12f36a02f11 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Fri, 28 Nov 2025 17:19:21 +0100 Subject: [PATCH 7/7] update init.pyi through stubs.py --- bindings/python/py_src/tokenizers/processors/__init__.pyi | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bindings/python/py_src/tokenizers/processors/__init__.pyi b/bindings/python/py_src/tokenizers/processors/__init__.pyi index ba7df03f9..930eba87a 100644 --- a/bindings/python/py_src/tokenizers/processors/__init__.pyi +++ b/bindings/python/py_src/tokenizers/processors/__init__.pyi @@ -100,8 +100,9 @@ class ByteLevel(PostProcessor): Whether to trim the whitespaces from the produced offsets. add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`): - Whether the add_prefix_space option was enabled during pre-tokenization. This - is relevant because it defines the way the offsets are trimmed out. + If :obj:`True`, keeps the first token's offset as is. If :obj:`False`, increments + the start of the first token's offset by 1. Only has an effect if :obj:`trim_offsets` + is set to :obj:`True`. """ def __init__(self, trim_offsets=True, add_prefix_state=True): pass