Skip to content

Commit b83d7c9

Browse files
authored
DOCS: add add_prefix_space to processors.ByteLevel (#1878)
* docs: add prefix space to pyi and rust * add prefix space to pyi * update docs for processors.rs * update docs and add tests * update processors.rs * make style * update init.pyi through stubs.py
1 parent 0607860 commit b83d7c9

File tree

3 files changed

+26
-3
lines changed

3 files changed

+26
-3
lines changed

bindings/python/py_src/tokenizers/processors/__init__.pyi

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,13 @@ class ByteLevel(PostProcessor):
9898
Args:
9999
trim_offsets (:obj:`bool`):
100100
Whether to trim the whitespaces from the produced offsets.
101+
102+
add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
103+
If :obj:`True`, keeps the first token's offset as is. If :obj:`False`, increments
104+
the start of the first token's offset by 1. Only has an effect if :obj:`trim_offsets`
105+
is set to :obj:`True`.
101106
"""
102-
def __init__(self, trim_offsets=True):
107+
def __init__(self, trim_offsets=True, add_prefix_state=True):
103108
pass
104109

105110
def num_special_tokens_to_add(self, is_pair):

bindings/python/src/processors.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -484,12 +484,17 @@ impl PyRobertaProcessing {
484484
/// Args:
485485
/// trim_offsets (:obj:`bool`):
486486
/// Whether to trim the whitespaces from the produced offsets.
487+
///
488+
/// add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
489+
/// If :obj:`True`, keeps the first token's offset as is. If :obj:`False`, increments
490+
/// the start of the first token's offset by 1. Only has an effect if :obj:`trim_offsets`
491+
/// is set to :obj:`True`.
487492
#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "ByteLevel")]
488493
pub struct PyByteLevel {}
489494
#[pymethods]
490495
impl PyByteLevel {
491496
#[new]
492-
#[pyo3(signature = (add_prefix_space = None, trim_offsets = None, use_regex = None, **_kwargs), text_signature = "(self, trim_offsets=True)")]
497+
#[pyo3(signature = (add_prefix_space = None, trim_offsets = None, use_regex = None, **_kwargs), text_signature = "(self, trim_offsets=True, add_prefix_state=True)")]
493498
fn new(
494499
add_prefix_space: Option<bool>,
495500
trim_offsets: Option<bool>,

bindings/python/tests/bindings/test_processors.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ class TestByteLevelProcessing:
6666
def test_instantiate(self):
6767
assert ByteLevel() is not None
6868
assert ByteLevel(trim_offsets=True) is not None
69+
assert ByteLevel(add_prefix_space=True) is not None
6970
assert isinstance(ByteLevel(), PostProcessor)
7071
assert isinstance(ByteLevel(), ByteLevel)
7172
assert isinstance(pickle.loads(pickle.dumps(ByteLevel())), ByteLevel)
@@ -82,11 +83,23 @@ def test_processing(self, roberta_files):
8283
assert output.offsets == [(0, 2), (2, 7), (7, 10), (10, 15)]
8384

8485
# Trims offsets when activated
85-
tokenizer.post_processor = ByteLevel(trim_offsets=True)
86+
tokenizer.post_processor = ByteLevel(trim_offsets=True, add_prefix_space=True)
8687
output = tokenizer.encode("My name is John")
8788
assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
8889
assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15)]
8990

91+
# Trims offsets without adding prefix space at first token
92+
tokenizer.post_processor = ByteLevel(trim_offsets=True, add_prefix_space=False)
93+
output = tokenizer.encode("My name is John")
94+
assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
95+
assert output.offsets == [(1, 2), (3, 7), (8, 10), (11, 15)]
96+
97+
# add_prefix_space without trimming offsets has no effect
98+
tokenizer.post_processor = ByteLevel(trim_offsets=False, add_prefix_space=True)
99+
output = tokenizer.encode("My name is John")
100+
assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
101+
assert output.offsets == [(0, 2), (2, 7), (7, 10), (10, 15)]
102+
90103
def test_manual_reload(self):
91104
byte_level = ByteLevel()
92105
state = json.loads(byte_level.__getstate__())

0 commit comments

Comments
 (0)