diff --git a/bindings/python/py_src/tokenizers/processors/__init__.pyi b/bindings/python/py_src/tokenizers/processors/__init__.pyi index 5136d02bb..930eba87a 100644 --- a/bindings/python/py_src/tokenizers/processors/__init__.pyi +++ b/bindings/python/py_src/tokenizers/processors/__init__.pyi @@ -98,8 +98,13 @@ class ByteLevel(PostProcessor): Args: trim_offsets (:obj:`bool`): Whether to trim the whitespaces from the produced offsets. + + add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`): + If :obj:`True`, keeps the first token's offset as is. If :obj:`False`, increments + the start of the first token's offset by 1. Only has an effect if :obj:`trim_offsets` + is set to :obj:`True`. """ - def __init__(self, trim_offsets=True): + def __init__(self, trim_offsets=True, add_prefix_state=True): pass def num_special_tokens_to_add(self, is_pair): diff --git a/bindings/python/src/processors.rs b/bindings/python/src/processors.rs index 03fa6bdf7..4a25c910c 100644 --- a/bindings/python/src/processors.rs +++ b/bindings/python/src/processors.rs @@ -484,12 +484,17 @@ impl PyRobertaProcessing { /// Args: /// trim_offsets (:obj:`bool`): /// Whether to trim the whitespaces from the produced offsets. +/// +/// add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`): +/// If :obj:`True`, keeps the first token's offset as is. If :obj:`False`, increments +/// the start of the first token's offset by 1. Only has an effect if :obj:`trim_offsets` +/// is set to :obj:`True`. #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "ByteLevel")] pub struct PyByteLevel {} #[pymethods] impl PyByteLevel { #[new] - #[pyo3(signature = (add_prefix_space = None, trim_offsets = None, use_regex = None, **_kwargs), text_signature = "(self, trim_offsets=True)")] + #[pyo3(signature = (add_prefix_space = None, trim_offsets = None, use_regex = None, **_kwargs), text_signature = "(self, trim_offsets=True, add_prefix_state=True)")] fn new( add_prefix_space: Option, trim_offsets: Option, diff --git a/bindings/python/tests/bindings/test_processors.py b/bindings/python/tests/bindings/test_processors.py index 3038d8694..50cf770f0 100644 --- a/bindings/python/tests/bindings/test_processors.py +++ b/bindings/python/tests/bindings/test_processors.py @@ -66,6 +66,7 @@ class TestByteLevelProcessing: def test_instantiate(self): assert ByteLevel() is not None assert ByteLevel(trim_offsets=True) is not None + assert ByteLevel(add_prefix_space=True) is not None assert isinstance(ByteLevel(), PostProcessor) assert isinstance(ByteLevel(), ByteLevel) assert isinstance(pickle.loads(pickle.dumps(ByteLevel())), ByteLevel) @@ -82,11 +83,23 @@ def test_processing(self, roberta_files): assert output.offsets == [(0, 2), (2, 7), (7, 10), (10, 15)] # Trims offsets when activated - tokenizer.post_processor = ByteLevel(trim_offsets=True) + tokenizer.post_processor = ByteLevel(trim_offsets=True, add_prefix_space=True) output = tokenizer.encode("My name is John") assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"] assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15)] + # Trims offsets without adding prefix space at first token + tokenizer.post_processor = ByteLevel(trim_offsets=True, add_prefix_space=False) + output = tokenizer.encode("My name is John") + assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"] + assert output.offsets == [(1, 2), (3, 7), (8, 10), (11, 15)] + + # add_prefix_space without trimming offsets has no effect + tokenizer.post_processor = ByteLevel(trim_offsets=False, add_prefix_space=True) + output = tokenizer.encode("My name is John") + assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"] + assert output.offsets == [(0, 2), (2, 7), (7, 10), (10, 15)] + def test_manual_reload(self): byte_level = ByteLevel() state = json.loads(byte_level.__getstate__())