Skip to content
Merged
7 changes: 6 additions & 1 deletion bindings/python/py_src/tokenizers/processors/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,13 @@ class ByteLevel(PostProcessor):
Args:
trim_offsets (:obj:`bool`):
Whether to trim the whitespaces from the produced offsets.

add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
If :obj:`True`, keeps the first token's offset as is. If :obj:`False`, increments
the start of the first token's offset by 1. Only has an effect if :obj:`trim_offsets`
is set to :obj:`True`.
"""
def __init__(self, trim_offsets=True):
def __init__(self, trim_offsets=True, add_prefix_state=True):
pass

def num_special_tokens_to_add(self, is_pair):
Expand Down
7 changes: 6 additions & 1 deletion bindings/python/src/processors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -484,12 +484,17 @@ impl PyRobertaProcessing {
/// Args:
/// trim_offsets (:obj:`bool`):
/// Whether to trim the whitespaces from the produced offsets.
///
/// add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
/// If :obj:`True`, keeps the first token's offset as is. If :obj:`False`, increments
/// the start of the first token's offset by 1. Only has an effect if :obj:`trim_offsets`
/// is set to :obj:`True`.
#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "ByteLevel")]
pub struct PyByteLevel {}
#[pymethods]
impl PyByteLevel {
#[new]
#[pyo3(signature = (add_prefix_space = None, trim_offsets = None, use_regex = None, **_kwargs), text_signature = "(self, trim_offsets=True)")]
#[pyo3(signature = (add_prefix_space = None, trim_offsets = None, use_regex = None, **_kwargs), text_signature = "(self, trim_offsets=True, add_prefix_state=True)")]
fn new(
add_prefix_space: Option<bool>,
trim_offsets: Option<bool>,
Expand Down
15 changes: 14 additions & 1 deletion bindings/python/tests/bindings/test_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ class TestByteLevelProcessing:
def test_instantiate(self):
assert ByteLevel() is not None
assert ByteLevel(trim_offsets=True) is not None
assert ByteLevel(add_prefix_space=True) is not None
assert isinstance(ByteLevel(), PostProcessor)
assert isinstance(ByteLevel(), ByteLevel)
assert isinstance(pickle.loads(pickle.dumps(ByteLevel())), ByteLevel)
Expand All @@ -82,11 +83,23 @@ def test_processing(self, roberta_files):
assert output.offsets == [(0, 2), (2, 7), (7, 10), (10, 15)]

# Trims offsets when activated
tokenizer.post_processor = ByteLevel(trim_offsets=True)
tokenizer.post_processor = ByteLevel(trim_offsets=True, add_prefix_space=True)
output = tokenizer.encode("My name is John")
assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15)]

# Trims offsets without adding prefix space at first token
tokenizer.post_processor = ByteLevel(trim_offsets=True, add_prefix_space=False)
output = tokenizer.encode("My name is John")
assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
assert output.offsets == [(1, 2), (3, 7), (8, 10), (11, 15)]

# add_prefix_space without trimming offsets has no effect
tokenizer.post_processor = ByteLevel(trim_offsets=False, add_prefix_space=True)
output = tokenizer.encode("My name is John")
assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
assert output.offsets == [(0, 2), (2, 7), (7, 10), (10, 15)]

def test_manual_reload(self):
byte_level = ByteLevel()
state = json.loads(byte_level.__getstate__())
Expand Down
Loading