Skip to content

Commit 0c25725

Browse files
authored
Update Tokenizer.explain for special cases with whitespace (#13086)
* Update Tokenizer.explain for special cases with whitespace Update `Tokenizer.explain` to skip special case matches if the exact text has not been matched due to intervening whitespace. Enable fuzzy `Tokenizer.explain` tests with additional whitespace normalization. * Add unit test for special cases with whitespace, xfail fuzzy tests again
1 parent ff9ddb6 commit 0c25725

File tree

2 files changed

+26
-4
lines changed

2 files changed

+26
-4
lines changed

spacy/tests/tokenizer/test_explain.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,18 @@ def test_tokenizer_explain_special_matcher(en_vocab):
8585
assert tokens == explain_tokens
8686

8787

88+
def test_tokenizer_explain_special_matcher_whitespace(en_vocab):
89+
rules = {":]": [{"ORTH": ":]"}]}
90+
tokenizer = Tokenizer(
91+
en_vocab,
92+
rules=rules,
93+
)
94+
text = ": ]"
95+
tokens = [t.text for t in tokenizer(text)]
96+
explain_tokens = [t[1] for t in tokenizer.explain(text)]
97+
assert tokens == explain_tokens
98+
99+
88100
@hypothesis.strategies.composite
89101
def sentence_strategy(draw: hypothesis.strategies.DrawFn, max_n_words: int = 4) -> str:
90102
"""
@@ -123,6 +135,9 @@ def test_tokenizer_explain_fuzzy(lang: str, sentence: str) -> None:
123135
"""
124136

125137
tokenizer: Tokenizer = spacy.blank(lang).tokenizer
126-
tokens = [t.text for t in tokenizer(sentence) if not t.is_space]
138+
# Tokenizer.explain is not intended to handle whitespace or control
139+
# characters in the same way as Tokenizer
140+
sentence = re.sub(r"\s+", " ", sentence).strip()
141+
tokens = [t.text for t in tokenizer(sentence)]
127142
debug_tokens = [t[1] for t in tokenizer.explain(sentence)]
128143
assert tokens == debug_tokens, f"{tokens}, {debug_tokens}, {sentence}"

spacy/tokenizer.pyx

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -730,9 +730,16 @@ cdef class Tokenizer:
730730
if i in spans_by_start:
731731
span = spans_by_start[i]
732732
exc = [d[ORTH] for d in special_cases[span.label_]]
733-
for j, orth in enumerate(exc):
734-
final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth]))
735-
i += len(span)
733+
# The phrase matcher can overmatch for tokens separated by
734+
# spaces in the text but not in the underlying rule, so skip
735+
# cases where the texts aren't identical
736+
if span.text != "".join([self.vocab.strings[orth] for orth in exc]):
737+
final_tokens.append(tokens[i])
738+
i += 1
739+
else:
740+
for j, orth in enumerate(exc):
741+
final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth]))
742+
i += len(span)
736743
else:
737744
final_tokens.append(tokens[i])
738745
i += 1

0 commit comments

Comments
 (0)