Skip to content

Commit

Permalink
Fix trailing whitespace token handling (#64)
Browse files Browse the repository at this point in the history
  • Loading branch information
adrianeboyd authored Mar 2, 2021
1 parent ab388a8 commit a87c723
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 3 deletions.
4 changes: 3 additions & 1 deletion spacy_stanza/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,9 @@ def __call__(self, text):
)
offset = 0
for i, word in enumerate(words):
if word.isspace() and word != snlp_tokens[i + offset].text:
if word.isspace() and (
i + offset >= len(snlp_tokens) or word != snlp_tokens[i + offset].text
):
# insert a space token
pos.append("SPACE")
tags.append("_SP")
Expand Down
15 changes: 13 additions & 2 deletions tests/test_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def test_spacy_stanza_english():
assert doc.ents[1].label_ == "GPE"

# Test whitespace alignment
doc = nlp(" Barack Obama was born\n\nin Hawaii.")
doc = nlp(" Barack Obama was born\n\nin Hawaii.\n")
assert [t.pos_ for t in doc] == [
"SPACE",
"PROPN",
Expand All @@ -69,6 +69,7 @@ def test_spacy_stanza_english():
"ADP",
"PROPN",
"PUNCT",
"SPACE",
]
assert [t.dep_ for t in doc] == [
"",
Expand All @@ -83,14 +84,24 @@ def test_spacy_stanza_english():
"case",
"root",
"punct",
"",
]
assert [t.head.i for t in doc] == [0, 7, 2, 1, 4, 7, 6, 7, 8, 10, 10, 10]
assert [t.head.i for t in doc] == [0, 7, 2, 1, 4, 7, 6, 7, 8, 10, 10, 10, 12]
assert len(doc.ents) == 2
assert doc.ents[0].text == "Barack Obama"
assert doc.ents[0].label_ == "PERSON"
assert doc.ents[1].text == "Hawaii"
assert doc.ents[1].label_ == "GPE"

# Test trailing whitespace handling
doc = nlp("a ")
doc = nlp("a ")
doc = nlp("a \n")
doc = nlp("\n ")
doc = nlp("\t ")
doc = nlp("a\n ")
doc = nlp("a \t ")

# Test serialization
reloaded_nlp = spacy_stanza.load_pipeline(lang).from_bytes(nlp.to_bytes())
assert reloaded_nlp.config.to_str() == nlp.config.to_str()
Expand Down

0 comments on commit a87c723

Please sign in to comment.