Skip to content

Commit

Permalink
Show warning if entities don't map to tokens
Browse files Browse the repository at this point in the history
  • Loading branch information
ines committed Mar 19, 2020
1 parent a305ecb commit 771b4e6
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 11 deletions.
2 changes: 1 addition & 1 deletion spacy_stanza/about.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
__title__ = "spacy-stanza"
__version__ = "0.2.0"
__version__ = "0.2.1"
__summary__ = "Use the latest Stanza (StanfordNLP) research models directly in spaCy"
__uri__ = "https://explosion.ai"
__author__ = "Ines Montani"
Expand Down
15 changes: 12 additions & 3 deletions spacy_stanza/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import numpy
import re
import warnings


class StanzaLanguage(Language):
Expand Down Expand Up @@ -171,9 +172,17 @@ def __call__(self, text):
ents = []
for ent in snlp_doc.entities:
ent_span = doc.char_span(ent.start_char, ent.end_char, ent.type)
if ent_span:
ents.append(ent_span)
doc.ents = ents
ents.append(ent_span)
if not all(ents):
warnings.warn(
f"Can't set named entities because the character offsets don't "
f"map to valid tokens produced by the Stanza tokenizer:\n"
f"Words: {words}\n"
f"Entities: {[(e.text, e.type, e.start_char, e.end_char) for e in snlp_doc.entities]}",
stacklevel=4,
)
else:
doc.ents = ents
# Overwrite lemmas separately to prevent them from being overwritten by spaCy
lemma_array = numpy.array([[lemma] for lemma in lemmas], dtype="uint64")
doc.from_array([LEMMA], lemma_array)
Expand Down
19 changes: 12 additions & 7 deletions tests/test_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,14 @@
import pytest


@pytest.fixture
def lang():
return "en"


def tags_equal(act, exp):
"""Check if each actual tag in act is equal to one or more expected tags in exp."""
return all(a == e if isinstance(e, str) else a in e for a, e in zip(act, exp))


def test_spacy_stanza(lang):
stanza.download(lang)
def test_spacy_stanza_english():
lang = "en"
stanza.download()
snlp = stanza.Pipeline(lang=lang)
nlp = StanzaLanguage(snlp)
assert nlp.lang == "stanza_" + lang
Expand Down Expand Up @@ -61,6 +57,15 @@ def test_spacy_stanza(lang):
assert doc.ents[1].label_ == "GPE"


def test_spacy_stanza_german():
lang = "de"
stanza.download(lang)
snlp = stanza.Pipeline(lang=lang)
nlp = StanzaLanguage(snlp)
with pytest.warns(UserWarning):
doc = nlp("Auf dem Friedhof an der Straße Am Rosengarten")


def test_get_defaults():
assert get_defaults("en") == EnglishDefaults
assert get_defaults("xvkfokdfo") == BaseDefaults

0 comments on commit 771b4e6

Please sign in to comment.