Skip to content

Commit

Permalink
Merge pull request #54 from recski/dev_recski
Browse files Browse the repository at this point in the history
upgrade to stanza 1.3.0, incl. update of fix_ssplit processor
  • Loading branch information
adaamko authored Feb 1, 2022
2 parents 81169a2 + d93ae4c commit c82e338
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 9 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def run(self):
'dict-recursive-update',
'networkx',
'penman',
'stanza==1.1.1',
'stanza==1.3.0',
'nltk',
"graphviz"
],
Expand Down
2 changes: 1 addition & 1 deletion tuw_nlp/grammar/text_to_4lang.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def __init__(self, lang, nlp_cache, cache_dir=None):
nlp = stanza.Pipeline(
'en', package="craft")
assert lang, "TextTo4lang does not have lang set"

self.lang = lang

self.nlp = CachedStanzaPipeline(nlp, nlp_cache)
Expand Down
12 changes: 5 additions & 7 deletions tuw_nlp/text/segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from stanza.pipeline.processor import Processor, register_processor

from tuw_nlp.text.patterns.de import ABBREV, MONTH
from tuw_nlp.text.patterns.misc import CHAR_PATT


@register_processor("fix_ssplit")
Expand Down Expand Up @@ -54,13 +53,12 @@ def process(self, document):
if requires_space is False:
char_offset -= 1

start_char, end_char = (
int(c) + char_offset
for c in CHAR_PATT.match(token.misc).groups())

sens[-1].append({
doc.ID: (token_id + 1, ), doc.TEXT: token.text,
doc.MISC: f'start_char={start_char}|end_char={end_char}'})
doc.ID: (token_id + 1, ),
doc.TEXT: token.text,
doc.MISC: token.misc,
doc.START_CHAR: token.start_char + char_offset,
doc.END_CHAR: token.end_char + char_offset})

token_id += 1

Expand Down

0 comments on commit c82e338

Please sign in to comment.