-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Jason Liartis
committed
Dec 13, 2024
1 parent
2ef6888
commit e2936a7
Showing
8 changed files
with
397 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
import torch | ||
import os | ||
|
||
|
||
STANZA_RESOURCES_DIR = os.getenv('STANZA_RESOURCES_DIR') | ||
|
||
model = torch.load(os.path.join(STANZA_RESOURCES_DIR, 'nl/lemma/alpino_charlm.pt'), map_location='cpu') | ||
word_dict, composite_dict = model['dicts'] | ||
|
||
word_dict['woudje'] = 'woudje' | ||
word_dict['diaspora'] = 'diaspora' | ||
word_dict['aidspatiënt'] = word_dict['aidspatiënten'] = 'aidspatiënt' | ||
word_dict['bosneger'] = 'bosneger' | ||
word_dict['cultuurchristen'] = 'cultuurchristen' | ||
word_dict['empowerment'] = 'empowerment' | ||
word_dict['jappenkamp'] = 'jappenkamp' | ||
word_dict['gekleurd'] = 'gekleurd' | ||
word_dict['kroezelkop'] = 'kroezelkop' | ||
word_dict['laaggeschoold'] = 'laaggeschoold' | ||
word_dict['leefgebied'] = 'leefgebied' | ||
word_dict['melaatse'] = 'melaatse' | ||
word_dict['muzulvrouw'] = 'muzulvrouw' | ||
word_dict['omaatje'] = 'omaatje' | ||
word_dict['oudje'] = 'oudje' | ||
word_dict['paaps'] = 'paaps' | ||
word_dict['santenboetiek'] = 'santenboetiek' | ||
word_dict['sloppenbewoner'] = 'sloppenbewoner' | ||
word_dict['superstitie'] = 'superstitie' | ||
word_dict['toverbeeld'] = 'toverbeeld' | ||
word_dict['tropen'] = 'tropen' | ||
word_dict['westers'] = 'westers' | ||
word_dict['papistisch'] = 'papist' | ||
word_dict['spleetogen'] = 'spleetoog' | ||
word_dict['tzigane'] = 'tzigaan' | ||
word_dict['zwartje'] = 'zwartje' | ||
word_dict['évolués'] = word_dict['évolué'] = 'evolué' | ||
word_dict['gehoorgestoord'] = 'gehoorgestoord' | ||
word_dict['gehoorgestoorde'] = 'gehoorgestoord' | ||
word_dict['gehoorgestoorden'] = 'gehoorgestoorden' | ||
word_dict['inlandse'] = 'inlander' | ||
word_dict['kalotten'] = 'karloot' | ||
word_dict['magische'] = 'magie' | ||
|
||
torch.save(model, os.path.join(STANZA_RESOURCES_DIR, 'nl/lemma/alpino_charlm_customized.pt')) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
import itertools | ||
import os | ||
|
||
import stanza | ||
from stanza.pipeline.processor import register_processor, Processor | ||
from stanza.models.common.doc import Word | ||
|
||
from src.utils import comp_split_dutch | ||
|
||
VOCABULARIES_PATH = os.getenv('VOCABULARIES_PATH') | ||
|
||
|
||
@register_processor('dutch_compound_noun_splitter') | ||
class GermanCompNounSplitterProcessor(Processor): | ||
_requires = {'tokenize', 'pos', 'delayedlemma'} | ||
_provides = {'splitter'} | ||
|
||
def __init__(self, device, config, pipeline): | ||
model_path = os.path.join(VOCABULARIES_PATH, './nl-NL.dic') | ||
self._set_up_model({'model_path': model_path}, pipeline, 'cpu') | ||
|
||
def _set_up_model(self, config, pipeline, device): | ||
input_file = config['model_path'] | ||
# TODO: save the ahocs in preprocessing and just load it here | ||
self._ahocs = comp_split_dutch.read_dictionary_from_file(input_file) | ||
|
||
def process(self, doc): | ||
for sent in doc.sentences: | ||
word_id = 1 | ||
for token in sent.tokens: | ||
new_word_list = [] | ||
for word in token.words: | ||
if word.upos not in ['NOUN', 'PROPN']: | ||
word.id = word_id | ||
word_id += 1 | ||
new_word_list.append(word) | ||
else: | ||
try: | ||
dissection = comp_split_dutch.dissect( | ||
word.text, self._ahocs, make_singular=True) | ||
except Exception as e: | ||
print(e) | ||
dissection = [word.text] | ||
# print(dissection) | ||
if len(dissection) <= 1: | ||
word.text = word.text.lower() | ||
word.id = word_id | ||
word_id += 1 | ||
new_word_list.append(word) | ||
else: | ||
upos = word.upos | ||
xpos = word.xpos | ||
feats = word.feats | ||
for part in dissection: | ||
new_word_dict = { | ||
'id': word_id, | ||
'text': part.lower(), | ||
'lemma': part.lower(), | ||
'upos': upos, | ||
'xpos': xpos, | ||
'feats': feats, | ||
'start_char': token.start_char, | ||
'end_char': token.end_char | ||
} | ||
word_id += 1 | ||
new_word = Word(sent, new_word_dict) | ||
new_word_list.append(new_word) | ||
token.words = new_word_list | ||
token.id = tuple(word.id for word in new_word_list) | ||
sent.words = list(itertools.chain.from_iterable(token.words for token in sent.tokens)) | ||
doc._count_words() | ||
return doc | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.