Skip to content

Commit

Permalink
implemented dutch splitter
Browse files Browse the repository at this point in the history
  • Loading branch information
Jason Liartis committed Dec 13, 2024
1 parent 2ef6888 commit e2936a7
Show file tree
Hide file tree
Showing 8 changed files with 397 additions and 8 deletions.
44 changes: 44 additions & 0 deletions model_preprocessing/dutch_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import torch
import os


STANZA_RESOURCES_DIR = os.getenv('STANZA_RESOURCES_DIR')

model = torch.load(os.path.join(STANZA_RESOURCES_DIR, 'nl/lemma/alpino_charlm.pt'), map_location='cpu')
word_dict, composite_dict = model['dicts']

word_dict['woudje'] = 'woudje'
word_dict['diaspora'] = 'diaspora'
word_dict['aidspatiënt'] = word_dict['aidspatiënten'] = 'aidspatiënt'
word_dict['bosneger'] = 'bosneger'
word_dict['cultuurchristen'] = 'cultuurchristen'
word_dict['empowerment'] = 'empowerment'
word_dict['jappenkamp'] = 'jappenkamp'
word_dict['gekleurd'] = 'gekleurd'
word_dict['kroezelkop'] = 'kroezelkop'
word_dict['laaggeschoold'] = 'laaggeschoold'
word_dict['leefgebied'] = 'leefgebied'
word_dict['melaatse'] = 'melaatse'
word_dict['muzulvrouw'] = 'muzulvrouw'
word_dict['omaatje'] = 'omaatje'
word_dict['oudje'] = 'oudje'
word_dict['paaps'] = 'paaps'
word_dict['santenboetiek'] = 'santenboetiek'
word_dict['sloppenbewoner'] = 'sloppenbewoner'
word_dict['superstitie'] = 'superstitie'
word_dict['toverbeeld'] = 'toverbeeld'
word_dict['tropen'] = 'tropen'
word_dict['westers'] = 'westers'
word_dict['papistisch'] = 'papist'
word_dict['spleetogen'] = 'spleetoog'
word_dict['tzigane'] = 'tzigaan'
word_dict['zwartje'] = 'zwartje'
word_dict['évolués'] = word_dict['évolué'] = 'evolué'
word_dict['gehoorgestoord'] = 'gehoorgestoord'
word_dict['gehoorgestoorde'] = 'gehoorgestoord'
word_dict['gehoorgestoorden'] = 'gehoorgestoorden'
word_dict['inlandse'] = 'inlander'
word_dict['kalotten'] = 'karloot'
word_dict['magische'] = 'magie'

torch.save(model, os.path.join(STANZA_RESOURCES_DIR, 'nl/lemma/alpino_charlm_customized.pt'))
2 changes: 1 addition & 1 deletion src/api_modules/main_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from src.utils.settings import STANZA_MODELS_KWARGS, STARTUP_LANGUAGES, PROCESSED_TERMS_FILEPATHS
# need to import in order to log these as stanza processors
# ORDER MATTERS for some godforsaken reason
from src.custom_processors import standardize, delayed_lemmatizer, german_compound_noun_splitter
from src.custom_processors import standardize, dutch_compound_noun_splitter, delayed_lemmatizer, german_compound_noun_splitter


in_memory_models = OrderedDict({
Expand Down
73 changes: 73 additions & 0 deletions src/custom_processors/dutch_compound_noun_splitter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import itertools
import os

import stanza
from stanza.pipeline.processor import register_processor, Processor
from stanza.models.common.doc import Word

from src.utils import comp_split_dutch

VOCABULARIES_PATH = os.getenv('VOCABULARIES_PATH')


@register_processor('dutch_compound_noun_splitter')
class GermanCompNounSplitterProcessor(Processor):
_requires = {'tokenize', 'pos', 'delayedlemma'}
_provides = {'splitter'}

def __init__(self, device, config, pipeline):
model_path = os.path.join(VOCABULARIES_PATH, './nl-NL.dic')
self._set_up_model({'model_path': model_path}, pipeline, 'cpu')

def _set_up_model(self, config, pipeline, device):
input_file = config['model_path']
# TODO: save the ahocs in preprocessing and just load it here
self._ahocs = comp_split_dutch.read_dictionary_from_file(input_file)

def process(self, doc):
for sent in doc.sentences:
word_id = 1
for token in sent.tokens:
new_word_list = []
for word in token.words:
if word.upos not in ['NOUN', 'PROPN']:
word.id = word_id
word_id += 1
new_word_list.append(word)
else:
try:
dissection = comp_split_dutch.dissect(
word.text, self._ahocs, make_singular=True)
except Exception as e:
print(e)
dissection = [word.text]
# print(dissection)
if len(dissection) <= 1:
word.text = word.text.lower()
word.id = word_id
word_id += 1
new_word_list.append(word)
else:
upos = word.upos
xpos = word.xpos
feats = word.feats
for part in dissection:
new_word_dict = {
'id': word_id,
'text': part.lower(),
'lemma': part.lower(),
'upos': upos,
'xpos': xpos,
'feats': feats,
'start_char': token.start_char,
'end_char': token.end_char
}
word_id += 1
new_word = Word(sent, new_word_dict)
new_word_list.append(new_word)
token.words = new_word_list
token.id = tuple(word.id for word in new_word_list)
sent.words = list(itertools.chain.from_iterable(token.words for token in sent.tokens))
doc._count_words()
return doc

6 changes: 3 additions & 3 deletions src/custom_processors/german_compound_noun_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from stanza.pipeline.processor import register_processor, Processor
from stanza.models.common.doc import Word

from src.utils import comp_split
from src.utils import comp_split_german

VOCABULARIES_PATH = os.getenv('VOCABULARIES_PATH')

Expand All @@ -22,7 +22,7 @@ def __init__(self, device, config, pipeline):
def _set_up_model(self, config, pipeline, device):
input_file = config['model_path']
# TODO: save the ahocs in preprocessing and just load it here
self._ahocs = comp_split.read_dictionary_from_file(input_file)
self._ahocs = comp_split_german.read_dictionary_from_file(input_file)

def process(self, doc):
for sent in doc.sentences:
Expand All @@ -36,7 +36,7 @@ def process(self, doc):
new_word_list.append(word)
else:
try:
dissection = comp_split.dissect(
dissection = comp_split_german.dissect(
word.text, self._ahocs, make_singular=True)
except Exception as e:
print(e)
Expand Down
Loading

0 comments on commit e2936a7

Please sign in to comment.