implemented dutch splitter

ails-lab · Dec 13, 2024 · e2936a7 · e2936a7
1 parent 2ef6888
commit e2936a7
Show file tree

Hide file tree

Showing 8 changed files with 397 additions and 8 deletions.
diff --git a/model_preprocessing/dutch_model.py b/model_preprocessing/dutch_model.py
@@ -0,0 +1,44 @@
+import torch
+import os
+
+
+STANZA_RESOURCES_DIR = os.getenv('STANZA_RESOURCES_DIR')
+
+model = torch.load(os.path.join(STANZA_RESOURCES_DIR, 'nl/lemma/alpino_charlm.pt'), map_location='cpu')
+word_dict, composite_dict = model['dicts']
+
+word_dict['woudje'] = 'woudje'
+word_dict['diaspora'] = 'diaspora'
+word_dict['aidspatiënt'] = word_dict['aidspatiënten'] = 'aidspatiënt'
+word_dict['bosneger'] = 'bosneger'
+word_dict['cultuurchristen'] = 'cultuurchristen'
+word_dict['empowerment'] = 'empowerment'
+word_dict['jappenkamp'] = 'jappenkamp'
+word_dict['gekleurd'] = 'gekleurd'
+word_dict['kroezelkop'] = 'kroezelkop'
+word_dict['laaggeschoold'] = 'laaggeschoold'
+word_dict['leefgebied'] = 'leefgebied'
+word_dict['melaatse'] = 'melaatse'
+word_dict['muzulvrouw'] = 'muzulvrouw'
+word_dict['omaatje'] = 'omaatje'
+word_dict['oudje'] = 'oudje'
+word_dict['paaps'] = 'paaps'
+word_dict['santenboetiek'] = 'santenboetiek'
+word_dict['sloppenbewoner'] = 'sloppenbewoner'
+word_dict['superstitie'] = 'superstitie'
+word_dict['toverbeeld'] = 'toverbeeld'
+word_dict['tropen'] = 'tropen'
+word_dict['westers'] = 'westers'
+word_dict['papistisch'] = 'papist'
+word_dict['spleetogen'] = 'spleetoog'
+word_dict['tzigane'] = 'tzigaan'
+word_dict['zwartje'] = 'zwartje'
+word_dict['évolués'] = word_dict['évolué'] = 'evolué'
+word_dict['gehoorgestoord'] = 'gehoorgestoord'
+word_dict['gehoorgestoorde'] = 'gehoorgestoord'
+word_dict['gehoorgestoorden'] = 'gehoorgestoorden'
+word_dict['inlandse'] = 'inlander'
+word_dict['kalotten'] = 'karloot'
+word_dict['magische'] = 'magie'
+
+torch.save(model, os.path.join(STANZA_RESOURCES_DIR, 'nl/lemma/alpino_charlm_customized.pt'))
diff --git a/src/api_modules/main_module.py b/src/api_modules/main_module.py
@@ -10,7 +10,7 @@
 from src.utils.settings import STANZA_MODELS_KWARGS, STARTUP_LANGUAGES, PROCESSED_TERMS_FILEPATHS
 # need to import in order to log these as stanza processors
 # ORDER MATTERS for some godforsaken reason
-from src.custom_processors import standardize, delayed_lemmatizer, german_compound_noun_splitter
+from src.custom_processors import standardize, dutch_compound_noun_splitter, delayed_lemmatizer, german_compound_noun_splitter
 
 
 in_memory_models = OrderedDict({

diff --git a/src/custom_processors/dutch_compound_noun_splitter.py b/src/custom_processors/dutch_compound_noun_splitter.py
@@ -0,0 +1,73 @@
+import itertools
+import os
+
+import stanza
+from stanza.pipeline.processor import register_processor, Processor
+from stanza.models.common.doc import Word
+
+from src.utils import comp_split_dutch
+
+VOCABULARIES_PATH = os.getenv('VOCABULARIES_PATH')
+
+
+@register_processor('dutch_compound_noun_splitter')
+class GermanCompNounSplitterProcessor(Processor):
+    _requires = {'tokenize', 'pos', 'delayedlemma'}
+    _provides = {'splitter'}
+
+    def __init__(self, device, config, pipeline):
+        model_path = os.path.join(VOCABULARIES_PATH, './nl-NL.dic')
+        self._set_up_model({'model_path': model_path}, pipeline, 'cpu')
+
+    def _set_up_model(self, config, pipeline, device):
+        input_file = config['model_path']
+        # TODO: save the ahocs in preprocessing and just load it here
+        self._ahocs = comp_split_dutch.read_dictionary_from_file(input_file)
+
+    def process(self, doc):
+        for sent in doc.sentences:
+            word_id = 1
+            for token in sent.tokens:
+                new_word_list = []
+                for word in token.words:
+                    if word.upos not in ['NOUN', 'PROPN']:
+                        word.id = word_id
+                        word_id += 1
+                        new_word_list.append(word)
+                    else:
+                        try:
+                            dissection = comp_split_dutch.dissect(
+                                word.text, self._ahocs, make_singular=True)
+                        except Exception as e:
+                            print(e)
+                            dissection = [word.text]
+                        # print(dissection)
+                        if len(dissection) <= 1:
+                            word.text = word.text.lower()
+                            word.id = word_id
+                            word_id += 1
+                            new_word_list.append(word)
+                        else:
+                            upos = word.upos
+                            xpos = word.xpos
+                            feats = word.feats
+                            for part in dissection:
+                                new_word_dict = {
+                                    'id': word_id,
+                                    'text': part.lower(),
+                                    'lemma': part.lower(),
+                                    'upos': upos,
+                                    'xpos': xpos,
+                                    'feats': feats,
+                                    'start_char': token.start_char,
+                                    'end_char': token.end_char
+                                }
+                                word_id += 1
+                                new_word = Word(sent, new_word_dict)
+                                new_word_list.append(new_word)
+                token.words = new_word_list
+                token.id = tuple(word.id for word in new_word_list)
+            sent.words = list(itertools.chain.from_iterable(token.words for token in sent.tokens))
+        doc._count_words()
+        return doc
+
diff --git a/src/custom_processors/german_compound_noun_splitter.py b/src/custom_processors/german_compound_noun_splitter.py
@@ -5,7 +5,7 @@
 from stanza.pipeline.processor import register_processor, Processor
 from stanza.models.common.doc import Word
 
-from src.utils import comp_split
+from src.utils import comp_split_german
 
 VOCABULARIES_PATH = os.getenv('VOCABULARIES_PATH')
 
@@ -22,7 +22,7 @@ def __init__(self, device, config, pipeline):
     def _set_up_model(self, config, pipeline, device):
         input_file = config['model_path']
         # TODO: save the ahocs in preprocessing and just load it here
-        self._ahocs = comp_split.read_dictionary_from_file(input_file)
+        self._ahocs = comp_split_german.read_dictionary_from_file(input_file)
 
     def process(self, doc):
         for sent in doc.sentences:
@@ -36,7 +36,7 @@ def process(self, doc):
                         new_word_list.append(word)
                     else:
                         try:
-                            dissection = comp_split.dissect(
+                            dissection = comp_split_german.dissect(
                                 word.text, self._ahocs, make_singular=True)
                         except Exception as e:
                             print(e)