compounds updates, sas updates

UUDigitalHumanitieslab · Jan 1, 2025 · b6015fb · b6015fb
1 parent 69cb882
commit b6015fb
Show file tree

Hide file tree

Showing 21 changed files with 371 additions and 80 deletions.
diff --git a/src/sastadev/__main__.py b/src/sastadev/__main__.py
@@ -179,7 +179,7 @@
                             post_process, query_exists, query_inform)
 from sastadev.readcsv import writecsv
 from sastadev.readmethod import itemseppattern, read_method
-from sastadev.resultsbyutterance import getscoresbyutt, mkscoresbyuttrows, byuttheader
+from sastadev.resultsbyutterance import getscoresbyutt, mkscoresbyuttrows, byuttheader, silverf1col
 from sastadev.sas_impact import getcomparisoncounts, mksas_impactrows, sas_impact
 from sastadev.sastatypes import (AltCodeDict, ExactResultsDict, FileName,
                                  GoldTuple, MatchesDict, MethodName, QId,
@@ -1284,7 +1284,7 @@ def main():
     # silverscoresbyutt = getscoresbyutt(allresults.coreresults, silverscores)
 
     byuttrows = mkscoresbyuttrows(allresults, goldscores, silverscores, themethod)
-    not100count = len([row for row in byuttrows if row[9] != 100])
+    not100count = len([row for row in byuttrows if row[silverf1col] != 100])
     scoresbyuttoutfullname = os.path.join(resultspath, corefilename + byuttscoressuffix + '.xlsx')
     wb = mkworkbook(scoresbyuttoutfullname, [byuttheader], byuttrows, freeze_panes=(1,0) )
     allbyuttscores = sas_impact(allresults, silverscores, themethod)

diff --git a/src/sastadev/basicreplacements.py b/src/sastadev/basicreplacements.py
@@ -52,6 +52,7 @@
 vzdevoicing = 'v/z devoicing'
 missingapostrophe = apomiss
 finalschwadrop = 'Finals schwa dropped'
+schwadrop = 'schwa dropped'
 u4schwa = 'u usewd for the schwa sound'
 inithdrop = 'Initial h drop'
 finaldevoicing = 'Final devoicing'
@@ -254,6 +255,7 @@ def combine(strlist: List[str]) -> str:
                                                 ('lus', 'lust', pron, infpron, codared, dp),
                                                 ('mij', 'mijn', pron, infpron, codared, dp),
                                                 ('drinken', 'voedsel',  avoidambiguity, wwnambiguity, wwnambiguity, dp ),
+                                                ('heelboel', 'heleboel', pron, infpron, schwadrop, dp),
                                                 ('jou', 'jouw', pron, infpron, codared, -dp), # Td 22, 30 ik wil ook keer naar jou huis find criterion
                                                 # ('kijke', 'kijk', pron, infpron, emphasis, dp), # TD05, 32 moved to disambuguationdict
                                                 # ('geel', 'mooi', avoidambiguity, adjnambiguity, dp), #TD05, 24

diff --git a/src/sastadev/cleanCHILDEStokens.py b/src/sastadev/cleanCHILDEStokens.py
@@ -167,9 +167,12 @@ def cleantext(utt: str, repkeep: bool, tokenoutput: bool = False, verbose=False)
     resultwordlist = [t.word for t in newtokens]
     resultstring = smartjoin(resultwordlist)
     resultposlist = [t.pos for t in newtokens]
-    newmeta1 = Meta('tokenisation', inwordlist, atype='list', source='CHAT/Tokenisation', backplacement=bpl_none)
-    newmeta2 = Meta('cleanedtokenisation', resultwordlist, atype='list', source='CHAT/Tokenisation', backplacement=bpl_none)
-    newmeta3 = Meta('cleanedtokenpositions', resultposlist, annotationposlist=resultposlist, atype='list', source='CHAT/Tokenisation', backplacement=bpl_none)
+    newmeta1 = Meta('tokenisation', inwordlist, atype='list', source='CHAT/Tokenisation', backplacement=bpl_none,
+                    penalty=0)
+    newmeta2 = Meta('cleanedtokenisation', resultwordlist, atype='list', source='CHAT/Tokenisation',
+                    backplacement=bpl_none, penalty=0)
+    newmeta3 = Meta('cleanedtokenpositions', resultposlist, annotationposlist=resultposlist, atype='list',
+                    source='CHAT/Tokenisation', backplacement=bpl_none, penalty=0)
     #newmeta4 = Meta('cleantext',  'done')
     metadata += [newmeta1, newmeta2, newmeta3]
     resultmetadata = metadata

diff --git a/src/sastadev/compounds.py b/src/sastadev/compounds.py
@@ -1,7 +1,7 @@
 '''
 The module *compounds*:
 
-* initialisez the compound dictionary *compounds*, which is a multidimensional Python dictionary
+* initialises the compound dictionary *compounds*, which is a multidimensional Python dictionary
   Dict[str, Dict[int, str]], which maps a string (for a lemma in CELEX orthography HeadDiaNew)
   and a column number to the value of the cell with this column number in the CSV file from which it is derived:
 
@@ -31,8 +31,13 @@
 from collections import defaultdict
 from typing import Dict, List
 
+from sastadev.CHAT_Annotation import CHAT_wordnoncompletion, CHAT_replacement
 from sastadev.conf import settings
+from sastadev.correctionlabels import contextcorrection
+from sastadev.sasta_explanation import explanationasreplacementname
 from sastadev.sastatypes import SynTree
+from sastadev.smartcompoundcomparison import issmartcompound
+from sastadev.stringfunctions import string2list
 from sastadev.treebankfunctions import getattval
 
 underscore = "_"
@@ -51,6 +56,11 @@
 dictfile = open(dictfilename, 'r', encoding='utf8')
 
 getwordsxpath = ".//node[@pt]"
+correctionsmetaxpath = f""".//xmeta[@name = "{explanationasreplacementname}" or 
+                                    @name = "{CHAT_replacement}" or 
+                                    @name = "{CHAT_wordnoncompletion}" or
+                                    @name = "{contextcorrection}"
+                                   ]"""
 
 
 def getcompounds(syntree: SynTree) -> List[SynTree]:
@@ -65,14 +75,33 @@ def getcompounds(syntree: SynTree) -> List[SynTree]:
     '''
     results = []
     tlist = syntree.xpath(getwordsxpath)
+    corrections = syntree.xpath(correctionsmetaxpath)
     for t in tlist:
         w = getattval(t, 'word')
         lemma = getattval(t, 'lemma')
         pt = getattval(t, 'pt')
-        if pt == 'n' and iscompound(lemma):
-            results.append(t)
+        if pt == 'n':
+            if lemma in compounds:
+                results.append(t)
+            else:
+                correction = getcorrection(t, corrections)
+                if issmartcompound(w, correction, lemma):
+                    results.append(t)
     return results
 
+
+def getcorrection(t: SynTree, corrections) -> str:
+    w = getattval(t, 'word')
+    position = getattval(t, 'begin')
+    for correction in corrections:
+        annotationposlist = string2list(correction.attrib["annotationposlist"])
+        annotationwordlist = string2list(correction.attrib["annotationwordlist"], quoteignore=True)
+        if  annotationposlist == [position]:
+            result = annotationwordlist[0]
+            return result
+    return w
+
+
 # I do not know how to type this, because the nesting can be arbitrarily deep
 
 

diff --git a/src/sastadev/context.py b/src/sastadev/context.py
@@ -4,6 +4,7 @@
 import os
 from sastadev.conf import settings
 from sastadev.constants import intreebanksfolder, outtreebanksfolder
+from sastadev.datasets import infiguresdatasets
 from sastadev.filefunctions import getbasename
 from sastadev.lexicon import known_word
 from sastadev.sastatypes import TreeBank, SynTree
@@ -147,14 +148,10 @@ def nottargetchild(stree: SynTree) -> bool:
 
 
 def main():
-    # read auristrain DLD03 in as test treebank
-    # filename = 'DLD03.xml'
-    # filename = 'DLD11.xml'
-    dataset = 'auristrain'
     table = []
-    datasets = ['auristrain', 'vkltarsp', 'vklstap', 'vklasta', 'vklstapfase2', 'vklastafase2', 'auristest']
+    datasets = infiguresdatasets
     for dataset in datasets:
-        fullpath = os.path.join(settings.DATAROOT, dataset, outtreebanksfolder)
+        fullpath = os.path.join(settings.DATAROOT, dataset.name, outtreebanksfolder)
         filenames = os.listdir(fullpath)
         # filenames= ['TD21.xml']
         for filename in filenames:
@@ -183,7 +180,8 @@ def main():
                         # print(f'Preceding context: {comma.join(prevbestwords)}')
                         postbestwords = findbestwords(wrongword, postcontext, lambda x: True)
                         # print(f'Post context: {comma.join(postbestwords)}')
-                        row = [dataset, sample, wrongword, comma.join(prevbestwords), comma.join(postbestwords), origutt]
+                        row = [dataset.name, sample, wrongword, comma.join(prevbestwords),
+                               comma.join(postbestwords), origutt]
                         table.append(row)
 
     header = ['dataset', 'sample', 'wrongword', 'prev', 'post', 'origutt']

diff --git a/src/sastadev/correctionlabels.py b/src/sastadev/correctionlabels.py
@@ -0,0 +1,4 @@
+
+
+contextcorrection = 'Context Correction'
+repetition = 'Repetition'
diff --git a/src/sastadev/corrector.py b/src/sastadev/corrector.py
@@ -16,6 +16,7 @@
 from sastadev.correctionparameters import CorrectionParameters
 from sastadev.cleanCHILDEStokens import cleantokens
 from sastadev.conf import settings
+from sastadev.correctionlabels import contextcorrection, repetition
 from sastadev.dedup import (cleanwordofnort, find_duplicates2,
                             find_janeenouduplicates, find_simpleduplicates,
                             find_substringduplicates2, getfilledpauses,
@@ -30,7 +31,7 @@
 from sastadev.iedims import getjeforms
 from sastadev.lexicon import (WordInfo, de, dets, getwordinfo, het,
                               informlexicon, isa_namepart, isa_inf, isa_vd, known_word, nochildword,
-                              tswnouns, validword, vuwordslexicon, wordsunknowntoalpinolexicondict)
+                              tswnouns, validnotalpinocompoundword, validword, vuwordslexicon, wordsunknowntoalpinolexicondict)
 from sastadev.macros import expandmacros
 from sastadev.metadata import (Meta, bpl_word_delprec, bpl_indeze, bpl_node, bpl_none, bpl_word,
                                bpl_wordlemma, defaultbackplacement,
@@ -68,7 +69,6 @@
 asta = 'asta'
 
 hyphen = '-'
-repetition = 'Repetition'
 
 replacepattern = '{} [: {} ]'
 metatemplate = '##META {} {} = {}'
@@ -92,7 +92,7 @@
 #: The constant *wrongdet_excluded_words* contains words that lead to incorrect
 #: replacement of uter determiners (e.g. *die zijn* would be replaced by *dat zijn*) and
 #: therefore have to be excluded from determiner replacement.
-wrongdet_excluded_words = ['zijn', 'dicht', 'met', 'ik', 'mee', 'wat', 'alles', 'niet']
+wrongdet_excluded_words = ['zijn', 'dicht', 'met', 'ik', 'mee', 'wat', 'alles', 'niet', 'spelen']
 
 #: The constant *e2een_excluded_nouns* contains words that lead to incorrect
 #: replacement of e or schwa  and
@@ -1115,8 +1115,8 @@ def getalternativetokenmds(tokenmd: TokenMD,  tokens: List[Token], tokenctr: int
                                         name='Character Case', value='Lower case', cat='Orthography')
 
     # dehyphenate
-    if not validword(token.word, methodname)  and hyphen in token.word:
-        newwords = fullworddehyphenate(token.word, lambda x: validword(x, methodname))
+    if not validnotalpinocompoundword(token.word, methodname)  and hyphen in token.word:
+        newwords = fullworddehyphenate(token.word, lambda x: validnotalpinocompoundword(x, methodname))
         newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata,
                                         name='Dehyphenation', value='Dehyphenation', cat='Pronunciation',
                                         backplacement=bpl_word)
@@ -1132,7 +1132,7 @@ def getalternativetokenmds(tokenmd: TokenMD,  tokens: List[Token], tokenctr: int
     # aha oho uhu ehe
     ahapattern = r'([aeouy])h\1'
     ahare = re.compile(ahapattern)
-    if not validword(token.word, methodname) and ahare.search(token.word):
+    if not validnotalpinocompoundword(token.word, methodname) and ahare.search(token.word):
         newwords = [ahare.sub(r'\1', token.word)]
         newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata,
                                         name='Emphasis', value='Phoneme Duplication', cat='Pronunciation',
@@ -1323,7 +1323,7 @@ def getalternativetokenmds(tokenmd: TokenMD,  tokens: List[Token], tokenctr: int
                 penalty = basepenalties[CONTEXT]
                 newwords = [newcandidate]
                 newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata,
-                                                name='Context Correction', value='Unknown word', cat='lexicon',
+                                                name=contextcorrection, value='Unknown word', cat='lexicon',
                                                 source=f'{SASTA}/{CONTEXT}', backplacement=bpl_word, penalty=penalty)
 
     # find document specific replacements
@@ -1440,7 +1440,7 @@ def getalternativetokenmds(tokenmd: TokenMD,  tokens: List[Token], tokenctr: int
                        'moppie', 'punkie', 'saffie',   'stekkie', 'wijfie']
 
 
-    if (not validword(token.word, methodname) or token.word in knowniedimwords) and \
+    if (not validnotalpinocompoundword(token.word, methodname) or token.word in knowniedimwords) and \
             (token.word.endswith('ie') or token.word.endswith('ies')):
         newwords = getjeforms(token.word)
         for newword in newwords:
@@ -1754,7 +1754,7 @@ def getwrongdetalternatives(tokensmd: TokenListMD, tree: SynTree, uttid: UttId)
                                            backplacement=bpl_node)
                         metadata.append(meta)
                         correctiondone = True
-                    elif token.word in dets[het]  and dehet == de and infl in ['e']:
+                    elif token.word in dets[het]  and ((dehet == de and infl in ['e']) or infl in ['m', 'dm']):
                         # newcurtoken = replacement(token, swapdehet(token))
                         newcurtokenword = swapdehet(token.word)
                         newcurtoken = Token(newcurtokenword, token.pos)

diff --git a/src/sastadev/correcttreebank.py b/src/sastadev/correcttreebank.py
@@ -235,17 +235,23 @@ def smartreplace(node: SynTree, word: str, mn: MethodName) -> SynTree:
     newnode = find1(wordtree, './/node[@pt]')
     newnodept = getattval(newnode, 'pt')
     nodept = getattval(node, 'pt')
+    nodelemma = getattval(node, 'lemma')
     newnodelemma = getattval(newnode, 'lemma')
     if isvalidword(word, mn) and \
             issamewordclass(node, newnode) and \
             not isrobustnoun(newnode) and \
             newnodelemma not in nochildwords:
         result = newnode
-        result.attrib['begin'] = getattval(node, 'begin')
-        result.attrib['end'] = getattval(node, 'end')
-        result.attrib['rel'] = getattval(node, 'rel')
+        if nodept == 'ww' and '_' in nodelemma and newnodelemma in nodelemma and '_' not in newnodelemma:
+            # e.g. nodelemma == 'op_hebben', newnodelemma='hebben'
+            cpseppos = nodelemma.find('_')
+            prt = nodelemma[:cpseppos]
+            result.set('lemma', f'{prt}_{newnodelemma}')
+        result.set('begin', getattval(node, 'begin'))
+        result.set('end', getattval(node, 'end'))
+        result.set('rel', getattval(node, 'rel'))
         if 'index' in node.attrib:
-            result.attrib['index'] = getattval(node, 'index')
+            result.set('index', getattval(node, 'index'))
         if infpvpair(newnode, node):
             adaptpv(result)
     else:

diff --git a/src/sastadev/data/macros/sastamacros1.txt b/src/sastadev/data/macros/sastamacros1.txt
@@ -278,7 +278,7 @@ hwwwithsvp = """(contains(@lemma,"_kunnen") or
                  contains(@lemma,"_zijn") 
 				) """
 
-hwwwithsvpexception = """(@lemma="terug_komen" or @lemma="vast_zitten" or @lemma="weg_gaan" or @lemma="aan_hebben")""" 
+hwwwithsvpexception = """(@lemma="terug_komen" or @lemma="vast_zitten" or @lemma="weg_gaan" or @lemma="aan_hebben" or @lemma="op_hebben")""" 
 
 complement = """((@rel="obj1" or @rel="obj2") or 
                  (@rel="predc" and not(%predcB%)) or
@@ -831,6 +831,9 @@ intrmodalverb = """( @pt="ww" and
 
 
 nodimlemma = """(@lemma="meisje" or @lemma="koek" or @lemma="beet")"""  
+dimword = """(ends-with(@word, "je") or ends-with(@word, "jes") or ends-with(@word, "ie") or ends-with(@word, "ies") or ends-with(@word, "ke") or ends-with(@word, "kes")  ) """
+
+mvword = """(ends-with(@word, "en") or ends-with(@word, "e") or ends-with(@word, "s"))"""
 
 hequery = """((@lemma="hè" or @lemma="he") and @end != 1 and
        ( (@end = ancestor::node[@cat="top"]/@end) or

diff --git a/src/sastadev/data/methods/TARSP_Index_Current.xlsx b/src/sastadev/data/methods/TARSP_Index_Current.xlsx
diff --git a/src/sastadev/data/nochildwords/nochildwords.txt b/src/sastadev/data/nochildwords/nochildwords.txt
@@ -3,4 +3,9 @@ pee
 's
 pele
 pelen
-toeten
+toeten
+heelboel
+pantoet
+wintik
+afpoelen
+int
diff --git a/src/sastadev/datasets.py b/src/sastadev/datasets.py
@@ -0,0 +1,64 @@
+from dataclasses import dataclass
+import os
+from sastadev.conf import settings
+from sastadev.sastatypes import MethodName
+from sastadev.xlsx import getxlsxdata
+from typing import List
+
+space = ' '
+MethodVariant = str
+
+
+datasetfilename = 'DatasetOverview.xlsx'
+datasetfolder = settings.DATAROOT
+datasetfullname = os.path.join(datasetfolder, datasetfilename)
+
+
+def robustint(x) -> int:
+    if x == '' or x == space:
+        result = 0
+    else:
+        result = int(x)
+    return result
+
+
+@dataclass
+class DataSet:
+    name: str
+    methodname: MethodName
+    use: str
+    infigures: bool
+    variant:  MethodVariant
+    samples: int
+    bronzecount: int
+    source_org: str
+    sourcepersons: str
+    description: str
+
+
+def row2dataset(row: List[str]) -> DataSet:
+    rawname = row[0]
+    lcname = rawname.strip()
+    rawmethodname = row[1]
+    methodname = rawmethodname.strip().lower()
+    infigures = "yes" in row[3].lower()
+    rawvariant = row[4]
+    variant = rawvariant.strip().lower()
+
+    result = DataSet(name=lcname, methodname=methodname, use=row[2], infigures=infigures, variant=variant,
+                     samples=robustint(row[5]), bronzecount=robustint(row[6]), source_org=row[7], sourcepersons=row[8],
+                     description=row[9])
+    return result
+
+
+def getalldatasets():
+    datasets = []
+    header, data = getxlsxdata(datasetfullname)
+    for row in data:
+        newdataset = row2dataset(row)
+        datasets.append(newdataset)
+    return datasets
+
+alldatasets = getalldatasets()
+infiguresdatasets = [d for d in alldatasets if d.infigures]
+dsname2method = {d.name: d.methodname for d in alldatasets}
diff --git a/src/sastadev/external_functions.py b/src/sastadev/external_functions.py
@@ -28,7 +28,7 @@
 from sastadev.dedup import correct, mlux, onvolledig, samplesize
 from sastadev.imperatives import wond4, wond5plus, wondx, wx, wxy, wxyz, wxyz5
 from sastadev.methods import allok, astalemmafilter
-from sastadev.queryfunctions import hequery, VzN, vobij, voslashbij, vudivers, xneg_neg, xneg_x
+from sastadev.queryfunctions import hequery, tarsp_mvzn, tarsp_verkl, VzN, vobij, voslashbij, vudivers, xneg_neg, xneg_x
 from sastadev.stapforms import makestapform
 from sastadev.STAPpostfunctions import GL5LVU, GLVU, BB_totaal
 from sastadev.Sziplus import sziplus6, vr5plus
@@ -68,7 +68,7 @@ def oldgetfname(f: Callable) -> str:
 # Initialisation
 thetarspfunctions = [getcompounds, hequery, sziplus6, xenx, vr5plus, wx, wxy, wxyz, wxyz5, wondx, wond4, wond5plus,
                      tarsp_screening, vutotaal, gofase, gtotaal, pf2, pf3, pf4, pf5, pf6, pf7, pf, xneg_x, xneg_neg,
-                     mktarspform, VzN, vobij, voslashbij, vudivers]
+                     mktarspform, tarsp_mvzn, tarsp_verkl, VzN, vobij, voslashbij, vudivers]
 
 thestapfunctions = [BB_totaal, GLVU, GL5LVU, makestapform]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,4 @@


		contextcorrection = 'Context Correction'
		repetition = 'Repetition'
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,4 +3,9 @@ pee @@
     's
     pele
     pelen
-    toeten
+    toeten
+    heelboel
+    pantoet
+    wintik
+    afpoelen
+    int