spellingcorrection updates, evaluation updates

UUDigitalHumanitieslab · Oct 9, 2024 · 8b954c7 · 8b954c7
1 parent 08743c8
commit 8b954c7
Show file tree

Hide file tree

Showing 35 changed files with 1,177 additions and 476 deletions.
diff --git a/src/sastadev/CHAT_Annotation.py b/src/sastadev/CHAT_Annotation.py
@@ -9,9 +9,11 @@
 CHAT = 'CHAT'
 
 CHAT_explanation = 'Explanation'
-CHAT_wordnoncompletion = 'Noncompletion of a Word'
+CHAT_reformulation = 'Reformulation'
+CHAT_repetition = 'Repetition'
 CHAT_replacement = 'Replacement'
-
+CHAT_retracing = 'Retracing'
+CHAT_wordnoncompletion = 'Noncompletion of a Word'
 
 monadic = 1
 dyadic = 2
@@ -78,7 +80,7 @@ def refunction(x):
 specialformpat = wordpat + r'(?:@z:\w\w\w|@\w\w?\w?)'
 fullspecialformpat = fullre(specialformpat)
 specialformre = re.compile(fullspecialformpat)
-repkeepannotations = ['Repetition', 'Retracing', 'Reformulation']
+repkeepannotations = [CHAT_repetition, CHAT_retracing, CHAT_reformulation]
 
 
 def getreplacement(repkeep, annotation):
@@ -817,7 +819,7 @@ def result(x, y):
                     CHAT_ComplexRegex(
                         (r'\[=', anybutrb, r'\]'), (keep, eps), False),
                     complexmetafunction),
-    CHAT_Annotation('Replacement', '8.3:69', '10.3:73',
+    CHAT_Annotation(CHAT_replacement, '8.3:69', '10.3:73',
                     CHAT_ComplexRegex(
                         (r'\[:\s', r'([^\]]+)', r'\]'), (eps, keep), True, containswords=True),
                     complexmetafunction_replbpl),
@@ -832,13 +834,13 @@ def result(x, y):
                     CHAT_ComplexRegex((r'\[%\s+', anybutrb, r'\]'), (keep, eps), True), complexmetafunction),
     CHAT_Annotation('Best Guess', '8.3:70-71', '10.3:74', CHAT_SimpleScopedRegex(r'\[\?\]', keep, True, monadic),
                     simplescopedmetafunction),
-    CHAT_Annotation('Repetition', '8.4:72', '10.4:75-76', CHAT_SimpleScopedRegex(r'\[/\]', eps, True, monadic),
+    CHAT_Annotation(CHAT_repetition, '8.4:72', '10.4:75-76', CHAT_SimpleScopedRegex(r'\[/\]', eps, True, monadic),
                     simplescopedmetafunction),
     CHAT_Annotation('Multiple Repetition', '8.4:72-73', '10.4:76',
                     CHAT_ComplexRegex((r'\[x', r'[0-9]+', r'\]'), (keep, eps), True), complexmetafunction),
-    CHAT_Annotation('Retracing', '8.4:73', '10.4:76-77', CHAT_SimpleScopedRegex(r'\[//\]', eps, True, monadic),
+    CHAT_Annotation(CHAT_retracing, '8.4:73', '10.4:76-77', CHAT_SimpleScopedRegex(r'\[//\]', eps, True, monadic),
                     simplescopedmetafunction),
-    CHAT_Annotation('Reformulation', '8.4:73-74', '10.4:77', CHAT_SimpleScopedRegex(r'\[///\]', eps, True, monadic),
+    CHAT_Annotation(CHAT_reformulation, '8.4:73-74', '10.4:77', CHAT_SimpleScopedRegex(r'\[///\]', eps, True, monadic),
                     simplescopedmetafunction),
     CHAT_Annotation('False Start Without Retracing', '8.4:74', '10.4:77',
                     CHAT_SimpleScopedRegex(r'\[/\-\]', eps, True, dyadic), simplescopedmetafunction),

diff --git a/src/sastadev/__main__.py b/src/sastadev/__main__.py
@@ -155,11 +155,12 @@
                                  ResultsKey, mkresultskey, scores2counts,
                                  showreskey)
 from sastadev.conf import settings
-from sastadev.constants import (bronzefolder, bronzesuffix, checksuffix, checkeditedsuffix,
+from sastadev.constants import (analysissuffix, bronzefolder, bronzesuffix, byuttscoressuffix, checksuffix, checkeditedsuffix,
                                 formsfolder, intreebanksfolder,
                                 loggingfolder, outtreebanksfolder, permprefix, platinumsuffix,
                                 platinumeditedsuffix,
                                 resultsfolder, silverfolder, silverpermfolder, silversuffix)
+from sastadev.context import getcontextdict
 from sastadev.correctionparameters import CorrectionParameters
 from sastadev.correcttreebank import (correcttreebank, corr0, corrn, errorwbheader, validcorroptions)
 from sastadev.counterfunctions import counter2liststr
@@ -178,6 +179,8 @@
                             post_process, query_exists, query_inform)
 from sastadev.readcsv import writecsv
 from sastadev.readmethod import itemseppattern, read_method
+from sastadev.resultsbyutterance import getscoresbyutt, mkscoresbyuttrows, byuttheader
+from sastadev.sas_impact import getcomparisoncounts, mksas_impactrows, sas_impact
 from sastadev.sastatypes import (AltCodeDict, ExactResultsDict, FileName,
                                  GoldTuple, MatchesDict, MethodName, QId,
                                  QIdCount, QueryDict, ResultsCounter,
@@ -197,7 +200,7 @@
 from sastadev.treebankfunctions import (find1, getattval, getnodeendmap, getuttid,
                                         getxmetatreepositions, getxsid,
                                         getyield, showtree)
-from sastadev.xlsx import mkworkbook
+from sastadev.xlsx import mkworkbook, add_worksheet
 
 
 start_time = time.time()
@@ -1204,8 +1207,10 @@ def main():
         else:
             mergedsamplecorrections = {}
 
+        contextdict = getcontextdict(treebank2, lambda x: True)
 
-        correctionparameters = CorrectionParameters(methodname, options, mergedsamplecorrections, thissamplecorrections)
+        correctionparameters = CorrectionParameters(methodname, options, mergedsamplecorrections,
+                                                    thissamplecorrections, treebank2, contextdict)
 
         treebank, errordict, allorandalts = correcttreebank(treebank2, targets,  correctionparameters, corr=corr)
 
@@ -1274,12 +1279,27 @@ def main():
     silverscores = exact2results(exactsilverscores)  # ongoing
     silvercounts = scores2counts(silverscores)
 
+    # scores by utterance
+    # bronzescoresbyutt = getscoresbyutt(allresults.coreresults, goldscores)
+    # silverscoresbyutt = getscoresbyutt(allresults.coreresults, silverscores)
+
+    byuttrows = mkscoresbyuttrows(allresults, goldscores, silverscores, themethod)
+    not100count = len([row for row in byuttrows if row[9] != 100])
+    scoresbyuttoutfullname = os.path.join(resultspath, corefilename + byuttscoressuffix + '.xlsx')
+    wb = mkworkbook(scoresbyuttoutfullname, [byuttheader], byuttrows, freeze_panes=(1,0) )
+    allbyuttscores = sas_impact(allresults, silverscores, themethod)
+    sasheader, sasimpactrows = mksas_impactrows(allbyuttscores, not100count)
+    add_worksheet(wb,[sasheader], sasimpactrows, sheetname='SAS_impact', freeze_panes=(1,0))
+    wb.close()
+
+
+
     # netx is now obsolete
     # platinumresults: Dict[ResultsKey, Counter] = reduceresults(platinumresults, samplesizetuple, options.methodname)
 
     (base, ext) = os.path.splitext(options.infilename)
     outputfullname = os.path.join(
-        resultspath, corefilename + "_analysis" + tsvext + txtext)
+        resultspath, corefilename + analysissuffix + tsvext + txtext)
     outfile = open(outputfullname, 'w', encoding='utf8')
 
     outxlsx = os.path.join(resultspath, corefilename + "_analysis" + xlsxext)

diff --git a/src/sastadev/alpinocompound.py b/src/sastadev/alpinocompound.py
@@ -0,0 +1,39 @@
+from sastadev.treebankfunctions import find1, getattval
+from sastadev.conf import settings
+from sastadev.lexicon import known_word
+
+
+# verhuizen naar lexicon module
+comma = ','
+compoundsep = '_'
+
+alpinoparse = settings.PARSE_FUNC
+
+
+def isalpinocompound(wrd: str) -> bool:
+    fullstr = f'geen {wrd}'   # geen makes it a noun and can combine with uter and neuter, count and mass, sg and plural
+    tree = alpinoparse(fullstr)
+    # find the noun
+    if tree is None:
+        settings.LOGGER.error(f'Parsing {fullstr} failed')
+        return False
+    nounnode = find1(tree, './/node[@pt="n"]')
+    if nounnode is None:
+        settings.LOGGER.error(f'No noun found in {fullstr} parse')
+        return False
+    nounwrd = getattval(nounnode, 'word')
+    if nounwrd != wrd:
+        settings.LOGGER.error(f'Wrong noun ({nounwrd}) found in {fullstr} parse')
+        return False
+    nounlemma = getattval(nounnode, 'lemma')
+    if compoundsep in nounlemma:
+        parts = nounlemma.split(compoundsep)
+        unknownparts = [part for part in parts if not known_word(part)]
+        result = unknownparts = []
+        if not result:
+            settings.LOGGER.error(f'Unknown words ({comma.join(unknownparts)}) found in {fullstr} parse')
+            return False
+        return True
+    else:
+        return False
+
diff --git a/src/sastadev/basicreplacements.py b/src/sastadev/basicreplacements.py
@@ -138,7 +138,7 @@ def combine(strlist: List[str]) -> str:
 #: .. autodata:: sastadev.basicreplacements::innureplacements
 #:      :no-value:
 #:
-basicreplacementlist: List[BasicReplacement] = [('as', 'als', pron, infpron, codared, dp),
+basicreplacementlist: List[BasicReplacement] = [('as', 'als', pron, infpron, codared, -2 * dp),
                                                 ('isse', 'is', pron, infpron, addschwa, mp(10)),
                                                 ('ooke', 'ook', pron, infpron, addschwa, dp),
                                                 ('t', "'t", orth, spellerr, apomiss, dp),
@@ -234,7 +234,7 @@ def combine(strlist: List[str]) -> str:
                                                 ('əs', 'eens', pron, infpron, reduction, dp),
                                                 ('moetə', 'moeten', pron, infpron, infpron, dp),
                                                 ('moetə' , 'moet', pron, infpron, infpron, dp),
-                                                ('pot', 'kapot', pron, infpron, sylldrop, dp),
+                                                ('pot', 'kapot', pron, infpron, sylldrop, -2 * dp),
                                                 ('almaal', 'allemaal', pron, infpron, sylldrop, dp),
                                                 ('knorrens', 'varkens', lexical, substitution, onom,dp),
                                                 ('potte', 'kapot', pron, infpron, combine([sylldrop, emphasis]), dp),
@@ -325,8 +325,8 @@ def combine(strlist: List[str]) -> str:
      ('as-t-ie', ['als', 'ie'], pron, infpron, t_ie, dp),
      ("dit's", ["dit", "is"], pron, infpron, contract, dp),
      ("dat's", ["dat", "is"], pron, infpron, contract, dp),
-     ("datte", ['dat', 'ie'], pron, infpron, contract, mp(120)),
-     ("omdatte", ['omdat', 'ie'], pron, infpron, contract, mp(120)),
+     ("datte", ['dat', 'ie'], pron, infpron, contract, mp(220)),
+     ("omdatte", ['omdat', 'ie'], pron, infpron, contract, mp(220)),
      ("isda", ['is', 'dat'], pron, infpron, contract, dp + 2),
      ("tisda", ['het',  'is', 'dat'], pron, infpron, contract, mp(120)),
      ("'savonds", ["'s", 'avonds'], pron, infpron, typo, mp(120)),
@@ -446,7 +446,8 @@ def getdisambiguationdict() -> Dict[str, Tuple[TokenTreePredicate, str]]:
             disambiguationdict[w] = cond, repl
     return disambiguationdict
 
-parsereplacementslist =  [('smarties', 'toffees', alpino_unknown_word, -2*dp),
-                          ('Smarties', 'toffees', alpino_unknown_word, -2*dp)]
+# next replaced by wordsunknowntoalpino lexicon
+#parsereplacementslist =  [('smarties', 'toffees', alpino_unknown_word, -2*dp),
+#                          ('Smarties', 'toffees', alpino_unknown_word, -2*dp)]
 
-parsereplacements = {el[0]:el for el in parsereplacementslist}
+# parsereplacements = {el[0]:el for el in parsereplacementslist}
diff --git a/src/sastadev/childesspellingcorrector.py b/src/sastadev/childesspellingcorrector.py
@@ -138,6 +138,7 @@ def adult_correctspelling(word: str, correctionsdict,max = None, threshold=okthr
 
 def tryme():
     words = ['kantie', 'opbijten', 'oprijten', 'opgereten', 'peelkaal' , ' beete' , 'kamm', 'daaistoel', 'oelen', 'tein']
+    words = ['poppe']
     for word in words:
         result = children_correctspelling(word, children_correctionsdict, max=5)
         print(f'{word}: {result}' )

diff --git a/src/sastadev/constants.py b/src/sastadev/constants.py
@@ -29,6 +29,7 @@
 analysissuffix = '_analysis'
 analysistsvsuffix = '_analysis.tsv'
 bronzesuffix = '_bronze'
+byuttscoressuffix = '_scoresbyutt'
 silversuffix = '_silver'
 correctionrefsuffix = '_correctionref'
 permprefix = 'perm_'