adult v. children spelling correction

UUDigitalHumanitieslab · Sep 20, 2024 · 50a615a · 50a615a
1 parent f25faa1
commit 50a615a
Show file tree

Hide file tree

Showing 9 changed files with 80 additions and 19 deletions.
diff --git a/src/sastadev/__main__.py b/src/sastadev/__main__.py
@@ -166,7 +166,8 @@
 from sastadev.external_functions import str2functionmap
 from sastadev.goldcountreader import get_goldcounts
 from sastadev.history import (donefiles, donefilesfullname, gathercorrections, mergecorrections, putcorrections,
-                              putdonefilenames, samplecorrections, samplecorrectionsfullname)
+                              putdonefilenames, children_samplecorrections, children_samplecorrectionsfullname,
+                              adult_samplecorrections, adult_samplecorrectionsfullname)
 from sastadev.macros import expandmacros
 from sastadev.methods import Method, supported_methods, treatmethod
 from sastadev.mismatches import exactmismatches, literalmissedmatches
@@ -1175,6 +1176,15 @@ def main():
 
         thissamplecorrections = {}
         if options.dohistory:
+            if options.methodname.lower() in {'tarsp', 'stap'}:
+                samplecorrectionsfullname = children_samplecorrectionsfullname
+                samplecorrections = children_samplecorrections
+            elif options.methodname.lower() in {'asta'}:
+                samplecorrectionsfullname = adult_samplecorrectionsfullname
+                samplecorrections = adult_samplecorrections
+            else: # should not occur
+                settings.LOGGER.error(f'Illegal method name used: {options.method}')
+                exit(-1)
             if corr != corr0:
                 reducedtreebankfullname = os.path.relpath(options.infilename, start=settings.DATAROOT)
                 if reducedtreebankfullname not in donefiles:

diff --git a/src/sastadev/astaforms.py b/src/sastadev/astaforms.py
@@ -438,6 +438,6 @@ def getlemmafreqs(allresults, lexicalreskey) -> Dict[str, int]:
         if qid == lemmaqid:
             lemma = reskey[1]
             for position in allresults.exactresults[reskey]:
-                if position in allresults.exactresults[lexicalreskey]:
+                if lexicalreskey in allresults.exactresults and position in allresults.exactresults[lexicalreskey]:
                     dict[lemma] += 1
     return dict
diff --git a/src/sastadev/childesspellingcorrector.py b/src/sastadev/childesspellingcorrector.py
@@ -46,11 +46,8 @@ def getchildesfrq() -> Tuple[FrqDict, FrqDict, FrqDict]:
 
 # function to read the stored corrections into a dictionary
 
-def getstoredcorrections() -> Dict[str, List[Tuple[str, int]]]:
+def getstoredcorrections(correctionsfullname) -> Dict[str, List[Tuple[str, int]]]:
     correctionsdict = {}
-    correctionsfilename = 'storedcorrections.txt'
-    correctionspath = os.path.join(settings.SD_DIR, 'data/storedcorrections')
-    correctionsfullname = os.path.join(correctionspath, correctionsfilename)
 
     idata = readcsv(correctionsfullname)
     for i, row in idata:
@@ -70,7 +67,7 @@ def getpenalty(score, total):
 
 #  a function for spelling correction
 
-def correctspelling(word: str, max = None, threshold=okthreshold) -> List[Tuple[str, int]]:
+def children_correctspelling(word: str, correctionsdict, max = None, threshold=okthreshold) -> List[Tuple[str, int]]:
     if word in correctionsdict:
         return correctionsdict[word]
     else:
@@ -115,12 +112,40 @@ def correctspelling(word: str, max = None, threshold=okthreshold) -> List[Tuple[
 
     return result
 
+
+def adult_correctspelling(word: str, correctionsdict,max = None, threshold=okthreshold) -> List[Tuple[str, int]]:
+    if word in correctionsdict:
+        return correctionsdict[word]
+    else:
+        corrections = spell.candidates(word)
+    if corrections is not None:
+        corrtuples = [(corr, spell.word_usage_frequency(corr)) for corr in corrections]
+    else:
+        corrtuples = []
+
+    sortedcorrtuples = sorted(corrtuples, key=lambda x: x[1], reverse=True)
+    allfrqsum = sum(corrtuple[1]for corrtuple in sortedcorrtuples)
+
+    result = [(corr, getpenalty(score, allfrqsum)) for (corr, score) in sortedcorrtuples]
+
+    if max is not None:
+        result = result[:max]
+
+    # store the result in the dictionary; write dictionary to file
+
+    return result
+
+
 def tryme():
     words = ['kantie', 'opbijten', 'oprijten', 'opgereten', 'peelkaal' , ' beete' , 'kamm', 'daaistoel', 'oelen', 'tein']
     for word in words:
-        result = correctspelling(word, max=5)
+        result = children_correctspelling(word, children_correctionsdict, max=5)
         print(f'{word}: {result}' )
 
+    words = ['motariek', 'silase']
+    for word in words:
+        result = adult_correctspelling(word, adult_correctionsdict, max=5)
+        print(f'{word}: {result}' )
 
 
 
@@ -129,8 +154,19 @@ def tryme():
 # read the childes frequency dict in, for targets and others and combine them also
 trgfrqdict, othfrqdict, allfrqdict = getchildesfrq()
 
-# read the stored corrections into a dictionary
-correctionsdict = getstoredcorrections()
+# read the stored corrections for children into a dictionary
+children_correctionsfilename = 'children_storedcorrections.txt'
+correctionspath = os.path.join(settings.SD_DIR, 'data/storedcorrections')
+children_correctionsfullname = os.path.join(correctionspath, children_correctionsfilename)
+children_correctionsdict = getstoredcorrections(children_correctionsfullname)
+
+# read the stored corrections for adults into a dictionary
+adult_correctionsfilename = 'adult_storedcorrections.txt'
+correctionspath = os.path.join(settings.SD_DIR, 'data/storedcorrections')
+adult_correctionsfullname = os.path.join(correctionspath, adult_correctionsfilename)
+adult_correctionsdict = getstoredcorrections(adult_correctionsfullname)
+
+
 
 if __name__ == '__main__':
     tryme()
diff --git a/src/sastadev/corrector.py b/src/sastadev/corrector.py
@@ -9,7 +9,8 @@
 from sastadev.alpino import getdehetwordinfo
 from sastadev.basicreplacements import (basicexpansions, basicreplacementpairs, basicreplacements,
                                         getdisambiguationdict, parsereplacements)
-from sastadev.childesspellingcorrector import correctspelling, allfrqdict
+from sastadev.childesspellingcorrector import (adult_correctionsdict, adult_correctspelling,
+                                               children_correctionsdict, children_correctspelling,  allfrqdict)
 from sastadev.correctionparameters import CorrectionParameters
 from sastadev.cleanCHILDEStokens import cleantokens
 from sastadev.conf import settings
@@ -22,7 +23,8 @@
 from sastadev.find_ngram import (Ngram, findmatches, ngram1, ngram2, ngram7,
                                  ngram10, ngram11, ngram16, ngram17)
 from sastadev.history import (childescorrections, childescorrectionsexceptions, mergecorrections, putcorrections,
-                              samplecorrections,  samplecorrectionsfullname)
+                              children_samplecorrections,  children_samplecorrectionsfullname,
+                              adult_samplecorrections,  adult_samplecorrectionsfullname)
 from sastadev.iedims import getjeforms
 from sastadev.lexicon import (WordInfo, de, dets, getwordinfo, het,
                               informlexicon, isa_namepart, isa_inf, isa_vd, known_word,
@@ -1264,7 +1266,7 @@ def getalternativetokenmds(tokenmd: TokenMD,  tokens: List[Token], tokenctr: int
         not known_word(token.word) and \
         token.word in correctionparameters.allsamplecorrections and \
             token.word not in childescorrectionsexceptions:
-        cc = samplecorrections[token.word]
+        cc = correctionparameters.allsamplecorrections[token.word]
         sumfrq = sum([hc.frequency for hc in cc])
         for hc in cc:
             relfrq = hc.frequency / sumfrq
@@ -1453,10 +1455,17 @@ def getalternativetokenmds(tokenmd: TokenMD,  tokens: List[Token], tokenctr: int
 
 
     if correctionparameters.options.dospellingcorrection  and \
-            correctionparameters.method in {'tarsp', 'stap'} and not known_word(token.word) and applyspellingcorrectionisok(token.word) and \
+             not known_word(token.word) and applyspellingcorrectionisok(token.word) and \
             not schwandropfound and not postviefound and not token.word[0].isupper() and not deduplicated and \
             not(token.word.endswith('ie') or token.word.endswith('ies')) and token.word[-3:] not in vvs:
-        corrtuples = correctspelling(token.word, max=5)
+        if correctionparameters.method in {'tarsp', 'stap'}:
+            corrtuples = children_correctspelling(token.word, children_correctionsdict, max=5)
+        elif correctionparameters.method in {'asta'}:
+            corrtuples = []
+            # put off because it causes a lot of errors: the X-words should all have been removed
+            # corrtuples = adult_correctspelling(token.word, adult_correctionsdict, max=5)
+        else:
+            corrtuples = []
         for corr, penalty in corrtuples:
             if corr != token.word and known_word(corr):
                 newtokenmds = updatenewtokenmds(newtokenmds, token, [corr], beginmetadata,

diff --git a/src/sastadev/data/filledpauseslexicon/additionalwordslexicon.txt b/src/sastadev/data/filledpauseslexicon/additionalwordslexicon.txt
@@ -35,3 +35,5 @@ Sesamstraat
 koffie
 Smarties
 omie
+m'n
+z'n
diff --git a/src/sastadev/data/filledpauseslexicon/vuwordslexicon.txt b/src/sastadev/data/filledpauseslexicon/vuwordslexicon.txt
@@ -1,4 +1,5 @@
 goed	100,
+ha	100,
 he	120
 hè	120
 hoor	003

diff --git a/src/sastadev/history.py b/src/sastadev/history.py
@@ -14,7 +14,8 @@
 childescorrectionspath = os.path.join(settings.SD_DIR, 'data', 'childescorrections')
 
 childescorrectionsfullname = os.path.join(childescorrectionspath, 'childescorrections.txt')
-samplecorrectionsfullname = os.path.join(childescorrectionspath, 'samplecorrections.txt')
+children_samplecorrectionsfullname = os.path.join(childescorrectionspath, 'children_samplecorrections.txt')
+adult_samplecorrectionsfullname = os.path.join(childescorrectionspath, 'adult_samplecorrections.txt')
 donefilesfullname = os.path.join(childescorrectionspath, 'donefiles.txt')
 
 @dataclass
@@ -142,7 +143,8 @@ def mergecorrections(corrections1: HistoryCorrectionDict, corrections2: HistoryC
                                [tpl[0] for tpl in innereplacements] + \
                                [tpl[0] for tpl in innureplacements]
 
-samplecorrections = getcorrections(samplecorrectionsfullname)
+children_samplecorrections = getcorrections(children_samplecorrectionsfullname)
+adult_samplecorrections = getcorrections(adult_samplecorrectionsfullname)
 donefiles = getdonefilenames(donefilesfullname)
 
 junk = 0
diff --git a/src/sastadev/methods.py b/src/sastadev/methods.py
@@ -89,7 +89,7 @@ def astalemmafilter(query: Query, xrs: ExactResultsDict, xr: ExactResult) -> boo
     for (qid, val) in xrs:
         if qid == lemmaqid:
             if xr in xrs[(qid, val)]:
-                result1 = xr in xrs[lexreskey] or xr in xrs[nreskey]
+                result1 = (lexreskey in xrs and xr in xrs[lexreskey]) or (nreskey in xrs and xr in xrs[nreskey])
                 result = query.process == pre_process or result1
                 return result
 

diff --git a/src/sastadev/smartcompoundcomparison.py b/src/sastadev/smartcompoundcomparison.py
@@ -81,7 +81,8 @@ def main():
          'sinterklaas_paardje', True),
         ('kippes', 'kippies', 'kip_pies', True),
         ('diehoek', 'driehoek', 'drie_hoek', True),
-        ('jantauto', 'brandweerauto', 'brandweer_auto', True)
+        ('jantauto', 'brandweerauto', 'brandweer_auto', True),
+        ('koekklok', 'koekoeksklok', 'koekoek_klok', True)
 
     ]
     # testlist = [('risstengeltjes', 'rietstengeltjes', 'riet_stengel', True)]
-Original file line number
+Diff line change
@@ Expand Up / @@ -35,3 +35,5 @@ Sesamstraat @@
     koffie
     Smarties
     omie
+    m'n
+    z'n