From 50a615a7ea71c3cae622cf9533cd383fb6e2234a Mon Sep 17 00:00:00 2001 From: Jan Odijk Date: Fri, 20 Sep 2024 17:56:41 +0200 Subject: [PATCH] adult v. children spelling correction --- src/sastadev/__main__.py | 12 ++++- src/sastadev/astaforms.py | 2 +- src/sastadev/childesspellingcorrector.py | 52 ++++++++++++++++--- src/sastadev/corrector.py | 19 +++++-- .../additionalwordslexicon.txt | 2 + .../filledpauseslexicon/vuwordslexicon.txt | 1 + src/sastadev/history.py | 6 ++- src/sastadev/methods.py | 2 +- src/sastadev/smartcompoundcomparison.py | 3 +- 9 files changed, 80 insertions(+), 19 deletions(-) diff --git a/src/sastadev/__main__.py b/src/sastadev/__main__.py index eaab6cb..e07d7d8 100644 --- a/src/sastadev/__main__.py +++ b/src/sastadev/__main__.py @@ -166,7 +166,8 @@ from sastadev.external_functions import str2functionmap from sastadev.goldcountreader import get_goldcounts from sastadev.history import (donefiles, donefilesfullname, gathercorrections, mergecorrections, putcorrections, - putdonefilenames, samplecorrections, samplecorrectionsfullname) + putdonefilenames, children_samplecorrections, children_samplecorrectionsfullname, + adult_samplecorrections, adult_samplecorrectionsfullname) from sastadev.macros import expandmacros from sastadev.methods import Method, supported_methods, treatmethod from sastadev.mismatches import exactmismatches, literalmissedmatches @@ -1175,6 +1176,15 @@ def main(): thissamplecorrections = {} if options.dohistory: + if options.methodname.lower() in {'tarsp', 'stap'}: + samplecorrectionsfullname = children_samplecorrectionsfullname + samplecorrections = children_samplecorrections + elif options.methodname.lower() in {'asta'}: + samplecorrectionsfullname = adult_samplecorrectionsfullname + samplecorrections = adult_samplecorrections + else: # should not occur + settings.LOGGER.error(f'Illegal method name used: {options.method}') + exit(-1) if corr != corr0: reducedtreebankfullname = os.path.relpath(options.infilename, start=settings.DATAROOT) if reducedtreebankfullname not in donefiles: diff --git a/src/sastadev/astaforms.py b/src/sastadev/astaforms.py index ce42741..6a1a5bd 100644 --- a/src/sastadev/astaforms.py +++ b/src/sastadev/astaforms.py @@ -438,6 +438,6 @@ def getlemmafreqs(allresults, lexicalreskey) -> Dict[str, int]: if qid == lemmaqid: lemma = reskey[1] for position in allresults.exactresults[reskey]: - if position in allresults.exactresults[lexicalreskey]: + if lexicalreskey in allresults.exactresults and position in allresults.exactresults[lexicalreskey]: dict[lemma] += 1 return dict diff --git a/src/sastadev/childesspellingcorrector.py b/src/sastadev/childesspellingcorrector.py index 078ecc1..9888ba6 100644 --- a/src/sastadev/childesspellingcorrector.py +++ b/src/sastadev/childesspellingcorrector.py @@ -46,11 +46,8 @@ def getchildesfrq() -> Tuple[FrqDict, FrqDict, FrqDict]: # function to read the stored corrections into a dictionary -def getstoredcorrections() -> Dict[str, List[Tuple[str, int]]]: +def getstoredcorrections(correctionsfullname) -> Dict[str, List[Tuple[str, int]]]: correctionsdict = {} - correctionsfilename = 'storedcorrections.txt' - correctionspath = os.path.join(settings.SD_DIR, 'data/storedcorrections') - correctionsfullname = os.path.join(correctionspath, correctionsfilename) idata = readcsv(correctionsfullname) for i, row in idata: @@ -70,7 +67,7 @@ def getpenalty(score, total): # a function for spelling correction -def correctspelling(word: str, max = None, threshold=okthreshold) -> List[Tuple[str, int]]: +def children_correctspelling(word: str, correctionsdict, max = None, threshold=okthreshold) -> List[Tuple[str, int]]: if word in correctionsdict: return correctionsdict[word] else: @@ -115,12 +112,40 @@ def correctspelling(word: str, max = None, threshold=okthreshold) -> List[Tuple[ return result + +def adult_correctspelling(word: str, correctionsdict,max = None, threshold=okthreshold) -> List[Tuple[str, int]]: + if word in correctionsdict: + return correctionsdict[word] + else: + corrections = spell.candidates(word) + if corrections is not None: + corrtuples = [(corr, spell.word_usage_frequency(corr)) for corr in corrections] + else: + corrtuples = [] + + sortedcorrtuples = sorted(corrtuples, key=lambda x: x[1], reverse=True) + allfrqsum = sum(corrtuple[1]for corrtuple in sortedcorrtuples) + + result = [(corr, getpenalty(score, allfrqsum)) for (corr, score) in sortedcorrtuples] + + if max is not None: + result = result[:max] + + # store the result in the dictionary; write dictionary to file + + return result + + def tryme(): words = ['kantie', 'opbijten', 'oprijten', 'opgereten', 'peelkaal' , ' beete' , 'kamm', 'daaistoel', 'oelen', 'tein'] for word in words: - result = correctspelling(word, max=5) + result = children_correctspelling(word, children_correctionsdict, max=5) print(f'{word}: {result}' ) + words = ['motariek', 'silase'] + for word in words: + result = adult_correctspelling(word, adult_correctionsdict, max=5) + print(f'{word}: {result}' ) @@ -129,8 +154,19 @@ def tryme(): # read the childes frequency dict in, for targets and others and combine them also trgfrqdict, othfrqdict, allfrqdict = getchildesfrq() -# read the stored corrections into a dictionary -correctionsdict = getstoredcorrections() +# read the stored corrections for children into a dictionary +children_correctionsfilename = 'children_storedcorrections.txt' +correctionspath = os.path.join(settings.SD_DIR, 'data/storedcorrections') +children_correctionsfullname = os.path.join(correctionspath, children_correctionsfilename) +children_correctionsdict = getstoredcorrections(children_correctionsfullname) + +# read the stored corrections for adults into a dictionary +adult_correctionsfilename = 'adult_storedcorrections.txt' +correctionspath = os.path.join(settings.SD_DIR, 'data/storedcorrections') +adult_correctionsfullname = os.path.join(correctionspath, adult_correctionsfilename) +adult_correctionsdict = getstoredcorrections(adult_correctionsfullname) + + if __name__ == '__main__': tryme() \ No newline at end of file diff --git a/src/sastadev/corrector.py b/src/sastadev/corrector.py index c4e342d..33c8dda 100644 --- a/src/sastadev/corrector.py +++ b/src/sastadev/corrector.py @@ -9,7 +9,8 @@ from sastadev.alpino import getdehetwordinfo from sastadev.basicreplacements import (basicexpansions, basicreplacementpairs, basicreplacements, getdisambiguationdict, parsereplacements) -from sastadev.childesspellingcorrector import correctspelling, allfrqdict +from sastadev.childesspellingcorrector import (adult_correctionsdict, adult_correctspelling, + children_correctionsdict, children_correctspelling, allfrqdict) from sastadev.correctionparameters import CorrectionParameters from sastadev.cleanCHILDEStokens import cleantokens from sastadev.conf import settings @@ -22,7 +23,8 @@ from sastadev.find_ngram import (Ngram, findmatches, ngram1, ngram2, ngram7, ngram10, ngram11, ngram16, ngram17) from sastadev.history import (childescorrections, childescorrectionsexceptions, mergecorrections, putcorrections, - samplecorrections, samplecorrectionsfullname) + children_samplecorrections, children_samplecorrectionsfullname, + adult_samplecorrections, adult_samplecorrectionsfullname) from sastadev.iedims import getjeforms from sastadev.lexicon import (WordInfo, de, dets, getwordinfo, het, informlexicon, isa_namepart, isa_inf, isa_vd, known_word, @@ -1264,7 +1266,7 @@ def getalternativetokenmds(tokenmd: TokenMD, tokens: List[Token], tokenctr: int not known_word(token.word) and \ token.word in correctionparameters.allsamplecorrections and \ token.word not in childescorrectionsexceptions: - cc = samplecorrections[token.word] + cc = correctionparameters.allsamplecorrections[token.word] sumfrq = sum([hc.frequency for hc in cc]) for hc in cc: relfrq = hc.frequency / sumfrq @@ -1453,10 +1455,17 @@ def getalternativetokenmds(tokenmd: TokenMD, tokens: List[Token], tokenctr: int if correctionparameters.options.dospellingcorrection and \ - correctionparameters.method in {'tarsp', 'stap'} and not known_word(token.word) and applyspellingcorrectionisok(token.word) and \ + not known_word(token.word) and applyspellingcorrectionisok(token.word) and \ not schwandropfound and not postviefound and not token.word[0].isupper() and not deduplicated and \ not(token.word.endswith('ie') or token.word.endswith('ies')) and token.word[-3:] not in vvs: - corrtuples = correctspelling(token.word, max=5) + if correctionparameters.method in {'tarsp', 'stap'}: + corrtuples = children_correctspelling(token.word, children_correctionsdict, max=5) + elif correctionparameters.method in {'asta'}: + corrtuples = [] + # put off because it causes a lot of errors: the X-words should all have been removed + # corrtuples = adult_correctspelling(token.word, adult_correctionsdict, max=5) + else: + corrtuples = [] for corr, penalty in corrtuples: if corr != token.word and known_word(corr): newtokenmds = updatenewtokenmds(newtokenmds, token, [corr], beginmetadata, diff --git a/src/sastadev/data/filledpauseslexicon/additionalwordslexicon.txt b/src/sastadev/data/filledpauseslexicon/additionalwordslexicon.txt index 27c3b8d..2e7fb1c 100644 --- a/src/sastadev/data/filledpauseslexicon/additionalwordslexicon.txt +++ b/src/sastadev/data/filledpauseslexicon/additionalwordslexicon.txt @@ -35,3 +35,5 @@ Sesamstraat koffie Smarties omie +m'n +z'n \ No newline at end of file diff --git a/src/sastadev/data/filledpauseslexicon/vuwordslexicon.txt b/src/sastadev/data/filledpauseslexicon/vuwordslexicon.txt index c9a87b5..0a269ff 100644 --- a/src/sastadev/data/filledpauseslexicon/vuwordslexicon.txt +++ b/src/sastadev/data/filledpauseslexicon/vuwordslexicon.txt @@ -1,4 +1,5 @@ goed 100, +ha 100, he 120 hè 120 hoor 003 diff --git a/src/sastadev/history.py b/src/sastadev/history.py index 441f177..b3980a3 100644 --- a/src/sastadev/history.py +++ b/src/sastadev/history.py @@ -14,7 +14,8 @@ childescorrectionspath = os.path.join(settings.SD_DIR, 'data', 'childescorrections') childescorrectionsfullname = os.path.join(childescorrectionspath, 'childescorrections.txt') -samplecorrectionsfullname = os.path.join(childescorrectionspath, 'samplecorrections.txt') +children_samplecorrectionsfullname = os.path.join(childescorrectionspath, 'children_samplecorrections.txt') +adult_samplecorrectionsfullname = os.path.join(childescorrectionspath, 'adult_samplecorrections.txt') donefilesfullname = os.path.join(childescorrectionspath, 'donefiles.txt') @dataclass @@ -142,7 +143,8 @@ def mergecorrections(corrections1: HistoryCorrectionDict, corrections2: HistoryC [tpl[0] for tpl in innereplacements] + \ [tpl[0] for tpl in innureplacements] -samplecorrections = getcorrections(samplecorrectionsfullname) +children_samplecorrections = getcorrections(children_samplecorrectionsfullname) +adult_samplecorrections = getcorrections(adult_samplecorrectionsfullname) donefiles = getdonefilenames(donefilesfullname) junk = 0 \ No newline at end of file diff --git a/src/sastadev/methods.py b/src/sastadev/methods.py index 52c9724..3d6357e 100644 --- a/src/sastadev/methods.py +++ b/src/sastadev/methods.py @@ -89,7 +89,7 @@ def astalemmafilter(query: Query, xrs: ExactResultsDict, xr: ExactResult) -> boo for (qid, val) in xrs: if qid == lemmaqid: if xr in xrs[(qid, val)]: - result1 = xr in xrs[lexreskey] or xr in xrs[nreskey] + result1 = (lexreskey in xrs and xr in xrs[lexreskey]) or (nreskey in xrs and xr in xrs[nreskey]) result = query.process == pre_process or result1 return result diff --git a/src/sastadev/smartcompoundcomparison.py b/src/sastadev/smartcompoundcomparison.py index 218dbaa..bc15baa 100644 --- a/src/sastadev/smartcompoundcomparison.py +++ b/src/sastadev/smartcompoundcomparison.py @@ -81,7 +81,8 @@ def main(): 'sinterklaas_paardje', True), ('kippes', 'kippies', 'kip_pies', True), ('diehoek', 'driehoek', 'drie_hoek', True), - ('jantauto', 'brandweerauto', 'brandweer_auto', True) + ('jantauto', 'brandweerauto', 'brandweer_auto', True), + ('koekklok', 'koekoeksklok', 'koekoek_klok', True) ] # testlist = [('risstengeltjes', 'rietstengeltjes', 'riet_stengel', True)]