diff --git a/src/sastadev/CHAT_Annotation.py b/src/sastadev/CHAT_Annotation.py index f343598..d5ddde0 100644 --- a/src/sastadev/CHAT_Annotation.py +++ b/src/sastadev/CHAT_Annotation.py @@ -9,9 +9,11 @@ CHAT = 'CHAT' CHAT_explanation = 'Explanation' -CHAT_wordnoncompletion = 'Noncompletion of a Word' +CHAT_reformulation = 'Reformulation' +CHAT_repetition = 'Repetition' CHAT_replacement = 'Replacement' - +CHAT_retracing = 'Retracing' +CHAT_wordnoncompletion = 'Noncompletion of a Word' monadic = 1 dyadic = 2 @@ -78,7 +80,7 @@ def refunction(x): specialformpat = wordpat + r'(?:@z:\w\w\w|@\w\w?\w?)' fullspecialformpat = fullre(specialformpat) specialformre = re.compile(fullspecialformpat) -repkeepannotations = ['Repetition', 'Retracing', 'Reformulation'] +repkeepannotations = [CHAT_repetition, CHAT_retracing, CHAT_reformulation] def getreplacement(repkeep, annotation): @@ -817,7 +819,7 @@ def result(x, y): CHAT_ComplexRegex( (r'\[=', anybutrb, r'\]'), (keep, eps), False), complexmetafunction), - CHAT_Annotation('Replacement', '8.3:69', '10.3:73', + CHAT_Annotation(CHAT_replacement, '8.3:69', '10.3:73', CHAT_ComplexRegex( (r'\[:\s', r'([^\]]+)', r'\]'), (eps, keep), True, containswords=True), complexmetafunction_replbpl), @@ -832,13 +834,13 @@ def result(x, y): CHAT_ComplexRegex((r'\[%\s+', anybutrb, r'\]'), (keep, eps), True), complexmetafunction), CHAT_Annotation('Best Guess', '8.3:70-71', '10.3:74', CHAT_SimpleScopedRegex(r'\[\?\]', keep, True, monadic), simplescopedmetafunction), - CHAT_Annotation('Repetition', '8.4:72', '10.4:75-76', CHAT_SimpleScopedRegex(r'\[/\]', eps, True, monadic), + CHAT_Annotation(CHAT_repetition, '8.4:72', '10.4:75-76', CHAT_SimpleScopedRegex(r'\[/\]', eps, True, monadic), simplescopedmetafunction), CHAT_Annotation('Multiple Repetition', '8.4:72-73', '10.4:76', CHAT_ComplexRegex((r'\[x', r'[0-9]+', r'\]'), (keep, eps), True), complexmetafunction), - CHAT_Annotation('Retracing', '8.4:73', '10.4:76-77', CHAT_SimpleScopedRegex(r'\[//\]', eps, True, monadic), + CHAT_Annotation(CHAT_retracing, '8.4:73', '10.4:76-77', CHAT_SimpleScopedRegex(r'\[//\]', eps, True, monadic), simplescopedmetafunction), - CHAT_Annotation('Reformulation', '8.4:73-74', '10.4:77', CHAT_SimpleScopedRegex(r'\[///\]', eps, True, monadic), + CHAT_Annotation(CHAT_reformulation, '8.4:73-74', '10.4:77', CHAT_SimpleScopedRegex(r'\[///\]', eps, True, monadic), simplescopedmetafunction), CHAT_Annotation('False Start Without Retracing', '8.4:74', '10.4:77', CHAT_SimpleScopedRegex(r'\[/\-\]', eps, True, dyadic), simplescopedmetafunction), diff --git a/src/sastadev/__main__.py b/src/sastadev/__main__.py index e07d7d8..7c1e3b7 100644 --- a/src/sastadev/__main__.py +++ b/src/sastadev/__main__.py @@ -155,11 +155,12 @@ ResultsKey, mkresultskey, scores2counts, showreskey) from sastadev.conf import settings -from sastadev.constants import (bronzefolder, bronzesuffix, checksuffix, checkeditedsuffix, +from sastadev.constants import (analysissuffix, bronzefolder, bronzesuffix, byuttscoressuffix, checksuffix, checkeditedsuffix, formsfolder, intreebanksfolder, loggingfolder, outtreebanksfolder, permprefix, platinumsuffix, platinumeditedsuffix, resultsfolder, silverfolder, silverpermfolder, silversuffix) +from sastadev.context import getcontextdict from sastadev.correctionparameters import CorrectionParameters from sastadev.correcttreebank import (correcttreebank, corr0, corrn, errorwbheader, validcorroptions) from sastadev.counterfunctions import counter2liststr @@ -178,6 +179,8 @@ post_process, query_exists, query_inform) from sastadev.readcsv import writecsv from sastadev.readmethod import itemseppattern, read_method +from sastadev.resultsbyutterance import getscoresbyutt, mkscoresbyuttrows, byuttheader +from sastadev.sas_impact import getcomparisoncounts, mksas_impactrows, sas_impact from sastadev.sastatypes import (AltCodeDict, ExactResultsDict, FileName, GoldTuple, MatchesDict, MethodName, QId, QIdCount, QueryDict, ResultsCounter, @@ -197,7 +200,7 @@ from sastadev.treebankfunctions import (find1, getattval, getnodeendmap, getuttid, getxmetatreepositions, getxsid, getyield, showtree) -from sastadev.xlsx import mkworkbook +from sastadev.xlsx import mkworkbook, add_worksheet start_time = time.time() @@ -1204,8 +1207,10 @@ def main(): else: mergedsamplecorrections = {} + contextdict = getcontextdict(treebank2, lambda x: True) - correctionparameters = CorrectionParameters(methodname, options, mergedsamplecorrections, thissamplecorrections) + correctionparameters = CorrectionParameters(methodname, options, mergedsamplecorrections, + thissamplecorrections, treebank2, contextdict) treebank, errordict, allorandalts = correcttreebank(treebank2, targets, correctionparameters, corr=corr) @@ -1274,12 +1279,27 @@ def main(): silverscores = exact2results(exactsilverscores) # ongoing silvercounts = scores2counts(silverscores) + # scores by utterance + # bronzescoresbyutt = getscoresbyutt(allresults.coreresults, goldscores) + # silverscoresbyutt = getscoresbyutt(allresults.coreresults, silverscores) + + byuttrows = mkscoresbyuttrows(allresults, goldscores, silverscores, themethod) + not100count = len([row for row in byuttrows if row[9] != 100]) + scoresbyuttoutfullname = os.path.join(resultspath, corefilename + byuttscoressuffix + '.xlsx') + wb = mkworkbook(scoresbyuttoutfullname, [byuttheader], byuttrows, freeze_panes=(1,0) ) + allbyuttscores = sas_impact(allresults, silverscores, themethod) + sasheader, sasimpactrows = mksas_impactrows(allbyuttscores, not100count) + add_worksheet(wb,[sasheader], sasimpactrows, sheetname='SAS_impact', freeze_panes=(1,0)) + wb.close() + + + # netx is now obsolete # platinumresults: Dict[ResultsKey, Counter] = reduceresults(platinumresults, samplesizetuple, options.methodname) (base, ext) = os.path.splitext(options.infilename) outputfullname = os.path.join( - resultspath, corefilename + "_analysis" + tsvext + txtext) + resultspath, corefilename + analysissuffix + tsvext + txtext) outfile = open(outputfullname, 'w', encoding='utf8') outxlsx = os.path.join(resultspath, corefilename + "_analysis" + xlsxext) diff --git a/src/sastadev/alpinocompound.py b/src/sastadev/alpinocompound.py new file mode 100644 index 0000000..2b989fd --- /dev/null +++ b/src/sastadev/alpinocompound.py @@ -0,0 +1,39 @@ +from sastadev.treebankfunctions import find1, getattval +from sastadev.conf import settings +from sastadev.lexicon import known_word + + +# verhuizen naar lexicon module +comma = ',' +compoundsep = '_' + +alpinoparse = settings.PARSE_FUNC + + +def isalpinocompound(wrd: str) -> bool: + fullstr = f'geen {wrd}' # geen makes it a noun and can combine with uter and neuter, count and mass, sg and plural + tree = alpinoparse(fullstr) + # find the noun + if tree is None: + settings.LOGGER.error(f'Parsing {fullstr} failed') + return False + nounnode = find1(tree, './/node[@pt="n"]') + if nounnode is None: + settings.LOGGER.error(f'No noun found in {fullstr} parse') + return False + nounwrd = getattval(nounnode, 'word') + if nounwrd != wrd: + settings.LOGGER.error(f'Wrong noun ({nounwrd}) found in {fullstr} parse') + return False + nounlemma = getattval(nounnode, 'lemma') + if compoundsep in nounlemma: + parts = nounlemma.split(compoundsep) + unknownparts = [part for part in parts if not known_word(part)] + result = unknownparts = [] + if not result: + settings.LOGGER.error(f'Unknown words ({comma.join(unknownparts)}) found in {fullstr} parse') + return False + return True + else: + return False + diff --git a/src/sastadev/basicreplacements.py b/src/sastadev/basicreplacements.py index afdb220..c4057c0 100644 --- a/src/sastadev/basicreplacements.py +++ b/src/sastadev/basicreplacements.py @@ -138,7 +138,7 @@ def combine(strlist: List[str]) -> str: #: .. autodata:: sastadev.basicreplacements::innureplacements #: :no-value: #: -basicreplacementlist: List[BasicReplacement] = [('as', 'als', pron, infpron, codared, dp), +basicreplacementlist: List[BasicReplacement] = [('as', 'als', pron, infpron, codared, -2 * dp), ('isse', 'is', pron, infpron, addschwa, mp(10)), ('ooke', 'ook', pron, infpron, addschwa, dp), ('t', "'t", orth, spellerr, apomiss, dp), @@ -234,7 +234,7 @@ def combine(strlist: List[str]) -> str: ('əs', 'eens', pron, infpron, reduction, dp), ('moetə', 'moeten', pron, infpron, infpron, dp), ('moetə' , 'moet', pron, infpron, infpron, dp), - ('pot', 'kapot', pron, infpron, sylldrop, dp), + ('pot', 'kapot', pron, infpron, sylldrop, -2 * dp), ('almaal', 'allemaal', pron, infpron, sylldrop, dp), ('knorrens', 'varkens', lexical, substitution, onom,dp), ('potte', 'kapot', pron, infpron, combine([sylldrop, emphasis]), dp), @@ -325,8 +325,8 @@ def combine(strlist: List[str]) -> str: ('as-t-ie', ['als', 'ie'], pron, infpron, t_ie, dp), ("dit's", ["dit", "is"], pron, infpron, contract, dp), ("dat's", ["dat", "is"], pron, infpron, contract, dp), - ("datte", ['dat', 'ie'], pron, infpron, contract, mp(120)), - ("omdatte", ['omdat', 'ie'], pron, infpron, contract, mp(120)), + ("datte", ['dat', 'ie'], pron, infpron, contract, mp(220)), + ("omdatte", ['omdat', 'ie'], pron, infpron, contract, mp(220)), ("isda", ['is', 'dat'], pron, infpron, contract, dp + 2), ("tisda", ['het', 'is', 'dat'], pron, infpron, contract, mp(120)), ("'savonds", ["'s", 'avonds'], pron, infpron, typo, mp(120)), @@ -446,7 +446,8 @@ def getdisambiguationdict() -> Dict[str, Tuple[TokenTreePredicate, str]]: disambiguationdict[w] = cond, repl return disambiguationdict -parsereplacementslist = [('smarties', 'toffees', alpino_unknown_word, -2*dp), - ('Smarties', 'toffees', alpino_unknown_word, -2*dp)] +# next replaced by wordsunknowntoalpino lexicon +#parsereplacementslist = [('smarties', 'toffees', alpino_unknown_word, -2*dp), +# ('Smarties', 'toffees', alpino_unknown_word, -2*dp)] -parsereplacements = {el[0]:el for el in parsereplacementslist} \ No newline at end of file +# parsereplacements = {el[0]:el for el in parsereplacementslist} \ No newline at end of file diff --git a/src/sastadev/childesspellingcorrector.py b/src/sastadev/childesspellingcorrector.py index 9888ba6..2ee9bc8 100644 --- a/src/sastadev/childesspellingcorrector.py +++ b/src/sastadev/childesspellingcorrector.py @@ -138,6 +138,7 @@ def adult_correctspelling(word: str, correctionsdict,max = None, threshold=okthr def tryme(): words = ['kantie', 'opbijten', 'oprijten', 'opgereten', 'peelkaal' , ' beete' , 'kamm', 'daaistoel', 'oelen', 'tein'] + words = ['poppe'] for word in words: result = children_correctspelling(word, children_correctionsdict, max=5) print(f'{word}: {result}' ) diff --git a/src/sastadev/constants.py b/src/sastadev/constants.py index e06dc31..f6cf770 100644 --- a/src/sastadev/constants.py +++ b/src/sastadev/constants.py @@ -29,6 +29,7 @@ analysissuffix = '_analysis' analysistsvsuffix = '_analysis.tsv' bronzesuffix = '_bronze' +byuttscoressuffix = '_scoresbyutt' silversuffix = '_silver' correctionrefsuffix = '_correctionref' permprefix = 'perm_' diff --git a/src/sastadev/context.py b/src/sastadev/context.py new file mode 100644 index 0000000..31279c3 --- /dev/null +++ b/src/sastadev/context.py @@ -0,0 +1,199 @@ +from collections import defaultdict +from editdistance import distance +from lxml import etree +import os +from sastadev.conf import settings +from sastadev.constants import intreebanksfolder, outtreebanksfolder +from sastadev.filefunctions import getbasename +from sastadev.lexicon import known_word +from sastadev.sastatypes import TreeBank, SynTree +from sastadev.treebankfunctions import getattval, getorigutt, getmeta, getxsid +from sastadev.xlsx import mkworkbook +from typing import Callable, List, Tuple + +comma = ',' + +realwordnodesxpath = """.//node[@word and @pt!='tsw' and @pt!='let']""" + +redthreshold = 0.5 + + +def relativedistance(wrd1: str, wrd2: str) -> float: + dist = distance(wrd1, wrd2) + maxl = max(len(wrd1), len(wrd2)) + result = dist/maxl + return result + +def mustbedone(stree: SynTree) -> bool: + origutt = getorigutt(stree) + xsid = getmeta(stree, 'xsid') + result = '[+ G]' in origutt or xsid is not None + return result + +def getcontextdict(treebank: TreeBank, cond: Callable) -> dict: + resultdict = defaultdict(lambda: defaultdict(tuple)) + for tree in treebank: + if cond(tree): + xsid = getxsid(tree) + if xsid == '0': + continue + realwordnodes = tree.xpath(realwordnodesxpath) + words = [getattval(wordnode, 'word') for wordnode in realwordnodes] + wrongwords = [word for word in words if len(word) > 4 and not known_word(word)] + origutt = getorigutt(tree) + if wrongwords == []: + continue + for wrongword in wrongwords: + prevcontext = getcontext(tree, treebank, -5, nottargetchild) + postcontext = getcontext(tree, treebank, +5, nottargetchild) + prevbestwords = findbestwords(wrongword, prevcontext, lambda x: True) + postbestwords = findbestwords(wrongword, postcontext, lambda x: True) + resultdict[xsid][wrongword] = (prevbestwords, postbestwords) + return resultdict + +def getcontext(stree:SynTree, tb: TreeBank, size: int, cond: Callable) -> List[SynTree]: + """ + get the context of *stree* in treebank *tb* with size *size*. each stree in the context must satisfy condition *cond*. + Negative size means preceding context, positive size means following context. Preceding context is delivered reversed + :param stree: + :param tb: + :param size: + :param cond: SynTree -> bool + :return: + """ + if size == 0: + return [] + streeindex = tb.index(stree) + ltb = len(tb) + context = [] + curindex = streeindex + while curindex >= 0 and curindex < ltb - 1: + incr = +1 if size > 0 else -1 + curindex = curindex + incr + curtree = tb[curindex] + if len(context) == abs(size): + break + if cond(curtree) and len(context) < abs(size): + context.append(curtree) + # if size < 0: + # context = reversed(context) + return context + +def getwordnodetuplesfromcontext(context: List[SynTree], cond: Callable) -> List[Tuple[SynTree, int]]: + """ + get the real word nodes from the context with their distance if the word satisfies the condition *cond*. + real words are words that are no interjections or interpunction (pt != 'tsw" and pt!= 'let') + :param context: List[SynTree]. Preceding context must be in reverse order + :param cond: str -> bool + :return: + """ + results = [] + for i, stree in enumerate(context): + rawwordnodes = stree.xpath(realwordnodesxpath) + wordnodes = [wordnode for wordnode in rawwordnodes if cond(getattval(wordnode, 'word'))] + streeresults = [(wordnode, i) for wordnode in wordnodes] + results += streeresults + return results + +def getwordtuplesfromcontext(context: List[SynTree], cond: Callable) -> List[Tuple[str, int]]: + wntuples = getwordnodetuplesfromcontext(context, cond) + wtuples = [(getattval(wn, 'word'), i) for wn, i in wntuples] + return wtuples + +def getlemmatuplesfromcontext(context: List[SynTree], cond: Callable) -> List[Tuple[str, int]]: + wntuples = getwordnodetuplesfromcontext(context, cond) + wtuples = [(getattval(wn, 'lemma'), i) for wn, i in wntuples] + return wtuples + + + +def islemmaincontext(wordnode: SynTree, context: List[SynTree]) -> bool: + lemmatuples = getlemmatuplesfromcontext(context, lambda x: True) + lemmas = [lemma for lemma, i in lemmatuples] + thislemma = getattval(wordnode, 'lemma') + result = thislemma in lemmas + return result + +def findbestwords(wrongword: str, context: List[SynTree], cond: Callable) -> List[str]: + """ + find the word *w* in the context that satisfies *cond*, and scores highest on the properties + (-relative_edit_distance(wrongword, w), -distance) + :param wrongword: + :param context: + :param cond: + :return: + """ + rawresults = [] + wntuples = getwordnodetuplesfromcontext(context, cond) + bestred = 1 + for wordnode, distance in wntuples: + word = getattval(wordnode, 'word') + red = relativedistance(wrongword, word) + if red < redthreshold: + rawresults.append((word, red, distance)) + sortedresults = sorted(rawresults, key=lambda x: (x[1], x[2])) + if sortedresults != []: + first = sortedresults[0] + filteredresults = [rawresult for rawresult in sortedresults if (rawresult[1], rawresult[2]) == (first[1], first[2])] + else: + filteredresults = [] + finalresults = [w for (w, red, dst) in filteredresults if len(w) > 4] + return finalresults + +def nottargetchild(stree: SynTree) -> bool: + spk = getmeta(stree, 'speaker') + result = spk != 'CHI' + return result + + +def main(): + # read auristrain DLD03 in as test treebank + # filename = 'DLD03.xml' + # filename = 'DLD11.xml' + dataset = 'auristrain' + table = [] + datasets = ['auristrain', 'vkltarsp', 'vklstap', 'vklasta', 'vklstapfase2', 'vklastafase2', 'auristest'] + for dataset in datasets: + fullpath = os.path.join(settings.DATAROOT, dataset, outtreebanksfolder) + filenames = os.listdir(fullpath) + # filenames= ['TD21.xml'] + for filename in filenames: + sample = getbasename(filename) + # print(f'Sample: {sample}') + infullname = os.path.join(fullpath, filename) + fulltreebank = etree.parse(infullname) + if fulltreebank is None: + print(f'No treebank {infullname} found, aborting') + exit(-1) + treebank = fulltreebank.getroot() + for tree in treebank: + if mustbedone(tree): + realwordnodes = tree.xpath(realwordnodesxpath) + words = [getattval(wordnode, 'word') for wordnode in realwordnodes ] + wrongwords = [word for word in words if len(word) > 4 and not known_word(word)] + origutt = getorigutt(tree) + if wrongwords == []: + continue + # print(f'UTT: {origutt}. Best corrections:' ) + for wrongword in wrongwords: + # print(f'wrong word: {wrongword}') + prevcontext = getcontext(tree, treebank, -5, nottargetchild) + postcontext = getcontext(tree, treebank, +5, nottargetchild) + prevbestwords = findbestwords(wrongword, prevcontext, lambda x: True) + # print(f'Preceding context: {comma.join(prevbestwords)}') + postbestwords = findbestwords(wrongword, postcontext, lambda x: True) + # print(f'Post context: {comma.join(postbestwords)}') + row = [dataset, sample, wrongword, comma.join(prevbestwords), comma.join(postbestwords), origutt] + table.append(row) + + header = ['dataset', 'sample', 'wrongword', 'prev', 'post', 'origutt'] + outfilename = 'contextcorrections.xlsx' + outpath = os.path.join(settings.SD_DIR, 'data', 'contextcorrections') + if not os.path.exists(outpath): + os.makedirs(outpath) + outfullname = os.path.join(outpath, outfilename) + wb = mkworkbook(outfullname, [header], table, freeze_panes=(1,0)) + wb.close() + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/src/sastadev/correctionparameters.py b/src/sastadev/correctionparameters.py index 280ebab..eed9631 100644 --- a/src/sastadev/correctionparameters.py +++ b/src/sastadev/correctionparameters.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from sastadev.sastatypes import MethodName +from sastadev.sastatypes import MethodName, TreeBank @dataclass class CorrectionParameters: @@ -7,3 +7,5 @@ class CorrectionParameters: options: dict allsamplecorrections : dict thissamplecorrections: dict + treebank: TreeBank + contextdict : dict diff --git a/src/sastadev/corrector.py b/src/sastadev/corrector.py index 2b7b78f..06abd86 100644 --- a/src/sastadev/corrector.py +++ b/src/sastadev/corrector.py @@ -8,9 +8,11 @@ from sastadev.alpino import getdehetwordinfo from sastadev.basicreplacements import (basicexpansions, basicreplacementpairs, basicreplacements, - getdisambiguationdict, parsereplacements) + getdisambiguationdict) +from sastadev.CHAT_Annotation import CHAT_retracing from sastadev.childesspellingcorrector import (adult_correctionsdict, adult_correctspelling, children_correctionsdict, children_correctspelling, allfrqdict) +from sastadev.context import findbestwords, getcontext from sastadev.correctionparameters import CorrectionParameters from sastadev.cleanCHILDEStokens import cleantokens from sastadev.conf import settings @@ -27,33 +29,39 @@ adult_samplecorrections, adult_samplecorrectionsfullname) from sastadev.iedims import getjeforms from sastadev.lexicon import (WordInfo, de, dets, getwordinfo, het, - informlexicon, isa_namepart, isa_inf, isa_vd, known_word, - tswnouns, vuwordslexicon, wordsunknowntoalpinolexicondict) + informlexicon, isa_namepart, isa_inf, isa_vd, known_word, nochildword, + tswnouns, validword, vuwordslexicon, wordsunknowntoalpinolexicondict) from sastadev.macros import expandmacros from sastadev.metadata import (Meta, bpl_word_delprec, bpl_indeze, bpl_node, bpl_none, bpl_word, bpl_wordlemma, defaultbackplacement, defaultpenalty, filled_pause, fstoken, intj, janeenou, longrep, mkSASTAMeta, modifypenalty as mp, repeated, repeatedjaneenou, repeatedseqtoken, shortrep, - substringrep, unknownsymbol) + substringrep, unknownsymbol, + SASTA, ADULTSPELLINGCORRECTION, ALLSAMPLECORRECTIONS, BASICREPLACEMENTS, CONTEXT, HISTORY, THISSAMPLECORRECTIONS, + CHILDRENSPELLINGCORRECTION, + EXTRAGRAMMATICAL + ) from sastadev.sasta_explanation import explanationasreplacement from sastadev.sastatoken import Token, tokenlist2stringlist from sastadev.sastatypes import (BackPlacement, MethodName, Nort, Penalty, Position, SynTree, UttId) from sastadev.smallclauses import smallclauses from sastadev.stringfunctions import (chatxxxcodes, consonants, dutchdeduplicate, - endsinschwa, fullworddehyphenate, + endsinschwa, fullworddehyphenate, ispunctuation, monosyllabic, vowels) from sastadev.sva import getsvacorrections from sastadev.toe import lonelytoe from sastadev.tokenmd import TokenListMD, TokenMD, mdlist2listmd -from sastadev.treebankfunctions import (fatparse, getattval, getnodeyield, keycheck, +from sastadev.treebankfunctions import (fatparse, getattval, getmeta, getnodeyield, getxsid, keycheck, showtree) Correction = Tuple[List[Token], List[Meta]] MetaCondition = Callable[[Meta], bool] -SASTA = 'SASTA' +basepenalties = {ADULTSPELLINGCORRECTION: 600, ALLSAMPLECORRECTIONS: 400, BASICREPLACEMENTS: 100, CHILDRENSPELLINGCORRECTION: 600, + CONTEXT: 200, HISTORY:500, THISSAMPLECORRECTIONS: 300 + } tarsp = 'tarsp' stap = 'stap' @@ -172,6 +180,15 @@ def skiptokens(tokenlist: List[Token], skiptokenlist: List[Token]) -> List[Token return resultlist +def speakeristargetchild(stree: SynTree) -> bool: + role = getmeta(stree, 'role') + result = role.lower() != 'target_child' + return result + +def nottargetchild(stree: SynTree) -> bool: + result = not speakeristargetchild(stree) + return result + def ngramreduction(reducedtokens: List[Token], token2nodemap: Dict[Token, SynTree], allremovetokens: List[Token], allremovepositions: List[Position], allmetadata: List[Meta], ngramcor: Ngramcorrection) \ -> Tuple[List[Token], List[Token], List[Meta]]: @@ -261,7 +278,7 @@ def reduce(tokens: List[Token], tree: Optional[SynTree]) -> Tuple[List[Token], L allremovetokens += unwantedtokens allremovepositions += unwantedpositions reducedtokens = [n for n in reducedtokens if n not in unwantedtokens] - metadata = [mkSASTAMeta(token, token, 'ExtraGrammatical', + metadata = [mkSASTAMeta(token, token, EXTRAGRAMMATICAL, unknownsymbol, 'Syntax') for token in unwantedtokens] allmetadata += metadata @@ -275,7 +292,7 @@ def reduce(tokens: List[Token], tree: Optional[SynTree]) -> Tuple[List[Token], L tok for tok in reducedtokens if tok not in filledpausetokens] reducednodes = [token2nodemap[tok.pos] for tok in reducedtokens if keycheck(tok.pos, token2nodemap)] - metadata = [mkSASTAMeta(token, token, 'ExtraGrammatical', + metadata = [mkSASTAMeta(token, token, EXTRAGRAMMATICAL, filled_pause, 'Syntax') for token in filledpausetokens] allmetadata += metadata @@ -285,7 +302,7 @@ def reduce(tokens: List[Token], tree: Optional[SynTree]) -> Tuple[List[Token], L ] allremovetokens += vutokens reducedtokens = [n for n in reducedtokens if n not in vutokens] - metadata = [mkSASTAMeta(token, token, 'ExtraGrammatical', + metadata = [mkSASTAMeta(token, token, EXTRAGRAMMATICAL, intj, 'Syntax') for token in vutokens] allmetadata += metadata @@ -302,7 +319,7 @@ def reduce(tokens: List[Token], tree: Optional[SynTree]) -> Tuple[List[Token], L allremovetokens += tswtokens allremovepositions == tswpositions reducedtokens = [n for n in reducedtokens if n not in tswtokens] - metadata = [mkSASTAMeta(token, token, 'ExtraGrammatical', + metadata = [mkSASTAMeta(token, token, EXTRAGRAMMATICAL, intj, 'Syntax') for token in tswtokens] allmetadata += metadata @@ -315,7 +332,7 @@ def reduce(tokens: List[Token], tree: Optional[SynTree]) -> Tuple[List[Token], L n for n in reducedtokens if n not in janeenouduplicatenodes] reducednodes = [token2nodemap[tok.pos] for tok in reducedtokens if keycheck(tok.pos, token2nodemap)] - metadata = [mkSASTAMeta(token, token, 'ExtraGrammatical', repeatedjaneenou, 'Syntax', subcat=repetition) + metadata = [mkSASTAMeta(token, token, EXTRAGRAMMATICAL, repeatedjaneenou, 'Syntax', subcat=repetition) for token in janeenouduplicatenodes] allmetadata += metadata @@ -329,7 +346,7 @@ def reduce(tokens: List[Token], tree: Optional[SynTree]) -> Tuple[List[Token], L allremovetokens += janeenoutokens allremovepositions += janeenoupositions reducedtokens = [tok for tok in reducedtokens if tok not in janeenoutokens] - metadata = [mkSASTAMeta(token, token, 'ExtraGrammatical', + metadata = [mkSASTAMeta(token, token, EXTRAGRAMMATICAL, janeenou, 'Syntax') for token in janeenoutokens] allmetadata += metadata @@ -347,7 +364,7 @@ def cond(x: Nort, y: Nort) -> bool: allremovetokens += shortprefixtokens allremovepositions += shortprefixpositions metadata = [ - mkSASTAMeta(token, repeatedtokens[token], 'ExtraGrammatical', shortrep, 'Tokenisation', subcat=repetition) for + mkSASTAMeta(token, repeatedtokens[token], EXTRAGRAMMATICAL, shortrep, 'Tokenisation', subcat=repetition) for token in reducedtokens if token in repeatedtokens] allmetadata += metadata reducedtokens = [ @@ -363,7 +380,7 @@ def longcond(x: Nort, y: Nort) -> bool: allremovetokens += longprefixtokens allremovepositions += longprefixpositions metadata = [ - mkSASTAMeta(token, repeatedtokens[token], 'ExtraGrammatical', longrep, 'Tokenisation', subcat=repetition) for + mkSASTAMeta(token, repeatedtokens[token], EXTRAGRAMMATICAL, longrep, 'Tokenisation', subcat=repetition) for token in reducedtokens if token in repeatedtokens] allmetadata += metadata reducedtokens = [ @@ -375,7 +392,7 @@ def longcond(x: Nort, y: Nort) -> bool: repeatedtokens = getrepeatedtokens(reducedtokens, substringtokens) allremovetokens += substringtokens allremovepositions += substringpositions - metadata = [mkSASTAMeta(token, repeatedtokens[token], 'ExtraGrammatical', substringrep, 'Tokenisation', + metadata = [mkSASTAMeta(token, repeatedtokens[token], EXTRAGRAMMATICAL, substringrep, 'Tokenisation', subcat=repetition) for token in reducedtokens if token in repeatedtokens] allmetadata += metadata reducedtokens = [ @@ -387,7 +404,7 @@ def longcond(x: Nort, y: Nort) -> bool: repeatedtokens = getrepeatedtokens(reducedtokens, dupnodetokens) allremovetokens += dupnodetokens allremovepositions += dupnodepositions - metadata = [mkSASTAMeta(token, repeatedtokens[token], 'ExtraGrammatical', + metadata = [mkSASTAMeta(token, repeatedtokens[token], EXTRAGRAMMATICAL, repeated, 'Tokenisation', subcat=repetition) for token in reducedtokens if token in repeatedtokens] allmetadata += metadata @@ -405,7 +422,7 @@ def longcond(x: Nort, y: Nort) -> bool: break allremovetokens += dupnodetokens allremovepositions += dupnodepositions - metadata = [mkSASTAMeta(token, nwt, 'ExtraGrammatical', + metadata = [mkSASTAMeta(token, nwt, EXTRAGRAMMATICAL, repeatedseqtoken, 'Tokenisation', subcat=repetition) for token, nwt in duppairs] allmetadata += metadata @@ -430,12 +447,12 @@ def longcond(x: Nort, y: Nort) -> bool: def metaf(falsestarttokens: List[Token], falsestartpositions: List[Position], correcttokens: List[Token]) \ -> List[Meta]: return \ - [Meta('Retracing', 'Retracing with Correction', annotatedposlist=falsestartpositions, + [Meta(CHAT_retracing, 'Retracing with Correction', annotatedposlist=falsestartpositions, annotatedwordlist=[c.word for c in falsestarttokens], annotationposlist=[c.pos for c in correcttokens], annotationwordlist=[c.word for c in correcttokens], cat='Retracing', subcat=None, source=SASTA, penalty=defaultpenalty, backplacement=bpl_none)] + \ - [mkSASTAMeta(ftoken, ctoken, 'Retracing with Correction', fstoken, 'Retracing') + [mkSASTAMeta(ftoken, ctoken, 'Retracing with Correction', fstoken, CHAT_retracing) for ftoken, ctoken in zip(falsestarttokens, correcttokens)] vnwpvvnwpvcor = Ngramcorrection(ngram1, (0, 2), (2, 4), metaf) @@ -583,6 +600,7 @@ def getcorrections(rawtokens: List[Token], correctionparameters: CorrectionParam # def getalternatives(origtokensmd, method, llremovedtokens, tree, uttid): def getalternatives(origtokensmd: TokenListMD, tree: SynTree, uttid: UttId, correctionparameters: CorrectionParameters): + methodname = correctionparameters.method newtokensmd = explanationasreplacement(origtokensmd, tree) if newtokensmd is not None: tokensmd = newtokensmd @@ -601,7 +619,7 @@ def getalternatives(origtokensmd: TokenListMD, tree: SynTree, uttid: UttId, alternativetokenmds[tokenctr] = getalternativetokenmds( tokenmd, tokens, tokenctr, tree, uttid, correctionparameters) validalternativetokenmds[tokenctr] = getvalidalternativetokenmds( - tokenmd, alternativetokenmds[tokenctr]) + tokenmd, alternativetokenmds[tokenctr], methodname) tokenctr += 1 # get all the new token sequences @@ -685,7 +703,7 @@ def getalternatives(origtokensmd: TokenListMD, tree: SynTree, uttid: UttId, allalternativemds += newresults # final check whether the alternatives are improvements. It is not assumed that the original tokens is included in the alternatives - finalalternativemds = lexcheck(tokensmd, allalternativemds) + finalalternativemds = lexcheck(tokensmd, allalternativemds, methodname) return finalalternativemds @@ -770,7 +788,7 @@ def OLDgetexpansions(uttmd: TokenListMD) -> List[TokenListMD]: if expansionfound: meta2 = Meta('OrigCleanTokenPosList', tokenposlist, annotatedposlist=[], annotatedwordlist=[], annotationposlist=tokenposlist, - annotationwordlist=[], cat='Tokenisation', subcat=None, source=SASTA, penalty=defaultpenalty, + annotationwordlist=[], cat='Tokenisation', subcat=None, source=SASTA, penalty=0, backplacement=bpl_none) newmd.append(meta2) result = [TokenListMD(newtokens, newmd)] @@ -798,8 +816,9 @@ def getsingleitemexpansions(token: Token, intokenposlist) -> List[Tuple[TokenLis newtokens.append(newtoken) outtokenposlist.append(token.pos) nwt = Token(space.join(rlist), token.pos) - meta1 = mkSASTAMeta(token, nwt, n, v, c, subcat=None, penalty=p, - backplacement=bpl_none) + fullpenalty = basepenalties[BASICREPLACEMENTS] + p + meta1 = mkSASTAMeta(token, nwt, n, v, c, subcat=None, penalty=fullpenalty, + backplacement=bpl_none, source=f'{SASTA}/{BASICREPLACEMENTS}') newmd.append(meta1) result = (TokenListMD(newtokens, newmd), outtokenposlist) results.append(result) @@ -903,7 +922,8 @@ def getexpansions(uttmd: TokenListMD) -> List[TokenListMD]: annotatedwordlist=[], annotationposlist=tokenposlist, annotationwordlist=[], cat='Tokenisation', subcat=None, source=SASTA, penalty=0, backplacement=bpl_none) - newmd = result[0].metadata + newmd = uttmd.metadata + newmd += result[0].metadata newmd.append(meta2) newtokenmd = TokenListMD(result[0].tokens, newmd) newtokenmds.append(newtokenmd) @@ -923,7 +943,7 @@ def getexpansions(uttmd: TokenListMD) -> List[TokenListMD]: # finalresults.append(finalresult) -def lexcheck(intokensmd: TokenListMD, allalternativemds: List[TokenListMD]) -> List[TokenListMD]: +def lexcheck(intokensmd: TokenListMD, allalternativemds: List[TokenListMD], methodname:MethodName) -> List[TokenListMD]: finalalternativemds = [intokensmd] for alternativemd in allalternativemds: diff_found = False @@ -936,7 +956,7 @@ def lexcheck(intokensmd: TokenListMD, allalternativemds: List[TokenListMD]) -> L for (intoken, outtoken) in zip(intokens, outtokens): if intoken != outtoken: diff_found = True - if not known_word(outtoken.word): + if not validword(outtoken.word, methodname): include = False break if diff_found and include: @@ -954,12 +974,12 @@ def lexcheck(intokensmd: TokenListMD, allalternativemds: List[TokenListMD]) -> L def updatenewtokenmds(newtokenmds: List[TokenMD], token: Token, newwords: List[str], beginmetadata: List[Meta], - name: str, value: str, cat: str, subcat: Optional[str] = None, + name: str, value: str, cat: str, subcat: Optional[str] = None, source=SASTA, penalty: Penalty = defaultpenalty, backplacement: BackPlacement = defaultbackplacement) \ -> List[TokenMD]: for nw in newwords: nwt = Token(nw, token.pos) - meta = mkSASTAMeta(token, nwt, name=name, value=value, cat=cat, subcat=subcat, penalty=penalty, + meta = mkSASTAMeta(token, nwt, name=name, value=value, cat=cat, subcat=subcat, source=source, penalty=penalty, backplacement=backplacement) metadata = [meta] + beginmetadata newwordtokenmd = TokenMD(nwt, metadata) @@ -1007,7 +1027,7 @@ def isnounsg(token: Token) -> bool: return True return False -def initdevoicing(token: Token, voiceless: str, voiced: str, newtokenmds: List[TokenMD], beginmetadata: List[Meta]) \ +def initdevoicing(token: Token, voiceless: str, voiced: str, methodname: MethodName, newtokenmds: List[TokenMD], beginmetadata: List[Meta]) \ -> List[TokenMD]: ''' The function *initdevoicing* takes as input *token*, checks whether it is an @@ -1022,10 +1042,10 @@ def initdevoicing(token: Token, voiceless: str, voiced: str, newtokenmds: List[T ''' # initial s -> z, f -> v - if not known_word(token.word) or token.word in specialdevoicingwords: + if not validword(token.word, methodname) or token.word in specialdevoicingwords : if token.word[0] == voiceless: newword = voiced + token.word[1:] - if known_word(newword): + if validword(newword, methodname): newtokenmds = updatenewtokenmds(newtokenmds, token, [newword], beginmetadata, name='Pronunciation Variant', value='Initial {} devoicing'.format( @@ -1048,7 +1068,7 @@ def adaptpenalty(wrong: str, correct: str, p: Penalty) -> Penalty: if hc.correction == correct: sumfrq = sum([hc.frequency for hc in cc]) relfrq = hc.frequency / sumfrq - penalty = max(1, int(defaultpenalty * (1 - relfrq))) + penalty = max(1, int(defaultpenalty * (1 - relfrq))) + p return penalty return p @@ -1064,6 +1084,7 @@ def nocorrectparse(tree: SynTree) -> bool: def getalternativetokenmds(tokenmd: TokenMD, tokens: List[Token], tokenctr: int, tree: SynTree, uttid: UttId, correctionparameters: CorrectionParameters) -> List[TokenMD]: + methodname = correctionparameters.method token = tokenmd.token beginmetadata = tokenmd.metadata newtokenmds: List[TokenMD] = [] @@ -1075,24 +1096,28 @@ def getalternativetokenmds(tokenmd: TokenMD, tokens: List[Token], tokenctr: int if token.skip: return newtokenmds + # ignore interpunction + if ispunctuation(token.word): + return newtokenmds + # decapitalize initial token except when it is a known name - # No do not do this - # if tokenctr == 0 and token.word.istitle() and not isa_namepart(token.word): - # newword = token.word.lower() - # - # newtokenmds = updatenewtokenmds(newtokenmds, token, [newword], beginmetadata, - # name='Character Case', value='Lower case', cat='Orthography') + # do this only for ASTA + if correctionparameters.method in {asta} and tokenctr == 0 and token.word.istitle() and not isa_namepart(token.word): + newword = token.word.lower() + + newtokenmds = updatenewtokenmds(newtokenmds, token, [newword], beginmetadata, + name='Character Case', value='Lower case', cat='Orthography') # dehyphenate - if not known_word(token.word) and hyphen in token.word: - newwords = fullworddehyphenate(token.word, known_word) + if not validword(token.word, methodname) and hyphen in token.word: + newwords = fullworddehyphenate(token.word, lambda x: validword(x, methodname)) newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata, name='Dehyphenation', value='Dehyphenation', cat='Pronunciation', backplacement=bpl_word) # deduplicate jaaaaa -> ja; heeeeeel -> heel - if not known_word(token.word): - newwords = dutchdeduplicate(token.word, known_word, exceptions=chatxxxcodes) + if not validword(token.word, methodname) : + newwords = dutchdeduplicate(token.word, lambda x: validword(x, methodname), exceptions=chatxxxcodes) deduplicated = newwords != [] newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata, name='Emphasis', value='Phoneme lengthening', cat='Pronunciation', @@ -1101,7 +1126,7 @@ def getalternativetokenmds(tokenmd: TokenMD, tokens: List[Token], tokenctr: int # aha oho uhu ehe ahapattern = r'([aeouy])h\1' ahare = re.compile(ahapattern) - if not known_word(token.word) and ahare.search(token.word): + if not validword(token.word, methodname) and ahare.search(token.word): newwords = [ahare.sub(r'\1', token.word)] newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata, name='Emphasis', value='Phoneme Duplication', cat='Pronunciation', @@ -1109,7 +1134,7 @@ def getalternativetokenmds(tokenmd: TokenMD, tokens: List[Token], tokenctr: int # iehie ijhij iehiepattern = r'(ie|ij)h\1' iehiere = re.compile(iehiepattern) - if not known_word(token.word) and iehiere.search(token.word): + if not validword(token.word, methodname) and iehiere.search(token.word): newwords = [iehiere.sub(r'\1', token.word)] newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata, name='Emphasis', value='Phoneme Duplication', cat='Pronunciation', @@ -1119,23 +1144,25 @@ def getalternativetokenmds(tokenmd: TokenMD, tokens: List[Token], tokenctr: int # here come the replacements if token.word in basicreplacements: for (r, c, n, v, p) in basicreplacements[token.word]: - newpenalty = adaptpenalty(token.word, r, p) + newpenalty = basepenalties[BASICREPLACEMENTS] + adaptpenalty(token.word, r, p-defaultpenalty) newwords = [r] newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata, - name=n, value=v, cat=c, backplacement=bpl_word, penalty=newpenalty) + name=n, value=v, cat=c, source=f'{SASTA}/{BASICREPLACEMENTS}', + backplacement=bpl_word, penalty=newpenalty) # final r realized as w weew, ew - if not known_word(token.word) and token.word.endswith('w') and known_word(f'{token.word[:-1]}r'): + if not validword(token.word, methodname) and token.word.endswith('w') and \ + validword(f'{token.word[:-1]}r', methodname): newwords = [f'{token.word[:-1]}r'] newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata, name='Informal pronunciation', value='Final r -> w', cat='Pronunciation', backplacement=bpl_word) # aller- misspelled as alle - if (not known_word(token.word) and + if (not validword(token.word, methodname) and token.word.startswith('alle') and not token.word.startswith('aller') and (token.word.endswith('st') or token.word.endswith('ste')) and - known_word(f'{token.word[4:]}')): + validword(f'{token.word[4:]}', methodname)): newwords = [f'aller{token.word[4:]}'] newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata, name='Informal pronunciation', value='r-drop', @@ -1155,14 +1182,15 @@ def getalternativetokenmds(tokenmd: TokenMD, tokens: List[Token], tokenctr: int # wrong past participle emaakt -> gemaakt - if not known_word(token.word) and token.word.startswith('e') and known_word(f'g{token.word}'): + if not validword(token.word, methodname) and token.word.startswith('e') and validword(f'g{token.word}', methodname): newwords = [f'g{token.word}'] newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata, name='Informal pronunciation', value='Initial g dropped', cat='Pronunciation', backplacement=bpl_word) # wrong transcription of 's + e-participle past participle semaakt -> 's emaakt -> is gemaakt - if not known_word(token.word) and token.word.startswith('se') and known_word(f'g{token.word[1:]}'): + if not validword(token.word, methodname) and token.word.startswith('se') and \ + validword(f'g{token.word[1:]}', methodname): newwords = [f"is g{token.word[1:]}"] newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata, name='Informal pronunciation', value='Initial g dropped', cat='Pronunciation', @@ -1170,7 +1198,8 @@ def getalternativetokenmds(tokenmd: TokenMD, tokens: List[Token], tokenctr: int # wrong past participle semaakt -> gemaakt - if not known_word(token.word) and token.word.startswith('se') and known_word(f'g{token.word[1:]}'): + if not validword(token.word, methodname) and token.word.startswith('se') and \ + validword(f'g{token.word[1:]}', methodname): newwords = [f'g{token.word[1:]}'] newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata, name='Informal pronunciation', value='Initial g replaced by s', cat='Pronunciation', @@ -1248,59 +1277,82 @@ def getalternativetokenmds(tokenmd: TokenMD, tokens: List[Token], tokenctr: int # words unknown to Alpino e.g *gymmen* is replaced by *trainen* if token.word in wordsunknowntoalpinolexicondict: - newwords = [wordsunknowntoalpinolexicondict[token.word]] - newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata, + for newword in wordsunknowntoalpinolexicondict[token.word]: + newwords = [newword] + newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata, name='Word unknown to Alpino', value='Unknown word', cat='lexicon', backplacement=bpl_wordlemma) + # replace unknown words by similar words from the context --tarsp and stap only, for asta more needs to be doen + if not validword(token.word, methodname) and correctionparameters.method in {tarsp, stap}: + xsid = getxsid(tree) + thecontextdict = correctionparameters.contextdict + if xsid in thecontextdict and token.word in thecontextdict[xsid]: + (prevwords, postwords) = thecontextdict[xsid][token.word] + newcandidates = postwords if postwords != [] else prevwords + for newcandidate in newcandidates: + if newcandidate == token.word: # otherwasie we will have an eternal loop + continue + penalty = basepenalties[CONTEXT] + newwords = [newcandidate] + newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata, + name='Context Correction', value='Unknown word', cat='lexicon', + source=f'{SASTA}/{CONTEXT}', backplacement=bpl_word, penalty=penalty) + # find document specific replacements - if not known_word(token.word) and \ + if not validword(token.word, methodname) and \ token.word in correctionparameters.thissamplecorrections and \ token.word not in childescorrectionsexceptions: cc = correctionparameters.thissamplecorrections[token.word] sumfrq = sum([hc.frequency for hc in cc]) for hc in cc: relfrq = hc.frequency / sumfrq - penalty = max(1, int(defaultpenalty * (1 - relfrq))) + penalty = basepenalties[THISSAMPLECORRECTIONS] + max(1, int(defaultpenalty * (1 - relfrq))) newwords = [hc.correction] if (token.word, hc.correction) not in basicreplacementpairs: if hc.correctiontype == 'noncompletion': newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata, name='Noncompletion', value='', cat='Pronunciation', + source=f'{SASTA}/{THISSAMPLECORRECTIONS}', backplacement=bpl_word, penalty=penalty) elif hc.correctiontype == 'replacement': newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata, name='Replacement', value='', cat='TBD', + source=f'{SASTA}/{THISSAMPLECORRECTIONS}', backplacement=bpl_word, penalty=penalty) elif hc.correctiontype == 'explanation': newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata, name='Explanation', value='', cat='TBD', + source=f'{SASTA}/{THISSAMPLECORRECTIONS}', backplacement=bpl_word, penalty=penalty) # find correction from all samples processed so far - if correctionparameters.method in [tarsp, stap] and \ - not known_word(token.word) and \ + if methodname in [tarsp, stap] and \ + not validword(token.word, methodname) and \ token.word in correctionparameters.allsamplecorrections and \ token.word not in childescorrectionsexceptions: cc = correctionparameters.allsamplecorrections[token.word] sumfrq = sum([hc.frequency for hc in cc]) for hc in cc: relfrq = hc.frequency / sumfrq - penalty = max(1, int(defaultpenalty * (1 - relfrq))) + penalty = basepenalties[ALLSAMPLECORRECTIONS] + max(1, int(defaultpenalty * (1 - relfrq))) newwords = [hc.correction] if (token.word, hc.correction) not in basicreplacementpairs: if hc.correctiontype == 'noncompletion': newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata, name='Noncompletion', value='', cat='Pronunciation', + source=f'{SASTA}/{ALLSAMPLECORRECTIONS}', backplacement=bpl_word, penalty=penalty) elif hc.correctiontype == 'replacement': newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata, name='Replacement', value='', cat='TBD', + source=f'{SASTA}/{ALLSAMPLECORRECTIONS}', backplacement=bpl_word, penalty=penalty) elif hc.correctiontype == 'explanation': newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata, name='Explanation', value='', cat='TBD', + source=f'{SASTA}/{ALLSAMPLECORRECTIONS}', backplacement=bpl_word, penalty=penalty) @@ -1312,30 +1364,34 @@ def getalternativetokenmds(tokenmd: TokenMD, tokens: List[Token], tokenctr: int # find childes replacements, preferably with vocabulary from the same age if correctionparameters.options.dohistory and \ - correctionparameters.method in [tarsp, stap] and not known_word(token.word) and token.word in childescorrections and \ + methodname in [tarsp, stap] and not validword(token.word, methodname) and \ + token.word in childescorrections and \ token.word not in childescorrectionsexceptions: cc = childescorrections[token.word] sumfrq = sum([hc.frequency for hc in cc]) for hc in cc: relfrq = hc.frequency / sumfrq - penalty = max(1, int(defaultpenalty * (1 - relfrq))) + penalty = basepenalties[HISTORY] + max(1, int(defaultpenalty * (1 - relfrq))) newwords = [hc.correction] if (token.word, hc.correction) not in basicreplacementpairs: if hc.correctiontype == 'noncompletion': newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata, name='Noncompletion', value='', cat='Pronunciation', + source=f'{SASTA}/{HISTORY}', backplacement=bpl_word, penalty=penalty) elif hc.correctiontype == 'replacement': newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata, name='Replacement', value='', cat='TBD', + source=f'{SASTA}/{HISTORY}', backplacement=bpl_word, penalty=penalty) elif hc.correctiontype == 'explanation': newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata, name='Explanation', value='', cat='TBD', + source=f'{SASTA}/{HISTORY}', backplacement=bpl_word, penalty=penalty) # gaatie - if not known_word(token.word): + if not validword(token.word, methodname): newwords = gaatie(token.word) if newwords != []: postviefound = True @@ -1358,17 +1414,19 @@ def getalternativetokenmds(tokenmd: TokenMD, tokens: List[Token], tokenctr: int 'moppie', 'punkie', 'saffie', 'stekkie', 'wijfie'] - if (not known_word(token.word) or token.word in knowniedimwords) and \ + if (not validword(token.word, methodname) or token.word in knowniedimwords) and \ (token.word.endswith('ie') or token.word.endswith('ies')): newwords = getjeforms(token.word) - newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata, - name='RegionalForm', value='ieDim', cat='Morphology', backplacement=bpl_word) + for newword in newwords: + if validword(newword, methodname): + newtokenmds = updatenewtokenmds(newtokenmds, token, [newword], beginmetadata, + name='RegionalForm', value='ieDim', cat='Morphology', backplacement=bpl_word) # overregularised verb forms: gevalt -> gevallen including incl wrong verb forms: gekeekt -> gekeken - if not known_word(token.word): + if not validword(token.word, methodname): nwms = correctinflection(token.word) for nw, metavalue in nwms: - if known_word(nw): + if validword(nw, methodname): newtokenmds += updatenewtokenmds(newtokenmds, token, [nw], beginmetadata, name='InflectionError', value=metavalue, cat='Morphology', backplacement=bpl_word) @@ -1400,20 +1458,19 @@ def getalternativetokenmds(tokenmd: TokenMD, tokens: List[Token], tokenctr: int cat='Pronunciation', backplacement=bpl_word) # e-> e(n) - if not known_word( - token.word) and token.word not in basicreplacements and token.word not in enexceptions: + if not validword(token.word, methodname) and token.word not in basicreplacements and token.word not in enexceptions: if endsinschwa(token.word) and not monosyllabic(token.word): newword = token.word + 'n' - if known_word(newword): + if validword(newword, methodname): schwandropfound = True newtokenmds = updatenewtokenmds(newtokenmds, token, [newword], beginmetadata, name='Pronunciation Variant', value='N-drop after schwa', cat='Pronunciation', backplacement=bpl_word) # initial s -> z - newtokenmds = initdevoicing(token, 's', 'z', newtokenmds, beginmetadata) + newtokenmds = initdevoicing(token, 's', 'z', methodname, newtokenmds, beginmetadata) # initial f -> v - newtokenmds = initdevoicing(token, 'f', 'v', newtokenmds, beginmetadata) + newtokenmds = initdevoicing(token, 'f', 'v', methodname, newtokenmds, beginmetadata) # replaceambiguous words with one reading not known by the child by a nonambiguous word with the same properties if correctionparameters.method in {'tarsp', 'stap'}: @@ -1428,7 +1485,7 @@ def getalternativetokenmds(tokenmd: TokenMD, tokens: List[Token], tokenctr: int aasre = rf'{dupvowel}\1s$' vvs = {'aas', 'oos', 'ees', 'uus'} # Lauraas -> Laura's; autoos -> auto's - if not known_word(token.word) and token.word[-3:] in vvs and known_word(token.word[:-2]): + if not validword(token.word, methodname) and token.word[-3:] in vvs and validword(token.word[:-2], methodname): newword = f"{token.word[:-2]}'s" newtokenmds = updatenewtokenmds(newtokenmds, token, [newword], beginmetadata, name='Spelling Correction', value='Missing Apostrophe', @@ -1444,7 +1501,7 @@ def getalternativetokenmds(tokenmd: TokenMD, tokens: List[Token], tokenctr: int # replace unknown part1+V verb by the most frequent part2+verb in CHILDES # e.g opbijten -> afbijten - if not known_word(token.word): + if not validword(token.word, methodname): issvp, thesvp = startswithsvp(token.word) part2 = token.word[len(thesvp):] if issvp and isaverb(part2): @@ -1452,42 +1509,38 @@ def getalternativetokenmds(tokenmd: TokenMD, tokens: List[Token], tokenctr: int and f'{svp}{part2}' in allfrqdict] sortednewcandidates = sorted(newcandidates, key= lambda x: x[1], reverse=True ) if sortednewcandidates != []: - newword = sortednewcandidates[0][0] + newwords = [sortednewcandidates[0][0]] else: - newword = part2 + newwords = [] - newtokenmds = updatenewtokenmds(newtokenmds, token, [newword], beginmetadata, + newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata, name='Unknown Word Substitution', value=token.word, - cat='Lexicon', backplacement=bpl_word) - - # replace words that are unknown to Alpino - if token.word in parsereplacements: - item = parsereplacements[token.word] - newword = item[1] - descr = item[2] - penalty = item[3] - newtokenmds = updatenewtokenmds(newtokenmds, token, [newword], beginmetadata, - name='Parse Substitution', value=descr, - cat='Parser', backplacement=bpl_wordlemma, penalty=penalty) + cat='Lexicon', backplacement=bpl_word, source=f'{SASTA}/{BASICREPLACEMENTS}') + if correctionparameters.options.dospellingcorrection and \ - not known_word(token.word) and applyspellingcorrectionisok(token.word) and \ + not validword(token.word, methodname) and applyspellingcorrectionisok(token.word) and \ not schwandropfound and not postviefound and not token.word[0].isupper() and not deduplicated and \ not(token.word.endswith('ie') or token.word.endswith('ies')) and token.word[-3:] not in vvs: if correctionparameters.method in {'tarsp', 'stap'}: corrtuples = children_correctspelling(token.word, children_correctionsdict, max=5) + subsource = CHILDRENSPELLINGCORRECTION elif correctionparameters.method in {'asta'}: corrtuples = [] + subsource = ADULTSPELLINGCORRECTION # put off because it causes a lot of errors: the X-words should all have been removed # corrtuples = adult_correctspelling(token.word, adult_correctionsdict, max=5) else: corrtuples = [] for corr, penalty in corrtuples: - if corr != token.word and known_word(corr): + newpenalty = basepenalties[subsource] + penalty + if corr != token.word and validword(corr, methodname): newtokenmds = updatenewtokenmds(newtokenmds, token, [corr], beginmetadata, name='Spelling Correction', value=corr, - cat='Spelling', backplacement=bpl_word, penalty=penalty) + cat='Spelling', + source=f'{SASTA}/{subsource}', + backplacement=bpl_word, penalty=newpenalty) for newtokenmd in newtokenmds: morenewtokenmds = getalternativetokenmds( @@ -1501,9 +1554,9 @@ def applyspellingcorrectionisok(word): len(word) > 4 and word not in enexceptions return result -def getvalidalternativetokenmds(tokenmd: TokenMD, newtokenmds: List[TokenMD]) -> List[TokenMD]: +def getvalidalternativetokenmds(tokenmd: TokenMD, newtokenmds: List[TokenMD], methodname:MethodName) -> List[TokenMD]: validnewtokenmds = [ - tokenmd for tokenmd in newtokenmds if known_word(tokenmd.token.word)] + tokenmd for tokenmd in newtokenmds if validword(tokenmd.token.word, methodname)] # and now we add the original tokenmd validnewtokenmds += [tokenmd] return validnewtokenmds diff --git a/src/sastadev/correcttreebank.py b/src/sastadev/correcttreebank.py index 1bc143b..4c65a68 100644 --- a/src/sastadev/correcttreebank.py +++ b/src/sastadev/correcttreebank.py @@ -1,6 +1,7 @@ from collections import defaultdict from copy import copy, deepcopy from dataclasses import dataclass +from editdistance import distance import os from typing import Callable, Dict, List, Optional, Set, Tuple @@ -12,16 +13,21 @@ from sastadev.correctionparameters import CorrectionParameters from sastadev.corrector import (Correction, disambiguationdict, getcorrections, mkuttwithskips) -from sastadev.lexicon import de, dets, known_word, nochildwords, wrongposwordslexicon +from sastadev.lexicon import de, dets, known_word, nochildword, nochildwords, validnouns, validword, \ + wordsunknowntoalpinolexicondict, wrongposwordslexicon from sastadev.macros import expandmacros -from sastadev.metadata import (Meta, bpl_delete, bpl_indeze, bpl_node, - bpl_none, bpl_replacement, bpl_word, bpl_wordlemma, bpl_word_delprec, insertion) +from sastadev.metadata import (Meta, bpl_delete, bpl_indeze, bpl_node, defaultpenalty, + bpl_none, bpl_replacement, bpl_word, bpl_wordlemma, bpl_word_delprec, insertion, + SASTA, ADULTSPELLINGCORRECTION, ALLSAMPLECORRECTIONS, BASICREPLACEMENTS, CONTEXT, + HISTORY, CHILDRENSPELLINGCORRECTION, THISSAMPLECORRECTIONS, replacementsubsources + ) from sastadev.sastatok import sasta_tokenize from sastadev.sastatoken import Token, insertinflate, tokenlist2stringlist, tokenlist2string from sastadev.sastatypes import (AltId, CorrectionMode, ErrorDict, MetaElement, MethodName, Penalty, Position, PositionStr, SynTree, Targets, Treebank, UttId) from sastadev.semantic_compatibility import semincompatiblecount +from sastadev.subcatprefs import getsubcatprefscore from sastadev.sva import phicompatible from sastadev.syllablecount import countsyllables from sastadev.targets import get_mustbedone @@ -42,6 +48,8 @@ positive = +1 negative = -1 +subsourcesep = '/' + corr0, corr1, corrn = '0', '1', 'n' validcorroptions = [corr0, corr1, corrn] @@ -78,12 +86,11 @@ def __init__(self, name, getfunction, polarity, description): class Alternative(): - def __init__(self, stree, altid, altsent, penalty, criteria): + def __init__(self, stree, altid, altsent, criteria): self.stree: SynTree = stree self.altid: AltId = altid self.altsent: str = altsent - self.penalty: Penalty = int(penalty) - self.criteria = criteria + [penalty] + self.criteria = criteria def alt2row(self, uttid: UttId, base: str, user1: str = '', user2: str = '', user3: str = '', bestaltids: List[AltId] = [], @@ -212,7 +219,7 @@ def adaptpv(node): node.attrib['postag'] = 'WW(pv,tgw,mv)' -def smartreplace(node: SynTree, word: str) -> SynTree: +def smartreplace(node: SynTree, word: str, mn: MethodName) -> SynTree: ''' replaces *node* by a different node if the parse of *word* yields a node with a valid word and the same word class and if it does not occur in nochildwords; otherwise by @@ -227,7 +234,7 @@ def smartreplace(node: SynTree, word: str) -> SynTree: newnodept = getattval(newnode, 'pt') nodept = getattval(node, 'pt') newnodelemma = getattval(newnode, 'lemma') - if isvalidword(word) and \ + if isvalidword(word, mn) and \ issamewordclass(node, newnode) and \ not isrobustnoun(newnode) and \ newnodelemma not in nochildwords: @@ -284,7 +291,8 @@ def correcttreebank(treebank: Treebank, targets: Targets, correctionparameters: * targets: a specification of the utterances that have to be analysed * treebankfullname: name of the file that contains the treebank * method: the method to be used. Some corrections are method-specific - * corr: to indicate how the corrections should be done: no corrections at all, all corrections but the last one (usually the one with most adaptations) is selected; all corrections but the best one according to the evaluation criterion is selected. + * corr: to indicate how the corrections should be done: no corrections at all, all corrections but the last one + (usually the one with most adaptations) is selected; all corrections but the best one according to the evaluation criterion is selected. * options: the input parameters for sastadev It returns a triple consisting of @@ -309,7 +317,7 @@ def correcttreebank(treebank: Treebank, targets: Targets, correctionparameters: # print(uttid) mustbedone = get_mustbedone(stree, targets) if mustbedone: - # to implement + # to implementf sentence = getsentence(stree) newstree, orandalts = correct_stree(stree, corr, correctionparameters) if newstree is not None: @@ -616,7 +624,7 @@ def correct_stree(stree: SynTree, corr: CorrectionMode, correctionparameters: C rawctmds: List[Correction] = getcorrections(cleanutttokens, correctionparameters, fatstree) - ctmds = reducecorrections(rawctmds) + ctmds = reducecorrections(rawctmds, correctionparameters.method) # ctmds = rawctmds debug = False @@ -679,7 +687,7 @@ def correct_stree(stree: SynTree, corr: CorrectionMode, correctionparameters: C if ptmds == []: thecorrection, orandalts = (cleanutttokens, fatstree, origmetadata), None elif corr in [corr1, corrn]: - thecorrection, orandalts = selectcorrection(fatstree, ptmds, corr) + thecorrection, orandalts = selectcorrection(fatstree, ptmds, corr, correctionparameters.method) else: settings.LOGGER.error( 'Illegal correction value: {}. No corrections applied'.format(corr)) @@ -750,7 +758,7 @@ def correct_stree(stree: SynTree, corr: CorrectionMode, correctionparameters: C # newnode.attrib['lemma'] = oldword # else: # newnode.attrib['lemma'] = wprop[3] - substnode = smartreplace(newnode, oldword) + substnode = smartreplace(newnode, oldword, correctionparameters.method) newnodeparent = newnode.getparent() newnodeparent.remove(newnode) @@ -912,6 +920,77 @@ def correct_stree(stree: SynTree, corr: CorrectionMode, correctionparameters: C return fulltree, orandalts +def splitsource(fullsource: str) -> Tuple[str, str]: + parts = fullsource.split(subsourcesep, maxsplit=2) + if len(parts) == 2: + return (parts[0], parts[1]) + elif len(parts) == 1: + return (parts[0], '') + else: # == 0 or > 2 + # should never occur + return ('', '') + + +postcomplsuxpath = """.//node[@rel="su" and + parent::node[@cat="ssub" and not(parent::node[@cat="whsub" or @cat="whrel" or @cat="rel"])] and + @begin >= ../node[(@word or @cat) and + (not(@pdtype) or @pdtype!="adv-pron") and + (@rel="ld" or @rel="obj1" or @rel="pc" or @rel="obj2")]/@end]""" +def getpostcomplsucount(nt: SynTree, md: List[Meta], mn: MethodName) -> int: + postcomplsus = nt.xpath(postcomplsuxpath) + result = len(postcomplsus) + return result +def getstreereplacementpenalty(stree: SynTree) -> int: + contextpositions = set() + spellingcorrectionpositions = set() + fullpenalty = 0 + metadatas = stree.xpath('.//metadata') + if metadatas != []: + metadata = metadatas[0] + for meta in metadata: + if meta.tag != 'xmeta': + continue + fullsource = meta.attrib['source'] if 'source' in meta.attrib else '' + mainsource, subsource = splitsource(fullsource) + if subsource in {BASICREPLACEMENTS, ALLSAMPLECORRECTIONS, HISTORY, CONTEXT, CHILDRENSPELLINGCORRECTION, + ADULTSPELLINGCORRECTION, THISSAMPLECORRECTIONS}: + penalty = meta.attrib['penalty'] if 'penalty' in meta.attrib else 0 + fullpenalty += penalty + annotatedposlist = meta.attrib['annotatedpositions'] if 'annotatedposlist' in meta.attrib else '[]' + position = annotatedposlist[1:-1] + if subsource in [ADULTSPELLINGCORRECTION, CHILDRENSPELLINGCORRECTION] and position != '': + spellingcorrectionpositions.add(position) + if subsource == CONTEXT and position != '': + contextpositions.add(position) + spellcontextintersection = contextpositions.intersection(spellingcorrectionpositions) + reduction = len(spellcontextintersection) * defaultpenalty + fullpenalty = fullpenalty - reduction + + return fullpenalty + + +def getreplacementpenalty(nt: SynTree, mds: List[Meta], mn:MethodName) -> int: + contextpositions = set() + spellingcorrectionpositions = set() + fullpenalty = 0 + for meta in mds: + fullsource = meta.source + mainsource, subsource = splitsource(fullsource) + if subsource in {BASICREPLACEMENTS, ALLSAMPLECORRECTIONS, HISTORY, CONTEXT, CHILDRENSPELLINGCORRECTION, + ADULTSPELLINGCORRECTION, THISSAMPLECORRECTIONS}: + penalty = meta.penalty + fullpenalty += penalty + position = meta.annotatedposlist[0] if meta.annotatedposlist != [] else '' + if subsource in [CHILDRENSPELLINGCORRECTION, ADULTSPELLINGCORRECTION] and position != '': + spellingcorrectionpositions.add(position) + if subsource == CONTEXT and position != '': + contextpositions.add(position) + spellcontextintersection = contextpositions.intersection(spellingcorrectionpositions) + reduction = len(spellcontextintersection) * defaultpenalty + fullpenalty = fullpenalty - reduction + + return fullpenalty + def getsentencenode(stree: SynTree) -> SynTree: sentnodes = stree.xpath('.//sentence') if sentnodes == []: @@ -945,11 +1024,11 @@ def oldgetuttid(stree: SynTree) -> UttId: uttid = uttidlist[0] return uttid -def reducecorrections(ctmds: List[Correction]) -> List[Correction]: +def reducecorrections(ctmds: List[Correction], mn:MethodName) -> List[Correction]: tempdict = {} for tokenlist, metadata in ctmds: tokenstr = tokenlist2string(tokenlist) - newpenalty = compute_penalty(metadata) + newpenalty = compute_penalty(None, metadata, mn) if tokenstr in tempdict: oldpenalty = tempdict[tokenstr][0] if newpenalty < oldpenalty: @@ -982,7 +1061,7 @@ def getbestaltids(alts: Dict[AltId, Alternative]) -> List[AltId]: return results -def getsvaokcount(nt: SynTree) -> int: +def getsvaokcount(nt: SynTree, md:List[Meta], mn:MethodName) -> int: subjects = nt.xpath('.//node[@rel="su"]') counter = 0 for subject in subjects: @@ -992,7 +1071,7 @@ def getsvaokcount(nt: SynTree) -> int: return counter -def getdeplusneutcount(nt: SynTree) -> int: +def getdeplusneutcount(nt: SynTree, md:List[Meta], mn:MethodName) -> int: theyield = getnodeyield(nt) ltheyield = len(theyield) counter = 0 @@ -1014,8 +1093,10 @@ def getdeplusneutcount(nt: SynTree) -> int: punctuationsymbols = """.,?!:;"'""" -def isvalidword(w: str) -> bool: - if known_word(w): +def isvalidword(w: str, mn: MethodName) -> bool: + if nochildword(w): + return False + elif validword(w, mn): return True elif w in punctuationsymbols: return True @@ -1025,7 +1106,7 @@ def isvalidword(w: str) -> bool: return False -def countambigwords(stree: SynTree) -> int: +def countambigwords(stree: SynTree, md:List[Meta], mn:MethodName) -> int: leaves = getnodeyield(stree) ambignodes = [leave for leave in leaves if getattval( leave, 'word').lower() in disambiguationdict] @@ -1033,15 +1114,14 @@ def countambigwords(stree: SynTree) -> int: return result -def getunknownwordcount(nt: SynTree) -> int: +def getunknownwordcount(nt: SynTree, md:List[Meta], mn:MethodName) -> int: words = [w for w in nt.xpath('.//node[@pt!="tsw"]/@word')] - unknownwords = [w for w in words if not ( - isvalidword(w.lower()) or isvalidword(w.title()))] + unknownwords = [w for w in words if not isvalidword(w.lower(), mn) ] result = len(unknownwords) return result wrongposwordxpathtemplate = './/node[@lemma="{word}" and @pt="{pos}"]' -def getwrongposwordcount(nt: SynTree) -> int: +def getwrongposwordcount(nt: SynTree, md:List[Meta], mn:MethodName) -> int: result = 0 for word, pos in wrongposwordslexicon: wrongposwordxpath = wrongposwordxpathtemplate.format(word=word, pos=pos) @@ -1050,38 +1130,43 @@ def getwrongposwordcount(nt: SynTree) -> int: return result sucountxpath = './/node[@rel="su" and not(@pt="ww" and @wvorm="inf") and not(node[@rel="hd" and @pt="ww" and @wvorm="inf"])] ' -def getsucount(nt: SynTree) -> int: +def getsucount(nt: SynTree, md:List[Meta], mn:MethodName) -> int: matches = nt.xpath(sucountxpath) result = len(matches) return result -getdpcount = lambda nt: countav(nt, 'rel', 'dp') -getdhyphencount = lambda nt: countav(nt, 'rel', '--') -getdimcount = lambda nt: countav(nt, 'graad', 'dim') -getcompcount = lambda nt: countav(nt, 'graad', 'comp') -getsupcount = lambda nt: countav(nt, 'graad', 'sup') -# getsucount = lambda nt: countav(nt, 'rel', 'su') -getbadcatcount = lambda nt: len( +localgetcompoundcount = lambda nt, md, mn: getcompoundcount(nt) +getdpcount = lambda nt, md, mn: countav(nt, 'rel', 'dp') +getdhyphencount = lambda nt, md, mn: countav(nt, 'rel', '--') +getdimcount = lambda nt, md, mn: countav(nt, 'graad', 'dim') +getcompcount = lambda nt, md, mn: countav(nt, 'graad', 'comp') +getsupcount = lambda nt, md, mn: countav(nt, 'graad', 'sup') +# getsucount = lambda nt, md, mn: countav(nt, 'rel', 'su') +getbadcatcount = lambda nt, md, mn: len( [node for node in nt.xpath('.//node[@cat and (@cat="du") and node[@rel="dp"]]')]) -gethyphencount = lambda nt: len( +gethyphencount = lambda nt, md, mn: len( [node for node in nt.xpath('.//node[contains(@word, "-")]')]) -getbasicreplaceecount = lambda nt: len([node for node in nt.xpath('.//node[@word]') +getbasicreplaceecount = lambda nt, md, mn: len([node for node in nt.xpath('.//node[@word]') if getattval(node, 'word').lower() in basicreplacements]) -getsubjunctivecount = lambda nt: len( +getsubjunctivecount = lambda nt, md, mn: len( [node for node in nt.xpath('.//node[@pvtijd="conj"]')]) -getunknownnouncount = lambda nt: len([node for node in nt.xpath( - './/node[@pt="n" and @frame="noun(both,both,both)"]')]) -getunknownnamecount = lambda nt: len([node for node in nt.xpath( +def getunknownnouncount(nt: SynTree, md:List[Meta], mn: MethodName): + candidates = nt.xpath('.//node[@pt="n" and @frame="noun(both,both,both)"]') + realones = [cand for cand in candidates if getattval(cand, 'lemma') not in validnouns] + result = len(realones) + return result + +getunknownnamecount = lambda nt, md, mn: len([node for node in nt.xpath( './/node[@pt="n" and @frame="proper_name(both)"]')]) complsuxpath = expandmacros(""".//node[node[(@rel="ld" or @rel="pc") and @end<=../node[@rel="su"]/@begin and @begin >= ../node[@rel="hd"]/@end] and not(node[%Rpronoun%])]""") -getcomplsucount = lambda nt: len([node for node in nt.xpath(complsuxpath)]) -getdezebwcount = lambda nt: len([node for node in nt.xpath(dezebwxpath)]) -getnoun1c_count = lambda nt: len([node for node in nt.xpath(noun1cxpath)]) +getcomplsucount = lambda nt, md, mn: len([node for node in nt.xpath(complsuxpath)]) +getdezebwcount = lambda nt, md, mn: len([node for node in nt.xpath(dezebwxpath)]) +getnoun1c_count = lambda nt, md, mn: len([node for node in nt.xpath(noun1cxpath)]) -def selectcorrection(stree: SynTree, ptmds: List[ParsedCorrection], corr: CorrectionMode) -> Tuple[ +def selectcorrection(stree: SynTree, ptmds: List[ParsedCorrection], corr: CorrectionMode, mn:MethodName) -> Tuple[ ParsedCorrection, OrigandAlts]: # to be implemented@@ # it is presupposed that ptmds is not [] @@ -1094,11 +1179,11 @@ def selectcorrection(stree: SynTree, ptmds: List[ParsedCorrection], corr: Correc for ct, nt, md in ptmds: cw = tokenlist2stringlist(ct) altsent = space.join(cw) - penalty = compute_penalty(md) + # penalty = compute_penalty(md) - criteriavalues = [criterion.getfunction(nt) * criterion.polarity for criterion in criteria] + criteriavalues = [criterion.getfunction(nt, md, mn) * criterion.polarity for criterion in criteria] - alt = Alternative(stree, altid, altsent, penalty * negative, criteriavalues) + alt = Alternative(stree, altid, altsent, criteriavalues) alts[altid] = alt altid += 1 orandalts = OrigandAlts(orig, alts) @@ -1127,13 +1212,13 @@ def selectcorrection(stree: SynTree, ptmds: List[ParsedCorrection], corr: Correc # smainsuxpath = '//node[@cat="smain" and @begin = node[@rel="su"]/@begin]' # yields unexpected errros (TD12:31; TARSP_08:31) smainsuxpath = './/node[@cat="smain" and node[@rel="su"]]' -def countsmainsu(nt: SynTree) -> int: +def countsmainsu(nt: SynTree, md:List[Meta], mn:MethodName) -> int: matches = nt.xpath(smainsuxpath) result = len(matches) return result mainclausexpath = './/node[@cat="smain" or @cat="whq" or (@cat="sv1" and @rel!="body" and @rel!="cnj")]' -def getmainclausecount(nt: SynTree) -> int: +def getmainclausecount(nt: SynTree, md:List[Meta], mn:MethodName) -> int: """ parses with more than 1 main clause node are bad :param nt: @@ -1147,8 +1232,15 @@ def getmainclausecount(nt: SynTree) -> int: result = lmatches return result +mainrelxpath = './/node[@rel="--" and (@cat="rel" or @cat="whrel")]' +def mainrelcount(stree: SynTree, md:List[Meta], mn:MethodName) -> int: + mainrels = stree.xpath(mainrelxpath) + result = len(mainrels) + return result + + topxpath = './/node[@cat="top"]' -def gettopclause(nt: SynTree) -> int: +def gettopclause(nt: SynTree, md:List[Meta], mn:MethodName) -> int: tops = nt.xpath(topxpath) if tops == []: return 0 @@ -1164,7 +1256,7 @@ def gettopclause(nt: SynTree) -> int: toexpath = './/node[@lemma="toe" or (@lemma="tot" and @vztype="fin")]' naarxpath = './/node[@lemma="naar"]' -def getlonelytoecount(nt: SynTree) -> int: +def getlonelytoecount(nt: SynTree, md:List[Meta], mn:MethodName) -> int: toematches = nt.xpath(toexpath) naarmatches = nt.xpath(naarxpath) if toematches == []: @@ -1177,15 +1269,56 @@ def getlonelytoecount(nt: SynTree) -> int: if all([int(getattval(naarmatch, 'begin')) > int(getattval(toematch, 'begin')) for naarmatch in naarmatches]): result += 1 return result -def compute_penalty(md: List[Meta]) -> Penalty: + +relasmainsuborderxpath = """.//node[(@cat="rel" or @cat="whrel" )and @rel="--" and + parent::node[@cat="top"] and + node[@rel="body" and @cat="ssub" and .//node[@word]/@end<=node[@rel="hd" and @pt="ww"]/@begin] ] +""" +def getrelasmainsubordercount(nt: SynTree, md: List[Meta], mn: MethodName): + matches = nt.xpath(relasmainsuborderxpath) + result = len(matches) + return result + +def compute_penalty(nt: SynTree, md: List[Meta], mn:MethodName) -> Penalty: totalpenalty = 0 for meta in md: totalpenalty += meta.penalty return totalpenalty +wordsxpath = """.//node[@word]""" +def getnotknownbyalpinocount(nt: SynTree, md: List[Meta], mn: MethodName) -> int: + wordnodes = nt.xpath(wordsxpath) + words = [getattval(wordnode, 'word') for wordnode in wordnodes] + unknownwords = [word for word in words if word in wordsunknowntoalpinolexicondict] + result = len(unknownwords) + return result + +def gettotaleditdistance(nt: SynTree, md: List[Meta], mn:MethodName) -> int: + wordlist = getyield(nt) + totaldistance = 0 + for meta in md: + fullsource = meta.source + mainsource, subsource = splitsource(fullsource) + if subsource in replacementsubsources and \ + len(meta.annotationwordlist) == 1 and \ + len(meta.annotatedwordlist) == 1: + correctword = meta.annotationwordlist[0] + wrongword = meta.annotatedwordlist[0] + dst = distance(wrongword, correctword) + totaldistance += dst + return totaldistance + +ppinnpxpath = """//node[@cat='pp' and node[@rel='hd' and @lemma!='van'] and parent::node[@cat='np']]""" +def getpostnominalppmodcount(nt: SynTree, md: List[Meta], mn: MethodName): + ppinnpmods = nt.xpath(ppinnpxpath) + result = len(ppinnpmods) + return result + + # The constant *criteria* is a list of objects of class *Criterion* that are used, in the order given, to evaluate parses criteria = [ Criterion("unknownwordcount", getunknownwordcount, negative, "Number of unknown words"), + Criterion('Alpinounknownword', getnotknownbyalpinocount, negative, "Number of words unknown to Alpino"), Criterion("wrongposwordcount", getwrongposwordcount, negative, "Number of words with the wrong part of speech"), Criterion("unknownnouncount", getunknownnouncount, negative, "Count of unknown nouns according to Alpino"), Criterion("unknownnamecount", getunknownnamecount, negative, "Count of unknown names"), @@ -1193,28 +1326,37 @@ def compute_penalty(md: List[Meta]) -> Penalty: Criterion("ambigcount", countambigwords, negative, "Number of ambiguous words"), Criterion("dpcount", getdpcount, negative, "Number of nodes with relation dp"), Criterion("dhyphencount", getdhyphencount, negative, "Number of nodes with relation --"), + Criterion("postcomplsucount", getpostcomplsucount, negative, + "Number of subjects to the right of a complement in a subordinate clause"), + Criterion('Nominal PP modifier count', getpostnominalppmodcount, negative, "Number of postnominal PP modifiers"), + Criterion('RelativeMainSuborder', getrelasmainsubordercount, negative, 'Number of Main Relative Clauses with subordinate order'), + Criterion("lonelytoecount", getlonelytoecount, negative, "Number of occurrences of lonely 'toe'"), + Criterion("noun1c_count", getnoun1c_count, negative, "Number of nouns that consist of a single character"), + Criterion('ReplacementPenalty', getreplacementpenalty, negative, 'Plausibility of the replacement'), + Criterion('Total Edit Distance', gettotaleditdistance, negative, "Total of the edit distances for all replaced words"), + # Criterion('Subcatscore', getsubcatprefscore, positive, + # 'Score based on the frequency of the subcategorisation pattern'), # put off needs revision + # Criterion('mainrelcount', mainrelcount, negative, 'Dislike main relative clauses'), # removed, leads to worse results Criterion("mainclausecount", getmainclausecount, negative, "Number of main clauses"), Criterion("topclause", gettopclause, positive, "Single clause under top"), Criterion("complsucount", getcomplsucount, negative, ""), Criterion("badcatcount", getbadcatcount, negative, "Count of bad categories: du that contains a node with relation dp"), Criterion("basicreplaceecount", getbasicreplaceecount, negative, "Number of words from the basic replacements"), Criterion("hyphencount", gethyphencount, negative, "Number rof words that contain hyphens"), - Criterion("lonelytoecount", getlonelytoecount, negative, "Number of occurrences of lonely 'toe'"), Criterion("subjunctivecount", getsubjunctivecount, negative, "Number of subjunctive verb forms"), Criterion("smainsucount", countsmainsu, positive, "Count of smain nodes that contain a subject"), Criterion("dimcount", getdimcount, positive, "Number of words that are diminutives"), Criterion("compcount", getcompcount, positive, "Number of words that are comparatives"), Criterion("supcount", getsupcount, positive, "Number of words that are superlatives"), - Criterion("compoundcount", getcompoundcount, positive, "Number of nouns that are compounds"), + Criterion("compoundcount", localgetcompoundcount, positive, "Number of nouns that are compounds"), Criterion("sucount", getsucount, positive, "Number of subjects"), Criterion("svaok", getsvaokcount, positive, "Numbe rof time subject verb agreement is OK"), Criterion("deplusneutcount", getdeplusneutcount, negative, "Number of deviant configuratios with de-determeine + neuiter noun"), Criterion("dezebwcount", getdezebwcount, negative, "Count of 'deze' as adverb"), - Criterion("noun1c_count", getnoun1c_count, negative, "Number of nouns that consist of a single character"), - # Criterion("penalty", compute_penalty, negative, "Penalty for the changes made") # not in here, added later in Alternative + Criterion("penalty", compute_penalty, negative, "Penalty for the changes made") ] -altpropertiesheader = [criterion.name for criterion in criteria] + ['penalty'] +altpropertiesheader = [criterion.name for criterion in criteria] errorwbheader = ['Sample', 'User1', 'User2', 'User3'] + \ ['Status', 'Uttid', 'Origutt', 'Origsent'] + \ diff --git a/src/sastadev/data/alpinoframes/alpino_pureframes.xlsx b/src/sastadev/data/alpinoframes/alpino_pureframes.xlsx new file mode 100644 index 0000000..d90bcb8 Binary files /dev/null and b/src/sastadev/data/alpinoframes/alpino_pureframes.xlsx differ diff --git a/src/sastadev/data/alpinoframes/wwframes.xlsx b/src/sastadev/data/alpinoframes/wwframes.xlsx new file mode 100644 index 0000000..205adbe Binary files /dev/null and b/src/sastadev/data/alpinoframes/wwframes.xlsx differ diff --git a/src/sastadev/data/childescorrections/adult_samplecorrections.txt b/src/sastadev/data/childescorrections/adult_samplecorrections.txt new file mode 100644 index 0000000..e69de29 diff --git a/src/sastadev/data/childescorrections/samplecorrections.txt b/src/sastadev/data/childescorrections/children_samplecorrections.txt similarity index 76% rename from src/sastadev/data/childescorrections/samplecorrections.txt rename to src/sastadev/data/childescorrections/children_samplecorrections.txt index 7999434..72bf2ec 100644 --- a/src/sastadev/data/childescorrections/samplecorrections.txt +++ b/src/sastadev/data/childescorrections/children_samplecorrections.txt @@ -1,6 +1,5 @@ -i is replacement 1 -i ik noncompletion 14 -i is noncompletion 3 +i ik noncompletion 17 +i is noncompletion 4 toe toen noncompletion 1 toe sg explanation 1 toe doe replacement 1 @@ -26,48 +25,45 @@ taat staat replacement 1 taat gehaald replacement 1 taat gaat replacement 1 hewu hebben replacement 1 +hee heel noncompletion 2 hee weer replacement 2 -hee heel noncompletion 1 hee heef noncompletion 1 hee een replacement 1 -heew heel replacement 5 +heew heel replacement 6 houtu grote replacement 1 waatu wagen replacement 1 pakken gepakt replacement 1 jeesoo lego replacement 1 kee keer noncompletion 2 kee geen replacement 1 -sa anders replacement 1 +sa anders replacement 2 sa van replacement 1 ein erin noncompletion 1 dument cement replacement 1 takdoeku zakdoek replacement 1 -oot ook explanation 1 -oot ook replacement 1 +oot ook replacement 2 maa maar noncompletion 6 maa maak noncompletion 1 -nie niet noncompletion 11 +nie niet noncompletion 13 nie Leonie noncompletion 1 heet weet replacement 2 +u het replacement 4 +u een replacement 4 u hij replacement 1 -u het replacement 3 u op replacement 1 u ik replacement 3 -u een replacement 3 ti zit replacement 1 tiej vier replacement 1 -da dat noncompletion 33 +da dat noncompletion 37 da dan noncompletion 6 da te replacement 1 da daar noncompletion 1 -oo ook replacement 1 -oo ook noncompletion 2 +oo ook noncompletion 3 tijken kijken replacement 1 -hie hier explanation 1 -hie hier noncompletion 4 +hie hier noncompletion 5 hiein hierin noncompletion 1 -tan kan replacement 3 -tan dan replacement 2 +tan dan replacement 3 +tan kan replacement 6 it ik replacement 2 nowug nodig replacement 1 nowas nodig replacement 1 @@ -80,7 +76,7 @@ kas kast noncompletion 1 dich dicht noncompletion 2 diehoek driehoek noncompletion 1 choenen schoenen noncompletion 1 -daa daar noncompletion 15 +daa daar noncompletion 17 stawbewie strawberry replacement 1 tuife druiven replacement 1 boejij boerderij replacement 1 @@ -92,9 +88,9 @@ wi kwil noncompletion 1 wi wil noncompletion 1 ees eerst replacement 2 ees eens noncompletion 1 -s is noncompletion 37 +s is noncompletion 39 boe boer noncompletion 2 -t het noncompletion 100 +t het noncompletion 102 t met noncompletion 1 taa daar replacement 1 ope open noncompletion 1 @@ -109,14 +105,14 @@ bofu boven replacement 1 teddy teddybeer replacement 1 wa had replacement 1 wa wat noncompletion 2 -moete moet replacement 3 +moete moet replacement 4 +e even explanation 3 +e 'n replacement 3 +e een noncompletion 2 e de noncompletion 6 e het noncompletion 3 e er noncompletion 8 e en noncompletion 4 -e een noncompletion 1 -e 'n replacement 2 -e even explanation 1 e ze noncompletion 2 pele spelen noncompletion 3 kinneboejij kinderboerderij replacement 1 @@ -125,26 +121,24 @@ siko cirkel replacement 1 bjokje blokje replacement 1 kikke kikker noncompletion 1 springe springen noncompletion 1 +in erin noncompletion 3 +in deze replacement 2 in op replacement 2 -in deze replacement 1 -in erin noncompletion 1 in uit replacement 1 -in erin replacement 1 in bij replacement 2 tactor tractor noncompletion 2 +op erop replacement 5 op in replacement 1 -op erop explanation 1 -op erop replacement 2 +die ze replacement 2 +die dat replacement 3 die het replacement 1 -die dat replacement 2 -die ze replacement 1 die hij replacement 1 die daar replacement 2 z'n zijn replacement 4 n een noncompletion 99 no nog noncompletion 2 draan deraan noncompletion 1 -kom komt noncompletion 3 +kom komt noncompletion 4 fiet fietst noncompletion 1 chuiven schuiven noncompletion 1 ander anders noncompletion 1 @@ -172,12 +166,11 @@ wewk werk replacement 1 Jatteliene Jacqueline replacement 1 daaw daar replacement 2 haat gaat replacement 2 -ie hij replacement 16 +ie hij replacement 20 +ie de replacement 3 ie die noncompletion 4 ie in replacement 1 ie gaat ie explanation 1 -ie de replacement 1 -ie de explanation 1 jo zo replacement 1 weew weer replacement 5 dankie dank je replacement 1 @@ -195,15 +188,14 @@ oppelost opgelost replacement 1 alweew alweer replacement 1 naawtoe naartoe replacement 1 hebbe hebben noncompletion 1 +dees hierin replacement 2 dees deze replacement 12 -dees hierin replacement 1 -em hem noncompletion 2 -em hem replacement 1 +em hem noncompletion 3 tuurlijk natuurlijk noncompletion 1 +heb hebben replacement 3 heb heeft replacement 2 heb heet replacement 1 heb hebt noncompletion 4 -heb hebben replacement 2 heb is replacement 1 hebt heeft replacement 3 hebt heb replacement 1 @@ -219,14 +211,13 @@ ke die replacement 1 dese deze replacement 4 hijs hij is replacement 1 eigk eigenlijk replacement 1 -de het replacement 19 +de het replacement 21 de der noncompletion 11 de dan replacement 2 de deze replacement 2 moek moet replacement 2 rijken rijden replacement 1 -turen sturen replacement 1 -turen sturen noncompletion 1 +turen sturen noncompletion 2 gwoon gewoon noncompletion 2 deke deze replacement 1 keraf eraf replacement 1 @@ -239,7 +230,7 @@ wak wat replacement 1 tuur stuur noncompletion 1 kok ook replacement 1 pobreren proberen replacement 1 -ze zit explanation 2 +ze zit explanation 3 ze hem replacement 1 ze zijn replacement 2 derin erin replacement 1 @@ -257,8 +248,8 @@ mewk melk replacement 1 worst borst replacement 1 ennu en replacement 1 koeie koeien noncompletion 1 -o op noncompletion 2 -mij mijn noncompletion 8 +o op noncompletion 3 +mij mijn noncompletion 9 bet bent noncompletion 1 bent ben replacement 1 tinnen tillen replacement 1 @@ -278,96 +269,92 @@ edaa gedaan noncompletion 1 tu toch replacement 1 tu dan replacement 1 wiw wil replacement 1 -naan banaan explanation 1 -es eens noncompletion 47 -m hem noncompletion 12 -wilt wil replacement 1 -taan staan explanation 1 -jijjen rijden replacement 1 +naan banaan explanation 3 +es eens noncompletion 57 +m hem noncompletion 16 +wilt wil replacement 4 +taan staan explanation 3 +jijjen rijden replacement 2 zn zijn noncompletion 4 -maken aan elkaar explanation 1 +maken aan elkaar explanation 2 hu pronoun explanation 1 man de kassier explanation 1 leeuw leeuwe noncompletion 1 -ditte dit replacement 2 -d'aan eraan replacement 2 +ditte dit replacement 3 +d'aan eraan replacement 3 brbrbrbr motorgeluiden explanation 2 mn mijn noncompletion 7 omt komt explanation 4 an kan explanation 2 -kleppie klepje replacement 2 -pusses puzzelstukjes explanation 1 -gane gaan replacement 1 -puss puzzelen replacement 1 -dietuin dierentuin explanation 1 -tein trein noncompletion 1 -tein trein explanation 2 -kimmen klimmen explanation 1 -ma mag explanation 2 -klei kleine explanation 1 -karrebeertjes koalabeertjes explanation 1 -dit deze explanation 1 -pisa pistool explanation 1 -maat tomaat explanation 1 -we bewegen explanation 2 -plassijde bladzijde explanation 1 +kleppie klepje replacement 3 +pusses puzzelstukjes explanation 3 +gane gaan replacement 3 +puss puzzelen replacement 3 +dietuin dierentuin explanation 3 +tein trein noncompletion 6 +kimmen klimmen explanation 3 +ma mag explanation 6 +klei kleine explanation 3 +karrebeertjes koalabeertjes explanation 3 +dit deze explanation 3 +pisa pistool explanation 3 +maat tomaat explanation 3 +we bewegen explanation 6 +plassijde bladzijde explanation 3 konij konijn explanation 1 haatjes haartjes explanation 1 lape slapen explanation 4 -laapkamer slaapkamer noncompletion 1 +laapkamer slaapkamer noncompletion 2 saat gaat explanation 4 kool goal explanation 1 kersie kersje explanation 1 -anne andere explanation 5 +anne andere explanation 7 ee een noncompletion 9 -step schep replacement 3 +step schep replacement 4 +isse is replacement 2 isse is 'n explanation 1 -isse is replacement 1 stepje schepje explanation 1 wos worst explanation 1 kokool stoffer explanation 1 boot vlag explanation 1 -rado radio explanation 1 -poese poesje noncompletion 1 -kijt krijt noncompletion 1 -mette met replacement 2 -gooie gooien noncompletion 1 -cende centjes replacement 1 -oppe in replacement 1 +rado radio explanation 2 +poese poesje noncompletion 3 +kijt krijt noncompletion 3 +mette met replacement 5 +gooie gooien noncompletion 3 +cende centjes replacement 3 +oppe in replacement 3 zotes grote explanation 1 burg berg explanation 1 saapt slaapt explanation 2 -inne de replacement 1 -inne naar replacement 1 -jafel tafel replacement 1 +inne de replacement 2 +inne naar replacement 2 +jafel tafel replacement 2 vief lief explanation 1 gote grote explanation 1 deutje deurtje explanation 2 -jerke werk replacement 1 -ite niet replacement 1 -ansie vakantie replacement 1 -af eraf noncompletion 1 -af is eraf explanation 1 -af eraf explanation 1 +jerke werk replacement 2 +ite niet replacement 2 +ansie vakantie replacement 2 +af is eraf explanation 2 +af eraf noncompletion 2 af kan eraf explanation 1 afgewoke afgebroken explanation 1 -teintje treintje noncompletion 1 -di's dit replacement 1 -oote grote replacement 1 +teintje treintje noncompletion 2 +di's dit replacement 2 +oote grote replacement 2 jantauto brandweerauto replacement 1 annetje mannetje explanation 1 toot boot explanation 2 toot grote boot explanation 1 soeie roeien explanation 1 -tuk stuk replacement 1 -tuk stuk noncompletion 1 -tuk stuk explanation 1 +tuk stuk noncompletion 4 eze deze noncompletion 1 hatel gaat replacement 1 el gaat wel explanation 1 e'af moet eraf explanation 1 chep schelp explanation 2 -borsel borstel noncompletion 1 +borsel borstel noncompletion 2 slarties Smarties explanation 2 varkje varkentje explanation 2 wor word explanation 2 @@ -384,17 +371,17 @@ kein klein explanation 1 ooid gegooid explanation 1 lullie jullie explanation 1 ooien gooien explanation 1 -luiers Arjen replacement 1 -Arjen luiers replacement 1 +luiers Arjen replacement 2 +Arjen luiers replacement 2 tree twee explanation 1 -deze dit replacement 2 +deze dit replacement 4 deze waar hoort deze explanation 1 evallen gevallen noncompletion 1 nog ook explanation 1 voor van explanation 1 -wat waar replacement 1 +wat waar replacement 2 +wordt hoort replacement 2 wordt worden replacement 1 -wordt hoort replacement 1 di dit noncompletion 4 meeenoomt meegenoomt noncompletion 1 kregen gekregen noncompletion 2 @@ -405,8 +392,7 @@ maakt deed replacement 1 maakt gemaakt noncompletion 3 tien Christien noncompletion 3 kiepen omkiepen noncompletion 1 -speeld gespeeld replacement 1 -speeld gespeeld noncompletion 1 +speeld gespeeld noncompletion 2 tekker trekker explanation 2 nijnties konijnen explanation 1 daaro daarop noncompletion 1 @@ -415,13 +401,12 @@ wintik winkel replacement 1 me met noncompletion 1 me maar replacement 1 me mijn explanation 1 -me we explanation 3 -me we replacement 4 +me we replacement 7 melluk melk replacement 1 et het noncompletion 5 +mee gaat hij mee explanation 2 +mee meer noncompletion 3 mee met replacement 1 -mee meer noncompletion 2 -mee gaat hij mee explanation 1 eers eerst noncompletion 4 voorleze voorlezen noncompletion 1 knippe knippen noncompletion 1 @@ -445,44 +430,43 @@ losmak losmaken noncompletion 1 loshouen loshouden noncompletion 1 geworde geworden noncompletion 1 reuse reuzen replacement 1 -wippen wip replacement 1 -heen overheen noncompletion 1 -waagtjes wagentjes replacement 1 -stitten zitten replacement 1 -loop lopen replacement 1 -stapel stapelbed noncompletion 2 -staap stapelbed replacement 1 -tomme komt replacement 1 +wippen wip replacement 2 +heen overheen noncompletion 2 +waagtjes wagentjes replacement 2 +stitten zitten replacement 2 +loop lopen replacement 2 +stapel stapelbed noncompletion 3 +staap stapelbed replacement 2 +tomme komt replacement 2 pasten pesten explanation 1 -pantoet pannekoeken replacement 1 -eet eten replacement 1 +pantoet pannekoeken replacement 2 +eet eten replacement 2 bejo hallo explanation 1 -teje gaan replacement 1 -maar en replacement 5 +teje gaan replacement 2 +maar en replacement 7 maar want replacement 1 maar ga daar maar explanation 1 -tappe stap replacement 1 +tappe stap replacement 2 +tom komt replacement 2 tom soms replacement 1 -tom komt replacement 1 -totedil krokodil replacement 1 -he muts explanation 2 +totedil krokodil replacement 2 +he muts explanation 3 he het noncompletion 2 he heb noncompletion 1 -ga gaat noncompletion 2 +ga gaat noncompletion 5 ga gaan replacement 4 -ga gaat replacement 1 -veven geven explanation 2 -moven weg replacement 4 -choot schoot explanation 6 -poppe muts replacement 2 -mus van replacement 2 -mus muts explanation 2 -anner andere explanation 2 -ja op straat explanation 2 -see sneeuw explanation 2 -vallen gevallen explanation 2 -tasje bedje explanation 2 -begbengen wegbrengen explanation 4 +veven geven explanation 3 +moven weg replacement 7 +choot schoot explanation 9 +poppe muts replacement 3 +mus van replacement 3 +mus muts explanation 3 +anner andere explanation 3 +ja op straat explanation 3 +see sneeuw explanation 3 +vallen gevallen explanation 3 +tasje bedje explanation 3 +begbengen wegbrengen explanation 6 dzo zo explanation 1 bochje bochtje explanation 1 anders andere explanation 1 @@ -521,77 +505,77 @@ wasde was explanation 1 rijs Parijs noncompletion 1 waarhee waarheen noncompletion 1 dez deze noncompletion 1 -ieduleen iedereen replacement 1 -lettu redden replacement 1 -uitkijktoolu uitkijktoren replacement 1 -saafu graven replacement 1 -wate water noncompletion 1 -siesu vliegen replacement 1 +ieduleen iedereen replacement 2 +lettu redden replacement 2 +uitkijktoolu uitkijktoren replacement 2 +saafu graven replacement 2 +wate water noncompletion 2 +siesu vliegen replacement 2 +had hard noncompletion 4 had gehad replacement 1 had kon replacement 1 -had hard noncompletion 2 -leen alleen replacement 2 -leen aleen noncompletion 1 -gun ging replacement 1 -gin ging noncompletion 2 -som soms noncompletion 3 -witte wit replacement 1 -dee was replacement 1 -dan toen replacement 4 -ging gingen replacement 1 -Cas Cars noncompletion 4 -speeloet speelgoed replacement 1 -Cwas Cwars noncompletion 1 -siend vriend replacement 1 -hoo hoor noncompletion 1 -debij derbij noncompletion 3 -wuw wil replacement 1 -Eipet iPad replacement 1 -om op replacement 1 -mat mag het replacement 1 -Ensu Enzo replacement 1 -Slow Knol replacement 1 -pulletjes spulletjes noncompletion 1 -hellu willen replacement 1 -teesjut tieshirt replacement 1 -dikku sticker replacement 1 -soe zo replacement 1 -Misjie Marshall replacement 1 -Pot Porter replacement 1 -ky Sky noncompletion 1 -sewf zelf replacement 1 -sas zag replacement 2 -lew wel replacement 1 -slin ging replacement 1 -stond stonden replacement 3 -stond was replacement 1 +leen aleen noncompletion 2 +leen alleen replacement 3 +gun ging replacement 2 +gin ging noncompletion 4 +som soms noncompletion 6 +witte wit replacement 2 +dee was replacement 2 +dan toen replacement 6 +ging gingen replacement 2 +Cas Cars noncompletion 8 +speeloet speelgoed replacement 2 +Cwas Cwars noncompletion 2 +siend vriend replacement 2 +hoo hoor noncompletion 2 +debij derbij noncompletion 4 +wuw wil replacement 2 +Eipet iPad replacement 2 +om op replacement 2 +mat mag het replacement 2 +Ensu Enzo replacement 2 +Slow Knol replacement 2 +pulletjes spulletjes noncompletion 2 +hellu willen replacement 2 +teesjut tieshirt replacement 2 +dikku sticker replacement 2 +soe zo replacement 2 +Misjie Marshall replacement 2 +Pot Porter replacement 2 +ky Sky noncompletion 2 +sewf zelf replacement 2 +sas zag replacement 4 +lew wel replacement 2 +slin ging replacement 2 +stond stonden replacement 4 +stond was replacement 2 +van om replacement 2 van door replacement 1 van met replacement 1 van uit replacement 1 van veel replacement 1 van bij replacement 1 -van om replacement 1 -arreen alleen replacement 1 +arreen alleen replacement 2 +het er replacement 2 het hij replacement 2 -het er replacement 1 -jiggen liggen replacement 2 -kruisd gekruisd replacement 1 -jit ligt replacement 1 -riggen liggen replacement 1 -hel wel replacement 1 -rigt ligt replacement 1 -mog nog replacement 1 -harders harder replacement 1 -is gaat replacement 1 -allerlangzaam allerlangzaamste replacement 1 -his wist replacement 1 +jiggen liggen replacement 4 +kruisd gekruisd replacement 2 +jit ligt replacement 2 +riggen liggen replacement 2 +hel wel replacement 2 +rigt ligt replacement 2 +mog nog replacement 2 +harders harder replacement 2 +is gaat replacement 2 +allerlangzaam allerlangzaamste replacement 2 +his wist replacement 2 +bij op replacement 2 bij voor replacement 1 -bij op replacement 1 -sumdat voordat replacement 1 -nieuw nieuwe noncompletion 1 -gezon gezonde replacement 1 -moes moest noncompletion 8 -rege leeg replacement 1 +sumdat voordat replacement 2 +nieuw nieuwe noncompletion 2 +gezon gezonde replacement 2 +moes moest noncompletion 9 +rege leeg replacement 2 denk denkt noncompletion 1 opgeten opgegeten replacement 1 kunnen kan replacement 1 @@ -827,3 +811,4 @@ sjale speciale replacement 1 zorgen verzorgen replacement 1 kantie vakantie replacement 1 zomerkantie vakantie replacement 1 +boekje 'n boekje voor David explanation 1 diff --git a/src/sastadev/data/childescorrections/donefiles.txt b/src/sastadev/data/childescorrections/donefiles.txt index 6e29391..05e3c76 100644 --- a/src/sastadev/data/childescorrections/donefiles.txt +++ b/src/sastadev/data/childescorrections/donefiles.txt @@ -1,93 +1,108 @@ -vklstap\intreebanks\STAP_09.xml -auristrain\intreebanks\TD16.xml -test_tarsp\intreebanks\test_tarsp.xml -auristrain\intreebanks\TD14.xml -auristest\intreebanks\TD10.xml -vkltarsp\intreebanks\TARSP_06.xml +vklstap\intreebanks\STAP_10.xml +vklstapfase2\intreebanks\kind1.xml +auristrain\intreebanks\TD05.xml +auristest\intreebanks\TD15.xml auristrain\intreebanks\TD09.xml +vkltarsp\intreebanks\Tarsp_02.xml +vkltarsp\intreebanks\TARSP_09.xml +vklstapfase2\intreebanks\STAP_DP.xml Auris\intreebanks\TD16.xml -vklstap\intreebanks\STAP_04.xml -vklstapfase2\intreebanks\STP_3.xml -vklasta\intreebanks\ASTA_03.xml -auristrain\intreebanks\TD19.xml -vkltarsp\intreebanks\TARSP_13.xml -vklstapfase2\intreebanks\K2.xml -vkltarsp\intreebanks\Tarsp_01.xml -auristrain\intreebanks\TD18.xml -auristrain\intreebanks\TD13.xml -vkltarsp\intreebanks\TARSP_08.xml -auristrain\intreebanks\DLD11.xml -vklastafase2\intreebanks\ASTA_15.xml -auristrain\intreebanks\TD06.xml -auristrain\intreebanks\TD05.xml -vklastafase2\intreebanks\ASTA_13.xml -vklasta\intreebanks\ASTA_09.xml -vklstap\intreebanks\STAP_08.xml -vklasta\intreebanks\ASTA_06.xml -auristrain\intreebanks\TD04.xml -vkltarsp\intreebanks\Tarsp_03.xml -vklstapfase2\intreebanks\STP_Ko.xml vklastafase2\intreebanks\ASTA_16.xml -vklasta\intreebanks\ASTA_08.xml -VKLStapFase2\intreebanks\K2.xml -auristrain\intreebanks\DLD14.xml -auristest\intreebanks\DLD20.xml -auristest\intreebanks\TD30.xml +vklasta\intreebanks\ASTA_07.xml +vklstapfase2\intreebanks\STP_Du.xml +vkltarsp\intreebanks\TARSP_07.xml +Auris\intreebanks\DLD03.xml +vklasta\intreebanks\ASTA_10.xml +vklastafase2\intreebanks\ASTA_11.xml +vklastafase2\intreebanks\ASTA_15.xml +vkltarsp\intreebanks\tarsp_01.xml vklstap\intreebanks\STAP_03.xml -auristrain\intreebanks\DLD16.xml -auristrain\intreebanks\TD23.xml +vklasta\intreebanks\asta_10.xml +vklstap\intreebanks\STAP_04.xml +vklstap\intreebanks\STAP_05.xml +vklstapfase2\intreebanks\SASTA_STAP_023.xml Auristrain\intreebanks\TD16.xml -auristrain\intreebanks\TD22.xml -vklstapfase2\intreebanks\STAP_024.xml -vklstapfase2\intreebanks\STP_Du.xml +vklasta\intreebanks\asta_04.xml +auristrain\intreebanks\TD14.xml +auristest\intreebanks\DLD20.xml +auristest\intreebanks\TD25.xml +auristrain\intreebanks\TD08.xml vklstapfase2\intreebanks\STAP025.xml +vklasta\intreebanks\ASTA_09.xml +vklstapfase2\intreebanks\SASTA_STAP_022.xml +auristrain\intreebanks\TD24.xml +AurisTrain\intreebanks\TD03.xml +Auris\intreebanks\TD03.xml vklstap\intreebanks\STAP_06.xml -vkltarsp\intreebanks\Tarsp_02.xml -vklstapfase2\intreebanks\kind1.xml -auristrain\intreebanks\TD26.xml -vklasta\intreebanks\ASTA_07.xml -vkltarsp\intreebanks\TARSP_10.xml -auristrain\intreebanks\TD07.xml -vkltarsp\intreebanks\TARSP_09.xml -vkltarsp\intreebanks\tarsp_01.xml +vklstapfase2\intreebanks\STP_Ko.xml +vklstapfase2\intreebanks\STP_MP_MZ.xml +vklstapfase2\intreebanks\STP_Da.xml vkltarsp\intreebanks\Tarsp_04.xml -auristest\intreebanks\TD25.xml -vklstapfase2\intreebanks\STAP_DP.xml -vklstap\intreebanks\STAP_10.xml vkltarsp\intreebanks\Tarsp_05.xml -vklastafase2\intreebanks\ASTA_14.xml -auristrain\intreebanks\TD21.xml +auristrain\intreebanks\TD23.xml +auristest\intreebanks\TD10.xml +auristrain\intreebanks\TD16.xml +auristrain\intreebanks\DLD11.xml Auris\intreebanks\TD01.xml -vklstapfase2\intreebanks\SASTA_STAP_023.xml +auristrain\intreebanks\TD06.xml +auristrain\intreebanks\TD19.xml +vklstapfase2\intreebanks\STP_3.xml +auristrain\intreebanks\TD18.xml +vkltarsp\intreebanks\Tarsp_03.xml +vklasta\intreebanks\ASTA_04.xml +Auris\intreebanks\TD13.xml +AurisTest\intreebanks\TD01.xml +Auristrain\intreebanks\TD07.xml +auristrain\intreebanks\TD21.xml test_stap\intreebanks\test_stap.xml -auristest\intreebanks\DLD07.xml -vklasta\intreebanks\ASTA_01.xml -vklstapfase2\intreebanks\STP_MP_MZ.xml -auristest\intreebanks\TD15.xml +auristrain\intreebanks\TD02.xml +vkltarsp\intreebanks\TARSP_08.xml +auristrain\intreebanks\TD26.xml +auristrain\intreebanks\DLD03.xml +vklstap\intreebanks\STAP_09.xml +VKLStapFase2\intreebanks\K2.xml +vkltarsp\intreebanks\Tarsp_01.xml +vklasta\intreebanks\ASTA_03.xml +auristest\intreebanks\TD01.xml +Auris\intreebanks\TD18.xml auristrain\intreebanks\TD29.xml -Auris\intreebanks\DLD16.xml -vklstap\intreebanks\STAP_07.xml -vklstap\intreebanks\STAP_02.xml +vklstapfase2\intreebanks\STAP_024.xml auristrain\intreebanks\TD03.xml -auristrain\intreebanks\TD02.xml -test_asta\intreebanks\test_asta.xml +vklasta\intreebanks\asta_01.xml vklasta\intreebanks\ASTA_05.xml +vkltarsp\intreebanks\TARSP_13.xml +vklastafase2\intreebanks\ASTA_13.xml +vklastafase2\intreebanks\ASTA_14.xml +Auris\intreebanks\DLD16.xml +auristest\intreebanks\TD20.xml +auristrain\intreebanks\DLD14.xml +auristrain\intreebanks\TD07.xml +vklstapfase2\intreebanks\K2.xml +AurisTrain\intreebanks\TD02.xml +auristrain\intreebanks\TD12.xml +vkltarsp\intreebanks\TARSP_10.xml +auristrain\intreebanks\DLD16.xml +vklasta\intreebanks\ASTA_08.xml vklasta\intreebanks\ASTA_02.xml -vklstapfase2\intreebanks\STP_Da.xml -Auris\intreebanks\TD13.xml -auristrain\intreebanks\DLD03.xml -vklasta\intreebanks\ASTA_04.xml -vklasta\intreebanks\ASTA_10.xml -vklastafase2\intreebanks\ASTA_11.xml +Auris\intreebanks\TD02.xml +auristest\intreebanks\TD30.xml +test_asta\intreebanks\test_asta.xml +auristrain\intreebanks\TD22.xml +vklstap\intreebanks\STAP_08.xml +auristrain\intreebanks\TD04.xml +Auris\intreebanks\TD11.xml +Auristrain\intreebanks\DLD03.xml +auchanntest\intreebanks\auchanNtest01.xml +vkltarsp\intreebanks\TARSP_06.xml +auristrain\intreebanks\TD13.xml +vklstap\intreebanks\STAP_07.xml auristrain\intreebanks\TD11.xml -auristrain\intreebanks\TD08.xml -vklstap\intreebanks\STAP_05.xml +vklasta\intreebanks\ASTA_01.xml +vklasta\intreebanks\ASTA_06.xml +auristest\intreebanks\DLD07.xml vklstapfase2\intreebanks\STP_KC.xml -vklstapfase2\intreebanks\SASTA_STAP_022.xml -vkltarsp\intreebanks\TARSP_07.xml -auristrain\intreebanks\TD12.xml +vklstap\intreebanks\stap_03.xml +test_tarsp\intreebanks\test_tarsp.xml +vklstap\intreebanks\stap_02.xml +vklstap\intreebanks\STAP_02.xml auristrain\intreebanks\TD28.xml -auristrain\intreebanks\TD24.xml -Auris\intreebanks\TD18.xml -auristest\intreebanks\TD01.xml -auristest\intreebanks\TD20.xml diff --git a/src/sastadev/data/contextcorrections/contextcorrections.xlsx b/src/sastadev/data/contextcorrections/contextcorrections.xlsx new file mode 100644 index 0000000..0cae438 Binary files /dev/null and b/src/sastadev/data/contextcorrections/contextcorrections.xlsx differ diff --git a/src/sastadev/data/filledpauseslexicon/additionalwordslexicon.txt b/src/sastadev/data/filledpauseslexicon/additionalwordslexicon.txt index 2e7fb1c..7e1e5aa 100644 --- a/src/sastadev/data/filledpauseslexicon/additionalwordslexicon.txt +++ b/src/sastadev/data/filledpauseslexicon/additionalwordslexicon.txt @@ -36,4 +36,8 @@ koffie Smarties omie m'n -z'n \ No newline at end of file +z'n +speellokaal +kinderfeest +kinderfeestje +damespaardje \ No newline at end of file diff --git a/src/sastadev/data/filledpauseslexicon/wrongposwordslexicon.txt b/src/sastadev/data/filledpauseslexicon/wrongposwordslexicon.txt new file mode 100644 index 0000000..e69de29 diff --git a/src/sastadev/data/methods/TARSP_Index_Current.xlsx b/src/sastadev/data/methods/TARSP_Index_Current.xlsx index b9b78fe..524e996 100644 Binary files a/src/sastadev/data/methods/TARSP_Index_Current.xlsx and b/src/sastadev/data/methods/TARSP_Index_Current.xlsx differ diff --git a/src/sastadev/data/nochildwords/nochildwords.txt b/src/sastadev/data/nochildwords/nochildwords.txt index 5c20ccd..1901996 100644 --- a/src/sastadev/data/nochildwords/nochildwords.txt +++ b/src/sastadev/data/nochildwords/nochildwords.txt @@ -1 +1,6 @@ -tanen \ No newline at end of file +tanen +pee +'s +pele +pelen +toeten \ No newline at end of file diff --git a/src/sastadev/data/subcatlexicon/sasta_subcatlexicon.xlsx b/src/sastadev/data/subcatlexicon/sasta_subcatlexicon.xlsx new file mode 100644 index 0000000..208bd0c Binary files /dev/null and b/src/sastadev/data/subcatlexicon/sasta_subcatlexicon.xlsx differ diff --git a/src/sastadev/data/subcatlexicon/vkvkschl_lemma_sc-frq.xlsx b/src/sastadev/data/subcatlexicon/vkvkschl_lemma_sc-frq.xlsx new file mode 100644 index 0000000..26ca956 Binary files /dev/null and b/src/sastadev/data/subcatlexicon/vkvkschl_lemma_sc-frq.xlsx differ diff --git a/src/sastadev/data/subcatlexicon/vkvkscl_ww_sc_role_frq.xlsx b/src/sastadev/data/subcatlexicon/vkvkscl_ww_sc_role_frq.xlsx new file mode 100644 index 0000000..56a4372 Binary files /dev/null and b/src/sastadev/data/subcatlexicon/vkvkscl_ww_sc_role_frq.xlsx differ diff --git a/src/sastadev/data/wordsunknowntoalpino/wordsunknowntoalpino.txt b/src/sastadev/data/wordsunknowntoalpino/wordsunknowntoalpino.txt index 03936fe..6fc5cdd 100644 --- a/src/sastadev/data/wordsunknowntoalpino/wordsunknowntoalpino.txt +++ b/src/sastadev/data/wordsunknowntoalpino/wordsunknowntoalpino.txt @@ -1,9 +1,14 @@ gymmen trainen gymt traint gym train +gym gymnastiek gegymd getraind gymmend trainend gymmende trainende gymmenden trainenden gymde trainde gymden trainden +smarties toffees +Smarties toffees +smartie toffee +Smartie toffee diff --git a/src/sastadev/lexicon.py b/src/sastadev/lexicon.py index 36d896a..006ed17 100644 --- a/src/sastadev/lexicon.py +++ b/src/sastadev/lexicon.py @@ -8,16 +8,19 @@ ''' +from collections import defaultdict import os from typing import Any, Dict, List, Optional from sastadev import celexlexicon, treebankfunctions from sastadev.conf import settings +from sastadev.methods import asta, stap, tarsp, MethodName from sastadev.namepartlexicon import (namepart_isa_namepart, namepart_isa_namepart_uc) from sastadev.readcsv import readcsv from sastadev.sastatypes import CELEX_INFL, DCOITuple, Lemma, SynTree, WordInfo +alpinoparse = settings.PARSE_FUNC space = ' ' celex = 'celex' @@ -63,6 +66,14 @@ def initializelexicondict(lexiconfilename) -> Dict[str,str]: lexicon[strippedword] = strippedreplacement return lexicon +def initializelexicondefdict(lexiconfilename) -> Dict[str,List[str]]: + lexicon = defaultdict(list) + fptuples = readcsv(lexiconfilename, header=False) + for _, fp in fptuples: + strippedword = fp[0].strip() + strippedreplacement = fp[1].strip() + lexicon[strippedword].append(strippedreplacement) + return lexicon def isa_namepart(word: str) -> bool: ''' @@ -198,18 +209,60 @@ def chatspecial(word: str) -> bool: def known_word(word: str) -> bool: ''' a word is considered to be a known_word if it occurs in the word form lexicon, - if it is a name part, or if it is a chatspecial item, or in a lexicon with additional words + if it is a name part, or if it is a chatspecial item, or in a lexicon with additional words, + or a compound noun recognised as such by Alpino but not in the nonwordslexicon :param word: :return: ''' result = informlexicon(word) or isa_namepart(word) or \ chatspecial(word) or word in additionalwordslexicon or \ - isallersuperlative(word) + isallersuperlative(word) or isalpinonouncompound(word) result = result and word not in nonwordslexicon return result +comma = ',' +compoundsep = '_' + +def validword(wrd: str, methodname: MethodName) -> bool: + result = known_word(wrd) + if methodname in {tarsp, stap}: + result = result and not nochildword(wrd) + return result + +def nochildword(wrd: str) -> bool: + result = wrd in nochildwords + return result + +def isalpinonouncompound(wrd: str) -> bool: + fullstr = f'geen {wrd}' # geen makes it a noun and can combine with uter and neuter, count and mass, sg and plural + tree = alpinoparse(fullstr) + # find the noun + if tree is None: + settings.LOGGER.error(f'Parsing {fullstr} failed') + return False + nounnode = treebankfunctions.find1(tree, './/node[@pt="n"]') + if nounnode is None: + settings.LOGGER.error(f'No noun found in {fullstr} parse') + return False + nounwrd = treebankfunctions.getattval(nounnode, 'word') + if nounwrd != wrd: + settings.LOGGER.error(f'Wrong noun ({nounwrd}) found in {fullstr} parse') + return False + nounlemma = treebankfunctions.getattval(nounnode, 'lemma') + if compoundsep in nounlemma: + parts = nounlemma.split(compoundsep) + unknownparts = [part for part in parts if not known_word(part)] + result = unknownparts = [] + if not result: + settings.LOGGER.error(f'Unknown words ({comma.join(unknownparts)}) found in {fullstr} parse') + return False + return True + else: + return False + + def isallersuperlative(wrd:str) -> bool: result = wrd.startswith('aller') and (wrd.endswith('st') or wrd.endswith('ste')) and informlexicon(wrd[5:]) return result @@ -243,7 +296,7 @@ def getinflforms(thesubj: SynTree, thepv: SynTree, inversion: bool) -> List[str] lexiconfoldername = 'data/wordsunknowntoalpino' wordsunknowntoalpinofilename = 'wordsunknowntoalpino.txt' wordsunknowntoalpinofullname = os.path.join(settings.SD_DIR, lexiconfoldername, wordsunknowntoalpinofilename) -wordsunknowntoalpinolexicondict = initializelexicondict(wordsunknowntoalpinofullname) +wordsunknowntoalpinolexicondict = initializelexicondefdict(wordsunknowntoalpinofullname) lexiconfoldername = 'data/filledpauseslexicon' @@ -273,4 +326,7 @@ def getinflforms(thesubj: SynTree, thepv: SynTree, inversion: bool) -> List[str] wrongposwordslexiconfilename = 'wrongposwordslexicon.txt' wrongposwordslexiconfullname = os.path.join(settings.SD_DIR, lexiconfoldername, wrongposwordslexiconfilename) -wrongposwordslexicon = initializelexicon(wrongposwordslexiconfullname) \ No newline at end of file +wrongposwordslexicon = initializelexicon(wrongposwordslexiconfullname) + +# validnouns is intended for nous that Alpino assigns frame (both,both, both) but that are valid Dutch words +validnouns = {'knijper'} \ No newline at end of file diff --git a/src/sastadev/mainrel.py b/src/sastadev/mainrel.py deleted file mode 100644 index 3840b14..0000000 --- a/src/sastadev/mainrel.py +++ /dev/null @@ -1,10 +0,0 @@ - -mainrelxpath = './/node[@rel="--" and @cat="rel"]' - - -def mainrelcount(stree: SynTree) -> int: - mainrels = stree.xpath(mainrelxpath) - result = len(mainrels) - return result - -Criterion('mainrelcount', mainrelcount, negative, 'Dislike main relative clauses') \ No newline at end of file diff --git a/src/sastadev/metadata.py b/src/sastadev/metadata.py index 3d0a176..3248456 100644 --- a/src/sastadev/metadata.py +++ b/src/sastadev/metadata.py @@ -10,6 +10,21 @@ defaultbackplacement = bpl_none SASTA = 'SASTA' +ADULTSPELLINGCORRECTION = 'AdultSpellingCorrection' +ALLSAMPLECORRECTIONS = 'AllSampleCorrections' +BASICREPLACEMENTS = 'BasicReplacements' +CHILDRENSPELLINGCORRECTION = 'ChildrenSpellingCorrection' +CONTEXT = 'Context' +HISTORY = 'History' +THISSAMPLECORRECTIONS = 'ThisSampleCorrections' + + +EXTRAGRAMMATICAL = 'ExtraGrammatical' + +replacementsubsources = [ ADULTSPELLINGCORRECTION, ALLSAMPLECORRECTIONS, BASICREPLACEMENTS, + CHILDRENSPELLINGCORRECTION , CONTEXT, HISTORY, THISSAMPLECORRECTIONS + ] + space = ' ' metakw = '##META' @@ -125,11 +140,11 @@ def selectmeta(name, metadatalist): return None -def mkSASTAMeta(token, nwt, name, value, cat, subcat=None, penalty=defaultpenalty, backplacement=defaultbackplacement): +def mkSASTAMeta(token, nwt, name, value, cat, subcat=None, source=SASTA, penalty=defaultpenalty, backplacement=defaultbackplacement): result = Meta(name, value, annotatedposlist=[token.pos], annotatedwordlist=[token.word], annotationposlist=[nwt.pos], annotationwordlist=[ - nwt.word], cat=cat, subcat=subcat, source=SASTA, penalty=penalty, + nwt.word], cat=cat, subcat=subcat, source=source, penalty=penalty, backplacement=backplacement) return result diff --git a/src/sastadev/resultsbyutterance.py b/src/sastadev/resultsbyutterance.py index 841b6d1..d2c6d03 100644 --- a/src/sastadev/resultsbyutterance.py +++ b/src/sastadev/resultsbyutterance.py @@ -5,21 +5,33 @@ from collections import Counter, defaultdict from typing import Dict, List, Tuple +from sastadev.allresults import AllResults from sastadev.conf import settings +from sastadev.methods import Method +from sastadev.query import query_inform from sastadev.rpf1 import getscores -from sastadev.sastatypes import GoldResults, QId, ResultsDict, UttId +from sastadev.sastatypes import GoldResults, QId, ResultsDict, Table, UttId + +comma = ',' +space = ' ' notapplicable = (0.0, 0.0, 0.0) +byuttheader = ['uttid', 'results', 'bronzeref', 'silverref'] + ['br', 'bp', 'bf1'] + ['sr', 'sp', 'sf1'] + ['utterance'] + ResultsByUttDict = Dict[UttId, List[QId]] ScoresByUttDict = Dict[UttId, List[Tuple[float, float, float]]] -def getresultsbyutt(results: ResultsDict) -> ResultsByUttDict: +def getresultsbyutt(results: ResultsDict, method: Method) -> ResultsByUttDict: resultsbyuttdict: ResultsByUttDict = defaultdict(Counter) - for qid in results: - for uttid in results[qid]: - resultsbyuttdict[uttid].update([qid]) + for reskey in results: + qid = reskey[0] + if qid in method.queries: + thequery = method.queries[qid] + if query_inform(thequery): + for uttid in results[reskey]: + resultsbyuttdict[uttid].update([reskey]) return resultsbyuttdict @@ -47,13 +59,75 @@ def getreference(goldscores: GoldResults) -> ResultsDict: return reference -def getscoresbyutt(results: ResultsDict, goldscores: GoldResults) -> ScoresByUttDict: - debug = True +def getscoresbyutt(results: ResultsDict, refscores: ResultsDict) -> ScoresByUttDict: + debug = False resultsbyutt = getresultsbyutt(results) - reference = getreference(goldscores) - referencebyutt = getresultsbyutt(reference) + # reference = getreference(goldscores) + referencebyutt = getresultsbyutt(refscores) scoresbyutt = getscoresbyutt2(resultsbyutt, referencebyutt) if debug: for uttid, triple in scoresbyutt.items(): print(uttid, triple) return scoresbyutt + + +def mkscoresbyuttrows(allresults: AllResults, bronzerefscores: ResultsDict, silverrefscores: ResultsDict, + method: Method) -> Table: + results = allresults.coreresults + resultsbyutt = getresultsbyutt(results, method) + bronzebyutt = getresultsbyutt(bronzerefscores, method) + silverbyutt = getresultsbyutt(silverrefscores, method) + bronzescoresbyutt = getscoresbyutt2(resultsbyutt, bronzebyutt) + silverscoresbyutt = getscoresbyutt2(resultsbyutt, silverbyutt) + resultsuttids = {uttid for uttid in resultsbyutt} + bronzeuttids = {uttid for uttid in bronzebyutt} + silveruttids = {uttid for uttid in silverbyutt} + alluttids = resultsuttids.union(bronzeuttids.union(silveruttids)) + alluttidlist = list(alluttids) + sortedalluttidlist = sorted(alluttidlist, key=lambda x: int(x)) + rows = [] + for uttid in sortedalluttidlist: + if uttid in resultsuttids: + results = counter2str(resultsbyutt[uttid], method) + else: + results = '' + if uttid in bronzebyutt: + bronzeref = counter2str(bronzebyutt[uttid], method) + else: + bronzeref = '' + if uttid in silverbyutt: + silverref = counter2str(silverbyutt[uttid], method) + else: + silverref = '' + if uttid in bronzescoresbyutt: + r, p, f1 = bronzescoresbyutt[uttid] + bronzescores = [r, p, f1] + else: + r, p, f1 = notapplicable + bronzescores = [r, p, f1] + if uttid in silverscoresbyutt: + r, p, f1 = silverscoresbyutt[uttid] + silverscores = [r, p, f1] + else: + r, p, f1 = notapplicable + silverscores = [r, p, f1] + utt = space.join(allresults.allutts[uttid]) if uttid in allresults.allutts else '@@' + fullrow = [uttid, results, bronzeref, silverref] + bronzescores + silverscores + [utt] + rows.append(fullrow) + return rows + +def counter2itemlist(scores: Counter, method: Method) -> List[str]: + resultlist = [] + for reskey in scores: + qid = reskey[0] + thequery = method.queries[qid] + theitem = thequery.item if reskey[0] == reskey[1] else f'{thequery.item}={reskey[1]}' + sublist = scores[reskey] * [theitem] + resultlist += sublist + sortedresultlist = sorted(resultlist) + return sortedresultlist + +def counter2str(scores: Counter, method: Method) -> str: + resultlist = counter2itemlist(scores, method) + result = comma.join(resultlist) + return result \ No newline at end of file diff --git a/src/sastadev/sas_impact.py b/src/sastadev/sas_impact.py new file mode 100644 index 0000000..e4b45f4 --- /dev/null +++ b/src/sastadev/sas_impact.py @@ -0,0 +1,67 @@ +from collections import Counter +import copy +from sastadev.allresults import AllResults +from sastadev.methods import Method +from sastadev.rpf1 import getscores, getevalscores +from sastadev.resultsbyutterance import getresultsbyutt, getscoresbyutt2 +from typing import Dict, List, Tuple + +def sas_impact(allresults: AllResults, silverrefscores, method: Method): + # maximum nuber of utterances to be reviewed + n = 10 + f1target = 95 + + results = allresults.coreresults + resultsbyutt = getresultsbyutt(results, method) + silverbyutt = getresultsbyutt(silverrefscores, method) + silverscoresbyutt = getscoresbyutt2(resultsbyutt, silverbyutt) + + # reverse sort them by silver F1 + silverscorebyuttlist = [(uttid, score) for uttid, score in silverscoresbyutt.items()] + sortedsilverscorebyutt = sorted(silverscorebyuttlist, key= lambda x: x[1][2]) + + resultscount, refcount, intersectioncount = getcomparisoncounts(resultsbyutt, silverbyutt) + originalscores = getevalscores(resultscount, refcount, intersectioncount) + + sasresultsbyutt = copy.deepcopy(resultsbyutt) + allscores = [originalscores] + for i in range(n): + + # change the results to the silver reference + curruttid = sortedsilverscorebyutt[i][0] + sasresultsbyutt[curruttid] = silverbyutt[curruttid] + + # compute the overall score + resultscount, refcount, intersectioncount = getcomparisoncounts(sasresultsbyutt, silverbyutt) + newscores = getevalscores(resultscount, refcount, intersectioncount) + allscores.append(newscores) + if newscores[2] >= f1target: + break + return allscores + + +def mksas_impactrows(allscores: List[Tuple[float]], not100count:int) -> List[str]: + # a list of the F1 scores, plus a header + row = [score[2] for score in allscores] + lrow = len(row) - 1 + header = ['not100count', 'original'] + [f'{str(i+1)} utts reviewed' for i in range(lrow)] + rows = [[not100count] + row] + return header, rows + + +def getcomparisoncounts(results: Dict[str, Counter], reference: Dict[str, Counter]) -> Tuple[int, int, int]: + resultscount = 0 + referencecount = 0 + intersectioncount = 0 + + for key in results: + resultscount += sum(results[key].values()) + + for key in reference: + referencecount += sum(reference[key].values()) + + for key in results: + if key in reference: + intersection = results[key] & reference[key] + intersectioncount += sum(intersection.values()) + return resultscount, referencecount, intersectioncount diff --git a/src/sastadev/semantic_compatibility.py b/src/sastadev/semantic_compatibility.py index bcb66ab..ba217fe 100644 --- a/src/sastadev/semantic_compatibility.py +++ b/src/sastadev/semantic_compatibility.py @@ -1,6 +1,8 @@ from lxml import etree import re from sastadev.conf import settings +from sastadev.metadata import Meta +from sastadev.methods import MethodName from sastadev.NLtypes import Animate, AnyType, Event, Human, Object, SemType, UnKnown, Alt, And from sastadev.sastatypes import List, SynTree from sastadev.semtypelexicon import sh, vnwsemdict, wwsemdict, wwreqsemdict, defaultreqsemdict @@ -171,7 +173,7 @@ def barebarecompatible(sem1: SemType, sem2: SemType) -> bool: wordnodexpath = './/node[@word]' -def semincompatiblecount(stree: SynTree) -> int: +def semincompatiblecount(stree: SynTree, md:List[Meta], mn:MethodName) -> int: sentence = getsentence(stree) # mainly for debugging ease result = 0 # gather the words diff --git a/src/sastadev/semtypelexicon.py b/src/sastadev/semtypelexicon.py index ad8af2f..a7d0862 100644 --- a/src/sastadev/semtypelexicon.py +++ b/src/sastadev/semtypelexicon.py @@ -103,7 +103,9 @@ def aa(semtypelist: List[SemType]) -> Alt: # lemma frame[2] semreq=List[Dict[rel: semtype]] semtype verbs = [ ('liggen', 'intransitive', [{su: sh(Object)}], sh(State)), ('maken', 'pred_np', [{su: sh(Animate), obj1: sh(Object), predc:Alt([And([State]), And([Property])])}], sh(Activity)), - ('kapot_maken', 'part_transitive(kapot)', [{su: sh(Animate), obj1: sh(Object)}], sh(Activity)) + ('kapot_maken', 'part_transitive(kapot)', [{su: sh(Animate), obj1: sh(Object)}], sh(Activity)), + ('maaien', 'intransitive', [{su: sh(Animate)}], sh(Activity) ), + ('maaien', 'transitive', [{su:sh(Animate), obj1: sh(NonAnimate)}], sh(Activity)) ] wwsemdict = {(lemma, frame): semtype for (lemma, frame, _, semtype) in verbs } diff --git a/src/sastadev/stringfunctions.py b/src/sastadev/stringfunctions.py index d4e683a..020ea67 100644 --- a/src/sastadev/stringfunctions.py +++ b/src/sastadev/stringfunctions.py @@ -11,6 +11,8 @@ comma = ',' underscore = '_' +punctuationchars = """`!()-{}[]:;"'<>,.?""" # should actally use unicod categories + # for selecting nonempty tokens from a csvstring ; comma between single quotes is allowed csvre = "'[^']+'|[^,' ]+" csvpat = re.compile(csvre) @@ -75,6 +77,9 @@ def star(str: str) -> str: ''' return '({})*'.format(str) +def ispunctuation(wrd: str) -> bool: + result = wrd in punctuationchars + return result def alt(strlist: Sequence[str], grouped: bool = True) -> str: ''' diff --git a/src/sastadev/subcatprefs.py b/src/sastadev/subcatprefs.py index c5b7a6a..1ea499b 100644 --- a/src/sastadev/subcatprefs.py +++ b/src/sastadev/subcatprefs.py @@ -43,10 +43,11 @@ def getsubcatprefscore(stree: SynTree) -> int: verbnodes = stree.xpath('.//node[@pt="ww"]') for verbnode in verbnodes: sc = getattval(verbnode, 'sc') + lemma = getattval(verbnode, 'lemma') if (lemma, sc) in oth_subcatlexicon: resultscore += oth_subcatlexicon[(lemma, sc)] elif (lemma, sc) in trg_subcatlexicon: - resultscore += trg_subcatlexicon + resultscore += trg_subcatlexicon[(lemma, sc)] return resultscore @@ -102,3 +103,5 @@ def getsubcatprefscore(stree: SynTree) -> int: del oth_temp2subcatlexicon del trg_temp2subcatlexicon del all_temp2subcatlexicon + +junk = 0 diff --git a/src/sastadev/testing.py b/src/sastadev/testing.py new file mode 100644 index 0000000..59fac01 --- /dev/null +++ b/src/sastadev/testing.py @@ -0,0 +1,6 @@ +from sastadev.lexicon import validword, nochildword + +ok = validword('pele', 'tarsp') + +ok = nochildword('pele') +junk = 0 \ No newline at end of file diff --git a/src/sastadev/toe.py b/src/sastadev/toe.py index 4014647..43913ab 100644 --- a/src/sastadev/toe.py +++ b/src/sastadev/toe.py @@ -1,6 +1,7 @@ from sastadev.treebankfunctions import getattval, getnodeyield, mktoken2nodemap from sastadev.conf import settings from sastadev.alpino import getdehetwordinfo +from sastadev.lexicon import isalpinonouncompound from sastadev.sastatypes import SynTree from sastadev.sastatoken import Token from typing import List @@ -76,7 +77,13 @@ def lonelytoe(tokensmd: TokenListMD, tree: SynTree) -> List[TokenListMD]: nominalpts = ['n', 'vnw'] def isnominal(node: SynTree) -> bool: pt = getattval(node, 'pt' ) - return pt in nominalpts + wrd = getattval(node, 'word') + if pt in nominalpts: + return True + elif isalpinonouncompound(wrd): + return True + else: + return False