Skip to content

Commit

Permalink
spellingcorrection updates, evaluation updates
Browse files Browse the repository at this point in the history
  • Loading branch information
JanOdijk committed Oct 9, 2024
1 parent 08743c8 commit 8b954c7
Show file tree
Hide file tree
Showing 35 changed files with 1,177 additions and 476 deletions.
16 changes: 9 additions & 7 deletions src/sastadev/CHAT_Annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@
CHAT = 'CHAT'

CHAT_explanation = 'Explanation'
CHAT_wordnoncompletion = 'Noncompletion of a Word'
CHAT_reformulation = 'Reformulation'
CHAT_repetition = 'Repetition'
CHAT_replacement = 'Replacement'

CHAT_retracing = 'Retracing'
CHAT_wordnoncompletion = 'Noncompletion of a Word'

monadic = 1
dyadic = 2
Expand Down Expand Up @@ -78,7 +80,7 @@ def refunction(x):
specialformpat = wordpat + r'(?:@z:\w\w\w|@\w\w?\w?)'
fullspecialformpat = fullre(specialformpat)
specialformre = re.compile(fullspecialformpat)
repkeepannotations = ['Repetition', 'Retracing', 'Reformulation']
repkeepannotations = [CHAT_repetition, CHAT_retracing, CHAT_reformulation]


def getreplacement(repkeep, annotation):
Expand Down Expand Up @@ -817,7 +819,7 @@ def result(x, y):
CHAT_ComplexRegex(
(r'\[=', anybutrb, r'\]'), (keep, eps), False),
complexmetafunction),
CHAT_Annotation('Replacement', '8.3:69', '10.3:73',
CHAT_Annotation(CHAT_replacement, '8.3:69', '10.3:73',
CHAT_ComplexRegex(
(r'\[:\s', r'([^\]]+)', r'\]'), (eps, keep), True, containswords=True),
complexmetafunction_replbpl),
Expand All @@ -832,13 +834,13 @@ def result(x, y):
CHAT_ComplexRegex((r'\[%\s+', anybutrb, r'\]'), (keep, eps), True), complexmetafunction),
CHAT_Annotation('Best Guess', '8.3:70-71', '10.3:74', CHAT_SimpleScopedRegex(r'\[\?\]', keep, True, monadic),
simplescopedmetafunction),
CHAT_Annotation('Repetition', '8.4:72', '10.4:75-76', CHAT_SimpleScopedRegex(r'\[/\]', eps, True, monadic),
CHAT_Annotation(CHAT_repetition, '8.4:72', '10.4:75-76', CHAT_SimpleScopedRegex(r'\[/\]', eps, True, monadic),
simplescopedmetafunction),
CHAT_Annotation('Multiple Repetition', '8.4:72-73', '10.4:76',
CHAT_ComplexRegex((r'\[x', r'[0-9]+', r'\]'), (keep, eps), True), complexmetafunction),
CHAT_Annotation('Retracing', '8.4:73', '10.4:76-77', CHAT_SimpleScopedRegex(r'\[//\]', eps, True, monadic),
CHAT_Annotation(CHAT_retracing, '8.4:73', '10.4:76-77', CHAT_SimpleScopedRegex(r'\[//\]', eps, True, monadic),
simplescopedmetafunction),
CHAT_Annotation('Reformulation', '8.4:73-74', '10.4:77', CHAT_SimpleScopedRegex(r'\[///\]', eps, True, monadic),
CHAT_Annotation(CHAT_reformulation, '8.4:73-74', '10.4:77', CHAT_SimpleScopedRegex(r'\[///\]', eps, True, monadic),
simplescopedmetafunction),
CHAT_Annotation('False Start Without Retracing', '8.4:74', '10.4:77',
CHAT_SimpleScopedRegex(r'\[/\-\]', eps, True, dyadic), simplescopedmetafunction),
Expand Down
28 changes: 24 additions & 4 deletions src/sastadev/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,11 +155,12 @@
ResultsKey, mkresultskey, scores2counts,
showreskey)
from sastadev.conf import settings
from sastadev.constants import (bronzefolder, bronzesuffix, checksuffix, checkeditedsuffix,
from sastadev.constants import (analysissuffix, bronzefolder, bronzesuffix, byuttscoressuffix, checksuffix, checkeditedsuffix,
formsfolder, intreebanksfolder,
loggingfolder, outtreebanksfolder, permprefix, platinumsuffix,
platinumeditedsuffix,
resultsfolder, silverfolder, silverpermfolder, silversuffix)
from sastadev.context import getcontextdict
from sastadev.correctionparameters import CorrectionParameters
from sastadev.correcttreebank import (correcttreebank, corr0, corrn, errorwbheader, validcorroptions)
from sastadev.counterfunctions import counter2liststr
Expand All @@ -178,6 +179,8 @@
post_process, query_exists, query_inform)
from sastadev.readcsv import writecsv
from sastadev.readmethod import itemseppattern, read_method
from sastadev.resultsbyutterance import getscoresbyutt, mkscoresbyuttrows, byuttheader
from sastadev.sas_impact import getcomparisoncounts, mksas_impactrows, sas_impact
from sastadev.sastatypes import (AltCodeDict, ExactResultsDict, FileName,
GoldTuple, MatchesDict, MethodName, QId,
QIdCount, QueryDict, ResultsCounter,
Expand All @@ -197,7 +200,7 @@
from sastadev.treebankfunctions import (find1, getattval, getnodeendmap, getuttid,
getxmetatreepositions, getxsid,
getyield, showtree)
from sastadev.xlsx import mkworkbook
from sastadev.xlsx import mkworkbook, add_worksheet


start_time = time.time()
Expand Down Expand Up @@ -1204,8 +1207,10 @@ def main():
else:
mergedsamplecorrections = {}

contextdict = getcontextdict(treebank2, lambda x: True)

correctionparameters = CorrectionParameters(methodname, options, mergedsamplecorrections, thissamplecorrections)
correctionparameters = CorrectionParameters(methodname, options, mergedsamplecorrections,
thissamplecorrections, treebank2, contextdict)

treebank, errordict, allorandalts = correcttreebank(treebank2, targets, correctionparameters, corr=corr)

Expand Down Expand Up @@ -1274,12 +1279,27 @@ def main():
silverscores = exact2results(exactsilverscores) # ongoing
silvercounts = scores2counts(silverscores)

# scores by utterance
# bronzescoresbyutt = getscoresbyutt(allresults.coreresults, goldscores)
# silverscoresbyutt = getscoresbyutt(allresults.coreresults, silverscores)

byuttrows = mkscoresbyuttrows(allresults, goldscores, silverscores, themethod)
not100count = len([row for row in byuttrows if row[9] != 100])
scoresbyuttoutfullname = os.path.join(resultspath, corefilename + byuttscoressuffix + '.xlsx')
wb = mkworkbook(scoresbyuttoutfullname, [byuttheader], byuttrows, freeze_panes=(1,0) )
allbyuttscores = sas_impact(allresults, silverscores, themethod)
sasheader, sasimpactrows = mksas_impactrows(allbyuttscores, not100count)
add_worksheet(wb,[sasheader], sasimpactrows, sheetname='SAS_impact', freeze_panes=(1,0))
wb.close()



# netx is now obsolete
# platinumresults: Dict[ResultsKey, Counter] = reduceresults(platinumresults, samplesizetuple, options.methodname)

(base, ext) = os.path.splitext(options.infilename)
outputfullname = os.path.join(
resultspath, corefilename + "_analysis" + tsvext + txtext)
resultspath, corefilename + analysissuffix + tsvext + txtext)
outfile = open(outputfullname, 'w', encoding='utf8')

outxlsx = os.path.join(resultspath, corefilename + "_analysis" + xlsxext)
Expand Down
39 changes: 39 additions & 0 deletions src/sastadev/alpinocompound.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from sastadev.treebankfunctions import find1, getattval
from sastadev.conf import settings
from sastadev.lexicon import known_word


# verhuizen naar lexicon module
comma = ','
compoundsep = '_'

alpinoparse = settings.PARSE_FUNC


def isalpinocompound(wrd: str) -> bool:
fullstr = f'geen {wrd}' # geen makes it a noun and can combine with uter and neuter, count and mass, sg and plural
tree = alpinoparse(fullstr)
# find the noun
if tree is None:
settings.LOGGER.error(f'Parsing {fullstr} failed')
return False
nounnode = find1(tree, './/node[@pt="n"]')
if nounnode is None:
settings.LOGGER.error(f'No noun found in {fullstr} parse')
return False
nounwrd = getattval(nounnode, 'word')
if nounwrd != wrd:
settings.LOGGER.error(f'Wrong noun ({nounwrd}) found in {fullstr} parse')
return False
nounlemma = getattval(nounnode, 'lemma')
if compoundsep in nounlemma:
parts = nounlemma.split(compoundsep)
unknownparts = [part for part in parts if not known_word(part)]
result = unknownparts = []
if not result:
settings.LOGGER.error(f'Unknown words ({comma.join(unknownparts)}) found in {fullstr} parse')
return False
return True
else:
return False

15 changes: 8 additions & 7 deletions src/sastadev/basicreplacements.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def combine(strlist: List[str]) -> str:
#: .. autodata:: sastadev.basicreplacements::innureplacements
#: :no-value:
#:
basicreplacementlist: List[BasicReplacement] = [('as', 'als', pron, infpron, codared, dp),
basicreplacementlist: List[BasicReplacement] = [('as', 'als', pron, infpron, codared, -2 * dp),
('isse', 'is', pron, infpron, addschwa, mp(10)),
('ooke', 'ook', pron, infpron, addschwa, dp),
('t', "'t", orth, spellerr, apomiss, dp),
Expand Down Expand Up @@ -234,7 +234,7 @@ def combine(strlist: List[str]) -> str:
('əs', 'eens', pron, infpron, reduction, dp),
('moetə', 'moeten', pron, infpron, infpron, dp),
('moetə' , 'moet', pron, infpron, infpron, dp),
('pot', 'kapot', pron, infpron, sylldrop, dp),
('pot', 'kapot', pron, infpron, sylldrop, -2 * dp),
('almaal', 'allemaal', pron, infpron, sylldrop, dp),
('knorrens', 'varkens', lexical, substitution, onom,dp),
('potte', 'kapot', pron, infpron, combine([sylldrop, emphasis]), dp),
Expand Down Expand Up @@ -325,8 +325,8 @@ def combine(strlist: List[str]) -> str:
('as-t-ie', ['als', 'ie'], pron, infpron, t_ie, dp),
("dit's", ["dit", "is"], pron, infpron, contract, dp),
("dat's", ["dat", "is"], pron, infpron, contract, dp),
("datte", ['dat', 'ie'], pron, infpron, contract, mp(120)),
("omdatte", ['omdat', 'ie'], pron, infpron, contract, mp(120)),
("datte", ['dat', 'ie'], pron, infpron, contract, mp(220)),
("omdatte", ['omdat', 'ie'], pron, infpron, contract, mp(220)),
("isda", ['is', 'dat'], pron, infpron, contract, dp + 2),
("tisda", ['het', 'is', 'dat'], pron, infpron, contract, mp(120)),
("'savonds", ["'s", 'avonds'], pron, infpron, typo, mp(120)),
Expand Down Expand Up @@ -446,7 +446,8 @@ def getdisambiguationdict() -> Dict[str, Tuple[TokenTreePredicate, str]]:
disambiguationdict[w] = cond, repl
return disambiguationdict

parsereplacementslist = [('smarties', 'toffees', alpino_unknown_word, -2*dp),
('Smarties', 'toffees', alpino_unknown_word, -2*dp)]
# next replaced by wordsunknowntoalpino lexicon
#parsereplacementslist = [('smarties', 'toffees', alpino_unknown_word, -2*dp),
# ('Smarties', 'toffees', alpino_unknown_word, -2*dp)]

parsereplacements = {el[0]:el for el in parsereplacementslist}
# parsereplacements = {el[0]:el for el in parsereplacementslist}
1 change: 1 addition & 0 deletions src/sastadev/childesspellingcorrector.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ def adult_correctspelling(word: str, correctionsdict,max = None, threshold=okthr

def tryme():
words = ['kantie', 'opbijten', 'oprijten', 'opgereten', 'peelkaal' , ' beete' , 'kamm', 'daaistoel', 'oelen', 'tein']
words = ['poppe']
for word in words:
result = children_correctspelling(word, children_correctionsdict, max=5)
print(f'{word}: {result}' )
Expand Down
1 change: 1 addition & 0 deletions src/sastadev/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
analysissuffix = '_analysis'
analysistsvsuffix = '_analysis.tsv'
bronzesuffix = '_bronze'
byuttscoressuffix = '_scoresbyutt'
silversuffix = '_silver'
correctionrefsuffix = '_correctionref'
permprefix = 'perm_'
Expand Down
Loading

0 comments on commit 8b954c7

Please sign in to comment.