Skip to content

Commit

Permalink
adult v. children spelling correction
Browse files Browse the repository at this point in the history
  • Loading branch information
JanOdijk committed Sep 20, 2024
1 parent f25faa1 commit 50a615a
Show file tree
Hide file tree
Showing 9 changed files with 80 additions and 19 deletions.
12 changes: 11 additions & 1 deletion src/sastadev/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,8 @@
from sastadev.external_functions import str2functionmap
from sastadev.goldcountreader import get_goldcounts
from sastadev.history import (donefiles, donefilesfullname, gathercorrections, mergecorrections, putcorrections,
putdonefilenames, samplecorrections, samplecorrectionsfullname)
putdonefilenames, children_samplecorrections, children_samplecorrectionsfullname,
adult_samplecorrections, adult_samplecorrectionsfullname)
from sastadev.macros import expandmacros
from sastadev.methods import Method, supported_methods, treatmethod
from sastadev.mismatches import exactmismatches, literalmissedmatches
Expand Down Expand Up @@ -1175,6 +1176,15 @@ def main():

thissamplecorrections = {}
if options.dohistory:
if options.methodname.lower() in {'tarsp', 'stap'}:
samplecorrectionsfullname = children_samplecorrectionsfullname
samplecorrections = children_samplecorrections
elif options.methodname.lower() in {'asta'}:
samplecorrectionsfullname = adult_samplecorrectionsfullname
samplecorrections = adult_samplecorrections
else: # should not occur
settings.LOGGER.error(f'Illegal method name used: {options.method}')
exit(-1)
if corr != corr0:
reducedtreebankfullname = os.path.relpath(options.infilename, start=settings.DATAROOT)
if reducedtreebankfullname not in donefiles:
Expand Down
2 changes: 1 addition & 1 deletion src/sastadev/astaforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,6 @@ def getlemmafreqs(allresults, lexicalreskey) -> Dict[str, int]:
if qid == lemmaqid:
lemma = reskey[1]
for position in allresults.exactresults[reskey]:
if position in allresults.exactresults[lexicalreskey]:
if lexicalreskey in allresults.exactresults and position in allresults.exactresults[lexicalreskey]:
dict[lemma] += 1
return dict
52 changes: 44 additions & 8 deletions src/sastadev/childesspellingcorrector.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,8 @@ def getchildesfrq() -> Tuple[FrqDict, FrqDict, FrqDict]:

# function to read the stored corrections into a dictionary

def getstoredcorrections() -> Dict[str, List[Tuple[str, int]]]:
def getstoredcorrections(correctionsfullname) -> Dict[str, List[Tuple[str, int]]]:
correctionsdict = {}
correctionsfilename = 'storedcorrections.txt'
correctionspath = os.path.join(settings.SD_DIR, 'data/storedcorrections')
correctionsfullname = os.path.join(correctionspath, correctionsfilename)

idata = readcsv(correctionsfullname)
for i, row in idata:
Expand All @@ -70,7 +67,7 @@ def getpenalty(score, total):

# a function for spelling correction

def correctspelling(word: str, max = None, threshold=okthreshold) -> List[Tuple[str, int]]:
def children_correctspelling(word: str, correctionsdict, max = None, threshold=okthreshold) -> List[Tuple[str, int]]:
if word in correctionsdict:
return correctionsdict[word]
else:
Expand Down Expand Up @@ -115,12 +112,40 @@ def correctspelling(word: str, max = None, threshold=okthreshold) -> List[Tuple[

return result


def adult_correctspelling(word: str, correctionsdict,max = None, threshold=okthreshold) -> List[Tuple[str, int]]:
if word in correctionsdict:
return correctionsdict[word]
else:
corrections = spell.candidates(word)
if corrections is not None:
corrtuples = [(corr, spell.word_usage_frequency(corr)) for corr in corrections]
else:
corrtuples = []

sortedcorrtuples = sorted(corrtuples, key=lambda x: x[1], reverse=True)
allfrqsum = sum(corrtuple[1]for corrtuple in sortedcorrtuples)

result = [(corr, getpenalty(score, allfrqsum)) for (corr, score) in sortedcorrtuples]

if max is not None:
result = result[:max]

# store the result in the dictionary; write dictionary to file

return result


def tryme():
words = ['kantie', 'opbijten', 'oprijten', 'opgereten', 'peelkaal' , ' beete' , 'kamm', 'daaistoel', 'oelen', 'tein']
for word in words:
result = correctspelling(word, max=5)
result = children_correctspelling(word, children_correctionsdict, max=5)
print(f'{word}: {result}' )

words = ['motariek', 'silase']
for word in words:
result = adult_correctspelling(word, adult_correctionsdict, max=5)
print(f'{word}: {result}' )



Expand All @@ -129,8 +154,19 @@ def tryme():
# read the childes frequency dict in, for targets and others and combine them also
trgfrqdict, othfrqdict, allfrqdict = getchildesfrq()

# read the stored corrections into a dictionary
correctionsdict = getstoredcorrections()
# read the stored corrections for children into a dictionary
children_correctionsfilename = 'children_storedcorrections.txt'
correctionspath = os.path.join(settings.SD_DIR, 'data/storedcorrections')
children_correctionsfullname = os.path.join(correctionspath, children_correctionsfilename)
children_correctionsdict = getstoredcorrections(children_correctionsfullname)

# read the stored corrections for adults into a dictionary
adult_correctionsfilename = 'adult_storedcorrections.txt'
correctionspath = os.path.join(settings.SD_DIR, 'data/storedcorrections')
adult_correctionsfullname = os.path.join(correctionspath, adult_correctionsfilename)
adult_correctionsdict = getstoredcorrections(adult_correctionsfullname)



if __name__ == '__main__':
tryme()
19 changes: 14 additions & 5 deletions src/sastadev/corrector.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
from sastadev.alpino import getdehetwordinfo
from sastadev.basicreplacements import (basicexpansions, basicreplacementpairs, basicreplacements,
getdisambiguationdict, parsereplacements)
from sastadev.childesspellingcorrector import correctspelling, allfrqdict
from sastadev.childesspellingcorrector import (adult_correctionsdict, adult_correctspelling,
children_correctionsdict, children_correctspelling, allfrqdict)
from sastadev.correctionparameters import CorrectionParameters
from sastadev.cleanCHILDEStokens import cleantokens
from sastadev.conf import settings
Expand All @@ -22,7 +23,8 @@
from sastadev.find_ngram import (Ngram, findmatches, ngram1, ngram2, ngram7,
ngram10, ngram11, ngram16, ngram17)
from sastadev.history import (childescorrections, childescorrectionsexceptions, mergecorrections, putcorrections,
samplecorrections, samplecorrectionsfullname)
children_samplecorrections, children_samplecorrectionsfullname,
adult_samplecorrections, adult_samplecorrectionsfullname)
from sastadev.iedims import getjeforms
from sastadev.lexicon import (WordInfo, de, dets, getwordinfo, het,
informlexicon, isa_namepart, isa_inf, isa_vd, known_word,
Expand Down Expand Up @@ -1264,7 +1266,7 @@ def getalternativetokenmds(tokenmd: TokenMD, tokens: List[Token], tokenctr: int
not known_word(token.word) and \
token.word in correctionparameters.allsamplecorrections and \
token.word not in childescorrectionsexceptions:
cc = samplecorrections[token.word]
cc = correctionparameters.allsamplecorrections[token.word]
sumfrq = sum([hc.frequency for hc in cc])
for hc in cc:
relfrq = hc.frequency / sumfrq
Expand Down Expand Up @@ -1453,10 +1455,17 @@ def getalternativetokenmds(tokenmd: TokenMD, tokens: List[Token], tokenctr: int


if correctionparameters.options.dospellingcorrection and \
correctionparameters.method in {'tarsp', 'stap'} and not known_word(token.word) and applyspellingcorrectionisok(token.word) and \
not known_word(token.word) and applyspellingcorrectionisok(token.word) and \
not schwandropfound and not postviefound and not token.word[0].isupper() and not deduplicated and \
not(token.word.endswith('ie') or token.word.endswith('ies')) and token.word[-3:] not in vvs:
corrtuples = correctspelling(token.word, max=5)
if correctionparameters.method in {'tarsp', 'stap'}:
corrtuples = children_correctspelling(token.word, children_correctionsdict, max=5)
elif correctionparameters.method in {'asta'}:
corrtuples = []
# put off because it causes a lot of errors: the X-words should all have been removed
# corrtuples = adult_correctspelling(token.word, adult_correctionsdict, max=5)
else:
corrtuples = []
for corr, penalty in corrtuples:
if corr != token.word and known_word(corr):
newtokenmds = updatenewtokenmds(newtokenmds, token, [corr], beginmetadata,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,5 @@ Sesamstraat
koffie
Smarties
omie
m'n
z'n
1 change: 1 addition & 0 deletions src/sastadev/data/filledpauseslexicon/vuwordslexicon.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
goed 100,
ha 100,
he 120
hè 120
hoor 003
Expand Down
6 changes: 4 additions & 2 deletions src/sastadev/history.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
childescorrectionspath = os.path.join(settings.SD_DIR, 'data', 'childescorrections')

childescorrectionsfullname = os.path.join(childescorrectionspath, 'childescorrections.txt')
samplecorrectionsfullname = os.path.join(childescorrectionspath, 'samplecorrections.txt')
children_samplecorrectionsfullname = os.path.join(childescorrectionspath, 'children_samplecorrections.txt')
adult_samplecorrectionsfullname = os.path.join(childescorrectionspath, 'adult_samplecorrections.txt')
donefilesfullname = os.path.join(childescorrectionspath, 'donefiles.txt')

@dataclass
Expand Down Expand Up @@ -142,7 +143,8 @@ def mergecorrections(corrections1: HistoryCorrectionDict, corrections2: HistoryC
[tpl[0] for tpl in innereplacements] + \
[tpl[0] for tpl in innureplacements]

samplecorrections = getcorrections(samplecorrectionsfullname)
children_samplecorrections = getcorrections(children_samplecorrectionsfullname)
adult_samplecorrections = getcorrections(adult_samplecorrectionsfullname)
donefiles = getdonefilenames(donefilesfullname)

junk = 0
2 changes: 1 addition & 1 deletion src/sastadev/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def astalemmafilter(query: Query, xrs: ExactResultsDict, xr: ExactResult) -> boo
for (qid, val) in xrs:
if qid == lemmaqid:
if xr in xrs[(qid, val)]:
result1 = xr in xrs[lexreskey] or xr in xrs[nreskey]
result1 = (lexreskey in xrs and xr in xrs[lexreskey]) or (nreskey in xrs and xr in xrs[nreskey])
result = query.process == pre_process or result1
return result

Expand Down
3 changes: 2 additions & 1 deletion src/sastadev/smartcompoundcomparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,8 @@ def main():
'sinterklaas_paardje', True),
('kippes', 'kippies', 'kip_pies', True),
('diehoek', 'driehoek', 'drie_hoek', True),
('jantauto', 'brandweerauto', 'brandweer_auto', True)
('jantauto', 'brandweerauto', 'brandweer_auto', True),
('koekklok', 'koekoeksklok', 'koekoek_klok', True)

]
# testlist = [('risstengeltjes', 'rietstengeltjes', 'riet_stengel', True)]
Expand Down

0 comments on commit 50a615a

Please sign in to comment.