Skip to content

Commit

Permalink
update van History, kleine update STAP vnw
Browse files Browse the repository at this point in the history
  • Loading branch information
JanOdijk committed Jul 25, 2024
1 parent 1ed94ea commit e6ed139
Show file tree
Hide file tree
Showing 11 changed files with 893 additions and 871 deletions.
15 changes: 8 additions & 7 deletions src/sastadev/CHAT_Annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -586,43 +586,44 @@ def simplemetafunction(f):
return lambda ann, pos, w: Meta(ann.name, [f(w)],
annotatedposlist=[pos],
annotatedwordlist=[w],
source=CHAT)
source=CHAT,
penalty= 0)


def noncompletionmetafunction(ann, annotationword, annotationpos, annotatedword):
return Meta(ann.name, annotationwordlist=[annotationword], annotationposlist=[annotationpos],
annotatedwordlist=[annotatedword], annotatedposlist=[
annotationpos], value=annotationword,
source=CHAT, backplacement=bpl_replacement)
source=CHAT, backplacement=bpl_replacement, penalty=0)


def simple_bpldel_metafunction(f): return lambda ann, pos, w: Meta(ann.name, [f(w)], annotatedposlist=[pos],
annotatedwordlist=[
w], source=CHAT,
backplacement=bpl_delete)
backplacement=bpl_delete, penalty=0)


def simplescopedmetafunction(ann, annotationwordlist, annotatedposlist, annotatedwordlist, annotationposlist):
return Meta(ann.name, annotationwordlist,
annotationposlist=annotationposlist, annotatedposlist=annotatedposlist,
annotatedwordlist=annotatedwordlist, source=CHAT)
annotatedwordlist=annotatedwordlist, source=CHAT, penalty=0)


def complexmetafunction(ann, annotationwordlist, annotatedposlist, annotatedwordlist, annotationposlist):
return Meta(ann.name, annotationwordlist,
annotationposlist=annotationposlist, annotatedwordlist=annotatedwordlist,
annotatedposlist=annotatedposlist, source=CHAT)
annotatedposlist=annotatedposlist, source=CHAT, penalty=0)


def complexmetafunction_replbpl(ann, annotationwordlist, annotatedposlist, annotatedwordlist, annotationposlist): return \
Meta(ann.name, annotationwordlist, annotationposlist=annotationposlist, annotatedwordlist=annotatedwordlist,
annotatedposlist=annotatedposlist, source=CHAT, backplacement=bpl_replacement)
annotatedposlist=annotatedposlist, source=CHAT, backplacement=bpl_replacement, penalty=0)


def charmetafunction(ann, annotationcharlist, annotatedcharlist, annotationcharposlist, annotatedcharposlist):
return Meta(ann.name, annotationcharlist, annotationcharlist=annotationcharlist,
annotatedcharlist=annotatedcharlist,
annotationcharposlist=annotationcharposlist, annotatedcharposlist=annotatedcharposlist)
annotationcharposlist=annotationcharposlist, annotatedcharposlist=annotatedcharposlist, penalty=0)


def epsf(w):
Expand Down
2 changes: 1 addition & 1 deletion src/sastadev/SAFreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@

# @@next must be made dependent on the method
literallevels = ['literal', 'lemma']
commentslevels = ['comments', 'commentaar', 'opmerkingen']
commentslevels = ['comments', 'commentaar', 'opmerkingen', 'qa']

semicolon = ';'
labelsep = semicolon
Expand Down
2 changes: 1 addition & 1 deletion src/sastadev/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1133,7 +1133,7 @@ def main():
# add xsid to trees that should have one but do not
treebank2 = tb_addxsid(treebank1, targets)

treebank, errordict, allorandalts = correcttreebank(treebank2, targets, methodname, corr)
treebank, errordict, allorandalts = correcttreebank(treebank2, targets, methodname, options.infilename, corr)

allresults, samplesizetuple = sastacore(
origtreebank, treebank, annotatedfileresults, scp)
Expand Down
38 changes: 19 additions & 19 deletions src/sastadev/basicreplacements.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import Dict, List, Tuple

from sastadev.deregularise import correctinflection
from sastadev.metadata import bpl_word, defaultpenalty
from sastadev.metadata import bpl_word, defaultpenalty, modifypenalty as mp
from sastadev.sastatoken import Token
from sastadev.sastatypes import ReplacementMode, SynTree, TokenTreePredicate
from sastadev.treebankfunctions import find1
Expand All @@ -12,9 +12,9 @@
KnownReplacement = Tuple[str, str, str, str, str, ReplacementMode]

dp = defaultpenalty
dp6 = dp + 6
dp3 = dp + 3
dp2 = dp + 2
dp6 = mp(160) # dp + 6
dp3 = mp(130) # dp + 3
dp2 = mp(120) # dp + 2
pron = 'Pronunciation'
orth = 'Orthography'
infpron = 'Informal Pronunciation'
Expand Down Expand Up @@ -128,7 +128,7 @@
#: :no-value:
#:
basicreplacementlist: List[BasicReplacement] = [('as', 'als', pron, infpron, codared, dp),
('isse', 'is', pron, infpron, addschwa, 1),
('isse', 'is', pron, infpron, addschwa, mp(10)),
('ooke', 'ook', pron, infpron, addschwa, dp),
('t', "'t", orth, spellerr, apomiss, dp),
('effjes', 'eventjes', pron, infpron, varpron, dp),
Expand All @@ -155,13 +155,13 @@
('wiw', 'wil', pron, wrongpron, phonrepl.format(wrong='w', correct='l'), dp),
('annug', 'ander', pron, wrongpron, phonrepl.format(wrong='nug', correct='der'), dp),
('nohug', 'nodig', pron, wrongpron, phonrepl.format(wrong='hu', correct='di'), dp),
('magge', 'mogen', morph, wronginfl, '{} & {}'.format(overgen, infpron), dp-5),
('magge', 'mogen', morph, wronginfl, '{} & {}'.format(overgen, infpron), mp(50)),
('magge', 'mag', pron, infpron, emphasis, dp),
('maggen', 'mogen', morph, wronginfl, overgen, dp),
('aleen', 'alleen', orth, typo, typorepl.format(wrong='aleen', correct='alleen'), dp),
('heef', 'heeft', pron, infpron, codared, dp),
('saan', 'staan', pron, wrongpron, onsetred, dp),
('saan', 'gaan', pron, wrongpron, wrongpron, dp + 2),
('saan', 'gaan', pron, wrongpron, wrongpron, mp(120)),
('jerke', 'werken', pron, wrongpron, wrongpron, dp),
('taan', 'staan', pron, wrongpron, onsetred, dp),
("a'maal", 'allemaal', pron, infpron, redpron, dp),
Expand Down Expand Up @@ -207,14 +207,14 @@
('naartoe', 'ernaartoe', pron, infpron, erdrop, dp),
('goe', 'goed', pron, infpron, codared, dp),
('geten', 'gegeten', morph, infpron, prefixdrop, dp),
('geten', 'vergeten', morph, infpron, prefixdrop, dp + 2),
('geten', 'vergeten', morph, infpron, prefixdrop, mp(120)),
('cirtus', 'circus', pron, wrongpron, typorepl.format(wrong='t', correct='c'), dp),
('ken', 'kan', pron, infpron, dial, dp),
('an', 'aan', pron, infpron, vowellaxing, dp),
('an', 'kan', pron, infpron, onsetred, dp),
('hoeve', 'hoef', pron, infpron, emphasis, dp),
('hoeve', 'hoeft', pron, infpron, emphasis, dp+2),
('hebbe', 'heb', pron, infpron, emphasis, dp+2),
('hoeve', 'hoeft', pron, infpron, emphasis, mp(120)),
('hebbe', 'heb', pron, infpron, emphasis, mp(120)),
('pot', 'kapot', pron, infpron, sylldrop, dp),
('kane', 'andere', pron, wrongpron, wrongpron, dp)
] + \
Expand Down Expand Up @@ -293,16 +293,16 @@
('as-t-ie', ['als', 'ie'], pron, infpron, t_ie, dp),
("dit's", ["dit", "is"], pron, infpron, contract, dp),
("dat's", ["dat", "is"], pron, infpron, contract, dp),
("datte", ['dat', 'ie'], pron, infpron, contract, dp + 2),
("omdatte", ['omdat', 'ie'], pron, infpron, contract, dp + 2),
("datte", ['dat', 'ie'], pron, infpron, contract, mp(120)),
("omdatte", ['omdat', 'ie'], pron, infpron, contract, mp(120)),
("isda", ['is', 'dat'], pron, infpron, contract, dp + 2),
("tisda", ['het', 'is', 'dat'], pron, infpron, contract, dp + 2),
("'savonds", ["'s", 'avonds'], pron, infpron, typo, dp + 2),
("savonds", ["'s", 'avonds'], pron, infpron, typo, dp + 2),
("jamaar", ['ja', 'maar'], pron, infpron, typo, dp + 2),
("jahoor", ['ja', 'hoor'], pron, infpron, typo, dp + 2),
("neehoor", ['nee', 'hoor'], pron, infpron, typo, dp + 2),
("kanne", ['kan', 'er'], pron, infpron, codared, dp + 2),
("tisda", ['het', 'is', 'dat'], pron, infpron, contract, mp(120)),
("'savonds", ["'s", 'avonds'], pron, infpron, typo, mp(120)),
("savonds", ["'s", 'avonds'], pron, infpron, typo, mp(120)),
("jamaar", ['ja', 'maar'], pron, infpron, typo, mp(120)),
("jahoor", ['ja', 'hoor'], pron, infpron, typo, mp(120)),
("neehoor", ['nee', 'hoor'], pron, infpron, typo, mp(120)),
("kanne", ['kan', 'er'], pron, infpron, codared, mp(120)),
("moek", ['moet', "'k"], pron, infpron, contract, dp)

]
Expand Down
4 changes: 2 additions & 2 deletions src/sastadev/corrector.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from sastadev.metadata import (Meta, bpl_word_delprec, bpl_indeze, bpl_node, bpl_none, bpl_word,
bpl_wordlemma, defaultbackplacement,
defaultpenalty, filled_pause, fstoken, intj,
janeenou, longrep, mkSASTAMeta, repeated,
janeenou, longrep, mkSASTAMeta, modifypenalty as mp, repeated,
repeatedjaneenou, repeatedseqtoken, shortrep,
substringrep, unknownsymbol)
from sastadev.sasta_explanation import explanationasreplacement
Expand Down Expand Up @@ -1110,7 +1110,7 @@ def getalternativetokenmds(tokenmd: TokenMD, method: MethodName, tokens: List[To
newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata,
name='Wrong pronunciation', value='Final n drop', cat='Pronunciation',
subcat='Coda reduction',
backplacement=bpl_word, penalty=.5*defaultpenalty)
backplacement=bpl_word, penalty=mp(50))



Expand Down
21 changes: 16 additions & 5 deletions src/sastadev/correcttreebank.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from collections import defaultdict
from copy import copy, deepcopy
import os
from typing import Dict, List, Optional, Set, Tuple

from lxml import etree
Expand All @@ -9,8 +10,8 @@
from sastadev.conf import settings
from sastadev.corrector import (Correction, disambiguationdict, getcorrections,
mkuttwithskips)
from sastadev.history import (gathercorrections, mergecorrections, putcorrections,
samplecorrections, samplecorrectionsfullname)
from sastadev.history import (donefiles, donefilesfullname, gathercorrections, mergecorrections, putcorrections,
putdonefilenames, samplecorrections, samplecorrectionsfullname)
from sastadev.lexicon import de, dets, known_word, nochildwords
from sastadev.macros import expandmacros
from sastadev.metadata import (Meta, bpl_delete, bpl_indeze, bpl_node,
Expand Down Expand Up @@ -293,13 +294,14 @@ def updateerrordict(errordict: ErrorDict, uttid: UttId, oldtree: SynTree, newtre
return errordict


def correcttreebank(treebank: Treebank, targets: Targets, method: MethodName, corr: CorrectionMode = corrn) \
-> Tuple[Treebank, ErrorDict, List[Optional[OrigandAlts]]]:
def correcttreebank(treebank: Treebank, targets: Targets, method: MethodName, treebankfullname,
corr: CorrectionMode = corrn ) -> Tuple[Treebank, ErrorDict, List[Optional[OrigandAlts]]]:
'''
The function *correcttreebank* takes as input:
* treebank: the treebank of the sample, parsed as is.
* targets: a specification of the utterances that have to be analysed
* treebankfullname: name of the file that contains the treebank
* method: the method to be used. Some corrections are method-specific
* corr: to indicate how the corrections should be done: no corrections at all, all corrections but the last one (usually the one with most adaptations) is selected; all corrections but the best one according to the evaluation criterion is selected.
Expand All @@ -311,12 +313,19 @@ def correcttreebank(treebank: Treebank, targets: Targets, method: MethodName, co
* a list of all original utterances and all alternatives that have been considered
'''
thissamplecorrections = gathercorrections(treebank)


allorandalts: List[Optional[OrigandAlts]] = []
errordict: ErrorDict = defaultdict(list)
if corr == corr0:
return treebank, errordict, allorandalts
else:
reducedtreebankfullname = os.path.relpath(treebankfullname, start=settings.DATAROOT)
if reducedtreebankfullname not in donefiles:
thissamplecorrections = gathercorrections(treebank)
else:
thissamplecorrections = {}

newtreebank: Treebank = etree.Element('treebank')
# errorlogrows = []
for stree in treebank:
Expand All @@ -340,6 +349,8 @@ def correcttreebank(treebank: Treebank, targets: Targets, method: MethodName, co
# merge the corrections from this sample with the samplecorrections and update the file
mergedsamplecorrections = mergecorrections(samplecorrections, thissamplecorrections)
putcorrections(mergedsamplecorrections, samplecorrectionsfullname)
donefiles.add(reducedtreebankfullname)
putdonefilenames(donefiles, donefilesfullname)

return newtreebank, errordict, allorandalts

Expand Down
Loading

0 comments on commit e6ed139

Please sign in to comment.