update van History, kleine update STAP vnw

UUDigitalHumanitieslab · Jul 25, 2024 · e6ed139 · e6ed139
1 parent 1ed94ea
commit e6ed139
Show file tree

Hide file tree

Showing 11 changed files with 893 additions and 871 deletions.
diff --git a/src/sastadev/CHAT_Annotation.py b/src/sastadev/CHAT_Annotation.py
@@ -586,43 +586,44 @@ def simplemetafunction(f):
     return lambda ann, pos, w: Meta(ann.name, [f(w)],
                                     annotatedposlist=[pos],
                                     annotatedwordlist=[w],
-                                    source=CHAT)
+                                    source=CHAT,
+                                    penalty= 0)
 
 
 def noncompletionmetafunction(ann, annotationword, annotationpos, annotatedword):
     return Meta(ann.name, annotationwordlist=[annotationword], annotationposlist=[annotationpos],
                 annotatedwordlist=[annotatedword], annotatedposlist=[
                     annotationpos], value=annotationword,
-                source=CHAT, backplacement=bpl_replacement)
+                source=CHAT, backplacement=bpl_replacement, penalty=0)
 
 
 def simple_bpldel_metafunction(f): return lambda ann, pos, w: Meta(ann.name, [f(w)], annotatedposlist=[pos],
                                                                    annotatedwordlist=[
                                                                        w], source=CHAT,
-                                                                   backplacement=bpl_delete)
+                                                                   backplacement=bpl_delete, penalty=0)
 
 
 def simplescopedmetafunction(ann, annotationwordlist, annotatedposlist, annotatedwordlist, annotationposlist):
     return Meta(ann.name, annotationwordlist,
                 annotationposlist=annotationposlist, annotatedposlist=annotatedposlist,
-                annotatedwordlist=annotatedwordlist, source=CHAT)
+                annotatedwordlist=annotatedwordlist, source=CHAT, penalty=0)
 
 
 def complexmetafunction(ann, annotationwordlist, annotatedposlist, annotatedwordlist, annotationposlist):
     return Meta(ann.name, annotationwordlist,
                 annotationposlist=annotationposlist, annotatedwordlist=annotatedwordlist,
-                annotatedposlist=annotatedposlist, source=CHAT)
+                annotatedposlist=annotatedposlist, source=CHAT, penalty=0)
 
 
 def complexmetafunction_replbpl(ann, annotationwordlist, annotatedposlist, annotatedwordlist, annotationposlist): return \
     Meta(ann.name, annotationwordlist, annotationposlist=annotationposlist, annotatedwordlist=annotatedwordlist,
-         annotatedposlist=annotatedposlist, source=CHAT, backplacement=bpl_replacement)
+         annotatedposlist=annotatedposlist, source=CHAT, backplacement=bpl_replacement, penalty=0)
 
 
 def charmetafunction(ann, annotationcharlist, annotatedcharlist, annotationcharposlist, annotatedcharposlist):
     return Meta(ann.name, annotationcharlist, annotationcharlist=annotationcharlist,
                 annotatedcharlist=annotatedcharlist,
-                annotationcharposlist=annotationcharposlist, annotatedcharposlist=annotatedcharposlist)
+                annotationcharposlist=annotationcharposlist, annotatedcharposlist=annotatedcharposlist, penalty=0)
 
 
 def epsf(w):

diff --git a/src/sastadev/SAFreader.py b/src/sastadev/SAFreader.py
@@ -35,7 +35,7 @@
 
 # @@next must be made dependent on the method
 literallevels = ['literal', 'lemma']
-commentslevels = ['comments', 'commentaar', 'opmerkingen']
+commentslevels = ['comments', 'commentaar', 'opmerkingen', 'qa']
 
 semicolon = ';'
 labelsep = semicolon

diff --git a/src/sastadev/__main__.py b/src/sastadev/__main__.py
@@ -1133,7 +1133,7 @@ def main():
         # add xsid to trees that should have one but do not
         treebank2 = tb_addxsid(treebank1, targets)
 
-        treebank, errordict, allorandalts = correcttreebank(treebank2, targets, methodname, corr)
+        treebank, errordict, allorandalts = correcttreebank(treebank2, targets, methodname, options.infilename, corr)
 
     allresults, samplesizetuple = sastacore(
         origtreebank, treebank, annotatedfileresults, scp)

diff --git a/src/sastadev/basicreplacements.py b/src/sastadev/basicreplacements.py
@@ -2,7 +2,7 @@
 from typing import Dict, List, Tuple
 
 from sastadev.deregularise import correctinflection
-from sastadev.metadata import bpl_word, defaultpenalty
+from sastadev.metadata import bpl_word, defaultpenalty, modifypenalty as mp
 from sastadev.sastatoken import Token
 from sastadev.sastatypes import ReplacementMode, SynTree, TokenTreePredicate
 from sastadev.treebankfunctions import find1
@@ -12,9 +12,9 @@
 KnownReplacement = Tuple[str, str, str, str, str, ReplacementMode]
 
 dp = defaultpenalty
-dp6 = dp + 6
-dp3 = dp + 3
-dp2 = dp + 2
+dp6 = mp(160)  # dp + 6
+dp3 = mp(130)  # dp + 3
+dp2 = mp(120)  # dp + 2
 pron = 'Pronunciation'
 orth = 'Orthography'
 infpron = 'Informal Pronunciation'
@@ -128,7 +128,7 @@
 #:      :no-value:
 #:
 basicreplacementlist: List[BasicReplacement] = [('as', 'als', pron, infpron, codared, dp),
-                                                ('isse', 'is', pron, infpron, addschwa, 1),
+                                                ('isse', 'is', pron, infpron, addschwa, mp(10)),
                                                 ('ooke', 'ook', pron, infpron, addschwa, dp),
                                                 ('t', "'t", orth, spellerr, apomiss, dp),
                                                 ('effjes', 'eventjes', pron, infpron, varpron, dp),
@@ -155,13 +155,13 @@
                                                 ('wiw', 'wil', pron, wrongpron, phonrepl.format(wrong='w', correct='l'), dp),
                                                 ('annug', 'ander', pron, wrongpron, phonrepl.format(wrong='nug', correct='der'), dp),
                                                 ('nohug', 'nodig', pron, wrongpron, phonrepl.format(wrong='hu', correct='di'), dp),
-                                                ('magge', 'mogen', morph, wronginfl, '{} & {}'.format(overgen, infpron), dp-5),
+                                                ('magge', 'mogen', morph, wronginfl, '{} & {}'.format(overgen, infpron), mp(50)),
                                                 ('magge', 'mag', pron, infpron, emphasis, dp),
                                                 ('maggen', 'mogen', morph, wronginfl, overgen, dp),
                                                 ('aleen', 'alleen', orth, typo, typorepl.format(wrong='aleen', correct='alleen'), dp),
                                                 ('heef', 'heeft', pron, infpron, codared, dp),
                                                 ('saan', 'staan', pron, wrongpron, onsetred, dp),
-                                                ('saan', 'gaan', pron, wrongpron, wrongpron, dp + 2),
+                                                ('saan', 'gaan', pron, wrongpron, wrongpron, mp(120)),
                                                 ('jerke', 'werken', pron, wrongpron, wrongpron, dp),
                                                 ('taan', 'staan', pron, wrongpron, onsetred, dp),
                                                 ("a'maal", 'allemaal', pron, infpron, redpron, dp),
@@ -207,14 +207,14 @@
                                                 ('naartoe', 'ernaartoe', pron, infpron, erdrop, dp),
                                                 ('goe', 'goed', pron, infpron, codared, dp),
                                                 ('geten', 'gegeten', morph, infpron, prefixdrop, dp),
-                                                ('geten', 'vergeten', morph, infpron, prefixdrop, dp + 2),
+                                                ('geten', 'vergeten', morph, infpron, prefixdrop, mp(120)),
                                                 ('cirtus', 'circus', pron, wrongpron, typorepl.format(wrong='t', correct='c'), dp),
                                                 ('ken', 'kan', pron, infpron, dial, dp),
                                                 ('an', 'aan', pron, infpron, vowellaxing, dp),
                                                 ('an', 'kan', pron, infpron, onsetred, dp),
                                                 ('hoeve', 'hoef', pron, infpron, emphasis, dp),
-                                                ('hoeve', 'hoeft', pron, infpron, emphasis, dp+2),
-                                                ('hebbe', 'heb', pron, infpron, emphasis, dp+2),
+                                                ('hoeve', 'hoeft', pron, infpron, emphasis, mp(120)),
+                                                ('hebbe', 'heb', pron, infpron, emphasis, mp(120)),
                                                 ('pot', 'kapot', pron, infpron, sylldrop, dp),
                                                 ('kane', 'andere', pron, wrongpron, wrongpron, dp)
                                                 ] + \
@@ -293,16 +293,16 @@
      ('as-t-ie', ['als', 'ie'], pron, infpron, t_ie, dp),
      ("dit's", ["dit", "is"], pron, infpron, contract, dp),
      ("dat's", ["dat", "is"], pron, infpron, contract, dp),
-     ("datte", ['dat', 'ie'], pron, infpron, contract, dp + 2),
-     ("omdatte", ['omdat', 'ie'], pron, infpron, contract, dp + 2),
+     ("datte", ['dat', 'ie'], pron, infpron, contract, mp(120)),
+     ("omdatte", ['omdat', 'ie'], pron, infpron, contract, mp(120)),
      ("isda", ['is', 'dat'], pron, infpron, contract, dp + 2),
-     ("tisda", ['het',  'is', 'dat'], pron, infpron, contract, dp + 2),
-     ("'savonds", ["'s", 'avonds'], pron, infpron, typo, dp + 2),
-     ("savonds", ["'s", 'avonds'], pron, infpron, typo, dp + 2),
-     ("jamaar", ['ja', 'maar'], pron, infpron, typo, dp + 2),
-     ("jahoor", ['ja', 'hoor'], pron, infpron, typo, dp + 2),
-     ("neehoor", ['nee', 'hoor'], pron, infpron, typo, dp + 2),
-     ("kanne", ['kan', 'er'], pron, infpron, codared, dp + 2),
+     ("tisda", ['het',  'is', 'dat'], pron, infpron, contract, mp(120)),
+     ("'savonds", ["'s", 'avonds'], pron, infpron, typo, mp(120)),
+     ("savonds", ["'s", 'avonds'], pron, infpron, typo, mp(120)),
+     ("jamaar", ['ja', 'maar'], pron, infpron, typo, mp(120)),
+     ("jahoor", ['ja', 'hoor'], pron, infpron, typo, mp(120)),
+     ("neehoor", ['nee', 'hoor'], pron, infpron, typo, mp(120)),
+     ("kanne", ['kan', 'er'], pron, infpron, codared, mp(120)),
      ("moek", ['moet', "'k"], pron, infpron, contract, dp)
 
      ]

diff --git a/src/sastadev/corrector.py b/src/sastadev/corrector.py
@@ -29,7 +29,7 @@
 from sastadev.metadata import (Meta, bpl_word_delprec, bpl_indeze, bpl_node, bpl_none, bpl_word,
                                bpl_wordlemma, defaultbackplacement,
                                defaultpenalty, filled_pause, fstoken, intj,
-                               janeenou, longrep, mkSASTAMeta, repeated,
+                               janeenou, longrep, mkSASTAMeta, modifypenalty as mp, repeated,
                                repeatedjaneenou, repeatedseqtoken, shortrep,
                                substringrep, unknownsymbol)
 from sastadev.sasta_explanation import explanationasreplacement
@@ -1110,7 +1110,7 @@ def getalternativetokenmds(tokenmd: TokenMD, method: MethodName, tokens: List[To
         newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata,
                                         name='Wrong pronunciation', value='Final n drop', cat='Pronunciation',
                                         subcat='Coda reduction',
-                                        backplacement=bpl_word, penalty=.5*defaultpenalty)
+                                        backplacement=bpl_word, penalty=mp(50))
 
 
 

diff --git a/src/sastadev/correcttreebank.py b/src/sastadev/correcttreebank.py
@@ -1,5 +1,6 @@
 from collections import defaultdict
 from copy import copy, deepcopy
+import os
 from typing import Dict, List, Optional, Set, Tuple
 
 from lxml import etree
@@ -9,8 +10,8 @@
 from sastadev.conf import settings
 from sastadev.corrector import (Correction, disambiguationdict, getcorrections,
                                 mkuttwithskips)
-from sastadev.history import (gathercorrections, mergecorrections, putcorrections,
-                              samplecorrections, samplecorrectionsfullname)
+from sastadev.history import (donefiles, donefilesfullname, gathercorrections, mergecorrections, putcorrections,
+                              putdonefilenames, samplecorrections, samplecorrectionsfullname)
 from sastadev.lexicon import de, dets, known_word, nochildwords
 from sastadev.macros import expandmacros
 from sastadev.metadata import (Meta, bpl_delete, bpl_indeze, bpl_node,
@@ -293,13 +294,14 @@ def updateerrordict(errordict: ErrorDict, uttid: UttId, oldtree: SynTree, newtre
     return errordict
 
 
-def correcttreebank(treebank: Treebank, targets: Targets, method: MethodName, corr: CorrectionMode = corrn) \
-        -> Tuple[Treebank, ErrorDict, List[Optional[OrigandAlts]]]:
+def correcttreebank(treebank: Treebank, targets: Targets, method: MethodName, treebankfullname,
+                    corr: CorrectionMode = corrn )   -> Tuple[Treebank, ErrorDict, List[Optional[OrigandAlts]]]:
     '''
     The function *correcttreebank* takes as input:
 
     * treebank: the treebank of the sample, parsed as is.
     * targets: a specification of the utterances that have to be analysed
+    * treebankfullname: name of the file that contains the treebank
     * method: the method to be used. Some corrections are method-specific
     * corr: to indicate how the corrections should be done: no corrections at all, all corrections but the last one (usually the one with most adaptations) is selected; all  corrections but the best one according to the evaluation  criterion is selected.
 
@@ -311,12 +313,19 @@ def correcttreebank(treebank: Treebank, targets: Targets, method: MethodName, co
     * a list of all original utterances and all alternatives that have been considered
 
     '''
-    thissamplecorrections = gathercorrections(treebank)
+
+
     allorandalts: List[Optional[OrigandAlts]] = []
     errordict: ErrorDict = defaultdict(list)
     if corr == corr0:
         return treebank, errordict, allorandalts
     else:
+        reducedtreebankfullname = os.path.relpath(treebankfullname, start=settings.DATAROOT)
+        if reducedtreebankfullname not in donefiles:
+            thissamplecorrections = gathercorrections(treebank)
+        else:
+            thissamplecorrections = {}
+
         newtreebank: Treebank = etree.Element('treebank')
         # errorlogrows = []
         for stree in treebank:
@@ -340,6 +349,8 @@ def correcttreebank(treebank: Treebank, targets: Targets, method: MethodName, co
         # merge the corrections from this sample with the samplecorrections and update the file
         mergedsamplecorrections = mergecorrections(samplecorrections, thissamplecorrections)
         putcorrections(mergedsamplecorrections, samplecorrectionsfullname)
+        donefiles.add(reducedtreebankfullname)
+        putdonefilenames(donefiles, donefilesfullname)
 
         return newtreebank, errordict, allorandalts