Skip to content

Commit

Permalink
compounds updates, sas updates
Browse files Browse the repository at this point in the history
  • Loading branch information
JanOdijk committed Jan 1, 2025
1 parent 69cb882 commit b6015fb
Show file tree
Hide file tree
Showing 21 changed files with 371 additions and 80 deletions.
4 changes: 2 additions & 2 deletions src/sastadev/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@
post_process, query_exists, query_inform)
from sastadev.readcsv import writecsv
from sastadev.readmethod import itemseppattern, read_method
from sastadev.resultsbyutterance import getscoresbyutt, mkscoresbyuttrows, byuttheader
from sastadev.resultsbyutterance import getscoresbyutt, mkscoresbyuttrows, byuttheader, silverf1col
from sastadev.sas_impact import getcomparisoncounts, mksas_impactrows, sas_impact
from sastadev.sastatypes import (AltCodeDict, ExactResultsDict, FileName,
GoldTuple, MatchesDict, MethodName, QId,
Expand Down Expand Up @@ -1284,7 +1284,7 @@ def main():
# silverscoresbyutt = getscoresbyutt(allresults.coreresults, silverscores)

byuttrows = mkscoresbyuttrows(allresults, goldscores, silverscores, themethod)
not100count = len([row for row in byuttrows if row[9] != 100])
not100count = len([row for row in byuttrows if row[silverf1col] != 100])
scoresbyuttoutfullname = os.path.join(resultspath, corefilename + byuttscoressuffix + '.xlsx')
wb = mkworkbook(scoresbyuttoutfullname, [byuttheader], byuttrows, freeze_panes=(1,0) )
allbyuttscores = sas_impact(allresults, silverscores, themethod)
Expand Down
2 changes: 2 additions & 0 deletions src/sastadev/basicreplacements.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
vzdevoicing = 'v/z devoicing'
missingapostrophe = apomiss
finalschwadrop = 'Finals schwa dropped'
schwadrop = 'schwa dropped'
u4schwa = 'u usewd for the schwa sound'
inithdrop = 'Initial h drop'
finaldevoicing = 'Final devoicing'
Expand Down Expand Up @@ -254,6 +255,7 @@ def combine(strlist: List[str]) -> str:
('lus', 'lust', pron, infpron, codared, dp),
('mij', 'mijn', pron, infpron, codared, dp),
('drinken', 'voedsel', avoidambiguity, wwnambiguity, wwnambiguity, dp ),
('heelboel', 'heleboel', pron, infpron, schwadrop, dp),
('jou', 'jouw', pron, infpron, codared, -dp), # Td 22, 30 ik wil ook keer naar jou huis find criterion
# ('kijke', 'kijk', pron, infpron, emphasis, dp), # TD05, 32 moved to disambuguationdict
# ('geel', 'mooi', avoidambiguity, adjnambiguity, dp), #TD05, 24
Expand Down
9 changes: 6 additions & 3 deletions src/sastadev/cleanCHILDEStokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,9 +167,12 @@ def cleantext(utt: str, repkeep: bool, tokenoutput: bool = False, verbose=False)
resultwordlist = [t.word for t in newtokens]
resultstring = smartjoin(resultwordlist)
resultposlist = [t.pos for t in newtokens]
newmeta1 = Meta('tokenisation', inwordlist, atype='list', source='CHAT/Tokenisation', backplacement=bpl_none)
newmeta2 = Meta('cleanedtokenisation', resultwordlist, atype='list', source='CHAT/Tokenisation', backplacement=bpl_none)
newmeta3 = Meta('cleanedtokenpositions', resultposlist, annotationposlist=resultposlist, atype='list', source='CHAT/Tokenisation', backplacement=bpl_none)
newmeta1 = Meta('tokenisation', inwordlist, atype='list', source='CHAT/Tokenisation', backplacement=bpl_none,
penalty=0)
newmeta2 = Meta('cleanedtokenisation', resultwordlist, atype='list', source='CHAT/Tokenisation',
backplacement=bpl_none, penalty=0)
newmeta3 = Meta('cleanedtokenpositions', resultposlist, annotationposlist=resultposlist, atype='list',
source='CHAT/Tokenisation', backplacement=bpl_none, penalty=0)
#newmeta4 = Meta('cleantext', 'done')
metadata += [newmeta1, newmeta2, newmeta3]
resultmetadata = metadata
Expand Down
35 changes: 32 additions & 3 deletions src/sastadev/compounds.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
'''
The module *compounds*:
* initialisez the compound dictionary *compounds*, which is a multidimensional Python dictionary
* initialises the compound dictionary *compounds*, which is a multidimensional Python dictionary
Dict[str, Dict[int, str]], which maps a string (for a lemma in CELEX orthography HeadDiaNew)
and a column number to the value of the cell with this column number in the CSV file from which it is derived:
Expand Down Expand Up @@ -31,8 +31,13 @@
from collections import defaultdict
from typing import Dict, List

from sastadev.CHAT_Annotation import CHAT_wordnoncompletion, CHAT_replacement
from sastadev.conf import settings
from sastadev.correctionlabels import contextcorrection
from sastadev.sasta_explanation import explanationasreplacementname
from sastadev.sastatypes import SynTree
from sastadev.smartcompoundcomparison import issmartcompound
from sastadev.stringfunctions import string2list
from sastadev.treebankfunctions import getattval

underscore = "_"
Expand All @@ -51,6 +56,11 @@
dictfile = open(dictfilename, 'r', encoding='utf8')

getwordsxpath = ".//node[@pt]"
correctionsmetaxpath = f""".//xmeta[@name = "{explanationasreplacementname}" or
@name = "{CHAT_replacement}" or
@name = "{CHAT_wordnoncompletion}" or
@name = "{contextcorrection}"
]"""


def getcompounds(syntree: SynTree) -> List[SynTree]:
Expand All @@ -65,14 +75,33 @@ def getcompounds(syntree: SynTree) -> List[SynTree]:
'''
results = []
tlist = syntree.xpath(getwordsxpath)
corrections = syntree.xpath(correctionsmetaxpath)
for t in tlist:
w = getattval(t, 'word')
lemma = getattval(t, 'lemma')
pt = getattval(t, 'pt')
if pt == 'n' and iscompound(lemma):
results.append(t)
if pt == 'n':
if lemma in compounds:
results.append(t)
else:
correction = getcorrection(t, corrections)
if issmartcompound(w, correction, lemma):
results.append(t)
return results


def getcorrection(t: SynTree, corrections) -> str:
w = getattval(t, 'word')
position = getattval(t, 'begin')
for correction in corrections:
annotationposlist = string2list(correction.attrib["annotationposlist"])
annotationwordlist = string2list(correction.attrib["annotationwordlist"], quoteignore=True)
if annotationposlist == [position]:
result = annotationwordlist[0]
return result
return w


# I do not know how to type this, because the nesting can be arbitrarily deep


Expand Down
12 changes: 5 additions & 7 deletions src/sastadev/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import os
from sastadev.conf import settings
from sastadev.constants import intreebanksfolder, outtreebanksfolder
from sastadev.datasets import infiguresdatasets
from sastadev.filefunctions import getbasename
from sastadev.lexicon import known_word
from sastadev.sastatypes import TreeBank, SynTree
Expand Down Expand Up @@ -147,14 +148,10 @@ def nottargetchild(stree: SynTree) -> bool:


def main():
# read auristrain DLD03 in as test treebank
# filename = 'DLD03.xml'
# filename = 'DLD11.xml'
dataset = 'auristrain'
table = []
datasets = ['auristrain', 'vkltarsp', 'vklstap', 'vklasta', 'vklstapfase2', 'vklastafase2', 'auristest']
datasets = infiguresdatasets
for dataset in datasets:
fullpath = os.path.join(settings.DATAROOT, dataset, outtreebanksfolder)
fullpath = os.path.join(settings.DATAROOT, dataset.name, outtreebanksfolder)
filenames = os.listdir(fullpath)
# filenames= ['TD21.xml']
for filename in filenames:
Expand Down Expand Up @@ -183,7 +180,8 @@ def main():
# print(f'Preceding context: {comma.join(prevbestwords)}')
postbestwords = findbestwords(wrongword, postcontext, lambda x: True)
# print(f'Post context: {comma.join(postbestwords)}')
row = [dataset, sample, wrongword, comma.join(prevbestwords), comma.join(postbestwords), origutt]
row = [dataset.name, sample, wrongword, comma.join(prevbestwords),
comma.join(postbestwords), origutt]
table.append(row)

header = ['dataset', 'sample', 'wrongword', 'prev', 'post', 'origutt']
Expand Down
4 changes: 4 additions & 0 deletions src/sastadev/correctionlabels.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@


contextcorrection = 'Context Correction'
repetition = 'Repetition'
18 changes: 9 additions & 9 deletions src/sastadev/corrector.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from sastadev.correctionparameters import CorrectionParameters
from sastadev.cleanCHILDEStokens import cleantokens
from sastadev.conf import settings
from sastadev.correctionlabels import contextcorrection, repetition
from sastadev.dedup import (cleanwordofnort, find_duplicates2,
find_janeenouduplicates, find_simpleduplicates,
find_substringduplicates2, getfilledpauses,
Expand All @@ -30,7 +31,7 @@
from sastadev.iedims import getjeforms
from sastadev.lexicon import (WordInfo, de, dets, getwordinfo, het,
informlexicon, isa_namepart, isa_inf, isa_vd, known_word, nochildword,
tswnouns, validword, vuwordslexicon, wordsunknowntoalpinolexicondict)
tswnouns, validnotalpinocompoundword, validword, vuwordslexicon, wordsunknowntoalpinolexicondict)
from sastadev.macros import expandmacros
from sastadev.metadata import (Meta, bpl_word_delprec, bpl_indeze, bpl_node, bpl_none, bpl_word,
bpl_wordlemma, defaultbackplacement,
Expand Down Expand Up @@ -68,7 +69,6 @@
asta = 'asta'

hyphen = '-'
repetition = 'Repetition'

replacepattern = '{} [: {} ]'
metatemplate = '##META {} {} = {}'
Expand All @@ -92,7 +92,7 @@
#: The constant *wrongdet_excluded_words* contains words that lead to incorrect
#: replacement of uter determiners (e.g. *die zijn* would be replaced by *dat zijn*) and
#: therefore have to be excluded from determiner replacement.
wrongdet_excluded_words = ['zijn', 'dicht', 'met', 'ik', 'mee', 'wat', 'alles', 'niet']
wrongdet_excluded_words = ['zijn', 'dicht', 'met', 'ik', 'mee', 'wat', 'alles', 'niet', 'spelen']

#: The constant *e2een_excluded_nouns* contains words that lead to incorrect
#: replacement of e or schwa and
Expand Down Expand Up @@ -1115,8 +1115,8 @@ def getalternativetokenmds(tokenmd: TokenMD, tokens: List[Token], tokenctr: int
name='Character Case', value='Lower case', cat='Orthography')

# dehyphenate
if not validword(token.word, methodname) and hyphen in token.word:
newwords = fullworddehyphenate(token.word, lambda x: validword(x, methodname))
if not validnotalpinocompoundword(token.word, methodname) and hyphen in token.word:
newwords = fullworddehyphenate(token.word, lambda x: validnotalpinocompoundword(x, methodname))
newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata,
name='Dehyphenation', value='Dehyphenation', cat='Pronunciation',
backplacement=bpl_word)
Expand All @@ -1132,7 +1132,7 @@ def getalternativetokenmds(tokenmd: TokenMD, tokens: List[Token], tokenctr: int
# aha oho uhu ehe
ahapattern = r'([aeouy])h\1'
ahare = re.compile(ahapattern)
if not validword(token.word, methodname) and ahare.search(token.word):
if not validnotalpinocompoundword(token.word, methodname) and ahare.search(token.word):
newwords = [ahare.sub(r'\1', token.word)]
newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata,
name='Emphasis', value='Phoneme Duplication', cat='Pronunciation',
Expand Down Expand Up @@ -1323,7 +1323,7 @@ def getalternativetokenmds(tokenmd: TokenMD, tokens: List[Token], tokenctr: int
penalty = basepenalties[CONTEXT]
newwords = [newcandidate]
newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata,
name='Context Correction', value='Unknown word', cat='lexicon',
name=contextcorrection, value='Unknown word', cat='lexicon',
source=f'{SASTA}/{CONTEXT}', backplacement=bpl_word, penalty=penalty)

# find document specific replacements
Expand Down Expand Up @@ -1440,7 +1440,7 @@ def getalternativetokenmds(tokenmd: TokenMD, tokens: List[Token], tokenctr: int
'moppie', 'punkie', 'saffie', 'stekkie', 'wijfie']


if (not validword(token.word, methodname) or token.word in knowniedimwords) and \
if (not validnotalpinocompoundword(token.word, methodname) or token.word in knowniedimwords) and \
(token.word.endswith('ie') or token.word.endswith('ies')):
newwords = getjeforms(token.word)
for newword in newwords:
Expand Down Expand Up @@ -1754,7 +1754,7 @@ def getwrongdetalternatives(tokensmd: TokenListMD, tree: SynTree, uttid: UttId)
backplacement=bpl_node)
metadata.append(meta)
correctiondone = True
elif token.word in dets[het] and dehet == de and infl in ['e']:
elif token.word in dets[het] and ((dehet == de and infl in ['e']) or infl in ['m', 'dm']):
# newcurtoken = replacement(token, swapdehet(token))
newcurtokenword = swapdehet(token.word)
newcurtoken = Token(newcurtokenword, token.pos)
Expand Down
14 changes: 10 additions & 4 deletions src/sastadev/correcttreebank.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,17 +235,23 @@ def smartreplace(node: SynTree, word: str, mn: MethodName) -> SynTree:
newnode = find1(wordtree, './/node[@pt]')
newnodept = getattval(newnode, 'pt')
nodept = getattval(node, 'pt')
nodelemma = getattval(node, 'lemma')
newnodelemma = getattval(newnode, 'lemma')
if isvalidword(word, mn) and \
issamewordclass(node, newnode) and \
not isrobustnoun(newnode) and \
newnodelemma not in nochildwords:
result = newnode
result.attrib['begin'] = getattval(node, 'begin')
result.attrib['end'] = getattval(node, 'end')
result.attrib['rel'] = getattval(node, 'rel')
if nodept == 'ww' and '_' in nodelemma and newnodelemma in nodelemma and '_' not in newnodelemma:
# e.g. nodelemma == 'op_hebben', newnodelemma='hebben'
cpseppos = nodelemma.find('_')
prt = nodelemma[:cpseppos]
result.set('lemma', f'{prt}_{newnodelemma}')
result.set('begin', getattval(node, 'begin'))
result.set('end', getattval(node, 'end'))
result.set('rel', getattval(node, 'rel'))
if 'index' in node.attrib:
result.attrib['index'] = getattval(node, 'index')
result.set('index', getattval(node, 'index'))
if infpvpair(newnode, node):
adaptpv(result)
else:
Expand Down
5 changes: 4 additions & 1 deletion src/sastadev/data/macros/sastamacros1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ hwwwithsvp = """(contains(@lemma,"_kunnen") or
contains(@lemma,"_zijn")
) """

hwwwithsvpexception = """(@lemma="terug_komen" or @lemma="vast_zitten" or @lemma="weg_gaan" or @lemma="aan_hebben")"""
hwwwithsvpexception = """(@lemma="terug_komen" or @lemma="vast_zitten" or @lemma="weg_gaan" or @lemma="aan_hebben" or @lemma="op_hebben")"""

complement = """((@rel="obj1" or @rel="obj2") or
(@rel="predc" and not(%predcB%)) or
Expand Down Expand Up @@ -831,6 +831,9 @@ intrmodalverb = """( @pt="ww" and


nodimlemma = """(@lemma="meisje" or @lemma="koek" or @lemma="beet")"""
dimword = """(ends-with(@word, "je") or ends-with(@word, "jes") or ends-with(@word, "ie") or ends-with(@word, "ies") or ends-with(@word, "ke") or ends-with(@word, "kes") ) """

mvword = """(ends-with(@word, "en") or ends-with(@word, "e") or ends-with(@word, "s"))"""

hequery = """((@lemma="hè" or @lemma="he") and @end != 1 and
( (@end = ancestor::node[@cat="top"]/@end) or
Expand Down
Binary file modified src/sastadev/data/methods/TARSP_Index_Current.xlsx
Binary file not shown.
7 changes: 6 additions & 1 deletion src/sastadev/data/nochildwords/nochildwords.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,9 @@ pee
's
pele
pelen
toeten
toeten
heelboel
pantoet
wintik
afpoelen
int
64 changes: 64 additions & 0 deletions src/sastadev/datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from dataclasses import dataclass
import os
from sastadev.conf import settings
from sastadev.sastatypes import MethodName
from sastadev.xlsx import getxlsxdata
from typing import List

space = ' '
MethodVariant = str


datasetfilename = 'DatasetOverview.xlsx'
datasetfolder = settings.DATAROOT
datasetfullname = os.path.join(datasetfolder, datasetfilename)


def robustint(x) -> int:
if x == '' or x == space:
result = 0
else:
result = int(x)
return result


@dataclass
class DataSet:
name: str
methodname: MethodName
use: str
infigures: bool
variant: MethodVariant
samples: int
bronzecount: int
source_org: str
sourcepersons: str
description: str


def row2dataset(row: List[str]) -> DataSet:
rawname = row[0]
lcname = rawname.strip()
rawmethodname = row[1]
methodname = rawmethodname.strip().lower()
infigures = "yes" in row[3].lower()
rawvariant = row[4]
variant = rawvariant.strip().lower()

result = DataSet(name=lcname, methodname=methodname, use=row[2], infigures=infigures, variant=variant,
samples=robustint(row[5]), bronzecount=robustint(row[6]), source_org=row[7], sourcepersons=row[8],
description=row[9])
return result


def getalldatasets():
datasets = []
header, data = getxlsxdata(datasetfullname)
for row in data:
newdataset = row2dataset(row)
datasets.append(newdataset)
return datasets

alldatasets = getalldatasets()
infiguresdatasets = [d for d in alldatasets if d.infigures]
dsname2method = {d.name: d.methodname for d in alldatasets}
4 changes: 2 additions & 2 deletions src/sastadev/external_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from sastadev.dedup import correct, mlux, onvolledig, samplesize
from sastadev.imperatives import wond4, wond5plus, wondx, wx, wxy, wxyz, wxyz5
from sastadev.methods import allok, astalemmafilter
from sastadev.queryfunctions import hequery, VzN, vobij, voslashbij, vudivers, xneg_neg, xneg_x
from sastadev.queryfunctions import hequery, tarsp_mvzn, tarsp_verkl, VzN, vobij, voslashbij, vudivers, xneg_neg, xneg_x
from sastadev.stapforms import makestapform
from sastadev.STAPpostfunctions import GL5LVU, GLVU, BB_totaal
from sastadev.Sziplus import sziplus6, vr5plus
Expand Down Expand Up @@ -68,7 +68,7 @@ def oldgetfname(f: Callable) -> str:
# Initialisation
thetarspfunctions = [getcompounds, hequery, sziplus6, xenx, vr5plus, wx, wxy, wxyz, wxyz5, wondx, wond4, wond5plus,
tarsp_screening, vutotaal, gofase, gtotaal, pf2, pf3, pf4, pf5, pf6, pf7, pf, xneg_x, xneg_neg,
mktarspform, VzN, vobij, voslashbij, vudivers]
mktarspform, tarsp_mvzn, tarsp_verkl, VzN, vobij, voslashbij, vudivers]

thestapfunctions = [BB_totaal, GLVU, GL5LVU, makestapform]

Expand Down
Loading

0 comments on commit b6015fb

Please sign in to comment.