Skip to content

Commit

Permalink
various tarsp updates, asta updates
Browse files Browse the repository at this point in the history
  • Loading branch information
JanOdijk committed Dec 19, 2024
1 parent f018a82 commit 472ae75
Show file tree
Hide file tree
Showing 9 changed files with 184 additions and 14 deletions.
19 changes: 15 additions & 4 deletions src/sastadev/basicreplacements.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

ampersand = '&'

adjnambiguity = 'Avoiding adj - noun ambiguity'
dp = defaultpenalty
dp6 = mp(160) # dp + 6
dp3 = mp(130) # dp + 3
Expand All @@ -37,6 +38,7 @@
phonrepl = '/{wrong}/ instead of /{correct}/'
wronginfl = 'Incorrect inflection'
morph = 'Morphology'
wrongmorph = 'Wrong Morphology'
overgen = 'Overgeneralisation'
typo = 'Typo'
typorepl = '{wrong} instead of {correct}'
Expand Down Expand Up @@ -65,6 +67,7 @@
voweldel = 'vowel deletion'
avoidambiguity = 'Avoiding ambiguity'
wwnambiguity = 'Verb - Noun ambiguity'
pnnambiguity = 'Person name - combined surname ambiguity'

def combine(strlist: List[str]) -> str:
return ampersand.join(strlist)
Expand Down Expand Up @@ -155,8 +158,8 @@ def combine(strlist: List[str]) -> str:
('il', 'wil', pron, pronerr, onsetred, dp),
('tee', 'twee', pron, pronerr, onsetred, dp),
('nie', 'niet', pron, infpron, codared, dp),
('s', 'is', orth, spellerr, apomiss, dp),
("'s", 'is', pron, infpron, redpron, dp),
# ('s', 'is', orth, spellerr, apomiss, dp), # moved to corrector for context
# ("'s", 'is', pron, infpron, redpron, dp), # moved to corrector for context
('ooke', 'ook', pron, infpron, addschwa, dp),
('it', 'dit', pron, pronerr, onsetred, dp),
('da', 'dat', pron, infpron, codared, dp),
Expand All @@ -173,6 +176,7 @@ def combine(strlist: List[str]) -> str:
('maggen', 'mogen', morph, wronginfl, overgen, dp),
('aleen', 'alleen', orth, typo, typorepl.format(wrong='aleen', correct='alleen'), dp),
('heef', 'heeft', pron, infpron, codared, dp),
('heef', 'heb', morph, wrongmorph, phonrepl.format(wrong='heef', correct='heb'), dp),
('saan', 'staan', pron, wrongpron, onsetred, dp),
('saan', 'gaan', pron, wrongpron, wrongpron, mp(120)),
('jerke', 'werken', pron, wrongpron, wrongpron, dp),
Expand Down Expand Up @@ -250,7 +254,11 @@ def combine(strlist: List[str]) -> str:
('lus', 'lust', pron, infpron, codared, dp),
('mij', 'mijn', pron, infpron, codared, dp),
('drinken', 'voedsel', avoidambiguity, wwnambiguity, wwnambiguity, dp ),
('jou', 'jouw', pron, infpron, codared, -dp) # Td 22, 30 ik wil ook keer naar jou huis find criterion
('jou', 'jouw', pron, infpron, codared, -dp), # Td 22, 30 ik wil ook keer naar jou huis find criterion
# ('kijke', 'kijk', pron, infpron, emphasis, dp), # TD05, 32 moved to disambuguationdict
# ('geel', 'mooi', avoidambiguity, adjnambiguity, dp), #TD05, 24
# ('Roy', 'Jan', avoidambiguity, pnnambiguity, dp)
# ('surf', 'turf', avoidambiguity, wwnambiguity, dp), # TD05, 35
# ('leggen', 'liggen', lexical, dial, '', dp), # moved to corrector : only if parse is illformed
# ('legt', 'ligt', lexical, dial, '', dp), # moved to corrector : only if parse is illformed
# ('leg', 'lig', lexical, dial, '', dp) # moved to corrector : only if parse is illformed
Expand Down Expand Up @@ -430,8 +438,11 @@ def welnietttp(token: Token, stree: SynTree) -> bool:
(dtp, ['bomen', 'kussen', 'kaarten', 'beesten', 'weken', 'huizen', 'apen', 'poten',
'wieken', 'paarden', 'stoelen', 'ramen', 'strepen', 'planten', 'groeten',
'flessen', 'boeren', 'punten', 'tranen'], 'teilen'),
(dtp, ['snel', 'wit', 'kort', 'dicht'], 'mooi'),
(dtp, ['snel', 'wit', 'kort', 'dicht', 'geel'], 'mooi'),
(dtp, ['witte'], 'mooie'),
(dtp, ['Roy'], 'Jan'),
# (dtp, ['kijke'], 'he'),
(dtp, ['surf'], 'turf'),
(welnietttp, ['wel', 'niet'], 'ietsjes') # find a different adverb that does not get inside constituents (ietsjes?)
]

Expand Down
1 change: 1 addition & 0 deletions src/sastadev/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
testsetsfolder = 'testsets'
dataset_treebanksfolder = 'dataset_treebanksfolder'
errorsummaryfolder = 'errorsummaries'
lemmafolder = 'lemmas'

analysissuffix = '_analysis'
analysistsvsuffix = '_analysis.tsv'
Expand Down
15 changes: 14 additions & 1 deletion src/sastadev/corrector.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@

enexceptions = {'inne', 'mette', 'omme', 'oppe', 'vanne'}
leggendict = {'leg': 'lig', 'legt': 'ligt', 'leggen': 'liggen'}
aposfollowers = {'ochtends', 'middags', 'avonds', 'nachts', 'morgens', 'werelds', 'lands', 'anderendaags',
'winters', 'zomers', 'namiddags',
'zondags', 'maandags', 'dinsdags', 'woensdags', 'donderdags', 'vrijdags', 'zaterdags'}

#: The constant *disambiguationdict* contains words that should be replaced by a
#: different word to avoid unwanted readings of the original word. It is filled by a
Expand Down Expand Up @@ -1241,7 +1244,17 @@ def getalternativetokenmds(tokenmd: TokenMD, tokens: List[Token], tokenctr: int
name='Informal pronunciation', value='Final t-deletion', cat='Pronunciation',
backplacement=bpl_word)

# clause intial maar must be parsed as conjunction not as ana dverb: we replces it by "en" to avoid the ambiguity
# 's and s could be is, but do not try it when followed by ochtends etc
if token.word in ["'s", "s"] and nexttoken.word not in aposfollowers:
newwords = ['is']
valvalue = 'reduced pronunciation'
catval = 'Pronunciation'
newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata,
name='Informal pronunciation', value=valvalue, cat=catval,
backplacement=bpl_word)


# clause intial maar must be parsed as conjunction not as ana dverb: we replace it by "en" to avoid the ambiguity
if token.word == 'maar':
initialmaars = tree.xpath(initialmaarvgxpath)
for initialmaar in initialmaars:
Expand Down
19 changes: 15 additions & 4 deletions src/sastadev/correcttreebank.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@
showtree, simpleshow, subclasscompatible, transplant_node,
treeinflate, treewithtokenpos,
updatetokenpos)
from sastadev.treetransform import transformtagcomma, transformtreeld, transformtreenogeen, transformtreenogde
from sastadev.treetransform import adaptlemmas, transformtagcomma, transformtreeld, transformtreenogeen, \
transformtreenogde, nognietsplit

ampersand = '&'

Expand Down Expand Up @@ -571,6 +572,10 @@ def correct_stree(stree: SynTree, corr: CorrectionMode, correctionparameters: C
stree = transformbwinnp(stree)
stree = transformtreenogeen(stree)
stree = transformtreenogde(stree)
# stree = nognietsplit(stree) # put off because it should not be done

# adapt lemmas for words of which we know Alpino does it wrong
stree = adaptlemmas(stree)

allmetadata = []
# orandalts = []
Expand All @@ -593,7 +598,7 @@ def correct_stree(stree: SynTree, corr: CorrectionMode, correctionparameters: C
lmetadatalist = len(metadatalist)
if lmetadatalist == 0:
settings.LOGGER.error('Missing metadata in utterance {}'.format(uttid))
origmetadata = []
origmetadata = None
else:
if lmetadatalist > 1:
settings.LOGGER.error(
Expand Down Expand Up @@ -666,7 +671,8 @@ def correct_stree(stree: SynTree, corr: CorrectionMode, correctionparameters: C
# newstree = insertskips(newstree, correctiontokenlist, stree)
# simpleshow(stree)
mdcopy = deepcopy(origmetadata)
fatnewstree.insert(0, mdcopy)
if mdcopy is not None:
fatnewstree.insert(0, mdcopy)
# copy the sentid attribute
sentencenode = getsentencenode(fatnewstree)
if sentencenode is not None:
Expand Down Expand Up @@ -918,6 +924,11 @@ def correct_stree(stree: SynTree, corr: CorrectionMode, correctionparameters: C
fulltree = transformbwinnp(fulltree)
fulltree = transformtreenogeen(fulltree)
fulltree = transformtreenogde(fulltree)
# fulltree = nognietsplit(fulltree) # put off because it should not be done

# adapt lemmas for words of which we know Alpino does it wrong
fulltree = adaptlemmas(fulltree)


# fulltree = deflate(fulltree) # put off becuase there may be expanded elements
debug = False
Expand Down Expand Up @@ -1363,7 +1374,7 @@ def getmaaradvcount(nt: SynTree, md: List[Meta], mn: MethodName) -> int:
Criterion("supcount", getsupcount, positive, "Number of words that are superlatives"),
Criterion("compoundcount", localgetcompoundcount, positive, "Number of nouns that are compounds"),
Criterion("sucount", getsucount, positive, "Number of subjects"),
Criterion("svaok", getsvaokcount, positive, "Numbe rof time subject verb agreement is OK"),
Criterion("svaok", getsvaokcount, positive, "Number of time subject verb agreement is OK"),
Criterion("deplusneutcount", getdeplusneutcount, negative, "Number of deviant configuratios with de-determeine + neuiter noun"),
Criterion("dezebwcount", getdezebwcount, negative, "Count of 'deze' as adverb"),
Criterion("penalty", compute_penalty, negative, "Penalty for the changes made")
Expand Down
45 changes: 41 additions & 4 deletions src/sastadev/lexicon.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
namepart_isa_namepart_uc)
from sastadev.readcsv import readcsv
from sastadev.sastatypes import CELEX_INFL, DCOITuple, Lemma, SynTree, WordInfo
from sastadev.stringfunctions import ispunctuation
from sastadev.stringfunctions import ispunctuation, strip_accents

alpinoparse = settings.PARSE_FUNC
space = ' '
Expand Down Expand Up @@ -67,6 +67,17 @@ def initializelexicondict(lexiconfilename) -> Dict[str,str]:
lexicon[strippedword] = strippedreplacement
return lexicon


def geninitializelexicondict(lexiconfilename, key: int, header=True) -> Dict[str, List[str]]:
lexicon = {}
fptuples = readcsv(lexiconfilename, header=header)
for _, fp in fptuples:
strippedkey = fp[key].strip()
strippedentry = [el.strip() for el in fp]
lexicon[strippedkey] = strippedentry
return lexicon


def initializelexicondefdict(lexiconfilename) -> Dict[str,List[str]]:
lexicon = defaultdict(list)
fptuples = readcsv(lexiconfilename, header=False)
Expand Down Expand Up @@ -149,6 +160,9 @@ def getwordposinfo(word: str, pos: str) -> List[WordInfo]:
results = []
if lexicon == celex:
results = celexlexicon.getwordposinfo(word, pos)
if results == []:
cleanword = strip_accents(word)
results = celexlexicon.getwordposinfo(cleanword, pos)
return results


Expand All @@ -161,6 +175,9 @@ def getwordinfo(word: str) -> List[WordInfo]:
results = []
if lexicon == celex:
results = celexlexicon.getwordinfo(word)
if results == []:
cleanword = strip_accents(word)
results = celexlexicon.getwordinfo(cleanword)
return results


Expand All @@ -174,7 +191,11 @@ def informlexicon(word: str) -> bool:
result = True
for aword in allwords:
if lexicon == 'celex':
result = result and celexlexicon.incelexdmw(aword)
wordfound = celexlexicon.incelexdmw(aword)
if not wordfound:
cleanword = strip_accents(aword)
wordfound = celexlexicon.incelexdmw(cleanword)
result = result and wordfound
elif lexicon == 'alpino':
result = False
else:
Expand All @@ -194,7 +215,11 @@ def informlexiconpos(word: str, pos: str) -> bool:
result = True
for aword in allwords:
if lexicon == 'celex':
result = result and celexlexicon.incelexdmwpos(aword, pos)
wordfound = celexlexicon.incelexdmwpos(aword, pos)
if not wordfound:
cleanword = strip_accents(aword)
wordfound = celexlexicon.incelexdmwpos(cleanword, pos)
result = result and wordfound
elif lexicon == 'alpino':
result = False
else:
Expand Down Expand Up @@ -332,4 +357,16 @@ def getinflforms(thesubj: SynTree, thepv: SynTree, inversion: bool) -> List[str]
wrongposwordslexicon = initializelexicon(wrongposwordslexiconfullname)

# validnouns is intended for nous that Alpino assigns frame (both,both, both) but that are valid Dutch words
validnouns = {'knijper', 'roosvicee'}
validnouns = {'knijper', 'roosvicee'}

lexiconfoldername = 'data/wordsunknowntoalpino'
lemmalexiconfilename = 'lemmalexicon.txt'
lemmalexiconfulname = os.path.join(settings.SD_DIR, lexiconfoldername, lemmalexiconfilename)
lemmalexicon = initializelexicondict(lemmalexiconfulname)

lexiconfoldername = 'data/wordsunknowntoalpino'
cardinallexiconfilename = 'cardinalnumerals.tsv'
cardinallexiconfullname = os.path.join(settings.SD_DIR, lexiconfoldername, cardinallexiconfilename)
cardinallexicon = geninitializelexicondict(cardinallexiconfullname, 0)

junk = 0 # to have a breapoint after the last lexicon read
38 changes: 38 additions & 0 deletions src/sastadev/nogniet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import copy
from sastadev.sastatypes import SynTree
from sastadev.treebankfunctions import find1


nognietxpath = """.//node[@cat="advp" and node[@rel="mod" and @lemma="nog"] and node[@rel="hd" and @lemma="niet"]]"""
zelfinnpmodxpath = """.//node[@rel="mod" and @lemma="zelf" and parent::node[@cat="np"]]"""

def nognietsplit(stree: SynTree) -> SynTree:
newstree = copy.deepcopy(stree)
nognietnodes = newstree.xpath(nognietxpath)
if nognietnodes == []:
return stree
for nognietnode in nognietnodes:
nog = find1(nognietnode, """./node[@lemma="nog"]""")
nognietnodeparent = nognietnode.getparent()
nognietnode.remove(nog)
for i, anode in enumerate(nognietnodeparent):
if anode == nognietnode:
nognietnodepos = i
break
nognietnodeparent[i] = nognietnode
return newstree


def ikzelfsplit(stree: SynTree) -> SynTree:
newstree = copy.deepcopy(stree)
zelfinnps = newstree.xpath(zelfinnpmodxpath)
if zelfinnps == []:
return stree
for zelfinnp in zelfinnps:
npnode = zelfinnp.getparent()
npparentnode = npnode.getparent()





4 changes: 3 additions & 1 deletion src/sastadev/stringfunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -507,7 +507,9 @@ def realwordstring(w: str) -> bool:
result = not unicodedata.category(w).startswith('P')
return result


def strip_accents(s):
return ''.join(c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn')
def getallrealwords(allresults):
result = {}
for uttid in allresults.allutts:
Expand Down
12 changes: 12 additions & 0 deletions src/sastadev/tblex.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,21 @@ def asta_recognised_nounnode(node: SynTree) -> bool:
result = result or recognised_lemmanodepos(node, pos)
result = result and not (all_lower_consonantsnode(node))
result = result and not (short_nucl_n(node))
result = result and not iscardinal(node)
return result


def iscardinal(node):
word = getattval(node, 'word')
wordlc = word.lower()
if wordlc == '':
result = False
elif wordlc in lex.cardinallexicon:
result = True
else:
result = False
return result

def asta_recognised_wordnode(node: SynTree) -> bool:
result = sasta_pseudonym(node)
result = result or spec_noun(node)
Expand Down
45 changes: 45 additions & 0 deletions src/sastadev/treetransform.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import copy
from sastadev.conf import settings
from sastadev.lexicon import lemmalexicon
from sastadev.treebankfunctions import find1, getattval, getbeginend, getnodeyield, getyield, \
immediately_precedes, iswordnode, showtree
from sastadev.sastatypes import SynTree
Expand All @@ -23,6 +24,8 @@
@lemma="zo'n" or @pt="tw") and parent::node[@cat="np"]]"""
dexpath = """.//node[(@lemma="de" or @lemma="het" or @lemma="deze" or @lemma="die") and parent::node[@cat="np"]]"""

nognietxpath = """.//node[@cat="advp" and node[@rel="mod" and @lemma="nog"] and node[@rel="hd" and @lemma="niet"] and not(parent::node[@cat="top"])]"""
zelfinnpmodxpath = """.//node[@rel="mod" and @lemma="zelf" and parent::node[@cat="np"]]"""

def transformtreeld(stree:SynTree) -> SynTree:
debug = False
Expand Down Expand Up @@ -133,6 +136,48 @@ def transformtagcomma(stree: SynTree) -> SynTree:
return result


def nognietsplit(stree: SynTree) -> SynTree:
debug = False
if debug:
showtree(stree, 'nognietsplit: stree')
newstree = copy.deepcopy(stree)
nognietnodes = newstree.xpath(nognietxpath)
if nognietnodes == []:
return stree
for nognietnode in nognietnodes:
nog = find1(nognietnode, """./node[@lemma="nog"]""")
niet = find1(nognietnode, """./node[@lemma="niet"]""")
nognietnodeparent = nognietnode.getparent()
nognietnode.remove(nog)
nognietnode.remove(niet)
nognietnodeparent.remove(nognietnode)
nognietnodeparent.append(nog)
niet.attrib['rel'] = 'mod'
nognietnodeparent.append(niet)
if debug:
showtree(newstree, 'nognietsplit: newstree')
return newstree


def adaptlemmas(stree: SynTree) -> SynTree:
newlemmafound = False
newstree = copy.deepcopy(stree)
for node in newstree.iter():
if node.tag == 'node' and iswordnode(node):
nodeword = getattval(node, 'word')
nodelemma = getattval(node, 'lemma')
if nodeword == nodelemma and nodeword in lemmalexicon:
# node.attrib['lemma'] = lemmalexicon[nodeword]
node.set('lemma', lemmalexicon[nodeword])
newlemmafound = True

if newlemmafound:
result = newstree
else:
result = stree
return result


def isfiniteverbnode(node: SynTree) -> bool:
pt = getattval(node, 'pt')
wvorm = getattval(node, 'wvorm')
Expand Down

0 comments on commit 472ae75

Please sign in to comment.