diff --git a/src/sastadev/alpinoparsing.py b/src/sastadev/alpinoparsing.py index ddbce0f..78f84b6 100644 --- a/src/sastadev/alpinoparsing.py +++ b/src/sastadev/alpinoparsing.py @@ -83,7 +83,11 @@ def parse(origsent: str, escape: bool = True): if 300 > r1.status >= 200: streebytes = r1.read() # print(streebytes.decode('utf8')) - stree = etree.fromstring(streebytes) + try: + stree = etree.fromstring(streebytes) + except etree.XMLSyntaxError as e: + sastadev.conf.settings.LOGGER.error(f'Error: {e} for {sent}') + stree = None return stree else: sastadev.conf.settings.LOGGER.error('parsing failed:', r1.status, r1.reason, sent) diff --git a/src/sastadev/correcttreebank.py b/src/sastadev/correcttreebank.py index 46ba621..4214552 100644 --- a/src/sastadev/correcttreebank.py +++ b/src/sastadev/correcttreebank.py @@ -22,7 +22,7 @@ from sastadev.sva import phicompatible from sastadev.syllablecount import countsyllables from sastadev.targets import get_mustbedone -from sastadev.treebankfunctions import (adaptsentence, add_metadata, countav, deflate, +from sastadev.treebankfunctions import (adaptsentence, add_metadata, clausecats, countav, deflate, deletewordnodes, fatparse, find1, getattval, getbeginend, getcompoundcount, getneighbourwordnode, getnodeyield, getorigutt, @@ -55,7 +55,7 @@ ParsedCorrection = Tuple[List[str], SynTree, List[Meta]] TupleNint = Tuple[19 * (int,)] -altpropertiesheader = ['penalty', 'dpcount', 'dhyphencount', 'mainclausecount', 'complsucount', 'dimcount', 'compcount', 'supcount', +altpropertiesheader = ['penalty', 'dpcount', 'dhyphencount', 'mainclausecount', 'topclause', 'complsucount', 'dimcount', 'compcount', 'supcount', 'compoundcount', 'unknownwordcount', 'wrongposwordcount', 'smainsucount', 'sucount', 'svaokcount', 'deplusneutcount', 'badcatcount', 'hyphencount', 'lonelytoecount', 'basicreplaceecount', 'ambigcount', 'subjunctivecount', 'unknownnouncount', 'unknownnamecount', 'dezebwcount', 'noun1c_count'] @@ -70,7 +70,7 @@ class Alternative(): - def __init__(self, stree, altid, altsent, penalty, dpcount, dhyphencount, mainclausecount, complsucount, dimcount, + def __init__(self, stree, altid, altsent, penalty, dpcount, dhyphencount, mainclausecount, topclause, complsucount, dimcount, compcount, supcount, compoundcount, unknownwordcount, wrongposwordcount, smainsucount, sucount, svaok, deplusneutcount, badcatcount, hyphencount, lonelytoecount, basicreplaceecount, ambigcount, subjunctivecount, unknownnouncount, unknownnamecount, @@ -82,6 +82,7 @@ def __init__(self, stree, altid, altsent, penalty, dpcount, dhyphencount, maincl self.dpcount: int = int(dpcount) self.dhyphencount: int = int(dhyphencount) self.mainclausecount: int = int(mainclausecount) + self.topclause: int = int(topclause) self.complsucount: int = int(complsucount) self.dimcount: int = int(dimcount) self.compcount: int = int(compcount) @@ -117,7 +118,7 @@ def alt2row(self, uttid: UttId, base: str, user1: str = '', user2: str = '', use score = ampersand.join(scores) part4 = list( map(str, [self.altid, self.altsent, score, self.penalty, self.dpcount, self.dhyphencount, - self.mainclausecount, self.complsucount, + self.mainclausecount, self.topclause, self.complsucount, self.dimcount, self.compcount, self.supcount, self.compoundcount, self.unknownwordcount, self.wrongposwordcount, self.smainsucount, self.sucount, self.svaok, self.deplusneutcount, self.badcatcount, self.hyphencount, self.lonelytoecount, @@ -975,7 +976,7 @@ def oldgetuttid(stree: SynTree) -> UttId: def scorefunction(obj: Alternative) -> TupleNint: return (-obj.unknownwordcount, -obj.wrongposwordcount,-obj.unknownnouncount, -obj.unknownnamecount, -obj.ambigcount, -obj.dpcount, - -obj.dhyphencount, -obj.mainclausecount, + -obj.dhyphencount, -obj.mainclausecount, obj.topclause, -obj.complsucount, -obj.badcatcount, -obj.basicreplaceecount, -obj.ambigcount, -obj.hyphencount, -obj.lonelytoecount, -obj.subjunctivecount, obj.smainsucount, obj.dimcount, @@ -1087,6 +1088,7 @@ def selectcorrection(stree: SynTree, ptmds: List[ParsedCorrection], corr: Correc sucount = countav(nt, 'rel', 'su') lonelytoecount = getlonelytoecount(nt) mainclausecount = getmainclausecount(nt) + topclause = gettopclause(nt) smainsucount = countsmainsu(nt) svaokcount = getsvaokcount(nt) deplusneutcount = getdeplusneutcount(nt) @@ -1112,7 +1114,7 @@ def selectcorrection(stree: SynTree, ptmds: List[ParsedCorrection], corr: Correc # overregcount but these will mostly be unknown words # mwunamecount well maybe unknownpropernoun first - alt = Alternative(stree, altid, altsent, penalty, dpcount, dhyphencount, mainclausecount, complsucount, dimcount, compcount, + alt = Alternative(stree, altid, altsent, penalty, dpcount, dhyphencount, mainclausecount, topclause, complsucount, dimcount, compcount, supcount, compoundcount, unknownwordcount, wrongposwordcount, smainsucount, sucount, svaokcount, deplusneutcount, badcatcount, hyphencount, lonelytoecount, @@ -1167,6 +1169,21 @@ def getmainclausecount(nt: SynTree) -> int: result = lmatches return result +topxpath = './/node[@cat="top"]' +def gettopclause(nt: SynTree) -> int: + tops = nt.xpath(topxpath) + if tops == []: + return 0 + top = tops[0] + realchildren = [child for child in top if getattval(child, 'pt') not in ['let', 'tsw']] + if len(realchildren) != 1: + return 0 + else: + thechild = realchildren[0] + thechildcat = getattval(thechild, 'cat') + result = 1 if thechildcat in clausecats else 0 + return result + toexpath = './/node[@lemma="toe" or (@lemma="tot" and @vztype="fin")]' naarxpath = './/node[@lemma="naar"]' def getlonelytoecount(nt: SynTree) -> int: diff --git a/src/sastadev/smallclauses.py b/src/sastadev/smallclauses.py index e32e8c4..fa8db49 100644 --- a/src/sastadev/smallclauses.py +++ b/src/sastadev/smallclauses.py @@ -293,7 +293,7 @@ def getauxform(aux: str, node:SynTree) -> str: result = 'heeft' if aux == 'hebben' else 'is' return result -def mkinsertmeta(inserttokens, resultlist, penalty=defaultpenalty): +def mkinsertmeta(inserttokens, resultlist, penalty=defaultpenalty, cat=smallclause): insertposs = [token.pos + token.subpos for token in inserttokens] insertwordlist = [token.word for token in inserttokens] tokenmappinglist = [token.pos if token.subpos == 0 else None for token in resultlist] diff --git a/src/sastadev/toe.py b/src/sastadev/toe.py index c6589b0..4014647 100644 --- a/src/sastadev/toe.py +++ b/src/sastadev/toe.py @@ -7,6 +7,8 @@ from sastadev.smallclauses import bg, mkinsertmeta, realword, word from sastadev.tokenmd import TokenListMD +lonelytoe = 'Lonely toe' + def isdet(node) -> bool: nodept = getattval(node, 'pt') nodepdtype = getattval(node, 'pdtype' ) @@ -50,7 +52,7 @@ def lonelytoe(tokensmd: TokenListMD, tree: SynTree) -> List[TokenListMD]: if isdet(thisnode) and getattval(nextnode, 'pt') == 'n': naartoken = Token('naar', token.pos, subpos=5) inserttokens = [naartoken] - metadata += mkinsertmeta(inserttokens, newtokens) + metadata += mkinsertmeta(inserttokens, newtokens, cat=lonelytoe) naarfound = True newtokens.append(naartoken) insertiondone = True @@ -62,7 +64,7 @@ def lonelytoe(tokensmd: TokenListMD, tree: SynTree) -> List[TokenListMD]: naarfound = True newtokens.append(naartoken) inserttokens = [naartoken] - metadata += mkinsertmeta(inserttokens, newtokens) + metadata += mkinsertmeta(inserttokens, newtokens, cat=lonelytoe) insertiondone = True newtokens.append(token) if insertiondone: diff --git a/src/sastadev/treebankfunctions.py b/src/sastadev/treebankfunctions.py index abc96ad..9c86243 100644 --- a/src/sastadev/treebankfunctions.py +++ b/src/sastadev/treebankfunctions.py @@ -610,7 +610,7 @@ def mktoken2nodemap(tokens: List[Token], tree: SynTree) -> Dict[int, SynTree]: tokennodes = tree.xpath('.//node[@pt or @pos or @word]') tokennodesdict = {int(getattval(n, 'begin')): n for n in tokennodes} token2nodemap = {token.pos: tokennodesdict[token.pos] - for token in tokens if keycheck(token.pos, tokennodesdict)} + for token in tokens if not token.skip and keycheck(token.pos, tokennodesdict)} return token2nodemap