From 7b095f4b67df5f49d67d309a9a84b09f39140da1 Mon Sep 17 00:00:00 2001 From: John Bauer Date: Thu, 7 Nov 2024 21:05:56 -0800 Subject: [PATCH] When processing "not only" and similar phrases into UD, separate them from the CONJP (sometimes ADVP by error) that they show up in. This allows the later part of the converter to connect both of them to the parent with advmod. As part of this, turn the UPOS of "not" into PART Also, update the corrector to make a few changes to the structure, which may help usages of the trees or of SD as well as the UD. The UD changes are written to accommodate the structural errors in the original PTB, though --- .../stanford/nlp/trees/CoordinationTransformer.java | 5 +++++ .../trees/UniversalEnglishGrammaticalRelations.java | 1 - src/edu/stanford/nlp/trees/UniversalPOSMapper.java | 3 +++ .../trees/treebank/EnglishPTBTreebankCorrector.java | 10 ++++++++++ 4 files changed, 18 insertions(+), 1 deletion(-) diff --git a/src/edu/stanford/nlp/trees/CoordinationTransformer.java b/src/edu/stanford/nlp/trees/CoordinationTransformer.java index debf10c151..4432581920 100644 --- a/src/edu/stanford/nlp/trees/CoordinationTransformer.java +++ b/src/edu/stanford/nlp/trees/CoordinationTransformer.java @@ -728,6 +728,10 @@ private static Tree findCCparent(Tree t, Tree root) { private static final TregexPattern BUT_ALSO_PATTERN = TregexPattern.compile("CONJP=conjp < (CC=cc < but) < (RB=rb < also) ?$+ (__=nextNode < (__ < __))"); private static final TsurgeonPattern BUT_ALSO_OPERATION = Tsurgeon.parseOperation("[move cc $- conjp] [move rb $- cc] [if exists nextNode move rb >1 nextNode] [createSubtree ADVP rb] [delete conjp]"); + /* "not only" is not a MWE, so break up the CONJP similar to "but also". */ + private static final TregexPattern NOT_ONLY_PATTERN = TregexPattern.compile("CONJP|ADVP=conjp < (RB=not < /^(?i)not$/) < (RB=only < /^(?i)only|just|merely|even$/) ?$+ (__=nextNode < (__ < __))"); + private static final TsurgeonPattern NOT_ONLY_OPERATION = Tsurgeon.parseOperation("[move not $- conjp] [move only $- not] [if exists nextNode move only >1 nextNode] [if exists nextNode move not >1 nextNode] [createSubtree ADVP not] [createSubtree ADVP only] [delete conjp]"); + /* at least / at most / at best / at worst / ... should be treated as if "at" was a preposition and the RBS was a noun. Assumes that the MWE "at least" has already been extracted. */ @@ -749,6 +753,7 @@ public static Tree MWETransform(Tree t) { Tsurgeon.processPattern(ACCORDING_TO_PATTERN, ACCORDING_TO_OPERATION, t); Tsurgeon.processPattern(BUT_ALSO_PATTERN, BUT_ALSO_OPERATION, t); + Tsurgeon.processPattern(NOT_ONLY_PATTERN, NOT_ONLY_OPERATION, t); Tsurgeon.processPattern(AT_RBS_PATTERN, AT_RBS_OPERATION, t); Tsurgeon.processPattern(AT_ALL_PATTERN, AT_ALL_OPERATION, t); diff --git a/src/edu/stanford/nlp/trees/UniversalEnglishGrammaticalRelations.java b/src/edu/stanford/nlp/trees/UniversalEnglishGrammaticalRelations.java index 2579127dee..22c8b96ade 100644 --- a/src/edu/stanford/nlp/trees/UniversalEnglishGrammaticalRelations.java +++ b/src/edu/stanford/nlp/trees/UniversalEnglishGrammaticalRelations.java @@ -1312,7 +1312,6 @@ private UniversalEnglishGrammaticalRelations() {} MODIFIER, "S|VP|ADJP|PP|ADVP|UCP(?:-TMP|-ADV)?|NX|NML|SBAR|NP(?:-TMP|-ADV)?", tregexCompiler, "NP|NP-TMP|NP-ADV|NX|NML < (PDT|CC|DT=target < /^(?i:either|neither|both)$/ $++ CC)", - "NP|NP-TMP|NP-ADV|NX|NML < (CONJP=target < (RB < /^(?i:not)$/) < (RB|JJ < /^(?i:only|merely|just)$/) $++ CC|CONJP)", // This matches weird/wrong NP-internal preconjuncts where you get (NP PDT (NP NP CC NP)) or similar "NP|NP-TMP|NP-ADV|NX|NML < (PDT|CC|DT=target < /^(?i:either|neither|both)$/ ) < (NP < CC)", "/^S|VP|ADJP|PP|ADVP|UCP(?:-TMP|-ADV)?|NX|NML|SBAR$/ < (PDT|DT|CC=target < /^(?i:either|neither|both)$/ $++ CC)", diff --git a/src/edu/stanford/nlp/trees/UniversalPOSMapper.java b/src/edu/stanford/nlp/trees/UniversalPOSMapper.java index dfc394dfd1..1840e583d8 100644 --- a/src/edu/stanford/nlp/trees/UniversalPOSMapper.java +++ b/src/edu/stanford/nlp/trees/UniversalPOSMapper.java @@ -134,6 +134,9 @@ public static void load() { // RB -> PART when it is verbal negation (not or its reductions) { "@VP|SINV|SQ|FRAG|ADVP < (RB=target < /^(?i:not|n't|nt|t|n)$/)", "PART" }, + // "not" as part of a phrase such as "not only", "not just", etc is tagged as PART in UD + { "@ADVP|CONJP <1 (RB=target < /^(?i:not|n't|nt|t|n)$/) <2 (__ < only|just|merely|even) !<3 __", "PART" }, + // Otherwise RB -> ADV { "RB=target <... {/.*/}", "ADV" }, diff --git a/src/edu/stanford/nlp/trees/treebank/EnglishPTBTreebankCorrector.java b/src/edu/stanford/nlp/trees/treebank/EnglishPTBTreebankCorrector.java index 21d6c5aa64..da14689740 100644 --- a/src/edu/stanford/nlp/trees/treebank/EnglishPTBTreebankCorrector.java +++ b/src/edu/stanford/nlp/trees/treebank/EnglishPTBTreebankCorrector.java @@ -168,6 +168,16 @@ private static BufferedReader getBufferedReader(String source) { "adjoin (NP NN@) newnp\n" + '\n') + + // Fix not_RB only_JJ, which should generally be not_RB only_RB + // and put it under a CONJP instead of an ADVP + ("ADVP|CONJP <1 (__ < /^(?i:not)$/) <2 (JJ=bad < only|just|merely|even) !<3 __\n" + + "relabel bad RB\n" + + '\n') + + + ("ADVP=bad <1 (__ < /^(?i:not)$/) <2 (RB < only|just|merely|even) !<3 __\n" + + "relabel bad CONJP\n" + + '\n') + + // Fix some cases of 'as well as' not made into a CONJP unit // There are a few other weird cases that should also be reviewed with the tregex // well|Well|WELL , as|AS|As . as|AS|As !>(__ > @CONJP)