Skip to content

Commit

Permalink
When processing "not only" and similar phrases into UD, separate them…
Browse files Browse the repository at this point in the history
… from the CONJP (sometimes ADVP by error) that they show up in. This allows the later part of the converter to connect both of them to the parent with advmod.

As part of this, turn the UPOS of "not" into PART

Also, update the corrector to make a few changes to the structure, which may help usages of the trees or of SD as well as the UD.  The UD changes are written to accommodate the structural errors in the original PTB, though
  • Loading branch information
AngledLuffa committed Nov 8, 2024
1 parent 2945cac commit 7b095f4
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 1 deletion.
5 changes: 5 additions & 0 deletions src/edu/stanford/nlp/trees/CoordinationTransformer.java
Original file line number Diff line number Diff line change
Expand Up @@ -728,6 +728,10 @@ private static Tree findCCparent(Tree t, Tree root) {
private static final TregexPattern BUT_ALSO_PATTERN = TregexPattern.compile("CONJP=conjp < (CC=cc < but) < (RB=rb < also) ?$+ (__=nextNode < (__ < __))");
private static final TsurgeonPattern BUT_ALSO_OPERATION = Tsurgeon.parseOperation("[move cc $- conjp] [move rb $- cc] [if exists nextNode move rb >1 nextNode] [createSubtree ADVP rb] [delete conjp]");

/* "not only" is not a MWE, so break up the CONJP similar to "but also". */
private static final TregexPattern NOT_ONLY_PATTERN = TregexPattern.compile("CONJP|ADVP=conjp < (RB=not < /^(?i)not$/) < (RB=only < /^(?i)only|just|merely|even$/) ?$+ (__=nextNode < (__ < __))");
private static final TsurgeonPattern NOT_ONLY_OPERATION = Tsurgeon.parseOperation("[move not $- conjp] [move only $- not] [if exists nextNode move only >1 nextNode] [if exists nextNode move not >1 nextNode] [createSubtree ADVP not] [createSubtree ADVP only] [delete conjp]");

/* at least / at most / at best / at worst / ... should be treated as if "at"
was a preposition and the RBS was a noun. Assumes that the MWE "at least"
has already been extracted. */
Expand All @@ -749,6 +753,7 @@ public static Tree MWETransform(Tree t) {

Tsurgeon.processPattern(ACCORDING_TO_PATTERN, ACCORDING_TO_OPERATION, t);
Tsurgeon.processPattern(BUT_ALSO_PATTERN, BUT_ALSO_OPERATION, t);
Tsurgeon.processPattern(NOT_ONLY_PATTERN, NOT_ONLY_OPERATION, t);
Tsurgeon.processPattern(AT_RBS_PATTERN, AT_RBS_OPERATION, t);
Tsurgeon.processPattern(AT_ALL_PATTERN, AT_ALL_OPERATION, t);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1312,7 +1312,6 @@ private UniversalEnglishGrammaticalRelations() {}
MODIFIER,
"S|VP|ADJP|PP|ADVP|UCP(?:-TMP|-ADV)?|NX|NML|SBAR|NP(?:-TMP|-ADV)?", tregexCompiler,
"NP|NP-TMP|NP-ADV|NX|NML < (PDT|CC|DT=target < /^(?i:either|neither|both)$/ $++ CC)",
"NP|NP-TMP|NP-ADV|NX|NML < (CONJP=target < (RB < /^(?i:not)$/) < (RB|JJ < /^(?i:only|merely|just)$/) $++ CC|CONJP)",
// This matches weird/wrong NP-internal preconjuncts where you get (NP PDT (NP NP CC NP)) or similar
"NP|NP-TMP|NP-ADV|NX|NML < (PDT|CC|DT=target < /^(?i:either|neither|both)$/ ) < (NP < CC)",
"/^S|VP|ADJP|PP|ADVP|UCP(?:-TMP|-ADV)?|NX|NML|SBAR$/ < (PDT|DT|CC=target < /^(?i:either|neither|both)$/ $++ CC)",
Expand Down
3 changes: 3 additions & 0 deletions src/edu/stanford/nlp/trees/UniversalPOSMapper.java
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,9 @@ public static void load() {
// RB -> PART when it is verbal negation (not or its reductions)
{ "@VP|SINV|SQ|FRAG|ADVP < (RB=target < /^(?i:not|n't|nt|t|n)$/)", "PART" },

// "not" as part of a phrase such as "not only", "not just", etc is tagged as PART in UD
{ "@ADVP|CONJP <1 (RB=target < /^(?i:not|n't|nt|t|n)$/) <2 (__ < only|just|merely|even) !<3 __", "PART" },

// Otherwise RB -> ADV
{ "RB=target <... {/.*/}", "ADV" },

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,16 @@ private static BufferedReader getBufferedReader(String source) {
"adjoin (NP NN@) newnp\n" +
'\n') +

// Fix not_RB only_JJ, which should generally be not_RB only_RB
// and put it under a CONJP instead of an ADVP
("ADVP|CONJP <1 (__ < /^(?i:not)$/) <2 (JJ=bad < only|just|merely|even) !<3 __\n" +
"relabel bad RB\n" +
'\n') +

("ADVP=bad <1 (__ < /^(?i:not)$/) <2 (RB < only|just|merely|even) !<3 __\n" +
"relabel bad CONJP\n" +
'\n') +

// Fix some cases of 'as well as' not made into a CONJP unit
// There are a few other weird cases that should also be reviewed with the tregex
// well|Well|WELL , as|AS|As . as|AS|As !>(__ > @CONJP)
Expand Down

0 comments on commit 7b095f4

Please sign in to comment.