Skip to content

Commit

Permalink
Add an Ssurgeon feature which splits a word into pieces based on rege…
Browse files Browse the repository at this point in the history
…x matches. A word can be specified as the head of the new pieces, along with the relation. Other words are pushed down the sentence to make the indices line up
  • Loading branch information
AngledLuffa committed Jul 2, 2024
1 parent bf8ee06 commit 13ede5a
Show file tree
Hide file tree
Showing 3 changed files with 237 additions and 0 deletions.
135 changes: 135 additions & 0 deletions src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SplitWord.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
package edu.stanford.nlp.semgraph.semgrex.ssurgeon;

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.*;

import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.trees.GrammaticalRelation;

/**
* Split a word into pieces based on the regex expressions provided by the -regex arguments
* <br>
* As an example of where this is useful, a tokenization dataset had "
* stuck to each of the words. We can separate that out by using two
* regex, one which matches the " in a group, one which matches the
* rest of the word without the "
*
* @author John Bauer
*/
public class SplitWord extends SsurgeonEdit {
public static final String LABEL = "splitWord";

final String node;
final List<Pattern> nodeRegex;
final int headIndex;
final GrammaticalRelation relation;

public SplitWord(String node, List<String> nodeRegex, Integer headIndex, GrammaticalRelation relation) {
if (node == null) {
throw new SsurgeonParseException("SplitWord expected -node with the name of the matched node to split");
}
this.node = node;

if (nodeRegex == null || nodeRegex.size() == 0) {
throw new SsurgeonParseException("SplitWord expected -regex with regex to determine which pieces to split the word into");
}
if (nodeRegex.size() == 1) {
throw new SsurgeonParseException("SplitWord expected at least two -regex");
}
this.nodeRegex = new ArrayList<>();
for (int i = 0; i < nodeRegex.size(); ++i) {
this.nodeRegex.add(Pattern.compile(nodeRegex.get(i)));
}

if (headIndex == null) {
throw new SsurgeonParseException("SplitWord expected a -headIndex, 0-indexed for the word piece to use when chopping up the word");
}
this.headIndex = headIndex;

if (relation == null) {
throw new SsurgeonParseException("SplitWord expected a -reln to represent the dependency to use for the new words");
}
this.relation = relation;
}

@Override
public String toEditString() {
StringWriter buf = new StringWriter();
buf.write(LABEL);
buf.write("\t");
buf.write("-node " + node + "\t");
for (Pattern regex : nodeRegex) {
buf.write("-regex " + regex + "\t");
}
buf.write("-reln " + relation.toString() + "\t");
buf.write("-headIndex " + headIndex);
return buf.toString();
}

@Override
public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
IndexedWord matchedNode = sm.getNode(node);
String origWord = matchedNode.word();

// first, iterate over the regex patterns we had at creation time
//
// each new word created will be the concatenation of all of the
// matching groups from this pattern
List<String> words = new ArrayList<>();
for (int i = 0; i < nodeRegex.size(); ++i) {
Matcher regexMatcher = nodeRegex.get(i).matcher(origWord);
if (!regexMatcher.matches()) {
return false;
}

StringBuilder newWordBuilder = new StringBuilder();
for (int j = 0; j < regexMatcher.groupCount(); ++j) {
newWordBuilder.append(regexMatcher.group(j+1));
}
String newWord = newWordBuilder.toString();
if (newWord.length() == 0) {
return false;
}
words.add(newWord);
}

int matchedIndex = matchedNode.index();

// at this point, we can make new words out of each of the patterns

// move all words down by nodeRegex.size() - 1
// then move the original word down by headIndex
AddDep.moveNodes(sg, sm, x -> (x > matchedIndex), x -> x+nodeRegex.size() - 1, true);
// the head node has its word replaced, and its index & links need
// to be rearranged, but none of the links are added or removed
if (headIndex > 0) {
AddDep.moveNode(sg, sm, matchedNode, matchedIndex + headIndex);
}
matchedNode = sm.getNode(node);
matchedNode.setWord(words.get(headIndex));
matchedNode.setValue(words.get(headIndex));

for (int i = 0; i < nodeRegex.size(); ++i) {
if (i == headIndex)
continue;

// otherwise, add a word with the appropriate index,
// then connect it to matchedNode
// TODO: add the ability to set more values, such as POS?
IndexedWord newNode = new IndexedWord();
newNode.setDocID(matchedNode.docID());
newNode.setIndex(matchedIndex + i);
newNode.setSentIndex(matchedNode.sentIndex());
newNode.setWord(words.get(i));
newNode.setValue(words.get(i));

sg.addVertex(newNode);
sg.addEdge(matchedNode, newNode, relation, 0.0, false);
}
return true;
}
}
24 changes: 24 additions & 0 deletions src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
* <li> {@code editNode -node node ...attributes...}
* <li> {@code lemmatize -node node}
* <li> {@code combineMWT -node node -word word}
* <li> {@code splitWord -node node -headIndex idx -reln depType -regex w1 -regex w2 ...}
* <li> {@code setRoots n1 (n2 n3 ...)}
* <li> {@code mergeNodes n1 n2}
* <li> {@code killAllIncomingEdges -node node}
Expand Down Expand Up @@ -146,6 +147,12 @@
* {@code -node} (repeated) is the nodes to edit.
* {@code -word} is the optional text to use for the new MWT. If not set, the words will be concatenated.
*</p><p>
* {@code splitWord} will split a single word into multiple pieces from the text of the current word
* {@code -node} is the node to split.
* {@code -headIndex} is the index (counting from 0) of the word piece to make the head.
* {@code -reln} is the name of the dependency type to use. pieces other than the head will connect using this relation
* {@code -regex} regex must match the matched node. all matching groups will be concatenated to form a new word. need at least 2 to split a word
*</p><p>
* {@code setRoots} sets the roots of the sentence to a new root.
* {@code n1, n2, ...} are the names of the nodes from the Semgrex to use as the root(s).
* This is best done in conjunction with other operations which actually manipulate the structure
Expand Down Expand Up @@ -397,9 +404,12 @@ public Collection<SsurgeonWordlist> getResources() {
public static final String DEP_NODENAME_ARG = "-dep";
public static final String EDGE_NAME_ARG = "-edge";
public static final String NODENAME_ARG = "-node";
public static final String REGEX_ARG = "-regex";
public static final String RELN_ARG = "-reln";
public static final String NODE_PROTO_ARG = "-nodearg";
public static final String WEIGHT_ARG = "-weight";
public static final String HEAD_INDEX_ARG = "-headIndex";
public static final String HEAD_INDEX_LOWER_ARG = "-headindex";
public static final String NAME_ARG = "-name";
public static final String POSITION_ARG = "-position";
public static final String UPDATE_MORPHO_FEATURES = "-updateMorphoFeatures";
Expand All @@ -420,6 +430,8 @@ protected static class SsurgeonArgs {

public List<String> nodes = new ArrayList<>();

public List<String> regex = new ArrayList<>();

// below are string representations of the intended values
public String nodeString = null;

Expand All @@ -431,6 +443,8 @@ protected static class SsurgeonArgs {

public String updateMorphoFeatures = null;

public Integer headIndex = null;

public Map<String, String> annotations = new TreeMap<>();
}

Expand Down Expand Up @@ -489,12 +503,19 @@ private static SsurgeonArgs parseArgsBox(String args, Map<String, String> additi
case NODENAME_ARG:
argsBox.nodes.add(argsValue);
break;
case REGEX_ARG:
argsBox.regex.add(argsValue);
break;
case NODE_PROTO_ARG:
argsBox.nodeString = argsValue;
break;
case WEIGHT_ARG:
argsBox.weight = Double.valueOf(argsValue);
break;
case HEAD_INDEX_ARG:
case HEAD_INDEX_LOWER_ARG:
argsBox.headIndex = Integer.valueOf(argsValue);
break;
case NAME_ARG:
argsBox.name = argsValue;
break;
Expand Down Expand Up @@ -602,6 +623,9 @@ public static SsurgeonEdit parseEditLine(String editLine, Map<String, String> at
return new KillAllIncomingEdges(argsBox.nodes.get(0));
} else if (command.equalsIgnoreCase(CombineMWT.LABEL)) {
return new CombineMWT(argsBox.nodes, argsBox.annotations.get("word"));
} else if (command.equalsIgnoreCase(SplitWord.LABEL)) {
GrammaticalRelation reln = GrammaticalRelation.valueOf(language, argsBox.reln);
return new SplitWord(argsBox.nodes.get(0), argsBox.regex, argsBox.headIndex, reln);
}
throw new SsurgeonParseException("Error in SsurgeonEdit.parseEditLine: command '"+command+"' is not supported");
} catch (SsurgeonParseException e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1954,6 +1954,84 @@ public void readXMLDeleteLeaf() {
assertEquals(newSg, expected);
}

/**
* Test splitWord, which should split a word into pieces based on regex matches, with the head at position 0
*/
@Test
public void readXMLSplitTwoWords() {
String doc = String.join(newline,
"<ssurgeon-pattern-list>",
" <ssurgeon-pattern>",
" <uid>38</uid>",
" <notes>Test splitting a word into two pieces with the head at the start</notes>",
" <language>UniversalEnglish</language>",
" <semgrex>" + XMLUtils.escapeXML("{word:/foobar/}=split") + "</semgrex>",
" <edit-list>splitWord -node split -regex ^(foo)bar$ -regex ^foo(bar)$ -reln dep -headIndex 0</edit-list>",
" </ssurgeon-pattern>",
"</ssurgeon-pattern-list>");
Ssurgeon inst = Ssurgeon.inst();
List<SsurgeonPattern> patterns = inst.readFromString(doc);
assertEquals(patterns.size(), 1);
SsurgeonPattern pattern = patterns.get(0);

SemanticGraph sg = SemanticGraph.valueOf("[example-3 det> the-1 amod> foobar-2]");
SemanticGraph newSg = pattern.iterate(sg).first;
SemanticGraph expected = SemanticGraph.valueOf("[example-4 det> the-1 amod> [foo-2 dep> bar-3]]");
assertEquals(newSg, expected);
}

/**
* Test splitWord, which should split a word into pieces based on regex matches, with the head at position 1
*/
@Test
public void readXMLSplitTwoWordsAfter() {
String doc = String.join(newline,
"<ssurgeon-pattern-list>",
" <ssurgeon-pattern>",
" <uid>38</uid>",
" <notes>Test splitting a word into two pieces with the head at the start</notes>",
" <language>UniversalEnglish</language>",
" <semgrex>" + XMLUtils.escapeXML("{word:/foobar/}=split") + "</semgrex>",
" <edit-list>splitWord -node split -regex ^(foo)bar$ -regex ^foo(bar)$ -reln dep -headIndex 1</edit-list>",
" </ssurgeon-pattern>",
"</ssurgeon-pattern-list>");
Ssurgeon inst = Ssurgeon.inst();
List<SsurgeonPattern> patterns = inst.readFromString(doc);
assertEquals(patterns.size(), 1);
SsurgeonPattern pattern = patterns.get(0);

SemanticGraph sg = SemanticGraph.valueOf("[example-3 det> the-1 amod> foobar-2]");
SemanticGraph newSg = pattern.iterate(sg).first;
SemanticGraph expected = SemanticGraph.valueOf("[example-4 det> the-1 amod> [bar-3 dep> foo-2]]");
assertEquals(newSg, expected);
}

/**
* Test splitWord, which should split a word into pieces based on regex matches, with three pieces
*/
@Test
public void readXMLSplitThreeWords() {
String doc = String.join(newline,
"<ssurgeon-pattern-list>",
" <ssurgeon-pattern>",
" <uid>38</uid>",
" <notes>Test splitting a word into two pieces with the head at the start</notes>",
" <language>UniversalEnglish</language>",
" <semgrex>" + XMLUtils.escapeXML("{word:/foobarbaz/}=split") + "</semgrex>",
" <edit-list>splitWord -node split -regex ^(foo)barbaz$ -regex ^foo(bar)baz$ -regex ^foobar(baz)$ -reln dep -headIndex 1</edit-list>",
" </ssurgeon-pattern>",
"</ssurgeon-pattern-list>");
Ssurgeon inst = Ssurgeon.inst();
List<SsurgeonPattern> patterns = inst.readFromString(doc);
assertEquals(patterns.size(), 1);
SsurgeonPattern pattern = patterns.get(0);

SemanticGraph sg = SemanticGraph.valueOf("[example-3 det> the-1 amod> foobarbaz-2]");
SemanticGraph newSg = pattern.iterate(sg).first;
SemanticGraph expected = SemanticGraph.valueOf("[example-5 det> the-1 amod> [bar-3 dep> foo-2 dep>baz-4]]");
assertEquals(newSg, expected);
}

/**
* Simple test of an Ssurgeon edit script. This instances a simple semantic graph,
* a semgrex pattern, and then the resulting actions over the named nodes in the
Expand Down

0 comments on commit 13ede5a

Please sign in to comment.