Skip to content

Commit

Permalink
Allow more language models as input (Cleaning - Java part)
Browse files Browse the repository at this point in the history
  • Loading branch information
loic-vial committed Jan 13, 2020
1 parent 3ae0a4c commit f4e244e
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 187 deletions.
18 changes: 13 additions & 5 deletions java/src/main/java/NeuralWSDDecode.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ public static void main(String[] args) throws Exception
new NeuralWSDDecode().decode(args);
}

private boolean filterLemma;

private boolean mfsBackoff;

private Disambiguator firstSenseDisambiguator;
Expand All @@ -42,9 +44,10 @@ private void decode(String[] args) throws Exception
parser.addArgument("sense_compression_instance_hypernyms", "false");
parser.addArgument("sense_compression_antonyms", "false");
parser.addArgument("sense_compression_file", "");
parser.addArgument("clear_text", "false");
parser.addArgument("clear_text", "true");
parser.addArgument("batch_size", "1");
parser.addArgument("truncate_max_length", "150");
parser.addArgument("filter_lemma", "true");
parser.addArgument("mfs_backoff", "true");
if (!parser.parse(args)) return;

Expand All @@ -59,8 +62,9 @@ private void decode(String[] args) throws Exception
boolean clearText = parser.getArgValueBoolean("clear_text");
int batchSize = parser.getArgValueInteger("batch_size");
int truncateMaxLength = parser.getArgValueInteger("truncate_max_length");
filterLemma = parser.getArgValueBoolean("filter_lemma");
mfsBackoff = parser.getArgValueBoolean("mfs_backoff");

Map<String, String> senseCompressionClusters = null;
if (senseCompressionHypernyms || senseCompressionAntonyms)
{
Expand All @@ -75,6 +79,7 @@ private void decode(String[] args) throws Exception
firstSenseDisambiguator = new FirstSenseDisambiguator(WordnetHelper.wn30());
neuralDisambiguator = new NeuralDisambiguator(pythonPath, dataPath, weights, clearText, batchSize);
neuralDisambiguator.lowercaseWords = lowercase;
neuralDisambiguator.filterLemma = filterLemma;
neuralDisambiguator.reducedOutputVocabulary = senseCompressionClusters;

reader = new BufferedReader(new InputStreamReader(System.in));
Expand All @@ -85,9 +90,12 @@ private void decode(String[] args) throws Exception
Sentence sentence = new Sentence(line);
if (sentence.getWords().size() > truncateMaxLength)
{
sentence.getWords().stream().skip(truncateMaxLength).collect(Collectors.toList()).forEach(sentence::removeWord);
sentence.getWords().stream().skip(truncateMaxLength).collect(Collectors.toList()).forEach(sentence::removeWord);
}
if (filterLemma)
{
tagger.tag(sentence.getWords());
}
tagger.tag(sentence.getWords());
sentences.add(sentence);
if (sentences.size() >= batchSize)
{
Expand All @@ -113,7 +121,7 @@ private void decodeSentenceBatch(List<Sentence> sentences) throws IOException
for (Word word : sentence.getWords())
{
writer.write(word.getValue().replace("|", "/"));
if (word.hasAnnotation("lemma") && word.hasAnnotation("pos") && word.hasAnnotation("wsd"))
if (/*word.hasAnnotation("lemma") && word.hasAnnotation("pos") && */ word.hasAnnotation("wsd"))
{
writer.write("|" + word.getAnnotationValue("wsd"));
}
Expand Down
40 changes: 22 additions & 18 deletions java/src/main/java/NeuralWSDDecodeUFSAC.java
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import getalp.wsd.common.wordnet.WordnetHelper;
import getalp.wsd.method.Disambiguator;
import getalp.wsd.method.FirstSenseDisambiguator;
import getalp.wsd.method.neural.NeuralDisambiguator;
import getalp.wsd.ufsac.core.Sentence;
import getalp.wsd.ufsac.streaming.modifier.StreamingCorpusModifierSentence;
import getalp.wsd.ufsac.utils.CorpusPOSTaggerAndLemmatizer;
import getalp.wsd.utils.ArgumentParser;
import getalp.wsd.common.utils.ArgumentParser;
import getalp.wsd.utils.WordnetUtils;
import getalp.wsd.common.utils.Wrapper;

import java.util.List;

public class NeuralWSDDecodeUFSAC
Expand All @@ -18,9 +20,11 @@ public static void main(String[] args) throws Exception
parser.addArgumentList("weights");
parser.addArgument("input");
parser.addArgument("output");
parser.addArgument("lowercase", "true");
parser.addArgument("lowercase", "false");
parser.addArgument("sense_reduction", "true");
parser.addArgument("lemma_pos_tagged", "false");
parser.addArgument("clear_text", "true");
parser.addArgument("batch_size", "1");
parser.addArgument("mfs_backoff", "true");
if (!parser.parse(args)) return;

String pythonPath = parser.getArgValue("python_path");
Expand All @@ -30,32 +34,32 @@ public static void main(String[] args) throws Exception
String outputPath = parser.getArgValue("output");
boolean lowercase = parser.getArgValueBoolean("lowercase");
boolean senseReduction = parser.getArgValueBoolean("sense_reduction");
boolean lemmaPOSTagged = parser.getArgValueBoolean("lemma_pos_tagged");
boolean clearText = parser.getArgValueBoolean("clear_text");
int batchSize = parser.getArgValueInteger("batch_size");
boolean mfsBackoff = parser.getArgValueBoolean("mfs_backoff");

Wrapper<CorpusPOSTaggerAndLemmatizer> lemmaPOSTagger = new Wrapper<>(null);
if (!lemmaPOSTagged)
{
lemmaPOSTagger.obj = new CorpusPOSTaggerAndLemmatizer();
}
NeuralDisambiguator disambiguator = new NeuralDisambiguator(pythonPath, dataPath, weights);
disambiguator.lowercaseWords = lowercase;
if (senseReduction) disambiguator.reducedOutputVocabulary = WordnetUtils.getReducedSynsetKeysWithHypernyms3(WordnetHelper.wn30());
else disambiguator.reducedOutputVocabulary = null;
CorpusPOSTaggerAndLemmatizer tagger = new CorpusPOSTaggerAndLemmatizer();
Disambiguator firstSenseDisambiguator = new FirstSenseDisambiguator(WordnetHelper.wn30());
NeuralDisambiguator neuralDisambiguator = new NeuralDisambiguator(pythonPath, dataPath, weights, clearText, batchSize);
neuralDisambiguator.lowercaseWords = lowercase;
if (senseReduction) neuralDisambiguator.reducedOutputVocabulary = WordnetUtils.getReducedSynsetKeysWithHypernyms3(WordnetHelper.wn30());
else neuralDisambiguator.reducedOutputVocabulary = null;

StreamingCorpusModifierSentence modifier = new StreamingCorpusModifierSentence()
{
public void modifySentence(Sentence sentence)
{
if (lemmaPOSTagger.obj != null)
tagger.tag(sentence.getWords());
neuralDisambiguator.disambiguate(sentence, "wsd");
if (mfsBackoff)
{
lemmaPOSTagger.obj.tag(sentence.getWords());
firstSenseDisambiguator.disambiguate(sentence, "wsd");
}
disambiguator.disambiguate(sentence, "wsd");
}
};

modifier.load(inputPath, outputPath);
disambiguator.close();
neuralDisambiguator.close();
}
}

3 changes: 3 additions & 0 deletions java/src/main/java/NeuralWSDPrepare.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ public static void main(String[] args) throws Exception
parser.addArgument("exclude_line_length", "150");
parser.addArgument("line_length_tokenizer", "null");
parser.addArgument("lowercase", "false");
parser.addArgument("filter_lemma", "true");
parser.addArgument("uniform_dash", "false");
parser.addArgument("sense_compression_hypernyms", "true");
parser.addArgument("sense_compression_instance_hypernyms", "false");
Expand All @@ -53,6 +54,7 @@ public static void main(String[] args) throws Exception
int outputFeatureVocabularyLimit = parser.getArgValueInteger("output_feature_vocabulary_limit");
int maxLineLength = parser.getArgValueInteger("truncate_line_length");
boolean lowercase = parser.getArgValueBoolean("lowercase");
boolean filterLemma = parser.getArgValueBoolean("filter_lemma");
boolean uniformDash = parser.getArgValueBoolean("uniform_dash");
boolean senseCompressionHypernyms = parser.getArgValueBoolean("sense_compression_hypernyms");
boolean senseCompressionInstanceHypernyms = parser.getArgValueBoolean("sense_compression_instance_hypernyms");
Expand Down Expand Up @@ -124,6 +126,7 @@ public static void main(String[] args) throws Exception

preparator.maxLineLength = maxLineLength;
preparator.lowercaseWords = lowercase;
preparator.filterLemma = filterLemma;
preparator.uniformDash = uniformDash;
preparator.multisenses = false;
preparator.removeAllCoarseGrained = true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@ public class NeuralDataPreparator

public boolean lowercaseWords = true;

public boolean filterLemma = true;

public boolean addWordKeyFromSenseKey = false;

public boolean uniformDash = false;
Expand Down
164 changes: 0 additions & 164 deletions java/src/main/java/getalp/wsd/utils/ArgumentParser.java

This file was deleted.

0 comments on commit f4e244e

Please sign in to comment.