From 69089b4ad1aa13b761f0ab579dae5a222e33119e Mon Sep 17 00:00:00 2001 From: "Thomas L. Redman" Date: Wed, 3 Jul 2019 12:44:18 -0500 Subject: [PATCH 1/3] Added pruneUselessLabels to SparseNetworkLearner, this method will discard any classifiers for labels not in the list passed in. This is intended to allow users to discard classifers for labels they are not interested in, but may have uninteded side effects, like GPE labels being classified as PERSON. --- lbjava-examples/pom.xml | 8 ++-- lbjava-mvn-plugin/pom.xml | 4 +- lbjava/pom.xml | 2 +- .../lbjava/learn/SparseNetworkLearner.java | 40 ++++++++++++++++++- .../SparseNetworkLearningPruneTest.java | 1 - pom.xml | 2 +- 6 files changed, 47 insertions(+), 10 deletions(-) diff --git a/lbjava-examples/pom.xml b/lbjava-examples/pom.xml index 804e89c8..f0bef592 100755 --- a/lbjava-examples/pom.xml +++ b/lbjava-examples/pom.xml @@ -3,7 +3,7 @@ lbjava-project edu.illinois.cs.cogcomp - 1.3.1 + 1.3.2 4.0.0 @@ -27,12 +27,12 @@ edu.illinois.cs.cogcomp LBJava - 1.3.1 + 1.3.2 edu.illinois.cs.cogcomp lbjava-maven-plugin - 1.3.1 + 1.3.2 @@ -63,7 +63,7 @@ edu.illinois.cs.cogcomp lbjava-maven-plugin - 1.3.1 + 1.3.2 ${project.basedir}/src/main/java ${project.basedir}/target/classes diff --git a/lbjava-mvn-plugin/pom.xml b/lbjava-mvn-plugin/pom.xml index 71bfa199..c0db736f 100644 --- a/lbjava-mvn-plugin/pom.xml +++ b/lbjava-mvn-plugin/pom.xml @@ -5,7 +5,7 @@ lbjava-project edu.illinois.cs.cogcomp - 1.3.1 + 1.3.2 lbjava-maven-plugin @@ -76,7 +76,7 @@ edu.illinois.cs.cogcomp LBJava - 1.3.1 + 1.3.2 jar compile diff --git a/lbjava/pom.xml b/lbjava/pom.xml index 13a58757..152e297d 100644 --- a/lbjava/pom.xml +++ b/lbjava/pom.xml @@ -3,7 +3,7 @@ lbjava-project edu.illinois.cs.cogcomp - 1.3.1 + 1.3.2 4.0.0 diff --git a/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/learn/SparseNetworkLearner.java b/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/learn/SparseNetworkLearner.java index 822fc1fd..d89bd59f 100644 --- a/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/learn/SparseNetworkLearner.java +++ b/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/learn/SparseNetworkLearner.java @@ -8,9 +8,9 @@ package edu.illinois.cs.cogcomp.lbjava.learn; import java.io.PrintStream; +import java.util.ArrayList; import java.util.Collection; import java.util.Iterator; -import java.util.Map.Entry; import edu.illinois.cs.cogcomp.core.datastructures.vectors.ExceptionlessInputStream; import edu.illinois.cs.cogcomp.core.datastructures.vectors.ExceptionlessOutputStream; @@ -688,6 +688,44 @@ public void read(ExceptionlessInputStream in) { for (int i = 0; i < N; ++i) network.add(Learner.readLearner(in)); } + + /** + * This method will discard learners not associated with the provided labels. For labels that are + * not needed at runtime, this would improve performance as well as memory footprint. For example, + * imagine you have a 4 class model, PER, ORG, LOC and OTHER, but you could care less about OTHER. + * In this case, you could eliminate that label and improve the performance of the model proportionally. + *

+ * Use of this feature may cause terms previously classified by a discarded classifier to be labeled + * as one of the remaining classes. + *

+ * @param keepers A list of the only labels to keep. + */ + public void pruneUnusedLabels(ArrayList keepers) { + int N = network.size(); + for (int i = 0; i < N; ++i) { + LinearThresholdUnit ltu = (LinearThresholdUnit) network.get(i); + if (ltu == null) + continue; + + // get the label and determine if it should be pruned. + String label = labelLexicon.lookupKey(i).getStringValue(); + if (label.length() > 2) { + // Take off the B-, I-, L- or U- + label = label.substring(2); + boolean keepit = false; + for (String checkme : keepers) { + if (label.equals(checkme)) { + keepit = true; + break; + } + } + if (!keepit) + network.set(i, null); + } else { + // keep other("O"), this is like a non-label to begin with. + } + } + } /** Returns a deep clone of this learning algorithm. */ public Object clone() { diff --git a/lbjava/src/test/java/edu/illinois/cs/cogcomp/lbjava/SparseNetworkLearningPruneTest.java b/lbjava/src/test/java/edu/illinois/cs/cogcomp/lbjava/SparseNetworkLearningPruneTest.java index e5202d51..e6e4aaf1 100644 --- a/lbjava/src/test/java/edu/illinois/cs/cogcomp/lbjava/SparseNetworkLearningPruneTest.java +++ b/lbjava/src/test/java/edu/illinois/cs/cogcomp/lbjava/SparseNetworkLearningPruneTest.java @@ -16,5 +16,4 @@ public class SparseNetworkLearningPruneTest { @Test public void test() { } - } diff --git a/pom.xml b/pom.xml index db0a0881..d37c893e 100644 --- a/pom.xml +++ b/pom.xml @@ -7,7 +7,7 @@ edu.illinois.cs.cogcomp lbjava-project pom - 1.3.1 + 1.3.2 lbjava From b6399c712a5c0a40867fa9640773f1f89668d608 Mon Sep 17 00:00:00 2001 From: "Thomas L. Redman" Date: Tue, 23 Jul 2019 10:40:45 -0500 Subject: [PATCH 2/3] Just a few performance optimizations. --- .../lbjava/classify/DiscretePrimitiveStringFeature.java | 5 ++--- .../java/edu/illinois/cs/cogcomp/lbjava/learn/Lexicon.java | 7 ++++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/classify/DiscretePrimitiveStringFeature.java b/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/classify/DiscretePrimitiveStringFeature.java index 792d5032..0e1506ad 100644 --- a/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/classify/DiscretePrimitiveStringFeature.java +++ b/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/classify/DiscretePrimitiveStringFeature.java @@ -221,7 +221,7 @@ public Feature encode(String e) { * @return The hash code of this feature. **/ public int hashCode() { - return 31 * super.hashCode() + 17 * identifier.hashCode() + value.hashCode(); + return super.hashCode() + 17 * identifier.hashCode() + value.hashCode(); } @@ -237,8 +237,7 @@ public boolean equals(Object o) { return false; if (o instanceof DiscretePrimitiveStringFeature) { DiscretePrimitiveStringFeature f = (DiscretePrimitiveStringFeature) o; - return identifier.equals(f.identifier) && valueIndex > -1 ? valueIndex == f.valueIndex - : value.equals(f.value); + return identifier.equals(f.identifier) && value.equals(f.value); } DiscretePrimitiveFeature f = (DiscretePrimitiveFeature) o; diff --git a/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/learn/Lexicon.java b/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/learn/Lexicon.java index bb55b6a7..4f753473 100644 --- a/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/learn/Lexicon.java +++ b/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/learn/Lexicon.java @@ -22,6 +22,7 @@ import edu.illinois.cs.cogcomp.lbjava.util.ClassUtils; import edu.illinois.cs.cogcomp.lbjava.util.FVector; import edu.illinois.cs.cogcomp.lbjava.util.TableFormat; +import gnu.trove.map.hash.THashMap; /** @@ -132,7 +133,7 @@ public static Lexicon readLexicon(ExceptionlessInputStream in, boolean readCount // Member variables. /** The map of features to integer keys. */ - protected Map lexicon; + protected Map lexicon; /** The inverted map of integer keys to their features. */ protected FVector lexiconInv; /** The encoding to use for new features added to this lexicon. */ @@ -182,7 +183,7 @@ public Lexicon(String e) { /** Clears the data structures associated with this instance. */ public void clear() { - lexicon = new HashMap(); + lexicon = new THashMap(); lexiconInv = new FVector(); lexiconChildren = null; pruneCutoff = -1; @@ -709,7 +710,7 @@ public Object clone() { } if (lexicon != null) { - clone.lexicon = new HashMap(); + clone.lexicon = new THashMap(); clone.lexicon.putAll(lexicon); } clone.lexiconInv = (FVector) lexiconInv.clone(); From faa4df57e6f0df345ad72d8b62c1bd31e264c49f Mon Sep 17 00:00:00 2001 From: "Thomas L. Redman" Date: Thu, 5 Dec 2019 20:31:49 -0600 Subject: [PATCH 3/3] ArrayFileParser never closed the ZipFile internally. --- lbjava-examples/pom.xml | 2 +- lbjava-mvn-plugin/pom.xml | 2 +- lbjava/pom.xml | 2 +- .../cs/cogcomp/lbjava/parse/ArrayFileParser.java | 13 ++++++++----- pom.xml | 2 +- 5 files changed, 12 insertions(+), 9 deletions(-) diff --git a/lbjava-examples/pom.xml b/lbjava-examples/pom.xml index f0bef592..a3f485f6 100755 --- a/lbjava-examples/pom.xml +++ b/lbjava-examples/pom.xml @@ -3,7 +3,7 @@ lbjava-project edu.illinois.cs.cogcomp - 1.3.2 + 1.3.3 4.0.0 diff --git a/lbjava-mvn-plugin/pom.xml b/lbjava-mvn-plugin/pom.xml index c0db736f..339b6ab8 100644 --- a/lbjava-mvn-plugin/pom.xml +++ b/lbjava-mvn-plugin/pom.xml @@ -5,7 +5,7 @@ lbjava-project edu.illinois.cs.cogcomp - 1.3.2 + 1.3.3 lbjava-maven-plugin diff --git a/lbjava/pom.xml b/lbjava/pom.xml index 152e297d..b6162f54 100644 --- a/lbjava/pom.xml +++ b/lbjava/pom.xml @@ -3,7 +3,7 @@ lbjava-project edu.illinois.cs.cogcomp - 1.3.2 + 1.3.3 4.0.0 diff --git a/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/parse/ArrayFileParser.java b/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/parse/ArrayFileParser.java index cbae0aa4..ea163d86 100644 --- a/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/parse/ArrayFileParser.java +++ b/lbjava/src/main/java/edu/illinois/cs/cogcomp/lbjava/parse/ArrayFileParser.java @@ -36,6 +36,8 @@ public class ArrayFileParser implements Parser { /** Reader for file currently being parsed. */ protected DataInputStream in; + /** the zip file must also be closed, if this is compressed file. */ + protected ZipFile zipFile=null; /** The name of the file to parse. */ protected String exampleFileName; /** A single array from which all examples can be parsed. */ @@ -190,13 +192,11 @@ public void reset() { try { if (exampleFileName != null) { if (zipped) { - ZipFile zip = new ZipFile(exampleFileName); - in = - new DataInputStream(new BufferedInputStream(zip.getInputStream(zip + zipFile = new ZipFile(exampleFileName); + in = new DataInputStream(new BufferedInputStream(zipFile.getInputStream(zipFile .getEntry(ExceptionlessInputStream.zipEntryName)))); } else - in = - new DataInputStream(new BufferedInputStream(new FileInputStream( + in = new DataInputStream(new BufferedInputStream(new FileInputStream( exampleFileName))); } else if (zipped) { ZipInputStream zip = new ZipInputStream(new ByteArrayInputStream(exampleData)); @@ -218,6 +218,9 @@ public void close() { return; try { in.close(); + if (zipFile != null) { + zipFile.close(); + } } catch (Exception e) { System.err.println("Can't close '" + exampleFileName + "':"); e.printStackTrace(); diff --git a/pom.xml b/pom.xml index d37c893e..8fcd6b74 100644 --- a/pom.xml +++ b/pom.xml @@ -7,7 +7,7 @@ edu.illinois.cs.cogcomp lbjava-project pom - 1.3.2 + 1.3.3 lbjava