From b78bbafce12569150355ce9e05f48876a5fa48ae Mon Sep 17 00:00:00 2001 From: Ben Knoll Date: Thu, 25 May 2017 13:40:29 -0500 Subject: [PATCH 1/5] Version bump. --- pom.xml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pom.xml b/pom.xml index 186fb68..2eae1e6 100644 --- a/pom.xml +++ b/pom.xml @@ -23,7 +23,7 @@ edu.umn.biomedicus biomedicus-gpl jar - 1.6.0 + 1.7.0-SNAPSHOT biomedicus-gpl BioMedICUS Annotation System - GPL Extensions @@ -31,19 +31,18 @@ UTF-8 UTF-8 - 1.6.0 edu.umn.biomedicus biomedicus-core - ${biomedicus.version} + 1.7.0-SNAPSHOT edu.umn.biomedicus biomedicus-uima - ${biomedicus.version} + 1.7.0-SNAPSHOT edu.stanford.nlp From 2b1703a759c31d037e2cf9d366699cbd47d7a7fe Mon Sep 17 00:00:00 2001 From: Ben Knoll Date: Thu, 25 May 2017 13:56:57 -0500 Subject: [PATCH 2/5] README update. --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 7d10633..610f9a0 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,9 @@ The system is being developed by our biomedical NLP/IE program at the University This is a collaborative project that aims to serve biomedical and clinical researchers, allowing for customization with different texts. -This project is a collection of GPL-licensed extensions and utilities for the BioMedICUS system. +This project is a collection of GPL-licensed extensions and utilities for the BioMedICUS system. +Any extensions or use of the BioMedICUS project with this extension installed must be compliant with +the GPLv3 license. Wiki From 1f625a653f277168ada024b9163e9ffed9b10996 Mon Sep 17 00:00:00 2001 From: Ben Knoll Date: Tue, 11 Jul 2017 15:21:27 -0500 Subject: [PATCH 3/5] Google style guide update. --- pom.xml | 232 ++++---- src/assembly/descriptor.xml | 110 ++-- .../SHStanfordConstituencyParser.xml | 212 ++++---- .../desc/ae/annotator/SeverityClassifier.xml | 196 +++---- .../annotator/StanfordConstituencyParser.xml | 213 ++++---- .../ae/training/SeverityClassifierTrainer.xml | 284 +++++----- .../parser/SHStanfordConstituencyParser.java | 2 +- .../parser/StanfordConstituencyParser.java | 3 +- .../internal/docclass/SeverityClassifier.java | 4 +- .../docclass/SeverityClassifierModel.java | 163 +++--- .../docclass/SeverityClassifierTrainer.java | 147 ++--- .../docclass/SeverityTrainerProcessor.java | 39 -- .../docclass/SeverityWekaProcessor.java | 502 +++++++++--------- 13 files changed, 1054 insertions(+), 1053 deletions(-) delete mode 100644 src/main/java/edu/umn/biomedicus/internal/docclass/SeverityTrainerProcessor.java diff --git a/pom.xml b/pom.xml index 2eae1e6..17b043b 100644 --- a/pom.xml +++ b/pom.xml @@ -16,127 +16,127 @@ ~ along with this program. If not, see . --> - - 4.0.0 - edu.umn.biomedicus - biomedicus-gpl - jar - 1.7.0-SNAPSHOT + + 4.0.0 + edu.umn.biomedicus + biomedicus-gpl + jar + 1.7.0-SNAPSHOT - biomedicus-gpl - BioMedICUS Annotation System - GPL Extensions + biomedicus-gpl + BioMedICUS Annotation System - GPL Extensions - - UTF-8 - UTF-8 - + + UTF-8 + UTF-8 + - - - edu.umn.biomedicus - biomedicus-core - 1.7.0-SNAPSHOT - - - edu.umn.biomedicus - biomedicus-uima - 1.7.0-SNAPSHOT - - - edu.stanford.nlp - stanford-corenlp - 3.6.0 - - - nz.ac.waikato.cms.weka - weka-stable - 3.8.0 - - - com.google.code.findbugs - jsr305 - 3.0.0 - true - - + + + edu.umn.biomedicus + biomedicus-core + 1.7.0-SNAPSHOT + + + edu.umn.biomedicus + biomedicus-uima + 1.7.0-SNAPSHOT + + + edu.stanford.nlp + stanford-corenlp + 3.6.0 + + + nz.ac.waikato.cms.weka + weka-stable + 3.8.0 + + + com.google.code.findbugs + jsr305 + 3.0.0 + true + + - - - - src/main/resources - true - - **/*.xml - - - - src/main/resources - false - - **/*.xml - - - - - - maven-assembly-plugin - 2.5.2 - - - make-bundles - - single - - package - - - src/assembly/descriptor.xml - - - - - - - - - - maven-compiler-plugin - - 1.8 - 1.8 - - - - - + + + + src/main/resources + true + + **/*.xml + + + + src/main/resources + false + + **/*.xml + + + + + + maven-assembly-plugin + 2.5.2 + + + make-bundles + + single + + package + + + src/assembly/descriptor.xml + + + + + + + + + + maven-compiler-plugin + + 1.8 + 1.8 + + + + + - - University of Minnesota Institute for Health Informatics NLP/IE Program - http://www.bmhi.umn.edu/ihi/research/nlpie/index.htm - - - - Ben Knoll - benknoll@umn.edu - NLP/IE Group at the University of Minnesota Institute for Health Informatics - - http://www.bmhi.umn.edu/ihi/research/nlpie/index.htm - - - - - GNU General Public License, Version 3.0 (GPLv3) - https://www.gnu.org/licenses/gpl-3.0.en.html - repo - - - - https://github.com/NLPIE/BioMedICUS - scm:git:https://github.com/NLPIE/BioMedICUS.git - scm:git:https://github.com/NLPIE/BioMedICUS.git - + + University of Minnesota Institute for Health Informatics NLP/IE Program + http://www.bmhi.umn.edu/ihi/research/nlpie/index.htm + + + + Ben Knoll + benknoll@umn.edu + NLP/IE Group at the University of Minnesota Institute for Health Informatics + + http://www.bmhi.umn.edu/ihi/research/nlpie/index.htm + + + + + GNU General Public License, Version 3.0 (GPLv3) + https://www.gnu.org/licenses/gpl-3.0.en.html + repo + + + + https://github.com/NLPIE/BioMedICUS + scm:git:https://github.com/NLPIE/BioMedICUS.git + scm:git:https://github.com/NLPIE/BioMedICUS.git + \ No newline at end of file diff --git a/src/assembly/descriptor.xml b/src/assembly/descriptor.xml index 5fbe75e..dfbf380 100644 --- a/src/assembly/descriptor.xml +++ b/src/assembly/descriptor.xml @@ -15,59 +15,59 @@ ~ along with this program. If not, see . --> - - release - - zip - - false - - - true - true - lib - false - - - - - 0755 - 0755 - src/main/bin - bin - - **/* - - - - src/main/config - config - - **/* - - - - src/main/desc - desc - - **/* - - - - src/main/top - / - - **/* - - - - . - - LICENSE.txt - README.md - - - + + release + + zip + + false + + + true + true + lib + false + + + + + 0755 + 0755 + src/main/bin + bin + + **/* + + + + src/main/config + config + + **/* + + + + src/main/desc + desc + + **/* + + + + src/main/top + / + + **/* + + + + . + + LICENSE.txt + README.md + + + diff --git a/src/main/desc/ae/annotator/SHStanfordConstituencyParser.xml b/src/main/desc/ae/annotator/SHStanfordConstituencyParser.xml index a4a4e09..5bdfa48 100644 --- a/src/main/desc/ae/annotator/SHStanfordConstituencyParser.xml +++ b/src/main/desc/ae/annotator/SHStanfordConstituencyParser.xml @@ -17,114 +17,116 @@ --> - org.apache.uima.java - true - - - edu.umn.biomedicus.uima.adapter.DocumentProcessorRunnerAnnotator - - - - Social History Stanford Constituency Parser + org.apache.uima.java + true + + + edu.umn.biomedicus.uima.adapter.DocumentProcessorUimaAdapter + + + + Social History Stanford Constituency Parser + + Uses Stanford's Shift-reduce parser to parse social history candidates for constituency. + + ${project.version} + ${organization.name} + + + documentProcessor + The document processor class to instantiate. + String + true + + + viewName + The name of the UIMA view to use. + String + + + eagerLoad - Uses Stanford's Shift-reduce parser to parse social history candidates for constituency. + The name of any classes that need to be eagerly loaded by the Guice injector. Classes + which are + instances of LoadableDataModel will have the loadData method called. - ${project.version} - ${organization.name} - - - documentProcessor - The document processor class to instantiate. - String - true - - - viewName - The name of the UIMA view to use. - String - - - eagerLoad - - The name of any classes that need to be eagerly loaded by the Guice injector. Classes which are - instances of LoadableDataModel will have the loadData method called. - - String - true - false - - - postProcessors - - The class names of any post processors that should be run after all documents have been processed. - - String - true - false - - - - - documentProcessor - - edu.umn.biomedicus.gpl.stanford.parser.SHStanfordConstituencyParser - - - - viewName - - SystemView - - - - eagerLoad - - - edu.umn.biomedicus.gpl.stanford.parser.StanfordConstituencyParserModel - - - - - - - - - - - - - - - - true - true - false - - + String + true + false + + + postProcessors + + The class names of any post processors that should be run after all documents have been + processed. + + String + true + false + + + + + documentProcessor + + edu.umn.biomedicus.gpl.stanford.parser.SHStanfordConstituencyParser + + + + viewName + + SystemView + + + + eagerLoad + + + edu.umn.biomedicus.gpl.stanford.parser.StanfordConstituencyParserModel + + + + + + + + + + + + + + + + true + true + false + + - - - guiceInjector - The guice injector resource. - - + + + guiceInjector + The guice injector resource. + + - - - - guiceInjector - The guice resource. - - edu.umn.biomedicus.uima.adapter.GuiceInjector - - - - - - guiceInjector - guiceInjector - - - + + + + guiceInjector + The guice resource. + + edu.umn.biomedicus.uima.adapter.GuiceInjector + + + + + + guiceInjector + guiceInjector + + + diff --git a/src/main/desc/ae/annotator/SeverityClassifier.xml b/src/main/desc/ae/annotator/SeverityClassifier.xml index 4fa85d6..32f823f 100644 --- a/src/main/desc/ae/annotator/SeverityClassifier.xml +++ b/src/main/desc/ae/annotator/SeverityClassifier.xml @@ -17,104 +17,106 @@ --> - org.apache.uima.java - true - - - edu.umn.biomedicus.uima.adapter.DocumentProcessorRunnerAnnotator - - - - Severity Classifier - Severity Classifier - ${project.version} - ${organization.name} - - - documentProcessor - The document processor class to instantiate. - String - true - - - viewName - The name of the UIMA view to use. - String - - - eagerLoad - - The name of any classes that need to be eagerly loaded by the Guice injector. Classes which are - instances of LoadableDataModel will have the loadData method called. - - String - true - false - - - postProcessors - - The class names of any post processors that should be run after all documents have been processed. - - String - true - false - - - - - documentProcessor - - edu.umn.biomedicus.internal.docclass.SeverityClassifier - - - - viewName - - SystemView - - - - - - - - - - - - - - - true - true - false - - + org.apache.uima.java + true + + + edu.umn.biomedicus.uima.adapter.DocumentProcessorUimaAdapter + + + + Severity Classifier + Severity Classifier + ${project.version} + ${organization.name} + + + documentProcessor + The document processor class to instantiate. + String + true + + + viewName + The name of the UIMA view to use. + String + + + eagerLoad + + The name of any classes that need to be eagerly loaded by the Guice injector. Classes + which are + instances of LoadableDataModel will have the loadData method called. + + String + true + false + + + postProcessors + + The class names of any post processors that should be run after all documents have been + processed. + + String + true + false + + + + + documentProcessor + + edu.umn.biomedicus.internal.docclass.SeverityClassifier + + + + viewName + + SystemView + + + + + + + + + + + + + + + true + true + false + + - - - guiceInjector - The guice injector resource. - - + + + guiceInjector + The guice injector resource. + + - - - - guiceInjector - The guice resource. - - edu.umn.biomedicus.uima.adapter.GuiceInjector - - - - - - guiceInjector - guiceInjector - - - + + + + guiceInjector + The guice resource. + + edu.umn.biomedicus.uima.adapter.GuiceInjector + + + + + + guiceInjector + guiceInjector + + + diff --git a/src/main/desc/ae/annotator/StanfordConstituencyParser.xml b/src/main/desc/ae/annotator/StanfordConstituencyParser.xml index af9bb25..a3b0a12 100644 --- a/src/main/desc/ae/annotator/StanfordConstituencyParser.xml +++ b/src/main/desc/ae/annotator/StanfordConstituencyParser.xml @@ -17,112 +17,115 @@ --> - org.apache.uima.java - true - - - edu.umn.biomedicus.uima.adapter.DocumentProcessorRunnerAnnotator - - - - Stanford Constituency Parser - Uses Stanford's Shift-reduce parser to parse all sentences for constituency. - ${project.version} - ${organization.name} - - - documentProcessor - The document processor class to instantiate. - String - true - - - viewName - The name of the UIMA view to use. - String - - - eagerLoad - - The name of any classes that need to be eagerly loaded by the Guice injector. Classes which are - instances of LoadableDataModel will have the loadData method called. - - String - true - false - - - postProcessors - - The class names of any post processors that should be run after all documents have been processed. - - String - true - false - - - - - documentProcessor - - edu.umn.biomedicus.gpl.stanford.parser.StanfordConstituencyParser - - - - viewName - - SystemView - - - - eagerLoad - - - edu.umn.biomedicus.gpl.stanford.parser.StanfordConstituencyParserModel - - - - - - - - - - - - - - - - true - true - false - - + org.apache.uima.java + true + + + edu.umn.biomedicus.uima.adapter.DocumentProcessorUimaAdapter + + + + Stanford Constituency Parser + Uses Stanford's Shift-reduce parser to parse all sentences for constituency. + + ${project.version} + ${organization.name} + + + documentProcessor + The document processor class to instantiate. + String + true + + + viewName + The name of the UIMA view to use. + String + + + eagerLoad + + The name of any classes that need to be eagerly loaded by the Guice injector. Classes + which are + instances of LoadableDataModel will have the loadData method called. + + String + true + false + + + postProcessors + + The class names of any post processors that should be run after all documents have been + processed. + + String + true + false + + + + + documentProcessor + + edu.umn.biomedicus.gpl.stanford.parser.StanfordConstituencyParser + + + + viewName + + SystemView + + + + eagerLoad + + + edu.umn.biomedicus.gpl.stanford.parser.StanfordConstituencyParserModel + + + + + + + + + + + + + + + + true + true + false + + - - - guiceInjector - The guice injector resource. - - + + + guiceInjector + The guice injector resource. + + - - - - guiceInjector - The guice resource. - - edu.umn.biomedicus.uima.adapter.GuiceInjector - - - - - - guiceInjector - guiceInjector - - - + + + + guiceInjector + The guice resource. + + edu.umn.biomedicus.uima.adapter.GuiceInjector + + + + + + guiceInjector + guiceInjector + + + diff --git a/src/main/desc/ae/training/SeverityClassifierTrainer.xml b/src/main/desc/ae/training/SeverityClassifierTrainer.xml index 7124b97..aa3201c 100644 --- a/src/main/desc/ae/training/SeverityClassifierTrainer.xml +++ b/src/main/desc/ae/training/SeverityClassifierTrainer.xml @@ -17,148 +17,150 @@ --> - org.apache.uima.java - true - - - edu.umn.biomedicus.uima.adapter.DocumentProcessorRunnerAnnotator - - - - Severity Trainer - Trains the severity classification model. - ${project.version} - ${organization.name} - - - docclass.severity.model.path - Where to write the model to. - String - false - - - docclass.stopwords.path - Location of stopwords file - String - false - - - documentProcessor - The document processor class to instantiate. - String - true - - - docclass.severity.attributesToKeep - The number of attributes to keep. - Integer - true - - - docclass.severity.minWordCount - Minimum word count - Integer - true - - - viewName - The name of the UIMA view to use. - String - - - eagerLoad - - The name of any classes that need to be eagerly loaded by the Guice injector. Classes which are - instances of LoadableDataModel will have the loadData method called. - - String - true - false - - - postProcessors - - The class names of any post processors that should be run after all documents have been processed. - - String - true - false - - - - - docclass.severity.attributesToKeep - - 1000 - - - - docclass.severity.minWordCount - - 2 - - - - documentProcessor - - edu.umn.biomedicus.internal.docclass.SeverityTrainerProcessor - - - - viewName - - SystemView - - - - postProcessors - - - edu.umn.biomedicus.internal.docclass.SeverityClassifierTrainer - - - - - - - - - - - - - - - - false - false - false - - + org.apache.uima.java + true + + + edu.umn.biomedicus.uima.adapter.DocumentProcessorUimaAdapter + + + + Severity Trainer + Trains the severity classification model. + ${project.version} + ${organization.name} + + + docclass.severity.model.path + Where to write the model to. + String + false + + + docclass.stopwords.path + Location of stopwords file + String + false + + + documentProcessor + The document processor class to instantiate. + String + true + + + docclass.severity.attributesToKeep + The number of attributes to keep. + Integer + true + + + docclass.severity.minWordCount + Minimum word count + Integer + true + + + viewName + The name of the UIMA view to use. + String + + + eagerLoad + + The name of any classes that need to be eagerly loaded by the Guice injector. Classes + which are + instances of LoadableDataModel will have the loadData method called. + + String + true + false + + + postProcessors + + The class names of any post processors that should be run after all documents have been + processed. + + String + true + false + + + + + docclass.severity.attributesToKeep + + 1000 + + + + docclass.severity.minWordCount + + 2 + + + + documentProcessor + + edu.umn.biomedicus.internal.docclass.SeverityTrainerProcessor + + + + viewName + + SystemView + + + + postProcessors + + + edu.umn.biomedicus.internal.docclass.SeverityClassifierTrainer + + + + + + + + + + + + + + + + false + false + false + + - - - guiceInjector - The guice injector resource. - - + + + guiceInjector + The guice injector resource. + + - - - - guiceInjector - The guice resource. - - edu.umn.biomedicus.uima.adapter.GuiceInjector - - - - - - guiceInjector - guiceInjector - - - + + + + guiceInjector + The guice resource. + + edu.umn.biomedicus.uima.adapter.GuiceInjector + + + + + + guiceInjector + guiceInjector + + + diff --git a/src/main/java/edu/umn/biomedicus/gpl/stanford/parser/SHStanfordConstituencyParser.java b/src/main/java/edu/umn/biomedicus/gpl/stanford/parser/SHStanfordConstituencyParser.java index 8f7b1a3..61a5655 100644 --- a/src/main/java/edu/umn/biomedicus/gpl/stanford/parser/SHStanfordConstituencyParser.java +++ b/src/main/java/edu/umn/biomedicus/gpl/stanford/parser/SHStanfordConstituencyParser.java @@ -51,7 +51,7 @@ public SHStanfordConstituencyParser( } @Override - public void process() throws BiomedicusException { + public void process(Document document) throws BiomedicusException { for (Label label : labelIndex) { stanfordConstituencyParserModel.parseSentence(label, parseTokenLabelIndex, partOfSpeechLabelIndex, constituencyParseLabeler); diff --git a/src/main/java/edu/umn/biomedicus/gpl/stanford/parser/StanfordConstituencyParser.java b/src/main/java/edu/umn/biomedicus/gpl/stanford/parser/StanfordConstituencyParser.java index a48988a..0b40239 100644 --- a/src/main/java/edu/umn/biomedicus/gpl/stanford/parser/StanfordConstituencyParser.java +++ b/src/main/java/edu/umn/biomedicus/gpl/stanford/parser/StanfordConstituencyParser.java @@ -23,6 +23,7 @@ import edu.umn.biomedicus.common.types.text.Sentence; import edu.umn.biomedicus.exc.BiomedicusException; import edu.umn.biomedicus.framework.DocumentProcessor; +import edu.umn.biomedicus.framework.store.Document; import edu.umn.biomedicus.framework.store.Label; import edu.umn.biomedicus.framework.store.LabelIndex; import edu.umn.biomedicus.framework.store.Labeler; @@ -48,7 +49,7 @@ public StanfordConstituencyParser(TextView textView, } @Override - public void process() throws BiomedicusException { + public void process(Document document) throws BiomedicusException { for (Label sentenceLabel : sentenceLabelIndex) { stanfordConstituencyParserModel.parseSentence(sentenceLabel, parseTokenLabelIndex, partOfSpeechLabelIndex, constituencyParseLabeler); diff --git a/src/main/java/edu/umn/biomedicus/internal/docclass/SeverityClassifier.java b/src/main/java/edu/umn/biomedicus/internal/docclass/SeverityClassifier.java index d4ba811..fee8d5c 100644 --- a/src/main/java/edu/umn/biomedicus/internal/docclass/SeverityClassifier.java +++ b/src/main/java/edu/umn/biomedicus/internal/docclass/SeverityClassifier.java @@ -38,8 +38,8 @@ public SeverityClassifier(SeverityClassifierModel severityClassifierModel, Docum } @Override - public void process() throws BiomedicusException { + public void process(Document document) throws BiomedicusException { String prediction = severityClassifierModel.predict(textView); - document.putMetadata("Severity", prediction); + this.document.putMetadata("Severity", prediction); } } diff --git a/src/main/java/edu/umn/biomedicus/internal/docclass/SeverityClassifierModel.java b/src/main/java/edu/umn/biomedicus/internal/docclass/SeverityClassifierModel.java index 98a747e..886e62c 100644 --- a/src/main/java/edu/umn/biomedicus/internal/docclass/SeverityClassifierModel.java +++ b/src/main/java/edu/umn/biomedicus/internal/docclass/SeverityClassifierModel.java @@ -24,16 +24,15 @@ import edu.umn.biomedicus.exc.BiomedicusException; import edu.umn.biomedicus.framework.DataLoader; import edu.umn.biomedicus.framework.store.TextView; -import weka.classifiers.Classifier; -import weka.core.Instance; -import weka.filters.Filter; - import java.io.FileInputStream; import java.io.ObjectInputStream; import java.io.Serializable; import java.nio.file.Path; import java.util.HashMap; import java.util.Map; +import weka.classifiers.Classifier; +import weka.core.Instance; +import weka.filters.Filter; /** * Classify documents based on symptom severity @@ -43,93 +42,95 @@ */ @ProvidedBy(SeverityClassifierModel.Loader.class) public class SeverityClassifierModel implements Serializable { - // For unknown classes (test data or poorly formatted training data) - static final String UNK = "unknown"; - private final Classifier classifier; - private final Filter attSel; - private final SeverityWekaProcessor severityWekaProcessor; - private final Map severityMap; + // For unknown classes (test data or poorly formatted training data) + static final String UNK = "unknown"; + private final Classifier classifier; + private final Filter attSel; + private final SeverityWekaProcessor severityWekaProcessor; - /** - * Initialize this model - * All training happens in the trainer; just store what we need to keep for classification - * @param classifier a Weka Classifier object - * @param attSel an attribute selection object - * @param severityWekaProcessor a processor to convert Document objects into Weka Instance objects - * @throws BiomedicusException - */ - SeverityClassifierModel(Classifier classifier, - Filter attSel, - SeverityWekaProcessor severityWekaProcessor) throws BiomedicusException { - severityMap = new HashMap<>(); - severityMap.put(0., "ABSENT"); - severityMap.put(1., "MILD"); - severityMap.put(2., "MODERATE"); - severityMap.put(3., "SEVERE"); - severityMap.put(4., UNK); - this.classifier = classifier; - this.attSel = attSel; - this.severityWekaProcessor = severityWekaProcessor; - } + private final Map severityMap; + + /** + * Initialize this model + * All training happens in the trainer; just store what we need to keep for classification + * + * @param classifier a Weka Classifier object + * @param attSel an attribute selection object + * @param severityWekaProcessor a processor to convert Document objects into Weka Instance + * objects + */ + SeverityClassifierModel(Classifier classifier, + Filter attSel, + SeverityWekaProcessor severityWekaProcessor) throws BiomedicusException { + severityMap = new HashMap<>(); + severityMap.put(0., "ABSENT"); + severityMap.put(1., "MILD"); + severityMap.put(2., "MODERATE"); + severityMap.put(3., "SEVERE"); + severityMap.put(4., UNK); + this.classifier = classifier; + this.attSel = attSel; + this.severityWekaProcessor = severityWekaProcessor; + } - /** - * Perform attribute selection and then classification using the stored Weka objects - * Where classes are tied, err on the side of higher class - * @param textView the textView - * @return a string (from the predefined classes) representing this textView's symptom severity - * @throws BiomedicusException - */ - public String predict(TextView textView) throws BiomedicusException { - Instance inst = severityWekaProcessor.getTestData(textView); - double result; - try { - if(attSel.input(inst)) { - inst = attSel.output(); - double[] dist = classifier.distributionForInstance(inst); - result=-1; - double max=-Double.MAX_VALUE; - for(int i=0; i= max) { - max = dist[i]; - result = i; - } - } - } else { - throw new Exception(); - } - } catch(Exception e) { - throw new BiomedicusException(); + /** + * Perform attribute selection and then classification using the stored Weka objects + * Where classes are tied, err on the side of higher class + * + * @param textView the textView + * @return a string (from the predefined classes) representing this textView's symptom severity + */ + public String predict(TextView textView) throws BiomedicusException { + Instance inst = severityWekaProcessor.getTestData(textView); + double result; + try { + if (attSel.input(inst)) { + inst = attSel.output(); + double[] dist = classifier.distributionForInstance(inst); + result = -1; + double max = -Double.MAX_VALUE; + for (int i = 0; i < dist.length; i++) { + if (dist[i] >= max) { + max = dist[i]; + result = i; + } } - return severityMap.get(result); + } else { + throw new Exception(); + } + } catch (Exception e) { + throw new BiomedicusException(); } + return severityMap.get(result); + } - public String getMetadataKey() { - return "Severity"; - } + public String getMetadataKey() { + return "Severity"; + } - /** - * Load a serialized model - */ - @ProcessorScoped - static class Loader extends DataLoader { + /** + * Load a serialized model + */ + @ProcessorScoped + static class Loader extends DataLoader { - private final Path modelPath; + private final Path modelPath; - @Inject - public Loader(@ProcessorSetting("docclass.severity.model.path") Path modelPath) { - this.modelPath = modelPath; - } + @Inject + public Loader(@ProcessorSetting("docclass.severity.model.path") Path modelPath) { + this.modelPath = modelPath; + } - @Override - protected SeverityClassifierModel loadModel() throws BiomedicusException { - try { - ObjectInputStream ois = new ObjectInputStream(new FileInputStream(modelPath.toFile())); - return (SeverityClassifierModel) ois.readObject(); - } catch(Exception e) { - throw new BiomedicusException(); - } - } + @Override + protected SeverityClassifierModel loadModel() throws BiomedicusException { + try { + ObjectInputStream ois = new ObjectInputStream(new FileInputStream(modelPath.toFile())); + return (SeverityClassifierModel) ois.readObject(); + } catch (Exception e) { + throw new BiomedicusException(); + } } + } } diff --git a/src/main/java/edu/umn/biomedicus/internal/docclass/SeverityClassifierTrainer.java b/src/main/java/edu/umn/biomedicus/internal/docclass/SeverityClassifierTrainer.java index 2a92c1b..74b992b 100644 --- a/src/main/java/edu/umn/biomedicus/internal/docclass/SeverityClassifierTrainer.java +++ b/src/main/java/edu/umn/biomedicus/internal/docclass/SeverityClassifierTrainer.java @@ -20,10 +20,19 @@ import com.google.inject.Inject; import edu.umn.biomedicus.annotations.ProcessorScoped; import edu.umn.biomedicus.annotations.ProcessorSetting; +import edu.umn.biomedicus.common.StandardViews; import edu.umn.biomedicus.exc.BiomedicusException; -import edu.umn.biomedicus.framework.PostProcessor; +import edu.umn.biomedicus.framework.Aggregator; import edu.umn.biomedicus.framework.store.Document; import edu.umn.biomedicus.framework.store.TextView; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.ObjectOutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Set; +import java.util.stream.Collectors; +import javax.annotation.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import weka.attributeSelection.ASEvaluation; @@ -36,15 +45,6 @@ import weka.filters.Filter; import weka.filters.unsupervised.attribute.Remove; -import javax.annotation.Nullable; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.ObjectOutputStream; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Set; -import java.util.stream.Collectors; - /** * Train a Weka model to classify documents according to symptom severity * Created for the 2016 i2b2 NLP Shared Task @@ -52,78 +52,79 @@ * @author Greg Finley */ @ProcessorScoped -public class SeverityClassifierTrainer implements PostProcessor { +public class SeverityClassifierTrainer implements Aggregator { - private static final Logger LOGGER = LoggerFactory.getLogger(SeverityClassifierTrainer.class); + private static final Logger LOGGER = LoggerFactory.getLogger(SeverityClassifierTrainer.class); - private final Path outPath; - private final SeverityWekaProcessor wekaProcessor; - private final int attributesToKeep; + private final Path outPath; + private final SeverityWekaProcessor wekaProcessor; + private final int attributesToKeep; - /** - * Initialize this trainer. If the stopwords file is not present or can't be read from, trainer will still work - * @param outPath the path to write the model to - * @param stopWordsPath path to a stopwords file - */ - @Inject - public SeverityClassifierTrainer(@ProcessorSetting("docclass.severity.output.path") Path outPath, - @ProcessorSetting("docclass.stopwords.path") @Nullable Path stopWordsPath, - @ProcessorSetting("docclass.severity.attributesToKeep") Integer attributesToKeep, - @ProcessorSetting("docclass.severity.minWordCount") Integer minWordCount) { - Set stopWords = null; - if(stopWordsPath != null) { - try { - stopWords = Files.lines(stopWordsPath).collect(Collectors.toSet()); - } catch (IOException e) { - LOGGER.warn("Could not load stopwords file; will not exclude stopwords"); - } - } - this.outPath = outPath; - this.attributesToKeep = attributesToKeep; - wekaProcessor = new SeverityWekaProcessor(stopWords, minWordCount, true); + /** + * Initialize this trainer. If the stopwords file is not present or can't be read from, trainer + * will still work + * + * @param outPath the path to write the model to + * @param stopWordsPath path to a stopwords file + */ + @Inject + public SeverityClassifierTrainer(@ProcessorSetting("docclass.severity.output.path") Path outPath, + @ProcessorSetting("docclass.stopwords.path") @Nullable Path stopWordsPath, + @ProcessorSetting("docclass.severity.attributesToKeep") Integer attributesToKeep, + @ProcessorSetting("docclass.severity.minWordCount") Integer minWordCount) { + Set stopWords = null; + if (stopWordsPath != null) { + try { + stopWords = Files.lines(stopWordsPath).collect(Collectors.toSet()); + } catch (IOException e) { + LOGGER.warn("Could not load stopwords file; will not exclude stopwords"); + } } + this.outPath = outPath; + this.attributesToKeep = attributesToKeep; + wekaProcessor = new SeverityWekaProcessor(stopWords, minWordCount, true); + } - /** - * Add the document to the collection, which will be trained all at once at the end - * @param textView a document - */ - public void processDocument(TextView textView) { - wekaProcessor.addTrainingDocument(textView); - } + @Override + public void addDocument(Document document) throws BiomedicusException { + TextView textView = document.getTextView(StandardViews.ORIGINAL_DOCUMENT) + .orElseThrow(() -> new BiomedicusException("No original document view")); + wekaProcessor.addTrainingDocument(textView); + } - @Override - public void afterProcessing() throws BiomedicusException { - Instances trainSet = wekaProcessor.getTrainingData(); - Classifier classifier = new SMO(); - AttributeSelection sel = new AttributeSelection(); - ASEvaluation infogain = new InfoGainAttributeEval(); - Ranker ranker = new Ranker(); - Remove remove = new Remove(); + @Override + public void done() throws BiomedicusException { + Instances trainSet = wekaProcessor.getTrainingData(); + Classifier classifier = new SMO(); + AttributeSelection sel = new AttributeSelection(); + ASEvaluation infogain = new InfoGainAttributeEval(); + Ranker ranker = new Ranker(); + Remove remove = new Remove(); - ranker.setNumToSelect(attributesToKeep); - sel.setEvaluator(infogain); - sel.setSearch(ranker); + ranker.setNumToSelect(attributesToKeep); + sel.setEvaluator(infogain); + sel.setSearch(ranker); - try { - sel.SelectAttributes(trainSet); - int[] selected = sel.selectedAttributes(); - remove.setInvertSelection(true); - remove.setAttributeIndicesArray(selected); - remove.setInputFormat(trainSet); - trainSet = Filter.useFilter(trainSet, remove); - classifier.buildClassifier(trainSet); - } catch (Exception e) { - throw new BiomedicusException(); - } + try { + sel.SelectAttributes(trainSet); + int[] selected = sel.selectedAttributes(); + remove.setInvertSelection(true); + remove.setAttributeIndicesArray(selected); + remove.setInputFormat(trainSet); + trainSet = Filter.useFilter(trainSet, remove); + classifier.buildClassifier(trainSet); + } catch (Exception e) { + throw new BiomedicusException(); + } - SeverityClassifierModel model = new SeverityClassifierModel(classifier, remove, wekaProcessor); + SeverityClassifierModel model = new SeverityClassifierModel(classifier, remove, wekaProcessor); - try { - ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(outPath.toFile())); - oos.writeObject(model); - oos.close(); - } catch(IOException e) { - throw new BiomedicusException(); - } + try { + ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(outPath.toFile())); + oos.writeObject(model); + oos.close(); + } catch (IOException e) { + throw new BiomedicusException(); } + } } diff --git a/src/main/java/edu/umn/biomedicus/internal/docclass/SeverityTrainerProcessor.java b/src/main/java/edu/umn/biomedicus/internal/docclass/SeverityTrainerProcessor.java deleted file mode 100644 index 787a0c9..0000000 --- a/src/main/java/edu/umn/biomedicus/internal/docclass/SeverityTrainerProcessor.java +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (C) 2016 Regents of the University of Minnesota - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -package edu.umn.biomedicus.internal.docclass; - -import com.google.inject.Inject; -import edu.umn.biomedicus.exc.BiomedicusException; -import edu.umn.biomedicus.framework.DocumentProcessor; -import edu.umn.biomedicus.framework.store.TextView; - -public class SeverityTrainerProcessor implements DocumentProcessor { - private final SeverityClassifierTrainer severityClassifierTrainer; - private final TextView textView; - - @Inject - public SeverityTrainerProcessor(SeverityClassifierTrainer severityClassifierTrainer, TextView textView) { - this.severityClassifierTrainer = severityClassifierTrainer; - this.textView = textView; - } - - @Override - public void process() throws BiomedicusException { - severityClassifierTrainer.processDocument(textView); - } -} diff --git a/src/main/java/edu/umn/biomedicus/internal/docclass/SeverityWekaProcessor.java b/src/main/java/edu/umn/biomedicus/internal/docclass/SeverityWekaProcessor.java index 74db22e..fc36bab 100644 --- a/src/main/java/edu/umn/biomedicus/internal/docclass/SeverityWekaProcessor.java +++ b/src/main/java/edu/umn/biomedicus/internal/docclass/SeverityWekaProcessor.java @@ -17,278 +17,306 @@ package edu.umn.biomedicus.internal.docclass; -import edu.umn.biomedicus.framework.store.Document; import edu.umn.biomedicus.framework.store.TextView; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import weka.core.*; - -import javax.annotation.Nullable; import java.io.Serializable; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; +import javax.annotation.Nullable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import weka.core.Attribute; +import weka.core.DenseInstance; +import weka.core.Instance; +import weka.core.Instances; +import weka.core.SparseInstance; /** - * Text processing used for the symptom severity annotator, as written for the 2016 i2b2 NLP Shared Task - * Currently works on raw document text; could be modified to work on richer data (i.e., biomedicus's NLP results) + * Text processing used for the symptom severity annotator, as written for the 2016 i2b2 NLP Shared + * Task Currently works on raw document text; could be modified to work on richer data (i.e., + * biomedicus's NLP results) * * @author Greg Finley */ class SeverityWekaProcessor implements Serializable { - private static final Logger LOGGER = LoggerFactory.getLogger(SeverityWekaProcessor.class); - // Build this incrementally with each added document - private Instances trainingTextInstances; + private static final Logger LOGGER = LoggerFactory.getLogger(SeverityWekaProcessor.class); + // de-weight instances in the 'absent' and 'mild' classes to deal with sparsity issues + private final double[] byClassWeights = {1, .3, 3, 3}; + private final List classValues; + private final boolean sortWordsByDescendingFreq; + private final int minTermCount; + private final Set stopWords; + private final Attribute classAttribute; + // Created specifically for the i2b2-format XML files + private final Pattern fileTextPattern = Pattern + .compile("\\|(.*)\\[report_end\\]", Pattern.DOTALL); + private final Pattern scorePattern = Pattern.compile("score=\"(\\w+)\""); + private final Pattern annotatedBy = Pattern.compile("annotated_by=\"(.)\""); + // Build this incrementally with each added document + private Instances trainingTextInstances; + private Map dictionary; + // Empty Instances objects used to maintain consistent format between individual Instance objects + private Instances textTemplate; + private Instances vectorTemplate; - // de-weight instances in the 'absent' and 'mild' classes to deal with sparsity issues - private final double[] byClassWeights = {1, .3, 3, 3}; + /** + * Initialize this processor + * + * @param stopWords an optional set of words to exclude from the vector space + * @param minTermCount minimum number of occurrences to use a term in the vector space (2 is a + * good value) + * @param sortWordsByDescendingFreq whether to sort words by global frequency (this helps + * attribute selection) + */ + SeverityWekaProcessor(@Nullable Set stopWords, int minTermCount, + boolean sortWordsByDescendingFreq) { + this.stopWords = stopWords == null ? new HashSet<>() : stopWords; + this.sortWordsByDescendingFreq = sortWordsByDescendingFreq; + this.minTermCount = minTermCount; - private final List classValues; + ArrayList textInstanceAttributes = new ArrayList<>(); + classValues = Arrays + .asList("ABSENT", "MILD", "MODERATE", "SEVERE", SeverityClassifierModel.UNK); + classAttribute = new Attribute("_class", classValues); + textInstanceAttributes.add(classAttribute); + textInstanceAttributes.add(new Attribute("text", (List) null)); - private final boolean sortWordsByDescendingFreq; - private final int minTermCount; - private final Set stopWords; - private Map dictionary; + textTemplate = new Instances("textTemplate", textInstanceAttributes, 0); + textTemplate.setClassIndex(0); - // Empty Instances objects used to maintain consistent format between individual Instance objects - private Instances textTemplate; - private Instances vectorTemplate; - private final Attribute classAttribute; + trainingTextInstances = new Instances(textTemplate); + } - // Created specifically for the i2b2-format XML files - private final Pattern fileTextPattern = Pattern.compile("\\|(.*)\\[report_end\\]", Pattern.DOTALL); - private final Pattern scorePattern = Pattern.compile("score=\"(\\w+)\""); - private final Pattern annotatedBy = Pattern.compile("annotated_by=\"(.)\""); + /** + * Once all documents have been passed, vectorize the text and return the real-valued feature + * vectors + * + * @return Instances containing all training data + */ + Instances getTrainingData() { + buildDictionary(trainingTextInstances); + return vectorizeInstances(trainingTextInstances); + } - /** - * Initialize this processor - * @param stopWords an optional set of words to exclude from the vector space - * @param minTermCount minimum number of occurrences to use a term in the vector space (2 is a good value) - * @param sortWordsByDescendingFreq whether to sort words by global frequency (this helps attribute selection) - */ - SeverityWekaProcessor(@Nullable Set stopWords, int minTermCount, boolean sortWordsByDescendingFreq) { - this.stopWords = stopWords == null ? new HashSet<>() : stopWords; - this.sortWordsByDescendingFreq = sortWordsByDescendingFreq; - this.minTermCount = minTermCount; - - ArrayList textInstanceAttributes = new ArrayList<>(); - classValues = Arrays.asList("ABSENT", "MILD", "MODERATE", "SEVERE", SeverityClassifierModel.UNK); - classAttribute = new Attribute("_class", classValues); - textInstanceAttributes.add(classAttribute); - textInstanceAttributes.add(new Attribute("text", (List) null)); + /** + * Add a document for training. Will extract this doc's text but will not train on it until + * getTrainingData called + * + * @param textView a document + */ + void addTrainingDocument(TextView textView) { + Instance trainingInstance = getTextInstance(textView.getText()); + if (trainingInstance != null) { + trainingTextInstances.add(trainingInstance); + } + } - textTemplate = new Instances("textTemplate", textInstanceAttributes, 0); - textTemplate.setClassIndex(0); + /** + * Convert a document into a vector instance. buildDictionary() needs to have been run. + * + * @param textView a document + * @return an Instance with real-valued data + */ + Instance getTestData(TextView textView) { + Instance textInstance = getTextInstance(textView.getText()); + Instance vectorInstance = vectorizeInstance(textInstance); + vectorInstance.setDataset(vectorTemplate); + return vectorInstance; + } - trainingTextInstances = new Instances(textTemplate); + /** + * Process the text and class of this document and put it into an Instance + * + * @param docText raw text from the file (assumed XML format) + * @return an Instance with two attributes: class, and doctext + */ + @Nullable + private Instance getTextInstance(String docText) { + String fileText; + String docClass; + double weight = 1; + Matcher matcher = fileTextPattern.matcher(docText); + if (matcher.find()) { + fileText = matcher.group(1); + } else { + fileText = docText; } - - /** - * Once all documents have been passed, vectorize the text and return the real-valued feature vectors - * @return Instances containing all training data - */ - Instances getTrainingData() { - buildDictionary(trainingTextInstances); - return vectorizeInstances(trainingTextInstances); + fileText = processText(fileText); + matcher = scorePattern.matcher(docText); + if (matcher.find() && classValues.contains(docClass = matcher.group(1))) { + weight *= byClassWeights[classValues.indexOf(docClass)]; + } else { + docClass = SeverityClassifierModel.UNK; + if (dictionary == null) { + LOGGER.warn("Added document with unknown class during training; will ignore!"); + return null; + } } - - /** - * Add a document for training. Will extract this doc's text but will not train on it until getTrainingData called - * @param textView a document - */ - void addTrainingDocument(TextView textView) { - Instance trainingInstance = getTextInstance(textView.getText()); - if (trainingInstance != null) { - trainingTextInstances.add(trainingInstance); - } + // add the annotator as a word (this helps classification a little) + matcher = annotatedBy.matcher(docText); + if (matcher.find()) { + fileText += " thisNoteAnnotatedBy" + matcher.group(1); + if (matcher.group(1).equals("1")) { + weight /= 2; + } } - /** - * Convert a document into a vector instance. buildDictionary() needs to have been run. - * @param textView a document - * @return an Instance with real-valued data - */ - Instance getTestData(TextView textView) { - Instance textInstance = getTextInstance(textView.getText()); - Instance vectorInstance = vectorizeInstance(textInstance); - vectorInstance.setDataset(vectorTemplate); - return vectorInstance; - } + Instance inst = new DenseInstance(2); + inst.setDataset(textTemplate); + inst.setValue(0, docClass); + inst.attribute(1).addStringValue(fileText); + inst.setValue(1, fileText); + inst.setWeight(weight); + return inst; + } - /** - * Process the text and class of this document and put it into an Instance - * @param docText raw text from the file (assumed XML format) - * @return an Instance with two attributes: class, and doctext - */ - @Nullable - private Instance getTextInstance(String docText) { - String fileText; - String docClass; - double weight = 1; - Matcher matcher = fileTextPattern.matcher(docText); - if(matcher.find()) { - fileText = matcher.group(1); - } else { - fileText = docText; - } - fileText = processText(fileText); - matcher = scorePattern.matcher(docText); - if(matcher.find() && classValues.contains(docClass = matcher.group(1))) { - weight *= byClassWeights[classValues.indexOf(docClass)]; - } else { - docClass = SeverityClassifierModel.UNK; - if(dictionary == null) { - LOGGER.warn("Added document with unknown class during training; will ignore!"); - return null; - } - } - // add the annotator as a word (this helps classification a little) - matcher = annotatedBy.matcher(docText); - if(matcher.find()) { - fileText += " thisNoteAnnotatedBy" + matcher.group(1); - if (matcher.group(1).equals("1")) { - weight /= 2; - } - } + /** + * Prepare text for vectorization (lowercasing, fixing bad line breaks, rough tokenization) + * + * @param origText entire text of a document + * @return the processed text + */ + private String processText(String origText) { + String text = fixTableRows(origText); + text = text.toLowerCase(); + String[] words = text.split("\\W+"); + if (words.length > 0) { + StringBuilder builder = new StringBuilder(); + builder.append(words[0]); + for (int i = 1; i < words.length; i++) { + builder.append(" "); + builder.append(words[i]); + } + addBigrams(words, builder); + return builder.toString(); + } else { + LOGGER.warn("Empty document"); + return ""; + } + } - Instance inst = new DenseInstance(2); - inst.setDataset(textTemplate); - inst.setValue(0, docClass); - inst.attribute(1).addStringValue(fileText); - inst.setValue(1, fileText); - inst.setWeight(weight); - return inst; + /** + * Given a list of words and a StringBuilder, continue to build bigrams/trigrams/etc. onto the + * builder + * + * @param words array of words in their natural order + * @param builder an active StringBuilder + */ + private void addBigrams(String[] words, StringBuilder builder) { + for (int i = 1; i < words.length; i++) { + builder.append(" "); + builder.append(words[i - 1]); + builder.append("_"); + builder.append(words[i]); } + } - /** - * Prepare text for vectorization (lowercasing, fixing bad line breaks, rough tokenization) - * @param origText entire text of a document - * @return the processed text - */ - private String processText(String origText) { - String text = fixTableRows(origText); - text = text.toLowerCase(); - String[] words = text.split("\\W+"); - if(words.length > 0) { - StringBuilder builder = new StringBuilder(); - builder.append(words[0]); - for (int i = 1; i < words.length; i++) { - builder.append(" "); - builder.append(words[i]); - } - addBigrams(words, builder); - return builder.toString(); - } else { - LOGGER.warn("Empty document"); - return ""; + /** + * Fixes a problem common in the i2b2 text: sometimes new table lines start without any whitespace + * + * @param origText text with table problems + * @return text with line breaks inserted + */ + private String fixTableRows(String origText) { + String pattern = "(:.*)\n+(.*[^A-Z\\-\\( \\s/])([A-Z].*:)"; + String repl = "$1 $2\n$3"; + // have to run this a few times to be sure we get it all (adjacent ones won't both be matched) + String fixed = origText.replaceAll(pattern, repl); + fixed = fixed.replaceAll(pattern, repl); + fixed = fixed.replaceAll(pattern, repl); + return fixed; + } + + /** + * Builds a dictionary from known text and set the attributes for vector instances In current + * implementation, this is done all at once, not incrementally, since total word counts must be + * known This function must be called before vectorizing any text instances + * + * @param textInstances Instances containing text (whitespace-delimited words) + */ + private void buildDictionary(Instances textInstances) { + Map globalCounts = new LinkedHashMap<>(); + for (Instance inst : textInstances) { + String processed = inst.stringValue(1); + String[] words = processed.split("\\s+"); + for (String uni : words) { + if (!stopWords.contains(uni)) { + if (!globalCounts.containsKey(uni)) { + globalCounts.put(uni, 0); + } + globalCounts.put(uni, globalCounts.get(uni) + 1); } + } } - /** - * Given a list of words and a StringBuilder, continue to build bigrams/trigrams/etc. onto the builder - * @param words array of words in their natural order - * @param builder an active StringBuilder - */ - private void addBigrams(String[] words, StringBuilder builder) { - for(int i=1; i sortedWords = new ArrayList<>(); + sortedWords.addAll(globalCounts.keySet()); + if (sortWordsByDescendingFreq) { + sortedWords.sort((word1, word2) -> { + int cmp = Integer.compare(globalCounts.get(word2), globalCounts.get(word1)); + return cmp == 0 ? word2.compareTo(word1) : cmp; + }); } - - /** - * Fixes a problem common in the i2b2 text: sometimes new table lines start without any whitespace - * @param origText text with table problems - * @return text with line breaks inserted - */ - private String fixTableRows(String origText) { - String pattern = "(:.*)\n+(.*[^A-Z\\-\\( \\s/])([A-Z].*:)"; - String repl = "$1 $2\n$3"; - // have to run this a few times to be sure we get it all (adjacent ones won't both be matched) - String fixed = origText.replaceAll(pattern, repl); - fixed = fixed.replaceAll(pattern, repl); - fixed = fixed.replaceAll(pattern, repl); - return fixed; + dictionary = new HashMap<>(); + ArrayList vectorInstanceAttributes = new ArrayList<>(); + vectorInstanceAttributes.add(classAttribute); + for (String word : sortedWords) { + if (globalCounts.get(word) >= minTermCount) { + dictionary.put(word, dictionary.size()); + vectorInstanceAttributes.add(new Attribute(word)); + } } + vectorTemplate = new Instances("vectorTemplate", vectorInstanceAttributes, 0); + vectorTemplate.setClassIndex(0); + } - /** - * Builds a dictionary from known text and set the attributes for vector instances - * In current implementation, this is done all at once, not incrementally, since total word counts must be known - * This function must be called before vectorizing any text instances - * @param textInstances Instances containing text (whitespace-delimited words) - */ - private void buildDictionary(Instances textInstances) { - Map globalCounts = new LinkedHashMap<>(); - for (Instance inst : textInstances) { - String processed = inst.stringValue(1); - String[] words = processed.split("\\s+"); - for (String uni : words) { - if (!stopWords.contains(uni)) { - if (!globalCounts.containsKey(uni)) { - globalCounts.put(uni, 0); - } - globalCounts.put(uni, globalCounts.get(uni) + 1); - } - } - } - List sortedWords = new ArrayList<>(); - sortedWords.addAll(globalCounts.keySet()); - if (sortWordsByDescendingFreq) { - sortedWords.sort((word1, word2) -> { - int cmp = Integer.compare(globalCounts.get(word2), globalCounts.get(word1)); - return cmp == 0 ? word2.compareTo(word1) : cmp; - }); - } - dictionary = new HashMap<>(); - ArrayList vectorInstanceAttributes = new ArrayList<>(); - vectorInstanceAttributes.add(classAttribute); - for (String word : sortedWords) { - if (globalCounts.get(word) >= minTermCount) { - dictionary.put(word, dictionary.size()); - vectorInstanceAttributes.add(new Attribute(word)); - } - } - vectorTemplate = new Instances("vectorTemplate", vectorInstanceAttributes, 0); - vectorTemplate.setClassIndex(0); + /** + * Vectorize a bunch of text instances and put them into a single Instances object, probably to + * train a classifier + * + * @param textInstances Instances that have a class and text attribute + * @return Instances that have a class and many real-valued attributes + */ + private Instances vectorizeInstances(Instances textInstances) { + List listInstance = new ArrayList<>(); + for (Instance textInstance : textInstances) { + listInstance.add(vectorizeInstance(textInstance)); } - - /** - * Vectorize a bunch of text instances and put them into a single Instances object, probably to train a classifier - * @param textInstances Instances that have a class and text attribute - * @return Instances that have a class and many real-valued attributes - */ - private Instances vectorizeInstances(Instances textInstances) { - List listInstance = new ArrayList<>(); - for(Instance textInstance : textInstances) { - listInstance.add(vectorizeInstance(textInstance)); - } - Instances vectorized = new Instances(vectorTemplate, textInstances.numInstances()); - for(Instance inst : listInstance) vectorized.add(inst); - return vectorized; + Instances vectorized = new Instances(vectorTemplate, textInstances.numInstances()); + for (Instance inst : listInstance) { + vectorized.add(inst); } + return vectorized; + } - /** - * Vectorize a text instance, probably for a classifier to evaluate - * @param textInstance Instance that has a class and text attribute - * @return Instance that has a class and many real-valued attributes - */ - private Instance vectorizeInstance(Instance textInstance) { - // Put the class and word counts for this doc into an array, then build an Instance from that - // counts[0] is the doc class, not actually a word count - double[] counts = new double[dictionary.size() + 1]; - counts[0] = textInstance.classValue(); - String processed = textInstance.stringValue(1); - String[] words = processed.split("\\s+"); - for (String uni : words) { - if (!stopWords.contains(uni) && dictionary.containsKey(uni)) { - counts[dictionary.get(uni) + 1]++; - } - } - Instance vec = new SparseInstance(1, counts); - vec.setWeight(textInstance.weight()); - return vec; + /** + * Vectorize a text instance, probably for a classifier to evaluate + * + * @param textInstance Instance that has a class and text attribute + * @return Instance that has a class and many real-valued attributes + */ + private Instance vectorizeInstance(Instance textInstance) { + // Put the class and word counts for this doc into an array, then build an Instance from that + // counts[0] is the doc class, not actually a word count + double[] counts = new double[dictionary.size() + 1]; + counts[0] = textInstance.classValue(); + String processed = textInstance.stringValue(1); + String[] words = processed.split("\\s+"); + for (String uni : words) { + if (!stopWords.contains(uni) && dictionary.containsKey(uni)) { + counts[dictionary.get(uni) + 1]++; + } } + Instance vec = new SparseInstance(1, counts); + vec.setWeight(textInstance.weight()); + return vec; + } } From 0a7580f1ee2ce48b90e400c4a20f8576f9e70918 Mon Sep 17 00:00:00 2001 From: Ben Knoll Date: Fri, 21 Jul 2017 18:21:23 -0500 Subject: [PATCH 4/5] =?UTF-8?q?Wrote=20our=20own=20version=20of=20a=20PTB?= =?UTF-8?q?=20reader=20so=20this=20doesn=E2=80=99t=20need=20to=20be=20GPL?= =?UTF-8?q?=20anymore.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../PennTreebankInputFileAdapter.java | 219 ------------------ 1 file changed, 219 deletions(-) delete mode 100644 src/main/java/edu/umn/biomedicus/gpl/penntree/PennTreebankInputFileAdapter.java diff --git a/src/main/java/edu/umn/biomedicus/gpl/penntree/PennTreebankInputFileAdapter.java b/src/main/java/edu/umn/biomedicus/gpl/penntree/PennTreebankInputFileAdapter.java deleted file mode 100644 index c2778ca..0000000 --- a/src/main/java/edu/umn/biomedicus/gpl/penntree/PennTreebankInputFileAdapter.java +++ /dev/null @@ -1,219 +0,0 @@ -/* - * Copyright (C) 2016 Regents of the University of Minnesota - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -package edu.umn.biomedicus.gpl.penntree; - -import com.google.inject.Inject; -import edu.stanford.nlp.ling.TaggedWord; -import edu.stanford.nlp.trees.PennTreeReaderFactory; -import edu.stanford.nlp.trees.Tree; -import edu.stanford.nlp.trees.TreeReader; -import edu.umn.biomedicus.common.types.syntax.PartOfSpeech; -import edu.umn.biomedicus.common.types.syntax.PartsOfSpeech; -import edu.umn.biomedicus.common.types.text.ImmutableParseToken; -import edu.umn.biomedicus.common.types.text.Sentence; -import edu.umn.biomedicus.framework.store.Document; -import edu.umn.biomedicus.framework.store.Label; -import edu.umn.biomedicus.framework.store.Span; -import edu.umn.biomedicus.framework.store.TextView; -import edu.umn.biomedicus.uima.adapter.UimaAdapters; -import edu.umn.biomedicus.uima.files.InputFileAdapter; -import edu.umn.biomedicus.uima.labels.LabelAdapters; -import java.io.IOException; -import java.io.Reader; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.ArrayList; -import java.util.List; -import java.util.Random; -import org.apache.uima.cas.CAS; -import org.apache.uima.collection.CollectionException; - -/** - * Adapts Penn treebank format files to CAS files. - * - * @author Ben Knoll - * @since 1.3.0 - */ -public final class PennTreebankInputFileAdapter implements InputFileAdapter { - - /** - * The penn tree reader factory. - */ - private final PennTreeReaderFactory pennTreeReaderFactory = new PennTreeReaderFactory(); - - private final LabelAdapters labelAdapters; - - /** - * The view name to load into. - */ - private String viewName; - - @Inject - public PennTreebankInputFileAdapter(LabelAdapters labelAdapters) { - this.labelAdapters = labelAdapters; - } - - @Override - public void adaptFile(CAS cas, Path path) throws CollectionException { - StringBuilder text = new StringBuilder(); - ArrayList sentences = new ArrayList<>(); - try (Reader reader = Files.newBufferedReader(path)) { - TreeReader treeReader = pennTreeReaderFactory.newTreeReader(reader); - Tree tree; - while ((tree = treeReader.readTree()) != null) { - int sentenceStart = text.length(); - - ArrayList taggedWords = tree.taggedYield(); - SentenceBuilder sentenceBuilder = new SentenceBuilder(); - sentenceBuilder.tokenBuilders = new ArrayList<>(taggedWords.size()); - for (TaggedWord taggedWord : taggedWords) { - String tag = taggedWord.tag(); - String word = taggedWord.word(); - text.append(" "); - - /** - * This -NONE- tag occurs in some documents when there is a assumed phrase. - */ - if (!"-NONE-".equals(tag)) { - int tokenStart = text.length(); - if ("-LRB-".equals(word)) { - text.append('('); - } else if ("-RRB-".equals(word)) { - text.append(')'); - } else if ("-LCB-".equals(word)) { - text.append('{'); - } else if ("-RCB-".equals(word)) { - text.append('}'); - } else if ("-LSB-".equals(word)) { - text.append('['); - } else if ("-RSB-".equals(word)) { - text.append(']'); - } else if ("``".equals(word)) { - text.append("\""); - } else if ("''".equals(word)) { - text.append("\""); - } else { - text.append(word); - } - int tokenEnd = text.length(); - - PartOfSpeech partOfSpeech; - if ("-LRB-".equals(tag)) { - partOfSpeech = PartOfSpeech.LEFT_PAREN; - } else if ("-RRB-".equals(tag)) { - partOfSpeech = PartOfSpeech.RIGHT_PAREN; - } else { - if (tag.contains("|")) { - String[] tags = tag.split("\\|"); - Random random = new Random(); - int randomIndex = random.nextInt(tags.length); - partOfSpeech = PartsOfSpeech.forTag(tags[randomIndex]); - } else { - partOfSpeech = PartsOfSpeech.forTag(tag); - } - - } - - TokenBuilder tokenBuilder = new TokenBuilder(); - tokenBuilder.tokenSpan = new Span(tokenStart, tokenEnd); - tokenBuilder.partOfSpeech = partOfSpeech; - if (partOfSpeech == null) { - throw new AssertionError("part of speech should not be null"); - } - sentenceBuilder.tokenBuilders.add(tokenBuilder); - } - } - - int sentenceEnd = text.length(); - - sentenceBuilder.sentenceSpan = new Span(sentenceStart, sentenceEnd); - sentences.add(sentenceBuilder); - } - } catch (IOException e) { - throw new CollectionException(e); - } - - Document document = UimaAdapters.createDocument(cas, labelAdapters, - path.getFileName().toString()); - TextView textView = document.newTextView() - .withText(text.toString()) - .withName(viewName) - .build(); - - for (SentenceBuilder sentence : sentences) { - Span sentenceSpan = sentence.sentenceSpan; - - textView.label(Label.create(sentenceSpan, Sentence.create())); - - for (TokenBuilder tokenBuilder : sentence.tokenBuilders) { - Span tokenSpan = tokenBuilder.tokenSpan; - - textView.label(Label.create(tokenSpan, - ImmutableParseToken.builder() - .text(tokenSpan.getCovered(text).toString()) - .hasSpaceAfter(true) - .build() - )); - - PartOfSpeech partOfSpeech = tokenBuilder.partOfSpeech; - if (partOfSpeech != null) { - String pos = partOfSpeech.toString(); - textView.label(Label.create(tokenSpan, PartsOfSpeech.forTag(pos))); - } - } - } - - } - - @Override - public void setTargetView(String viewName) { - this.viewName = viewName; - } - - /** - * Used to build sentences. - */ - private static class SentenceBuilder { - - /** - * The begin and end of the sentence. - */ - private Span sentenceSpan; - - /** - * The tokens of the sentence. - */ - private List tokenBuilders; - } - - /** - * Used to build tokens - */ - private static class TokenBuilder { - - /** - * The begin and end of the token - */ - private Span tokenSpan; - - /** - * The part of speech. - */ - private PartOfSpeech partOfSpeech; - } -} From a9d786fa1cc89207c91449e998039b77efc984ea Mon Sep 17 00:00:00 2001 From: Ben Knoll Date: Mon, 24 Jul 2017 09:16:22 -0500 Subject: [PATCH 5/5] 1.7.0 pom version change --- pom.xml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pom.xml b/pom.xml index 17b043b..d0b5e25 100644 --- a/pom.xml +++ b/pom.xml @@ -23,7 +23,7 @@ edu.umn.biomedicus biomedicus-gpl jar - 1.7.0-SNAPSHOT + 1.7.0 biomedicus-gpl BioMedICUS Annotation System - GPL Extensions @@ -37,12 +37,12 @@ edu.umn.biomedicus biomedicus-core - 1.7.0-SNAPSHOT + 1.7.0 edu.umn.biomedicus biomedicus-uima - 1.7.0-SNAPSHOT + 1.7.0 edu.stanford.nlp