Upgraded Stanford NER tool to 2012-11-11 version.

* Moved old version files into lib/nertools/stanford/stanford_old_library/
CompNet · Dec 23, 2012 · e5696ec · e5696ec
1 parent 3a9be83
commit e5696ec
Show file tree

Hide file tree

Showing 39 changed files with 1,026 additions and 24 deletions.
diff --git a/src/SocialNetworkExtractorNihai/.classpath b/src/SocialNetworkExtractorNihai/.classpath
@@ -6,15 +6,14 @@
 	<classpathentry kind="lib" path="lib/httpclient-4.1.1.jar"/>
 	<classpathentry kind="lib" path="lib/httpcore-4.1.jar"/>
 	<classpathentry kind="lib" path="lib/json_simple-1.1.jar"/>
-	<classpathentry kind="lib" path="lib/nertools/stanford/stanford-ner.jar"/>
 	<classpathentry kind="lib" path="lib/commons-logging-1.1.1.jar"/>
 	<classpathentry kind="lib" path="lib/filterbuilder.jar"/>
 	<classpathentry kind="lib" path="lib/sitecapturer.jar"/>
 	<classpathentry kind="lib" path="lib/thumbelina.jar"/>
-	<classpathentry kind="lib" path="lib/nertools/stanford/stanford-ner-2009-01-16.jar"/>
 	<classpathentry kind="lib" path="lib/htmllexer.jar"/>
 	<classpathentry kind="lib" path="lib/nertools/illinois/LBJ2.jar"/>
 	<classpathentry kind="lib" path="lib/nertools/illinois/LBJ2Library.jar"/>
 	<classpathentry kind="lib" path="lib/nertools/illinois/bin"/>
+	<classpathentry kind="lib" path="lib/nertools/stanford/stanford-ner-2012-11-11.jar"/>
 	<classpathentry kind="output" path="bin"/>
 </classpath>
diff --git a/src/SocialNetworkExtractorNihai/lib/nertools/stanford/NERDemo.java b/src/SocialNetworkExtractorNihai/lib/nertools/stanford/NERDemo.java
@@ -1,8 +1,8 @@
-import edu.stanford.nlp.ie.crf.*;
 import edu.stanford.nlp.ie.AbstractSequenceClassifier;
+import edu.stanford.nlp.ie.crf.*;
+import edu.stanford.nlp.io.IOUtils;
 import edu.stanford.nlp.ling.CoreLabel;
 import edu.stanford.nlp.ling.CoreAnnotations.AnswerAnnotation;
-import edu.stanford.nlp.util.StringUtils;
 
 import java.util.List;
 import java.io.IOException;
@@ -11,7 +11,7 @@
 
 /** This is a demo of calling CRFClassifier programmatically.
  *  <p>
- *  Usage: <code> java -cp "stanford-ner.jar:." NERDemo [serializedClassifier [fileName]]</code>
+ *  Usage: <code> java -mx400m -cp "stanford-ner.jar:." NERDemo [serializedClassifier [fileName]]</code>
  *  <p>
  *  If arguments aren't specified, they default to
  *  ner-eng-ie.crf-3-all2006.ser.gz and some hardcoded sample text.
@@ -33,13 +33,13 @@ public class NERDemo {
 
     public static void main(String[] args) throws IOException {
 
-      String serializedClassifier = "classifiers/ner-eng-ie.crf-3-all2008.ser.gz";
+      String serializedClassifier = "classifiers/english.all.3class.distsim.crf.ser.gz";
 
       if (args.length > 0) {
         serializedClassifier = args[0];
       }
 
-      AbstractSequenceClassifier classifier = CRFClassifier.getClassifierNoExceptions(serializedClassifier);
+      AbstractSequenceClassifier<CoreLabel> classifier = CRFClassifier.getClassifierNoExceptions(serializedClassifier);
 
       /* For either a file to annotate or for the hardcoded text example,
          this demo file shows two ways to process the output, for teaching
@@ -49,7 +49,7 @@ public static void main(String[] args) throws IOException {
          and produce an inline XML output format.
       */
       if (args.length > 1) {
-        String fileContents = StringUtils.slurpFile(args[1]);
+        String fileContents = IOUtils.slurpFile(args[1]);
         List<List<CoreLabel>> out = classifier.classify(fileContents);
         for (List<CoreLabel> sentence : out) {
           for (CoreLabel word : sentence) {
@@ -71,6 +71,13 @@ public static void main(String[] args) throws IOException {
         System.out.println(classifier.classifyToString(s1));
         System.out.println(classifier.classifyWithInlineXML(s2));
         System.out.println(classifier.classifyToString(s2, "xml", true));
+        int i=0;
+        for (List<CoreLabel> lcl : classifier.classify(s2)) {
+          for (CoreLabel cl : lcl) {
+            System.out.println(i++ + ":");
+            System.out.println(cl);
+          }
+        }
       }
     }
 

diff --git a/src/SocialNetworkExtractorNihai/lib/nertools/stanford/README.txt b/src/SocialNetworkExtractorNihai/lib/nertools/stanford/README.txt
@@ -0,0 +1,193 @@
+Stanford NER - v1.2.7 - 2012-11-11
+----------------------------------------------
+
+This package provides a high-performance machine learning based named
+entity recognition system, including facilities to train models from
+supervised training data and pre-trained models for English.
+
+(c) 2002-2012.  The Board of Trustees of The Leland
+    Stanford Junior University. All Rights Reserved. 
+
+Original CRF code by Jenny Finkel.
+Additional modules, features, internationalization, compaction, and
+support code by Christopher Manning, Dan Klein, Christopher Cox, Huy Nguyen
+Shipra Dingare, Anna Rafferty, and John Bauer.
+This release prepared by John Bauer.
+
+LICENSE 
+
+The software is licensed under the full GPL.  Please see the file LICENCE.txt
+
+For more information, bug reports, and fixes, contact:
+    Christopher Manning
+    Dept of Computer Science, Gates 1A
+    Stanford CA 94305-9010
+    USA
+    java-nlp-support@lists.stanford.edu
+    http://www-nlp.stanford.edu/software/CRF-NER.shtml
+
+CONTACT
+
+For questions about this distribution, please contact Stanford's JavaNLP group
+at java-nlp-support@lists.stanford.edu.  We provide assistance on a best-effort
+basis.
+
+TUTORIAL
+
+Quickstart guidelines, primarily for end users who wish to use the included NER
+models, are below.  For further instructions on training your own NER model,
+go to http://www-nlp.stanford.edu/software/crf-faq.shtml.
+
+INCLUDED SERIALIZED MODELS / TRAINING DATA
+
+The basic included serialized model is a 3 class NER tagger that can
+label: PERSON, ORGANIZATION, and LOCATION entities.  It is included as
+english.all.3class.distsim.crf.ser.gz.  It is trained on data from
+CoNLL, MUC6, MUC7, and ACE.  Because this model is trained on both US
+and UK newswire, it is fairly robust across the two domains.
+
+We have also included a 4 class NER tagger trained on the CoNLL 2003
+Shared Task training data that labels for PERSON, ORGANIZATION,
+LOCATION, and MISC.  It is named
+english.conll.4class.caseless.distsim.crf.ser.gz .
+
+A third model is trained only on data from MUC and distinguishes
+between 7 different classes,
+english.muc.7class.caseless.distsim.crf.ser.gz.
+
+All of the serialized classifiers come in two versions, the second of
+which uses a distributional similarity lexicon to improve performance
+(by about 1.5% F-measure).  These classifiers have additional features
+which make them perform substantially better, but they require rather
+more memory.  The distsim models are included in the release package,
+and nodistsim versions of the same models are available on the
+Stanford NER webpage.
+
+There are also case-insensitive versions of the three models available
+on the webpage.
+
+Finally, a package with two German models is also available for download.
+
+
+QUICKSTART INSTRUCTIONS
+
+This NER system requires Java 1.6 or later.   We have only tested it on
+the SUN JVM.
+
+Providing java is on your PATH, you should just be able to run an NER
+GUI demonstration by just clicking.  It might work to double-click on
+the stanford-ner.jar archive but this may well fail as the operating
+system does not give Java enough memory for our NER system, so it is
+safer to instead double click on the ner-gui.bat icon (Windows) or
+ner-gui.sh (Linux/Unix/MacOSX).  Then, from the Classifier menu, either
+load a CRF classifier from the classifiers directory of the distribution
+or you should be able to use the Load Default CRF option.  You can then
+either load a text file or web page from the File menu, or decide to use
+the default text in the window.  Finally, you can now named entity tag
+the text by pressing the Run NER button.
+
+From a command line, you need to have java on your PATH and the
+stanford-ner.jar file in your CLASSPATH.  (The way of doing this depends on
+your OS/shell.)  The supplied ner.bat and ner.sh should work to allow
+you to tag a single file.  For example, for Windows:
+
+    ner file
+
+Or on Unix/Linux you should be able to parse the test file in the distribution
+directory with the command:
+
+java -mx600m edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier classifiers/all.3class.crf.ser.gz -textFile sample.txt
+
+When run from a jar file, you also have the option of using a serialized
+classifier contained in the jar file.
+
+If you use the -jar command, or double-click the jar file, NERGUI is
+automatically started, and you will also be given the option (under the
+'Classifier' menu item) to load a default supplied classifier:
+
+java -mx1000m -jar stanford-ner.jar
+
+
+PROGRAMMATIC USE
+
+The NERDemo file illustrates a couple of ways of calling the system
+programatically.  You should get the same results from
+
+java -mx300m NERDemo classifiers/all.3class.crf.ser.gz sample.txt
+
+as from using CRFClassifier.  For more information on API calls, look in
+the enclosed javadoc directory: load index.html in a browser and look
+first at the edu.stanford.nlp.ie.crf package and CRFClassifier class.
+If you wish to train your own NER systems, look also at the
+edu.stanford.nlp.ie package NERFeatureFactory class. 
+
+
+SERVER VERSION
+
+The NER code may also be run as a server listening on a socket:
+
+java -mx1000m -cp stanford-ner.jar edu.stanford.nlp.ie.NERServer 1234
+
+You can specify which model to load with flags, either one on disk:
+
+java -mx1000m -cp stanford-ner.jar edu.stanford.nlp.ie.NERServer -loadClassifier classifiers/all.3class.crf.ser.gz 1234
+
+Or if you have put a model inside the jar file:
+
+java -mx1000m -cp stanford-ner.jar edu.stanford.nlp.ie.NERServer -loadJarClassifier all.3class.crf.ser.gz 1234
+
+
+RUNNING CLASSIFIERS FROM INSIDE A JAR FILE
+
+The software can run any serialized classifier from within a jar file by
+giving the flag -loadJarClassifier resourceName .  An end user can make
+their own jar files with the desired NER models contained inside.  The
+serialized classifier must be located immediately under classifiers/ in
+the jar file, with the name given.  This allows single jar file
+deployment.
+
+
+PERFORMANCE GUIDELINES
+
+Performance depends on many factors.  Speed and memory use depend on
+hardware, operating system, and JVM.  Accuracy depends on the data
+tested on.  Nevertheless, in the belief that something is better than
+nothing, here are some statistics from one machine on one test set, in
+semi-realistic conditions (where the test data is somewhat varied).
+
+ner-eng-ie.crf-3-all2006-distsim.ser.gz (older version of ner-eng-ie.crf-3-all2008-distsim.ser.gz)
+Memory: 320MB (on a 32 bit machine)
+PERSON	ORGANIZATION	LOCATION
+91.88	82.91		88.21
+
+
+--------------------
+CHANGES
+--------------------
+
+2012-11-11    1.2.7     Improved English 3 class model, release 
+                        Chinese model 
+
+2012-07-09    1.2.6     Minor bug fixes 
+
+2012-05-22    1.2.5     Fix encoding issue
+
+2012-04-07    1.2.4     Caseless version of English models supported
+
+2012-01-06    1.2.3     Minor bug fixes
+
+2011-09-14    1.2.2     Improved thread safety
+
+2011-06-19    1.2.1     Models reduced in size but on average improved 
+                        in accuracy (improved distsim clusters)
+
+2011-05-16      1.2     Normal download includes 3, 4, and 7 
+                        class models. Updated for compatibility 
+                        with other software releases.
+
+2009-01-16    1.1.1     Minor bug and usability fixes, changed API
+
+2008-05-07      1.1     Additional feature flags, various code updates
+
+2006-09-18      1.0     Initial release
+
diff --git a/src/SocialNetworkExtractorNihai/lib/nertools/stanford/build.xml b/src/SocialNetworkExtractorNihai/lib/nertools/stanford/build.xml
@@ -39,8 +39,8 @@
   <property name="compile.debug"       value="true"/>
   <property name="compile.deprecation" value="false"/>
   <property name="compile.optimize"    value="true"/>
-  <property name="compile.source"      value="1.5" />
-  <property name="compile.target"      value="1.5" />
+  <property name="compile.source"      value="1.6" />
+  <property name="compile.target"      value="1.6" />
 
 
 
@@ -84,14 +84,14 @@
 
   <target name="classpath" description="Sets the classpath">
       <path id="compile.classpath">
-        <!--<fileset dir="${basedir}/lib">
+       <!-- <fileset dir="${basedir}/lib">
           <include name="*.jar"/>
           <exclude name="javanlp*"/>
-        </fileset>-->
+        </fileset> -->
       </path>
   </target>
 
-  
+
 
 
 
@@ -114,6 +114,7 @@
     <javac srcdir="${src.home}"
           destdir="${build.home}"
             debug="${compile.debug}"
+         encoding="utf-8" 
       deprecation="${compile.deprecation}"
          optimize="${compile.optimize}"
 	   source="${compile.source}"
@@ -152,14 +153,14 @@
     <javadoc sourcepath="${src.home}"
                 destdir="${javadoc.home}"
               maxmemory="768m"
-                 author="true"      
-                 source="1.5"
+                 author="true"
+                 source="1.6"
                 Overview="${src.home}/edu/stanford/nlp/overview.html"
-                Doctitle="Stanford JavaNLP API Documentation" 
+                Doctitle="Stanford JavaNLP API Documentation"
              Windowtitle="Stanford JavaNLP API"
            packagenames="*">
       <bottom><![CDATA[<FONT SIZE=2><A HREF=\"http://nlp.stanford.edu\">Stanford NLP Group</A></FONT>]]></bottom>
-      <link href="http://java.sun.com/j2se/1.5.0/docs/api/"/>
+      <link href="http://java.sun.com/j2se/1.6.0/docs/api/"/>
     </javadoc>
 
   </target>

diff --git a/...orkExtractorNihai/lib/nertools/stanford/classifiers/english.all.3class.distsim.crf.ser.gz b/...orkExtractorNihai/lib/nertools/stanford/classifiers/english.all.3class.distsim.crf.ser.gz
diff --git a/...alNetworkExtractorNihai/lib/nertools/stanford/classifiers/english.all.3class.distsim.prop b/...alNetworkExtractorNihai/lib/nertools/stanford/classifiers/english.all.3class.distsim.prop
@@ -0,0 +1,52 @@
+trainFile = /u/nlp/data/ner/column_data/all.3class.train
+testFile = /u/nlp/data/ner/column_data/all.3class.test
+serializeTo = english.all.3class.distsim.crf.ser.gz
+
+type = crf
+
+#distSimLexicon = /u/nlp/data/pos_tags_are_useless/englishGigaword.200.pruned
+#distSimLexicon = /u/nlp/data/pos_tags_are_useless/egw.bnc.200
+distSimLexicon = /u/nlp/data/pos_tags_are_useless/egw4-reut.512.clusters
+useDistSim = true
+
+map = word=0,answer=1
+
+saveFeatureIndexToDisk = true
+
+useClassFeature=true
+useWord=true
+#useWordPairs=true
+useNGrams=true
+noMidNGrams=true
+maxNGramLeng=6
+usePrev=true
+useNext=true
+#useTags=true
+#useWordTag=true
+useLongSequences=true
+useSequences=true
+usePrevSequences=true
+useTypeSeqs=true
+useTypeSeqs2=true
+useTypeySequences=true
+useOccurrencePatterns=true
+useLastRealWord=true
+useNextRealWord=true
+#useReverse=false
+normalize=true
+# normalizeTimex=true
+wordShape=chris2useLC
+useDisjunctive=true
+disjunctionWidth=5
+#useDisjunctiveShapeInteraction=true
+
+maxLeft=1
+readerAndWriter=edu.stanford.nlp.sequences.ColumnDocumentReaderAndWriter
+
+useObservedSequencesOnly=true
+
+useQN = true
+QNsize = 25
+
+# makes it go faster
+featureDiffThresh=0.05
diff --git a/...kExtractorNihai/lib/nertools/stanford/classifiers/english.conll.4class.distsim.crf.ser.gz b/...kExtractorNihai/lib/nertools/stanford/classifiers/english.conll.4class.distsim.crf.ser.gz