precision/recall slda

dmrd · May 10, 2014 · 87643f1 · 87643f1
2 parents 7dee928 + 32baa2b
commit 87643f1
Show file tree

Hide file tree

Showing 53 changed files with 51,797 additions and 401 deletions.
diff --git a/features/analyzer.py b/features/analyzer.py
@@ -32,6 +32,17 @@ def cmu_lookup(s, APPROX=True):
             return d[match][0]
         raise
 
+#
+#  Utility methods
+#
+
+
+def to_ngrams(items, n, BODY=False):
+    grams = ngrams(items, n)
+    if BODY:
+        return items, grams
+    return grams
+
 
 #
 #  Part-of-Speech
@@ -53,17 +64,17 @@ def tag_text(text):
     return sum(tag_sentences(text), [])
 
 
-def pos_ngrams(text, n):
+def pos_ngrams(text, n, BODY=False):
     """Extracts POS ngrams for a body of text."""
     pos_tags = [tag for (w, tag) in tag_text(text)]
-    return ngrams(pos_tags, n)
+    return to_ngrams(pos_tags, n, BODY=BODY)
 
 
-def word_ngrams(text, n, PUNC=True):
+def word_ngrams(text, n, PUNC=True, BODY=False):
     def filter(tag):
         return PUNC or not tag in PUNCTUATION_TAGS
     tokenized_text = [t for (t, tag) in tag_text(text) if filter(tag)]
-    return ngrams(tokenized_text, n)
+    return to_ngrams(tokenized_text, n, BODY=BODY)
 
 
 #
@@ -142,7 +153,7 @@ def reset():
     return result
 
 
-def syllable_ngrams(text, n):
+def syllable_ngrams(text, n, BODY=False):
     """
     Returns the n-grams of syllable usage by breaking each word down into
     (# syllables) and then taking n-grams on that sequence. In this way,
@@ -151,10 +162,10 @@ def syllable_ngrams(text, n):
     >>> [(1, 1), (1, 4)]
     """
     syllables = syllabic_representation(text)
-    return ngrams(syllables, n)
+    return to_ngrams(syllables, n, BODY=BODY)
 
 
-def syllable_count_ngrams(text, n):
+def syllable_count_ngrams(text, n, BODY=False):
     """
     Returns the n-grams of syllable counts between punctuation. That is,
     it sums (# syllables) for all the words between punctuation marks and
@@ -164,7 +175,7 @@ def syllable_count_ngrams(text, n):
     """
     syllables = syllable_counts(text, TOTAL=True)
     syllables = list(sum(syllables, ()))
-    return ngrams(syllables, n)
+    return to_ngrams(syllables, n, BODY=BODY)
 
 
 def word_counts(text):
@@ -189,13 +200,13 @@ def word_counts(text):
     return result
 
 
-def word_count_ngrams(text, n):
+def word_count_ngrams(text, n, BODY=False):
     """
     Returns the n-grams of word counts between punctuation.
     """
     counts = word_counts(text)
     counts = list(sum(counts, ()))
-    return ngrams(counts, n)
+    return to_ngrams(counts, n, BODY=BODY)
 
 
 def stress_counts_by_syllable(text, SECONDARY=True):
@@ -232,9 +243,9 @@ def etymology_representation(text):
     return map(e.lookup, text)
 
 
-def etymology_ngrams(text, n):
+def etymology_ngrams(text, n, BODY=False):
     representation = etymology_representation(text)
-    return ngrams(representation, n)
+    return to_ngrams(representation, n, BODY=BODY)
 
 
 def etymology_frequencies(text, n):

diff --git a/features/extract.py b/features/extract.py
@@ -1,3 +1,4 @@
+import re
 import analyzer
 
 
@@ -6,7 +7,7 @@ def find_all_matching_ngrams(text, ngram, ngram_extractor, PUNC=True):
         return None
 
     n = len(ngram)
-    word_ngrams = analyzer.word_ngrams(text, n, PUNC=True)
+    word_ngrams = analyzer.word_ngrams(text, n, PUNC=PUNC)
     target_ngrams = ngram_extractor(text, n)
     return list(zip(*filter(lambda x: x[1] == ngram, zip(word_ngrams, target_ngrams)))[0])
 
@@ -16,4 +17,30 @@ def find_pos_ngram(text, ngram):
 
 
 def find_syllable_ngram(text, ngram):
-    return find_all_matching_ngrams(text, ngram, analyzer.syllable_ngrams, PUNC=False)
+    return find_all_matching_ngrams(text, ngram, analyzer.syllable_ngrams)
+
+
+def find_etymology_ngram(text, ngram):
+    return find_all_matching_ngrams(text, ngram, analyzer.etymology_ngrams, PUNC=False)
+
+
+def sentence_ngrams(text, n):
+    sents = []
+    for s in re.split('(?<=[.!?,\(\)-;:]) +', text):
+        sents.append(s[:-1])
+        sents.append(s[-1])
+    return analyzer.to_ngrams(sents, n)
+
+
+def find_syllable_count_ngram(text, ngram):
+    n = len(ngram)
+    sent_grams = sentence_ngrams(text, n)
+    target_ngrams = analyzer.syllable_count_ngrams(text, n)
+    return list(zip(*filter(lambda x: x[1] == ngram, zip(sent_grams, target_ngrams)))[0])
+
+
+def find_word_count_ngram(text, ngram):
+    n = len(ngram)
+    sent_grams = sentence_ngrams(text, n)
+    target_ngrams = analyzer.word_count_ngrams(text, n)
+    return list(zip(*filter(lambda x: x[1] == ngram, zip(sent_grams, target_ngrams)))[0])
diff --git a/liblbfgs-1.10/AUTHORS b/liblbfgs-1.10/AUTHORS
@@ -0,0 +1 @@
+Naoaki Okazaki <okazaki at chokkan org>
diff --git a/liblbfgs-1.10/COPYING b/liblbfgs-1.10/COPYING
@@ -0,0 +1,22 @@
+The MIT License
+
+Copyright (c) 1990 Jorge Nocedal
+Copyright (c) 2007-2010 Naoaki Okazaki
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/liblbfgs-1.10/ChangeLog b/liblbfgs-1.10/ChangeLog
@@ -0,0 +1,120 @@
+2010-xx-xx  Naoaki Okazaki  <okazaki at chokkan org>
+
+	* libLBFGS 1.10:
+	- Fixed compiling errors on Mac OS X; this patch was kindly submitted by Nic Schraudolph.
+	- Reduced compiling warnings on Mac OS X; this patch was kindly submitted by Tamas Nepusz.
+
+
+2010-01-29  Naoaki Okazaki  <okazaki at chokkan org>
+
+	* libLBFGS 1.9:
+	- Fixed a mistake in checking the validity of the parameters "ftol" and "wolfe"; this mistake was discovered by Kevin S. Van Horn.
+
+
+2009-07-13  Naoaki Okazaki  <okazaki at chokkan org>
+
+	* libLBFGS 1.8:
+	- Accepted the patch submitted by Takashi Imamichi; the backtracking method now has three criteria for choosing the step length.
+	- Updated the documentation to explain the above three criteria.
+
+
+2009-02-28  Naoaki Okazaki  <okazaki at chokkan org>
+
+	* libLBFGS 1.7:
+	- Improved OWL-QN routines for stability.
+	- Removed the support of OWL-QN method in MoreThuente algorithm
+	  because it accidentally fails in early stages of iterations for some
+	  objectives. Because of this change, the OW-LQN method must be used
+	  with the backtracking algorithm (LBFGS_LINESEARCH_BACKTRACKING), or
+	  the library returns LBFGSERR_INVALID_LINESEARCH.
+	- Renamed line search algorithms as follows:
+	    - LBFGS_LINESEARCH_BACKTRACKING: regular Wolfe condition.
+	    - LBFGS_LINESEARCH_BACKTRACKING_LOOSE: regular Wolfe condition.
+	    - LBFGS_LINESEARCH_BACKTRACKING_STRONG: strong Wolfe condition.
+	- Source code clean-up.
+
+
+2008-11-02  Naoaki Okazaki  <okazaki at chokkan org>
+
+	* libLBFGS 1.6:
+	- Improved line-search algorithm with strong Wolfe condition, which
+	  was contributed by Takashi Imamichi. This routine is now default for
+	  LBFGS_LINESEARCH_BACKTRACKING. The previous line search algorithm
+	  with regular Wolfe condition is still available as
+	  LBFGS_LINESEARCH_BACKTRACKING_LOOSE.
+	- Configurable stop index for L1-norm computation. A member variable
+	  lbfgs_parameter_t::orthantwise_end was added to specify the index
+	  number at which the library stops computing the L1 norm of the
+	  variables. This is useful to prevent some variables from being
+	  regularized by the OW-LQN method.
+	- A sample program written in C++ (sample/sample.cpp).
+
+
+2008-07-10  Naoaki Okazaki  <okazaki at chokkan org>
+
+	* libLBFGS 1.5:
+	- Configurable starting index for L1-norm computation. A member
+	  variable lbfgs_parameter_t::orthantwise_start was added to specify
+	  the index number from which the library computes the L1 norm of the
+	  variables.
+	- Fixed a zero-division error when the initial variables have already
+	  been a minimizer (reported by Takashi Imamichi). In this case, the
+	  library returns LBFGS_ALREADY_MINIMIZED status code.
+	- Defined LBFGS_SUCCESS status code as zero; removed unused constants,
+	  LBFGSFALSE and LBFGSTRUE.
+	- Fixed a compile error in an implicit down-cast.
+
+
+2008-04-25  Naoaki Okazaki  <okazaki at chokkan org>
+
+	* libLBFGS 1.4:
+	- Configurable line search algorithms. A member variable
+	  lbfgs_parameter_t::linesearch was added to choose either MoreThuente
+	  method (LBFGS_LINESEARCH_MORETHUENTE) or backtracking algorithm
+	  (LBFGS_LINESEARCH_BACKTRACKING).
+	- Fixed a bug: the previous version did not compute psuedo-gradients
+	  properly in the line search routines for OW-LQN. This bug might quit
+	  an iteration process too early when the OW-LQN routine was activated
+	  (0 < lbfgs_parameter_t::orthantwise_c).
+	- Configure script for POSIX environments.
+	- SSE/SSE2 optimizations with GCC.
+	- New functions lbfgs_malloc and lbfgs_free to use SSE/SSE2 routines
+	  transparently. It is uncessary to use these functions for libLBFGS
+	  built without SSE/SSE2 routines; you can still use any memory
+	  allocators if SSE/SSE2 routines are disabled in libLBFGS.
+
+
+2007-12-16  Naoaki Okazaki  <okazaki at chokkan org>
+
+	* libLBFGS 1.3:
+	- An API change. An argument was added to lbfgs() function to receive
+	  the final value of the objective function. This argument can be set
+	  to NULL if the final value is unnecessary.
+	- Fixed a null-pointer bug in the sample code (reported by Takashi
+	  Imamichi).
+	- Added build scripts for Microsoft Visual Studio 2005 and GCC.
+	- Added README file.
+
+
+2007-12-13  Naoaki Okazaki  <okazaki at chokkan org>
+
+	* libLBFGS 1.2:
+	- Fixed a serious bug in orthant-wise L-BFGS. An important variable
+	  was used without initialization.
+	- Configurable L-BFGS parameters (number of limited memories, epsilon).
+
+
+2007-12-01  Naoaki Okazaki  <okazaki at chokkan org>
+
+	* libLBFGS 1.1:
+	- Implemented orthant-wise L-BFGS.
+	- Implemented lbfgs_parameter_init() function.
+	- Fixed several bugs.
+	- API documentation.
+
+
+2007-09-20  Naoaki Okazaki  <okazaki at chokkan org>
+
+	* libLBFGS 1.0
+	- Initial release.
+