Skip to content

Commit

Permalink
precision/recall slda
Browse files Browse the repository at this point in the history
  • Loading branch information
msimchowitz committed May 10, 2014
2 parents 7dee928 + 32baa2b commit 87643f1
Show file tree
Hide file tree
Showing 53 changed files with 51,797 additions and 401 deletions.
35 changes: 23 additions & 12 deletions features/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,17 @@ def cmu_lookup(s, APPROX=True):
return d[match][0]
raise

#
# Utility methods
#


def to_ngrams(items, n, BODY=False):
grams = ngrams(items, n)
if BODY:
return items, grams
return grams


#
# Part-of-Speech
Expand All @@ -53,17 +64,17 @@ def tag_text(text):
return sum(tag_sentences(text), [])


def pos_ngrams(text, n):
def pos_ngrams(text, n, BODY=False):
"""Extracts POS ngrams for a body of text."""
pos_tags = [tag for (w, tag) in tag_text(text)]
return ngrams(pos_tags, n)
return to_ngrams(pos_tags, n, BODY=BODY)


def word_ngrams(text, n, PUNC=True):
def word_ngrams(text, n, PUNC=True, BODY=False):
def filter(tag):
return PUNC or not tag in PUNCTUATION_TAGS
tokenized_text = [t for (t, tag) in tag_text(text) if filter(tag)]
return ngrams(tokenized_text, n)
return to_ngrams(tokenized_text, n, BODY=BODY)


#
Expand Down Expand Up @@ -142,7 +153,7 @@ def reset():
return result


def syllable_ngrams(text, n):
def syllable_ngrams(text, n, BODY=False):
"""
Returns the n-grams of syllable usage by breaking each word down into
(# syllables) and then taking n-grams on that sequence. In this way,
Expand All @@ -151,10 +162,10 @@ def syllable_ngrams(text, n):
>>> [(1, 1), (1, 4)]
"""
syllables = syllabic_representation(text)
return ngrams(syllables, n)
return to_ngrams(syllables, n, BODY=BODY)


def syllable_count_ngrams(text, n):
def syllable_count_ngrams(text, n, BODY=False):
"""
Returns the n-grams of syllable counts between punctuation. That is,
it sums (# syllables) for all the words between punctuation marks and
Expand All @@ -164,7 +175,7 @@ def syllable_count_ngrams(text, n):
"""
syllables = syllable_counts(text, TOTAL=True)
syllables = list(sum(syllables, ()))
return ngrams(syllables, n)
return to_ngrams(syllables, n, BODY=BODY)


def word_counts(text):
Expand All @@ -189,13 +200,13 @@ def word_counts(text):
return result


def word_count_ngrams(text, n):
def word_count_ngrams(text, n, BODY=False):
"""
Returns the n-grams of word counts between punctuation.
"""
counts = word_counts(text)
counts = list(sum(counts, ()))
return ngrams(counts, n)
return to_ngrams(counts, n, BODY=BODY)


def stress_counts_by_syllable(text, SECONDARY=True):
Expand Down Expand Up @@ -232,9 +243,9 @@ def etymology_representation(text):
return map(e.lookup, text)


def etymology_ngrams(text, n):
def etymology_ngrams(text, n, BODY=False):
representation = etymology_representation(text)
return ngrams(representation, n)
return to_ngrams(representation, n, BODY=BODY)


def etymology_frequencies(text, n):
Expand Down
31 changes: 29 additions & 2 deletions features/extract.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
import analyzer


Expand All @@ -6,7 +7,7 @@ def find_all_matching_ngrams(text, ngram, ngram_extractor, PUNC=True):
return None

n = len(ngram)
word_ngrams = analyzer.word_ngrams(text, n, PUNC=True)
word_ngrams = analyzer.word_ngrams(text, n, PUNC=PUNC)
target_ngrams = ngram_extractor(text, n)
return list(zip(*filter(lambda x: x[1] == ngram, zip(word_ngrams, target_ngrams)))[0])

Expand All @@ -16,4 +17,30 @@ def find_pos_ngram(text, ngram):


def find_syllable_ngram(text, ngram):
return find_all_matching_ngrams(text, ngram, analyzer.syllable_ngrams, PUNC=False)
return find_all_matching_ngrams(text, ngram, analyzer.syllable_ngrams)


def find_etymology_ngram(text, ngram):
return find_all_matching_ngrams(text, ngram, analyzer.etymology_ngrams, PUNC=False)


def sentence_ngrams(text, n):
sents = []
for s in re.split('(?<=[.!?,\(\)-;:]) +', text):
sents.append(s[:-1])
sents.append(s[-1])
return analyzer.to_ngrams(sents, n)


def find_syllable_count_ngram(text, ngram):
n = len(ngram)
sent_grams = sentence_ngrams(text, n)
target_ngrams = analyzer.syllable_count_ngrams(text, n)
return list(zip(*filter(lambda x: x[1] == ngram, zip(sent_grams, target_ngrams)))[0])


def find_word_count_ngram(text, ngram):
n = len(ngram)
sent_grams = sentence_ngrams(text, n)
target_ngrams = analyzer.word_count_ngrams(text, n)
return list(zip(*filter(lambda x: x[1] == ngram, zip(sent_grams, target_ngrams)))[0])
1 change: 1 addition & 0 deletions liblbfgs-1.10/AUTHORS
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Naoaki Okazaki <okazaki at chokkan org>
22 changes: 22 additions & 0 deletions liblbfgs-1.10/COPYING
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
The MIT License

Copyright (c) 1990 Jorge Nocedal
Copyright (c) 2007-2010 Naoaki Okazaki

Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
120 changes: 120 additions & 0 deletions liblbfgs-1.10/ChangeLog
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
2010-xx-xx Naoaki Okazaki <okazaki at chokkan org>

* libLBFGS 1.10:
- Fixed compiling errors on Mac OS X; this patch was kindly submitted by Nic Schraudolph.
- Reduced compiling warnings on Mac OS X; this patch was kindly submitted by Tamas Nepusz.


2010-01-29 Naoaki Okazaki <okazaki at chokkan org>

* libLBFGS 1.9:
- Fixed a mistake in checking the validity of the parameters "ftol" and "wolfe"; this mistake was discovered by Kevin S. Van Horn.


2009-07-13 Naoaki Okazaki <okazaki at chokkan org>

* libLBFGS 1.8:
- Accepted the patch submitted by Takashi Imamichi; the backtracking method now has three criteria for choosing the step length.
- Updated the documentation to explain the above three criteria.


2009-02-28 Naoaki Okazaki <okazaki at chokkan org>

* libLBFGS 1.7:
- Improved OWL-QN routines for stability.
- Removed the support of OWL-QN method in MoreThuente algorithm
because it accidentally fails in early stages of iterations for some
objectives. Because of this change, the OW-LQN method must be used
with the backtracking algorithm (LBFGS_LINESEARCH_BACKTRACKING), or
the library returns LBFGSERR_INVALID_LINESEARCH.
- Renamed line search algorithms as follows:
- LBFGS_LINESEARCH_BACKTRACKING: regular Wolfe condition.
- LBFGS_LINESEARCH_BACKTRACKING_LOOSE: regular Wolfe condition.
- LBFGS_LINESEARCH_BACKTRACKING_STRONG: strong Wolfe condition.
- Source code clean-up.


2008-11-02 Naoaki Okazaki <okazaki at chokkan org>

* libLBFGS 1.6:
- Improved line-search algorithm with strong Wolfe condition, which
was contributed by Takashi Imamichi. This routine is now default for
LBFGS_LINESEARCH_BACKTRACKING. The previous line search algorithm
with regular Wolfe condition is still available as
LBFGS_LINESEARCH_BACKTRACKING_LOOSE.
- Configurable stop index for L1-norm computation. A member variable
lbfgs_parameter_t::orthantwise_end was added to specify the index
number at which the library stops computing the L1 norm of the
variables. This is useful to prevent some variables from being
regularized by the OW-LQN method.
- A sample program written in C++ (sample/sample.cpp).


2008-07-10 Naoaki Okazaki <okazaki at chokkan org>

* libLBFGS 1.5:
- Configurable starting index for L1-norm computation. A member
variable lbfgs_parameter_t::orthantwise_start was added to specify
the index number from which the library computes the L1 norm of the
variables.
- Fixed a zero-division error when the initial variables have already
been a minimizer (reported by Takashi Imamichi). In this case, the
library returns LBFGS_ALREADY_MINIMIZED status code.
- Defined LBFGS_SUCCESS status code as zero; removed unused constants,
LBFGSFALSE and LBFGSTRUE.
- Fixed a compile error in an implicit down-cast.


2008-04-25 Naoaki Okazaki <okazaki at chokkan org>

* libLBFGS 1.4:
- Configurable line search algorithms. A member variable
lbfgs_parameter_t::linesearch was added to choose either MoreThuente
method (LBFGS_LINESEARCH_MORETHUENTE) or backtracking algorithm
(LBFGS_LINESEARCH_BACKTRACKING).
- Fixed a bug: the previous version did not compute psuedo-gradients
properly in the line search routines for OW-LQN. This bug might quit
an iteration process too early when the OW-LQN routine was activated
(0 < lbfgs_parameter_t::orthantwise_c).
- Configure script for POSIX environments.
- SSE/SSE2 optimizations with GCC.
- New functions lbfgs_malloc and lbfgs_free to use SSE/SSE2 routines
transparently. It is uncessary to use these functions for libLBFGS
built without SSE/SSE2 routines; you can still use any memory
allocators if SSE/SSE2 routines are disabled in libLBFGS.


2007-12-16 Naoaki Okazaki <okazaki at chokkan org>

* libLBFGS 1.3:
- An API change. An argument was added to lbfgs() function to receive
the final value of the objective function. This argument can be set
to NULL if the final value is unnecessary.
- Fixed a null-pointer bug in the sample code (reported by Takashi
Imamichi).
- Added build scripts for Microsoft Visual Studio 2005 and GCC.
- Added README file.


2007-12-13 Naoaki Okazaki <okazaki at chokkan org>

* libLBFGS 1.2:
- Fixed a serious bug in orthant-wise L-BFGS. An important variable
was used without initialization.
- Configurable L-BFGS parameters (number of limited memories, epsilon).


2007-12-01 Naoaki Okazaki <okazaki at chokkan org>

* libLBFGS 1.1:
- Implemented orthant-wise L-BFGS.
- Implemented lbfgs_parameter_init() function.
- Fixed several bugs.
- API documentation.


2007-09-20 Naoaki Okazaki <okazaki at chokkan org>

* libLBFGS 1.0
- Initial release.

Loading

0 comments on commit 87643f1

Please sign in to comment.