Skip to content

Commit

Permalink
More feature extraction cleanup and visualization tweaks
Browse files Browse the repository at this point in the history
  • Loading branch information
Charles Marsh committed May 11, 2014
1 parent eed5321 commit 845de1e
Show file tree
Hide file tree
Showing 7 changed files with 47 additions and 5 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
slda_input_files/termite_files/*.txt
slda_input_files/termite_files/

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
7 changes: 7 additions & 0 deletions features/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,13 @@ def stress_counts_by_syllable(text, SECONDARY=True):
#


def etymology_with_words(text):
exclude = set(punctuation)
text = ''.join(ch for ch in text if ch not in exclude)
text = word_tokenize(text)
return map(lambda w: (w, e.lookup(w)), text)


def etymology_representation(text):
exclude = set(punctuation)
text = ''.join(ch for ch in text if ch not in exclude)
Expand Down
4 changes: 4 additions & 0 deletions features/etymology_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,18 @@ def lookup(self, s):
self.load()

try:
print "Using self"
return self.d[s.lower()]
except KeyError:
try:
print "Using stemmer: " + self.stemmer.stem(s).lower()
return self.d[self.stemmer.stem(s).lower()]
except KeyError:
try:
print "Using lemmatizer: " + self.lemmatizer.lemmatize(s).lower()
return self.d[self.lemmatizer.lemmatize(s).lower()]
except KeyError:
(score, match) = max((ratio(s, t), t) for t in self.d)
self.d[s] = self.d[match]
print "Using Levenshtein: " + match
return self.d[match]
15 changes: 13 additions & 2 deletions features/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,30 @@ def find_all_matching_ngrams(text, ngram, ngram_extractor, PUNC=True):
n = len(ngram)
word_ngrams = analyzer.word_ngrams(text, n, PUNC=PUNC)
target_ngrams = ngram_extractor(text, n)
print word_ngrams, target_ngrams
return list(zip(*filter(lambda x: x[1] == ngram, zip(word_ngrams, target_ngrams)))[0])


def find_matches_for_pairs(pairs, ngram):
n = len(ngram)
split = zip(*pairs)
word_ngrams = analyzer.to_ngrams(split[0], n)
target_ngrams = analyzer.to_ngrams(split[1], n)
return list(zip(*filter(lambda x: x[1] == ngram, zip(word_ngrams, target_ngrams)))[0])


def find_pos_ngram(text, ngram):
return find_all_matching_ngrams(text, ngram, analyzer.pos_ngrams)
pairs = analyzer.tag_text(text)
return find_matches_for_pairs(pairs, ngram)


def find_syllable_ngram(text, ngram):
return find_all_matching_ngrams(text, ngram, analyzer.syllable_ngrams)


def find_etymology_ngram(text, ngram):
return find_all_matching_ngrams(text, ngram, analyzer.etymology_ngrams, PUNC=False)
pairs = analyzer.etymology_with_words(text)
return find_matches_for_pairs(pairs, ngram)


def sentence_ngrams(text, n):
Expand Down
Binary file modified poster/dendrogram.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
22 changes: 21 additions & 1 deletion visualization/examples/dendrogram_horizontal.html
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
<script>

var colors = { "syllable": "coral", "pos": "green", "etymology": "red", "word": "steelblue" };
var descriptions = { "syllable": "Syllables", "pos": "Part-of-Speech", "etymology": "Etymology", "word": "Words" };

var width = 800,
height = 700;
Expand Down Expand Up @@ -64,8 +65,27 @@

node.append("text")
.attr("dy", -8)
.style("text-anchor", function(d) { return d.children ? "middle" : "middle"; })
.style("text-anchor", "middle")
.text(function(d) { return d.body; });

node.filter(function(d) { return d.children })
.append("text")
.attr("dy", 22)
.style("text-anchor", "middle")
.text(function(d) { return descriptions[d.type]; });

var y = 175;
var lineData = [{'x': -100, 'y': y}, {'x': width + 100, 'y': y}];
var lineFunction = d3.svg.line()
.x(function(d) { return d.x; })
.y(function(d) { return d.y; })
.interpolate("linear");
console.log(lineData);
svg.append("path")
.style("stroke-dasharray", (3, 3))
.attr("d", lineFunction(lineData))
.attr("stroke", "firebrick")
.attr("stroke-width", 1);
});

d3.select(self.frameElement).style("height", height + "px");
Expand Down
2 changes: 1 addition & 1 deletion visualization/examples/output.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"body": "The cow jumped over the moon.", "type": "text", "children": [{"body": "[1, 1, 1, 2, 1, 1, '.']", "type": "syllable", "children": [{"body": "(1, 1, 1)", "type": "syllable", "children": []}, {"body": "(1, 1, 2)", "type": "syllable", "children": []}, {"body": "(1, 2, 1)", "type": "syllable", "children": []}, {"body": "(2, 1, 1)", "type": "syllable", "children": []}, {"body": "(1, 1, '.')", "type": "syllable", "children": []}]}, {"body": "['AS.', 'OE.', 'F.', 'AS.', 'AS.', 'OE.']", "type": "etymology", "children": [{"body": "('AS.', 'OE.', 'F.')", "type": "etymology", "children": []}, {"body": "('OE.', 'F.', 'AS.')", "type": "etymology", "children": []}, {"body": "('F.', 'AS.', 'AS.')", "type": "etymology", "children": []}, {"body": "('AS.', 'AS.', 'OE.')", "type": "etymology", "children": []}]}, {"body": "['The', 'cow', 'jumped', 'over', 'the', 'moon', '.']", "type": "word", "children": [{"body": "('The', 'cow', 'jumped')", "type": "word", "children": []}, {"body": "('cow', 'jumped', 'over')", "type": "word", "children": []}, {"body": "('jumped', 'over', 'the')", "type": "word", "children": []}, {"body": "('over', 'the', 'moon')", "type": "word", "children": []}, {"body": "('the', 'moon', '.')", "type": "word", "children": []}]}, {"body": "['DT', 'NN', 'VBD', 'IN', 'DT', 'NN', '.']", "type": "pos", "children": [{"body": "('DT', 'NN', 'VBD')", "type": "pos", "children": []}, {"body": "('NN', 'VBD', 'IN')", "type": "pos", "children": []}, {"body": "('VBD', 'IN', 'DT')", "type": "pos", "children": []}, {"body": "('IN', 'DT', 'NN')", "type": "pos", "children": []}, {"body": "('DT', 'NN', '.')", "type": "pos", "children": []}]}]}
{"body": "The cow jumped over the moon.", "children": [{"body": "['The', 'cow', 'jumped', 'over', 'the', 'moon', '.']", "type": "word", "children": [{"body": "('The', 'cow', 'jumped')", "type": "word", "children": []}, {"body": "('cow', 'jumped', 'over')", "type": "word", "children": []}, {"body": "('jumped', 'over', 'the')", "type": "word", "children": []}, {"body": "('over', 'the', 'moon')", "type": "word", "children": []}, {"body": "('the', 'moon', '.')", "type": "word", "children": []}]}, {"body": "[1, 1, 1, 2, 1, 1, '.']", "type": "syllable", "children": [{"body": "(1, 1, 1)", "type": "syllable", "children": []}, {"body": "(1, 1, 2)", "type": "syllable", "children": []}, {"body": "(1, 2, 1)", "type": "syllable", "children": []}, {"body": "(2, 1, 1)", "type": "syllable", "children": []}, {"body": "(1, 1, '.')", "type": "syllable", "children": []}]}, {"body": "['AS.', 'OE.', 'F.', 'AS.', 'AS.', 'OE.']", "type": "etymology", "children": [{"body": "('AS.', 'OE.', 'F.')", "type": "etymology", "children": []}, {"body": "('OE.', 'F.', 'AS.')", "type": "etymology", "children": []}, {"body": "('F.', 'AS.', 'AS.')", "type": "etymology", "children": []}, {"body": "('AS.', 'AS.', 'OE.')", "type": "etymology", "children": []}]}, {"body": "['DT', 'NN', 'VBD', 'IN', 'DT', 'NN', '.']", "type": "pos", "children": [{"body": "('DT', 'NN', 'VBD')", "type": "pos", "children": []}, {"body": "('NN', 'VBD', 'IN')", "type": "pos", "children": []}, {"body": "('VBD', 'IN', 'DT')", "type": "pos", "children": []}, {"body": "('IN', 'DT', 'NN')", "type": "pos", "children": []}, {"body": "('DT', 'NN', '.')", "type": "pos", "children": []}]}]}

0 comments on commit 845de1e

Please sign in to comment.