More feature extraction cleanup and visualization tweaks

dmrd · May 11, 2014 · 845de1e · 845de1e
1 parent eed5321
commit 845de1e
Show file tree

Hide file tree

Showing 7 changed files with 47 additions and 5 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,4 @@
-slda_input_files/termite_files/*.txt
+slda_input_files/termite_files/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/features/analyzer.py b/features/analyzer.py
@@ -236,6 +236,13 @@ def stress_counts_by_syllable(text, SECONDARY=True):
 #
 
 
+def etymology_with_words(text):
+    exclude = set(punctuation)
+    text = ''.join(ch for ch in text if ch not in exclude)
+    text = word_tokenize(text)
+    return map(lambda w: (w, e.lookup(w)), text)
+
+
 def etymology_representation(text):
     exclude = set(punctuation)
     text = ''.join(ch for ch in text if ch not in exclude)

diff --git a/features/etymology_dict.py b/features/etymology_dict.py
@@ -21,14 +21,18 @@ def lookup(self, s):
             self.load()
 
         try:
+            print "Using self"
             return self.d[s.lower()]
         except KeyError:
             try:
+                print "Using stemmer: " + self.stemmer.stem(s).lower()
                 return self.d[self.stemmer.stem(s).lower()]
             except KeyError:
                 try:
+                    print "Using lemmatizer: " + self.lemmatizer.lemmatize(s).lower()
                     return self.d[self.lemmatizer.lemmatize(s).lower()]
                 except KeyError:
                     (score, match) = max((ratio(s, t), t) for t in self.d)
                     self.d[s] = self.d[match]
+                    print "Using Levenshtein: " + match
                     return self.d[match]
diff --git a/features/extract.py b/features/extract.py
@@ -9,19 +9,30 @@ def find_all_matching_ngrams(text, ngram, ngram_extractor, PUNC=True):
     n = len(ngram)
     word_ngrams = analyzer.word_ngrams(text, n, PUNC=PUNC)
     target_ngrams = ngram_extractor(text, n)
+    print word_ngrams, target_ngrams
+    return list(zip(*filter(lambda x: x[1] == ngram, zip(word_ngrams, target_ngrams)))[0])
+
+
+def find_matches_for_pairs(pairs, ngram):
+    n = len(ngram)
+    split = zip(*pairs)
+    word_ngrams = analyzer.to_ngrams(split[0], n)
+    target_ngrams = analyzer.to_ngrams(split[1], n)
     return list(zip(*filter(lambda x: x[1] == ngram, zip(word_ngrams, target_ngrams)))[0])
 
 
 def find_pos_ngram(text, ngram):
-    return find_all_matching_ngrams(text, ngram, analyzer.pos_ngrams)
+    pairs = analyzer.tag_text(text)
+    return find_matches_for_pairs(pairs, ngram)
 
 
 def find_syllable_ngram(text, ngram):
     return find_all_matching_ngrams(text, ngram, analyzer.syllable_ngrams)
 
 
 def find_etymology_ngram(text, ngram):
-    return find_all_matching_ngrams(text, ngram, analyzer.etymology_ngrams, PUNC=False)
+    pairs = analyzer.etymology_with_words(text)
+    return find_matches_for_pairs(pairs, ngram)
 
 
 def sentence_ngrams(text, n):

diff --git a/poster/dendrogram.png b/poster/dendrogram.png
diff --git a/visualization/examples/dendrogram_horizontal.html b/visualization/examples/dendrogram_horizontal.html
@@ -24,6 +24,7 @@
 <script>
 
 var colors = { "syllable": "coral", "pos": "green", "etymology": "red", "word": "steelblue" };
+var descriptions = { "syllable": "Syllables", "pos": "Part-of-Speech", "etymology": "Etymology", "word": "Words" };
 
 var width = 800,
     height = 700;
@@ -64,8 +65,27 @@
 
   node.append("text")
       .attr("dy", -8)
-      .style("text-anchor", function(d) { return d.children ? "middle" : "middle"; })
+      .style("text-anchor", "middle")
       .text(function(d) { return d.body; });
+
+  node.filter(function(d) { return d.children })
+      .append("text")
+      .attr("dy", 22)
+      .style("text-anchor", "middle")
+      .text(function(d) { return descriptions[d.type]; });
+
+  var y = 175;
+  var lineData = [{'x': -100, 'y': y}, {'x': width + 100, 'y': y}];
+  var lineFunction = d3.svg.line()
+    .x(function(d) { return d.x; })
+    .y(function(d) { return d.y; })
+    .interpolate("linear");
+  console.log(lineData);
+  svg.append("path")
+    .style("stroke-dasharray", (3, 3))
+    .attr("d", lineFunction(lineData))
+    .attr("stroke", "firebrick")
+    .attr("stroke-width", 1);
 });
 
 d3.select(self.frameElement).style("height", height + "px");

diff --git a/visualization/examples/output.json b/visualization/examples/output.json
@@ -1 +1 @@
-{"body": "The cow jumped over the moon.", "type": "text", "children": [{"body": "[1,  1,  1,  2,  1,  1,  '.']", "type": "syllable", "children": [{"body": "(1,  1,  1)", "type": "syllable", "children": []}, {"body": "(1,  1,  2)", "type": "syllable", "children": []}, {"body": "(1,  2,  1)", "type": "syllable", "children": []}, {"body": "(2,  1,  1)", "type": "syllable", "children": []}, {"body": "(1,  1,  '.')", "type": "syllable", "children": []}]}, {"body": "['AS.',  'OE.',  'F.',  'AS.',  'AS.',  'OE.']", "type": "etymology", "children": [{"body": "('AS.',  'OE.',  'F.')", "type": "etymology", "children": []}, {"body": "('OE.',  'F.',  'AS.')", "type": "etymology", "children": []}, {"body": "('F.',  'AS.',  'AS.')", "type": "etymology", "children": []}, {"body": "('AS.',  'AS.',  'OE.')", "type": "etymology", "children": []}]}, {"body": "['The',  'cow',  'jumped',  'over',  'the',  'moon',  '.']", "type": "word", "children": [{"body": "('The',  'cow',  'jumped')", "type": "word", "children": []}, {"body": "('cow',  'jumped',  'over')", "type": "word", "children": []}, {"body": "('jumped',  'over',  'the')", "type": "word", "children": []}, {"body": "('over',  'the',  'moon')", "type": "word", "children": []}, {"body": "('the',  'moon',  '.')", "type": "word", "children": []}]}, {"body": "['DT',  'NN',  'VBD',  'IN',  'DT',  'NN',  '.']", "type": "pos", "children": [{"body": "('DT',  'NN',  'VBD')", "type": "pos", "children": []}, {"body": "('NN',  'VBD',  'IN')", "type": "pos", "children": []}, {"body": "('VBD',  'IN',  'DT')", "type": "pos", "children": []}, {"body": "('IN',  'DT',  'NN')", "type": "pos", "children": []}, {"body": "('DT',  'NN',  '.')", "type": "pos", "children": []}]}]}
+{"body": "The cow jumped over the moon.", "children": [{"body": "['The',  'cow',  'jumped',  'over',  'the',  'moon',  '.']", "type": "word", "children": [{"body": "('The',  'cow',  'jumped')", "type": "word", "children": []}, {"body": "('cow',  'jumped',  'over')", "type": "word", "children": []}, {"body": "('jumped',  'over',  'the')", "type": "word", "children": []}, {"body": "('over',  'the',  'moon')", "type": "word", "children": []}, {"body": "('the',  'moon',  '.')", "type": "word", "children": []}]}, {"body": "[1,  1,  1,  2,  1,  1,  '.']", "type": "syllable", "children": [{"body": "(1,  1,  1)", "type": "syllable", "children": []}, {"body": "(1,  1,  2)", "type": "syllable", "children": []}, {"body": "(1,  2,  1)", "type": "syllable", "children": []}, {"body": "(2,  1,  1)", "type": "syllable", "children": []}, {"body": "(1,  1,  '.')", "type": "syllable", "children": []}]}, {"body": "['AS.',  'OE.',  'F.',  'AS.',  'AS.',  'OE.']", "type": "etymology", "children": [{"body": "('AS.',  'OE.',  'F.')", "type": "etymology", "children": []}, {"body": "('OE.',  'F.',  'AS.')", "type": "etymology", "children": []}, {"body": "('F.',  'AS.',  'AS.')", "type": "etymology", "children": []}, {"body": "('AS.',  'AS.',  'OE.')", "type": "etymology", "children": []}]}, {"body": "['DT',  'NN',  'VBD',  'IN',  'DT',  'NN',  '.']", "type": "pos", "children": [{"body": "('DT',  'NN',  'VBD')", "type": "pos", "children": []}, {"body": "('NN',  'VBD',  'IN')", "type": "pos", "children": []}, {"body": "('VBD',  'IN',  'DT')", "type": "pos", "children": []}, {"body": "('IN',  'DT',  'NN')", "type": "pos", "children": []}, {"body": "('DT',  'NN',  '.')", "type": "pos", "children": []}]}]}
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"body": "The cow jumped over the moon.", "type": "text", "children": [{"body": "[1, 1, 1, 2, 1, 1, '.']", "type": "syllable", "children": [{"body": "(1, 1, 1)", "type": "syllable", "children": []}, {"body": "(1, 1, 2)", "type": "syllable", "children": []}, {"body": "(1, 2, 1)", "type": "syllable", "children": []}, {"body": "(2, 1, 1)", "type": "syllable", "children": []}, {"body": "(1, 1, '.')", "type": "syllable", "children": []}]}, {"body": "['AS.', 'OE.', 'F.', 'AS.', 'AS.', 'OE.']", "type": "etymology", "children": [{"body": "('AS.', 'OE.', 'F.')", "type": "etymology", "children": []}, {"body": "('OE.', 'F.', 'AS.')", "type": "etymology", "children": []}, {"body": "('F.', 'AS.', 'AS.')", "type": "etymology", "children": []}, {"body": "('AS.', 'AS.', 'OE.')", "type": "etymology", "children": []}]}, {"body": "['The', 'cow', 'jumped', 'over', 'the', 'moon', '.']", "type": "word", "children": [{"body": "('The', 'cow', 'jumped')", "type": "word", "children": []}, {"body": "('cow', 'jumped', 'over')", "type": "word", "children": []}, {"body": "('jumped', 'over', 'the')", "type": "word", "children": []}, {"body": "('over', 'the', 'moon')", "type": "word", "children": []}, {"body": "('the', 'moon', '.')", "type": "word", "children": []}]}, {"body": "['DT', 'NN', 'VBD', 'IN', 'DT', 'NN', '.']", "type": "pos", "children": [{"body": "('DT', 'NN', 'VBD')", "type": "pos", "children": []}, {"body": "('NN', 'VBD', 'IN')", "type": "pos", "children": []}, {"body": "('VBD', 'IN', 'DT')", "type": "pos", "children": []}, {"body": "('IN', 'DT', 'NN')", "type": "pos", "children": []}, {"body": "('DT', 'NN', '.')", "type": "pos", "children": []}]}]}
		{"body": "The cow jumped over the moon.", "children": [{"body": "['The', 'cow', 'jumped', 'over', 'the', 'moon', '.']", "type": "word", "children": [{"body": "('The', 'cow', 'jumped')", "type": "word", "children": []}, {"body": "('cow', 'jumped', 'over')", "type": "word", "children": []}, {"body": "('jumped', 'over', 'the')", "type": "word", "children": []}, {"body": "('over', 'the', 'moon')", "type": "word", "children": []}, {"body": "('the', 'moon', '.')", "type": "word", "children": []}]}, {"body": "[1, 1, 1, 2, 1, 1, '.']", "type": "syllable", "children": [{"body": "(1, 1, 1)", "type": "syllable", "children": []}, {"body": "(1, 1, 2)", "type": "syllable", "children": []}, {"body": "(1, 2, 1)", "type": "syllable", "children": []}, {"body": "(2, 1, 1)", "type": "syllable", "children": []}, {"body": "(1, 1, '.')", "type": "syllable", "children": []}]}, {"body": "['AS.', 'OE.', 'F.', 'AS.', 'AS.', 'OE.']", "type": "etymology", "children": [{"body": "('AS.', 'OE.', 'F.')", "type": "etymology", "children": []}, {"body": "('OE.', 'F.', 'AS.')", "type": "etymology", "children": []}, {"body": "('F.', 'AS.', 'AS.')", "type": "etymology", "children": []}, {"body": "('AS.', 'AS.', 'OE.')", "type": "etymology", "children": []}]}, {"body": "['DT', 'NN', 'VBD', 'IN', 'DT', 'NN', '.']", "type": "pos", "children": [{"body": "('DT', 'NN', 'VBD')", "type": "pos", "children": []}, {"body": "('NN', 'VBD', 'IN')", "type": "pos", "children": []}, {"body": "('VBD', 'IN', 'DT')", "type": "pos", "children": []}, {"body": "('IN', 'DT', 'NN')", "type": "pos", "children": []}, {"body": "('DT', 'NN', '.')", "type": "pos", "children": []}]}]}