diff --git a/.gitignore b/.gitignore index 853207f..be05216 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -slda_input_files/termite_files/*.txt +slda_input_files/termite_files/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/features/analyzer.py b/features/analyzer.py index 8c93933..bc0f864 100644 --- a/features/analyzer.py +++ b/features/analyzer.py @@ -236,6 +236,13 @@ def stress_counts_by_syllable(text, SECONDARY=True): # +def etymology_with_words(text): + exclude = set(punctuation) + text = ''.join(ch for ch in text if ch not in exclude) + text = word_tokenize(text) + return map(lambda w: (w, e.lookup(w)), text) + + def etymology_representation(text): exclude = set(punctuation) text = ''.join(ch for ch in text if ch not in exclude) diff --git a/features/etymology_dict.py b/features/etymology_dict.py index 15ef12e..602200d 100644 --- a/features/etymology_dict.py +++ b/features/etymology_dict.py @@ -21,14 +21,18 @@ def lookup(self, s): self.load() try: + print "Using self" return self.d[s.lower()] except KeyError: try: + print "Using stemmer: " + self.stemmer.stem(s).lower() return self.d[self.stemmer.stem(s).lower()] except KeyError: try: + print "Using lemmatizer: " + self.lemmatizer.lemmatize(s).lower() return self.d[self.lemmatizer.lemmatize(s).lower()] except KeyError: (score, match) = max((ratio(s, t), t) for t in self.d) self.d[s] = self.d[match] + print "Using Levenshtein: " + match return self.d[match] diff --git a/features/extract.py b/features/extract.py index 74623e0..7fb55f0 100644 --- a/features/extract.py +++ b/features/extract.py @@ -9,11 +9,21 @@ def find_all_matching_ngrams(text, ngram, ngram_extractor, PUNC=True): n = len(ngram) word_ngrams = analyzer.word_ngrams(text, n, PUNC=PUNC) target_ngrams = ngram_extractor(text, n) + print word_ngrams, target_ngrams + return list(zip(*filter(lambda x: x[1] == ngram, zip(word_ngrams, target_ngrams)))[0]) + + +def find_matches_for_pairs(pairs, ngram): + n = len(ngram) + split = zip(*pairs) + word_ngrams = analyzer.to_ngrams(split[0], n) + target_ngrams = analyzer.to_ngrams(split[1], n) return list(zip(*filter(lambda x: x[1] == ngram, zip(word_ngrams, target_ngrams)))[0]) def find_pos_ngram(text, ngram): - return find_all_matching_ngrams(text, ngram, analyzer.pos_ngrams) + pairs = analyzer.tag_text(text) + return find_matches_for_pairs(pairs, ngram) def find_syllable_ngram(text, ngram): @@ -21,7 +31,8 @@ def find_syllable_ngram(text, ngram): def find_etymology_ngram(text, ngram): - return find_all_matching_ngrams(text, ngram, analyzer.etymology_ngrams, PUNC=False) + pairs = analyzer.etymology_with_words(text) + return find_matches_for_pairs(pairs, ngram) def sentence_ngrams(text, n): diff --git a/poster/dendrogram.png b/poster/dendrogram.png index bc029fd..cdc2f8a 100644 Binary files a/poster/dendrogram.png and b/poster/dendrogram.png differ diff --git a/visualization/examples/dendrogram_horizontal.html b/visualization/examples/dendrogram_horizontal.html index 0bd9ef0..76b0898 100644 --- a/visualization/examples/dendrogram_horizontal.html +++ b/visualization/examples/dendrogram_horizontal.html @@ -24,6 +24,7 @@