-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpmi.py
54 lines (47 loc) · 2.25 KB
/
pmi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/env python
# coding: utf-8
import nltk
import pandas as pd
import numpy as np
from nltk.collocations import *
from nltk.corpus import stopwords
import os
import codecs
bigram_measures = nltk.collocations.BigramAssocMeasures()
#trigram_measures = nltk.collocations.TrigramAssocMeasures()
files = {}
pmi_list = {}
input_folder = '2016_Data_Hillary Clinton'
for filename in os.listdir(input_folder):
with open("{}/{}".format(input_folder, filename), 'rb') as fp:
files[filename] = fp.read().decode('latin-1')
for filename,text in files.items():
tokens = nltk.wordpunct_tokenize(text.lower())
stop_words = set(stopwords.words('english'))
custom_stopwords = {",", "'",";" ,".",":","-","\x96","t","m","!",'"',"%","3"
,"4","];","$",'â','\x80\x94', '(',"100", "33","000","/"
,"11","e","[", "?","]",'."',"said","audience","yes","know"
,"going","applause",'.!','u',"--","see","1","2","mean","get"
,"wants","thank","across","tell","us","say","words","let"
,'...[','k','12', '...','video','**[','click','\x80','\x93',')','also','10','matthews'}
stop_words = stop_words.union(custom_stopwords)
filtered_tokens = []
for word in tokens:
if word not in stop_words:
filtered_tokens.append(word)
finder = BigramCollocationFinder.from_words(filtered_tokens)
finder.apply_freq_filter(3)
pmi_list[filename] = sorted(finder.ngram_fd.items(), key=lambda t: (-t[1], t[0]))[:10]
# pmi_list
#finding collocations words
#extract n-grams from your data and then find the ones that have the highest point wise mutual information (PMI)
#Collocations are expressions of multiple words which commonly co-occur
# finder from manually-derived FreqDists:
word_fd = nltk.FreqDist(filtered_tokens)
bigram_fd = nltk.FreqDist(nltk.bigrams(filtered_tokens))
finder = BigramCollocationFinder(word_fd, bigram_fd)
sorted(finder.nbest(bigram_measures.raw_freq, 10))
#spanning intervening words:
ignored_words = nltk.corpus.stopwords.words('english')
finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
finder.nbest(bigram_measures.likelihood_ratio, 20)