-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathfeatures.py
254 lines (199 loc) · 7.75 KB
/
features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
from abc import ABCMeta, abstractmethod
import numpy as np
from gensim import corpora
from gensim.models import ldamodel
from scipy.sparse import vstack
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from utils import *
from nltk.corpus import stopwords
topscores = {'Liberal': 106, 'videos': 10341, 'gentlemanboners': 1619, 'books':
4914, 'Music': 7286, 'politics': 15133, 'nba': 4108, 'pokemon': 3270,
'funny': 9633, 'technology': 10848, 'Conservative': 438, 'food': 3358,
'WTF': 11107, 'worldnews': 10559, 'soccer': 2985, 'gaming': 16413,
'aww': 7656, 'circlejerk': 3069, 'LadyBoners': 1190, 'news': 10995,
'television': 9274, 'science': 8965, 'nfl': 5416, 'pics': 19196,
'movies': 93504}
class AbstractFeatureModel(object):
"""
Interface for all feature extractors. Extend this class to
create new models for analyzing comment data.
"""
__metaclass__ = ABCMeta
@abstractmethod
def make_training_xy(self, data):
"""
Extract a feature matrix X and value vector Y from the training data set.
Args
----
data : dataframe containing comments data and upvote data
Returns
-------
X : numpy array (dims: ncomments x nfeatures)
each row of X represents the features associated with each comment
Y : numpy array (dims: ncomments)
each entry corresponds to the value associated with each comment
(e.g. normalized upvote score, subreddit id)
Usage
-----
model = MyModel(param1, param2)
X, Y = model.make_training(data)
"""
pass
@abstractmethod
def data_to_x(self, new_data):
"""
Extracts a feature matrix X from the new data set (for predictions), where
the number of rows of X is the number of new data items in new_data
Args
----
new_data : dataframe containing comments data, but no label data (upvotes, subreddit)
Returns
-------
X : feature matrix (num rows equal to number of entries in new_data)
"""
pass
@abstractmethod
def y_to_label(self, data, Y):
"""
Translates a y value back into its true representation (e.g. the
denormalized upvote score, the subreddit name).
Args
----
data : dataframe containing comments data and upvote data
Returns
-------
labels : The human readable label for the given Y values, len(labels) == len(Y)
"""
pass
class BagOfWordsModel(AbstractFeatureModel):
"""
Bag of words model. This is only an example. TODO
"""
def __init__(self, min_df=0, tfidf=True):
self.vectorizer = CountVectorizer(min_df=min_df, stop_words='english')
self.tfidf = tfidf
def make_training_xy(self, data):
X = self.vectorizer.fit_transform(data.body)
if self.tfidf:
X = TfidfTransformer().fit_transform(X)
X = X.tocsc()
Y = normalize_scores(data.net, data.subreddit[0])
return X,Y
def data_to_x(self, new_data):
return self.vectorizer.transform(new_data.body)
def y_to_label(self, data, Y):
return denormalize_scores(Y, data.subreddit[0])
class NGramModel(AbstractFeatureModel):
"""
n-gram model for analyzing text
"""
def __init__(self, n, min_df=0, tfidf=True):
self.n = n
self.tfidf = tfidf
self.vectorizer = CountVectorizer(ngram_range=(n,n), min_df=min_df, stop_words='english')
def make_training_xy(self, data):
X = self.vectorizer.fit_transform(data.body)
if self.tfidf:
X = TfidfTransformer().fit_transform(X)
X = X.tocsc()
Y = normalize_scores(data.net, data.subreddit[0])
return X, Y
def data_to_x(self, new_data):
return self.vectorizer.transform(new_data.body)
def y_to_label(self, data, Y):
return denormalize_scores(Y, data.subreddit[0])
class LdaFeatureModel(AbstractFeatureModel):
"""
LDA Topic model
"""
def __init__(self, num_topics=10, printing=False):
self.lda = None
self.num_topics = num_topics
self.printing = printing
self.stop = set(stopwords.words('english'))
def make_training_xy(self, data):
# convert data to "docs" for lda
docs = []
for post in data.body:
docs.append(filter(lambda w: w not in self.stop, post.split(" ")))
# make LDA model
self.dictionary = corpora.Dictionary(docs)
corpus = [self.dictionary.doc2bow(doc) for doc in docs]
self.lda = ldamodel.LdaModel(corpus, id2word=self.dictionary, num_topics=self.num_topics)
if self.printing:
print "LDA Topics:"
for topic in xrange(self.num_topics):
print "Topic #{}".format(topic),
print self.lda.print_topic(topic)
# make X and Y
X = self.docs_to_lda_matrix(docs)
Y = normalize_scores(data.net, data.subreddit[0])
return np.array(X), Y
def data_to_x(self, new_data):
# convert data to "docs" for lda
docs = []
for post in new_data.body:
docs.append(filter(lambda w: w not in self.stop, post.split(" ")))
# make X
X = self.docs_to_lda_matrix(docs)
return np.array(X)
def y_to_label(self, data, Y):
return denormalize_scores(Y, data.subreddit[0])
def docs_to_lda_matrix(self, docs):
res = []
for doc in docs:
row = [0.0] * self.num_topics
pred = self.lda[self.dictionary.doc2bow(doc)]
# iterate over topic prediction tuples
for t in pred:
row[t[0]] = t[1]
# add row
res.append(row)
return res
class CooccurenceModel(AbstractFeatureModel):
"""
Cooccurence model for analyzing text
"""
def __init__(self, min_df=0):
# build bag of words model
self.bow_model = BagOfWordsModel(min_df)
pass
def make_training_xy(self, data):
# get X/Y from bow_model
bow_X, bow_Y = self.bow_model.make_training_xy(data)
num_rows = bow_X.shape[0]
num_features = bow_X.shape[1]
# store sparse rows in a list
rows = []
# iterate over sparse rows
for i in xrange(num_rows):
# multiply X with itself to get coccurrence matrix
bow_X_col = bow_X.transpose(copy=True)
# reshape into a single row, and add to rows array
cooc_matrix_row = coo_reshape(bow_X_col.getcol(0) * bow_X.getrow(0), (1, num_features * num_features)).tocsc()
rows.append(cooc_matrix_row)
# stack rows
cooc_matrix = vstack(rows)
# return TODO: should we remove duplicates? e.g. A/B and B/A?
return cooc_matrix, bow_Y
def data_to_x(self, new_data):
# get counts from bow model
box_X = self.bow_model.data_to_x(new_data)
# multiply X with itself to get coccurrence matrix
bow_X_col = bow_X.transpose(copy=True)
# reshape into a single row, and add to rows array
cooc_matrix_row = coo_reshape(bow_X_col.getcol(0) * bow_X.getrow(0), (1, num_features * num_features)).tocsc()
# return
return cooc_matrix_row
def y_to_label(self, data, Y):
return denormalize_scores(Y, data.subreddit[0])
"""
normalize_scores
Normalizes the score based on the max upvotes in the given subreddit.
@param: ups (array of upvote scores), subreddit (name of subreddit)
@ret: array of normalized scores
"""
def normalize_scores(ups, subreddit):
return [float(x)/topscores[subreddit] for x in ups]
def denormalize_scores(norms, subreddit):
return [x * topscores[subreddit] for x in norms]