-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocessDoc.py
401 lines (348 loc) · 12.7 KB
/
processDoc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
import numpy as np
import time
import sys
import collections
import math
import operator
import unicodedata as ucd # deal with Latin char such as \xc2\xa37m
'''
usage
strange_str ='\xc2\xa37m'
strange_str = strange_str.decode('latin1')
print(ucd.name(strange_str[0]))
print(strange_str.encode('latin-1').decode('utf-8'))
'''
'''
read inn 2D synthetic CSV data
'''
def read_csv(filename):
i = 0
GM_set = set()
with open(filename,'r') as file:
for i, line in enumerate(file):
# skip header
if i<2:
continue
# parse line
GM_tuple = tuple()
#GM_data_line = file.readline()
# print GM_data_line
fields = line.split(",")
if len(fields)!=3:
raise Exception("raw data has incorrect number of fields",GM_data_line)
class_label =int(fields[0])
x1 = float(fields[1])
x2 = float(fields[2].strip())
GM_tuple = class_label,x1,x2
GM_set.add(GM_tuple)
return GM_set
################## process BBC news data ################################
'''
parse BBC .mtx
"termid docid frequency"
'''
def read_in_mtx(filename, has_header):
"""
Reads in a data set
@param filename: string - name of file (with path) read
@param has_header: boolean - whether the file has a header
"""
documents = dict()
with open(filename, 'r') as f:
for i, line in enumerate(f):
# print("line----",line)
if i < 2 and has_header:
continue
parse_in_mtx(line, documents)
return documents
def parse_in_mtx(line, documents):
"""
Reads in line from .mtx file into documents hash.
@param line: string - line to parse
@param documents: dict - hash of documents (doc id -> term id -> count)
"""
# print("line",line)
fields = line.split(" ")
if len(fields) != 3:
raise Exception("Matrix line has incorrect number of fields: " + line)
term_id = int(fields[0])
doc_id = int(fields[1])
count = float(fields[2].rstrip())
# print("termid, doc_id,count",term_id,doc_id,count)
if not documents.has_key(doc_id):
documents[doc_id] = dict()
documents[doc_id][term_id] = count
'''
vector representation for a document using tfidf
tfidf_dict: {termid, docid: tfidf value})
return tuple {vector[V_size]}
'''
def doc_to_vec(documents,tfidf_dict):
dim= len(tfidf_dict)
doc_vec = list()
doc_count = len(documents)
docid_list = documents.keys()
for docid in range(doc_count):
temp_list = tuple()
for termid in range(dim):
# termid starts from 1
if tfidf_dict[termid+1].has_key(docid+1):
temp_list = temp_list+(tfidf_dict[termid+1][docid+1],)
else:
temp_list = temp_list+(0.0,)
doc_vec.append(temp_list)
return doc_vec
def parse_in_classes(line, documents):
"""
Reads in line from .mtx file into documents hash.
@param line: string - line to parse
@param documents: dict - hash of documents (doc id -> class label)
"""
# print("line",line)
fields = line.split(" ")
if len(fields) != 2:
raise Exception("Matrix line has incorrect number of fields: " + line)
doc_id = int(fields[0])
label = int(fields[1].rstrip())
# print("doc_id,label",doc_id,label)
if not documents.has_key(doc_id):
documents[doc_id] = dict()
documents[doc_id]= label
def parse_in_center(line):
"""
Reads in line from .mtx file into documents hash.
@param line: string - line to parse
@param centers: tuple (x1,x2,x3....xn)
"""
# print("line",line)
fields = line.split(" ")
center = tuple()
dim = len(fields)
if dim != 99:
raise Exception("Matrix line has incorrect number of fields: " + line)
for i in range(dim):
center= center + (float(fields[i]),)
return center
'''
read in .terms file in BBC data
@ return {termid: term} note that termid starts from 1-99
ref: https://docs.python.org/3/library/codecs.html#standard-encodings
'''
def read_in_terms(filename, has_header):
documents = dict() # termid: term
with open(filename, 'r') as f:
for i, line in enumerate(f):
# strip '\n'
line = line.rstrip()
# print("line----",line)
# line = line.decode('latin-1')
# # line = line.('latin_1')
# line = line.encode('latin-1').decode('utf-8')
if i < 2 and has_header:
continue
# parse_in_line(line, documents)
documents[i+1] = line
return documents
'''
read in .classes file in BBC data
'''
def read_in_classes(filename, has_header):
documents = dict()
with open(filename, 'r') as f:
for i, line in enumerate(f):
# strip '\n'
line = line.rstrip()
# print("line----",line)
if i < 2 and has_header:
continue
parse_in_classes(line, documents)
return documents
'''
read in .centers file in BBC data
return a set of cluster centers : a list of tuples
'''
def read_in_centers(filename, has_header):
centers = list()
with open(filename, 'r') as f:
for i, line in enumerate(f):
# strip '\n'
line = line.rstrip()
# print("line----",line)
if i < 2 and has_header:
continue
centers.append(parse_in_center(line))
return centers
'''
ref: https://en.wikipedia.org/wiki/Tf%E2%80%93idf
tf(t,d) = f(t,d) / max{f(w,d): w in d}
@ documents: documents[doc_id][term_id] = count
'''
def tf(documents):
# {termid, docid : tf}
tf_dict = dict()
for docid, val in documents.iteritems():
# max freq term in one doc
max_freq = float(max(val.values()))
for termid, count in val.iteritems():
if not tf_dict.has_key(termid):
tf_dict[termid] = dict()
tf_dict[termid][docid] = float(float(count) / float(max_freq))
return tf_dict
'''
idf(t,D) = log{ N / |{d in D: t in d}|}
@ N: total number of documents in the corpus
@ |{d in D: t in d}|: number of documents where the
term t appears. If the term is not in the corpus,
this will lead to a division-by-zero.
It is therefore common to adjust the denominator to {1+|\{d\in D:t\in d\}|}
It is the logarithmically scaled inverse fraction of the documents
that contain the word, obtained by dividing the total number of
documents by the number of documents containing the term,
and then taking the logarithm of that quotient.
@ documents: documents[doc_id][term_id] = count
@ return {termid: idf value}
'''
def idf(documents):
N = len(documents)
# {term: value}
idf_dict = dict()
# number of documents where the term t appears.
# {termid: count of docs}
doc_count = dict()
# search for documents that contain a particular term
for docid, val in documents.iteritems():
# # max freq term in one doc
# max_freq = float(max(val.values()))
for termid, count in val.iteritems():
doc_count[termid]=doc_count.get(termid,0.0)+1.0
for termid, count in doc_count.iteritems():
if count!=0:
idf_dict[termid] = math.log(float(float(N)/float(count)))
else:
print("term count is 0 in a doc, termid",termid)
idf_dict[termid] = float("inf")
return idf_dict
'''
Convert the term-doc-frequency matrix into a term-doc-tfidf matrix
return @ tfidf_dict: {termid, docid: tfidf value}
return @ avg_tfidf_dict: {termid, class_label : avg tiidf value}
'''
def tfidf(documents,bbc_classes):
idf_dict = idf(documents)
tf_dict = tf(documents)
tfidf_dict = dict()
avg_tfidf_dict = dict() # termid, class label: tfidf
class_list = bbc_classes.values()
class_count_dict = dict() # class_label: count
# class_count = len(bbc_classes)
# get term-doc-freq matrix
for docid, val in documents.iteritems():
class_label = bbc_classes[docid]
# if not avg_tfidf_dict.has_key[class_label]:
# avg_tfidf_dict[]
for termid, count in val.iteritems():
if not tfidf_dict.has_key(termid):
tfidf_dict[termid] = dict()
tfidf_dict[termid][docid]=tf_dict[termid][docid] * idf_dict[termid]
if not avg_tfidf_dict.has_key(termid):
avg_tfidf_dict[termid] = dict()
avg_tfidf_dict[termid][class_label] = avg_tfidf_dict[termid].get(class_label,0)+1
# print("avg_tfidf_dict[termid][class_label]",avg_tfidf_dict[termid][class_label] )
# print("class_label",class_label)
class_count_dict[class_label]=class_count_dict.get(class_label,0)+1
# print("len of avg_tfidf_dict",len(avg_tfidf_dict))
# print("all terms in avg_tfidf_dict",avg_tfidf_dict.keys())
# print("all vals in avg_tfidf_dict",avg_tfidf_dict.values())
for termid,val in avg_tfidf_dict.iteritems():
for label, val in val.iteritems():
# print("avg_tfidf_dict[termid][class_label]",avg_tfidf_dict[termid][label])
# print("class_count_dict[class_label]",class_count_dict[label])
avg_tfidf_dict[termid][label] = float(float(avg_tfidf_dict[termid][label]) / float(class_count_dict[label]))
return tfidf_dict,avg_tfidf_dict
'''
Convert the term-doc-frequency matrix into a avg term-doc-tfidf matrix
@ var : documents, idf_dict,tf_dict
@ var: bbc_classes : (docid: label)
return @ tfidf_dict: {termid, docid: tfidf value}
'''
# def avg_tfidf(tfidf_dict,bbc_classes):
# tfidf_dict = dict()
# # get term-doc-freq matrix
# for docid, val in documents.iteritems():
# for termid, count in val.itertimes():
# tfidf_dict[termid][docid]=tf_dict[termid][docid] * idf_dict[termid]
# return tfidf_dict
'''
For each class Ci,
return @ sorted avg tfidf {label: (termid, avg tfidf)}
'''
def sort_avg_tfidt(term_freq_mtx,bbc_classes):
idf_dict = idf(term_freq_mtx)
tf_dict = tf(term_freq_mtx)
tfidf_dict,avg_tfidf_dict = tfidf(term_freq_mtx,bbc_classes)
#avg_tfidf_dict: {termid, class_label : avg tfidf value}
# reverse_avgtfidf = list() # tuple(label,termid, avg tfidf)}
reverse_avgtfidf_dict = dict() # label: term, avg tfidf
for termid, val in avg_tfidf_dict.iteritems():
for label,avg_tfidf in val.iteritems():
# reverse_avgtfidf.append((label,termid, avg_tfidf))
if not reverse_avgtfidf_dict.has_key(label):
reverse_avgtfidf_dict[label] = list()
reverse_avgtfidf_dict[label].append((termid, avg_tfidf))
sorted_tfidf = dict() # sorted avg tfidf {label: (termid, avg tfidf)}
for label, vals in reverse_avgtfidf_dict.iteritems():
if not sorted_tfidf.has_key(label):
sorted_tfidf[label] = list()
# sort by the 2nd term: avg tfidf
sorted_tfidf[label] = sorted(vals, key=operator.itemgetter(1),reverse=True)
# for label, val in sorted_tfidf.iteritems():
# print("label",label)
# print("termid, avg tfidf",val)
return sorted_tfidf
'''
given a termid, return term
'''
def get_term(termid,bbc_terms):
return bbc_terms[termid]
'''
get class content given a label
business, entertainment, politics, sport, tech
'''
def get_class_by_label(label):
class_dict = {0:"business",1:"entertainment",2:"politics",
3:"sport",4:"tech"}
class_name = class_dict[label]
return class_name
########################## main ###################################
def main():
# filename = "2DGaussianMixture.csv"
# GM_set = read_csv(filename)
# # for item in GM_set:
# # print item
# print(len(GM_set))
mtx_file = "bbc.mtx"
term_freq_mtx = read_in_mtx(mtx_file,True)
# print("termid values",term_freq_mtx.values())
termfile = "bbc.terms"
bbc_terms = read_in_terms(termfile,False)
# print("term list",bbc_terms)
class_file = "bbc.classes"
bbc_classes = read_in_classes(class_file,False)
print("bbc_classes",bbc_classes)
center_file = "bbc.centers"
centers = read_in_centers(center_file,False)
# print("centers",centers)
# For each class Ci,report the 5 terms with the highest AvgTfidf for the class
# sorted_tfidf: {label:(termid,avg_tfidf)}
sorted_tfidf = sort_avg_tfidt(term_freq_mtx,bbc_classes)
for label, val in sorted_tfidf.iteritems():
print("label:",get_class_by_label(label))
i = 0
while i<5:
# label_term =
# print("val[0]",val[i])
term = bbc_terms[val[i][0]]
print("term, avg tfidf",term,val[i][1])
i = i+1
if __name__=="__main__":
main()