-
Notifications
You must be signed in to change notification settings - Fork 0
/
doc_similarity_VSM.py
173 lines (148 loc) · 6.59 KB
/
doc_similarity_VSM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import time
import numpy as np
from dictionary_builder import DictionaryBuilder
DOCUMENT_FILE = "data/199801_clear_1.txt"
DOCUMENT_FILE_TEST = "data/small_data_for_test.txt" # datafile for testing
class DocSimilarityVSM:
def __init__(self, datafile):
self._word_dict, self._doc_list = DictionaryBuilder(datafile).build_dictionary()
self._num_doc = len(self._doc_list)
self.doc_frequency = self.cal_doc_frequency()
self.word_frequency_list = self.word_frequency_list()
self.all_docs_vector = self.build_doc_vector()
self.doc_vec_l2norm = self.save_doc_vec_l2norm()
def print_doc_attr(self):
print("num_doc: ", self._num_doc)
print("doc_frequency: ", self.doc_frequency)
print("doc_list:", self._doc_list)
def word_frequency_list(self):
word_frequency_list = []
for doc in self._doc_list:
doc_word_frequency = self.cal_word_frequency_in_one_doc(doc)
word_frequency_list.append(doc_word_frequency)
return word_frequency_list
def build_doc_vector(self):
"""
use dict structure to save the doc vector,
and we only reserve words that take up 80% tf-idf weights
"""
all_docs_vector = []
for doc_idx, doc in enumerate(self._doc_list):
doc_vector_dict = dict()
for word in doc:
word_tf_idf = self.cal_tf_idf(word, self.word_frequency_list[doc_idx])
doc_vector_dict[word] = word_tf_idf
sorted_doc_vector_dict = dict(sorted(doc_vector_dict.items(), key=lambda d: d[1], reverse=True))
# reserve words which take up top 80% tf-idf weights
sum_weights = 0
total_weights = sum(sorted_doc_vector_dict.values()) * 0.8
doc_vector_dict_reserved = dict()
for key, value in sorted_doc_vector_dict.items():
if sum_weights <= total_weights:
doc_vector_dict_reserved.update({key: value})
sum_weights += value
all_docs_vector.append(doc_vector_dict_reserved)
return all_docs_vector
def save_doc_vec_l2norm(self):
"""
to improve the efficiency and avoid repetitive computation,
we can save l2_norm of each doc vector which will be used in
cosine distance computation.
"""
_all_docs_vector = self.build_doc_vector()
all_docs_vector_l2norm = []
for doc_vec in _all_docs_vector:
l2_norm = sum([i * i for i in doc_vec.values()]) ** 0.5
all_docs_vector_l2norm.append(l2_norm)
return all_docs_vector_l2norm
def cosine_distance(self, doc1_id: int, doc2_id: int):
"""calculate cosine distance between two docs"""
doc1_vec, doc2_vec = self.all_docs_vector[doc1_id], self.all_docs_vector[doc2_id]
if len(doc1_vec) <= len(doc2_vec):
base_vec = doc1_vec
cmp_vec = doc2_vec
else:
base_vec = doc2_vec
cmp_vec = doc1_vec
inner_product = 0
for word in base_vec:
if word in cmp_vec:
inner_product += base_vec[word] * cmp_vec[word]
vec_norm_product = self.doc_vec_l2norm[doc1_id] * self.doc_vec_l2norm[doc2_id]
distance = 0
try:
distance = inner_product / vec_norm_product
except ZeroDivisionError as e:
print("doc1_id = ", doc1_id, doc1_vec, self._doc_list[doc1_id])
print("doc2_id = ", doc2_id, doc2_vec, self._doc_list[doc2_id])
print(e)
return distance
def cal_all_docs_similarity(self):
time_start = time.time()
doc_similarity_vec = np.zeros((self._num_doc, self._num_doc), dtype=np.float32)
for i in range(self._num_doc):
doc_similarity_vec[i, i] = 1
for i in range(self._num_doc):
if i % 100 == 0:
print("processed %d, time %f" % (i, time.time() - time_start))
for j in range(i):
doc_similarity = self.cosine_distance(i, j)
doc_similarity_vec[i][j] = doc_similarity_vec[j][i] = doc_similarity
return doc_similarity_vec
def cal_tf_idf(self, word, word_frequency):
if word not in word_frequency.keys():
return 0
word_tf = 1 + np.log(word_frequency[word])
word_idf = np.log(self._num_doc / self.doc_frequency[word])
word_tf_idf = word_tf * word_idf
return word_tf_idf
def cal_word_frequency_in_one_doc(self, doc):
word_frequency = dict()
for word in doc:
word_frequency[word] = word_frequency.get(word, 0) + 1
return word_frequency
def cal_doc_frequency(self):
word_doc_frequency = dict()
for doc in self._doc_list:
doc_set = set(doc)
for word in doc_set:
word_doc_frequency[word] = word_doc_frequency.get(word, 0) + 1
sort_list = sorted(word_doc_frequency.items(), key=lambda d: d[1], reverse=True)
word_doc_frequency_dict = dict(sort_list)
return word_doc_frequency_dict
def write_vec_to_file(self, file, vec):
np.savetxt(file, vec)
def analyze_result(self, vec_file: str, doc_id: int):
"""analyze similarity calculation result, print the most similar doc to doc_id"""
doc_sim_vec = np.loadtxt(vec_file)
sim_doc_idx = np.argsort(-doc_sim_vec[doc_id])
print("base doc:")
print(self._doc_list[doc_id])
# most similar to doc itself, so index from 1
print("the most similar doc index: ", sim_doc_idx[1:4])
print("they are: ")
for i in sim_doc_idx[1:4]:
print(self._doc_list[i])
if __name__ == '__main__':
doc_similarity = DocSimilarityVSM(DOCUMENT_FILE)
# doc_similarity.print_doc_attr()
# print(doc_similarity._num_doc)
# test cal_doc_frequency
# print(doc_similarity.cal_doc_frequency())
# test build_doc_vector
# doc_vec = doc_similarity.build_doc_vector()
# print(doc_vec)
# test cal_doc_similarity
# doc_similarity_vec = doc_similarity.cal_doc_similarity()
# print(doc_similarity_vec)
# test build_doc_vector
# all_docs_vector_ = doc_similarity.build_doc_vector()
# print(all_docs_vector_)
# test cosine distance
# cosine_distance = doc_similarity.cosine_distance(3, 1)
# print(cosine_distance)
# test cal_all_docs_similarity
all_docs_similarity = doc_similarity.cal_all_docs_similarity()
# print(all_docs_similarity)
doc_similarity.write_vec_to_file("doc_similarity_vec.txt", all_docs_similarity)
doc_similarity.analyze_result("doc_similarity_vec.txt", 29)