-
Notifications
You must be signed in to change notification settings - Fork 0
/
vsm.py
184 lines (153 loc) · 6 KB
/
vsm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
@File : vsm.py
@Description : 实现向量空间模型
@Date : 2021/10/09 21:33:47
@Author : ZelKnow
@Github : https://github.com/ZelKnow
"""
__author__ = "ZelKnow"
from collections import defaultdict
import math
import time
from util.dataloader import load_data
from multiprocessing import Pool
def df2idf(docfreq, totaldocs, log_base=2.0):
"""Compute inverse document frequency(idf) for term t.
Args:
docfreq (int): number of documents where the term t appears
totaldocs (int): total number of documents
log_base (float, optional): Defaults to 2.0.
Returns:
int: idf result
"""
return math.log(float(totaldocs) / docfreq) / math.log(log_base)
def compute_dot_product(x, y):
"""Compute the sum of the products of the values with the same key.
Args:
x (dict): input dict
y (dict): input dict
Returns:
dict: dot product
"""
intersection = x.keys() & y.keys()
return sum([x[k] * y[k] for k in intersection])
def norm(x):
"""Compute the norm of the values array of dict x.
Args:
x (dict)
Returns:
float: norm result
"""
return math.sqrt(sum([v * v for v in x.values()]))
class VectorSpaceModel:
def __init__(self, path_to_data, path_to_stopwords=None):
"""Initialize the Vector Space Model
Args:
path_to_data (str): path to the data
path_to_stopwords (str, optional): path to the stopwords. Defaults to None.
"""
self.bag_of_words = [] # 词袋
self.token2id = {}
self.idfs = {}
self.tfs = []
self.docs = load_data(path_to_data, path_to_stopwords) # 读取数据
self.num_docs = len(self.docs)
self.similarity = [[0.0 for i in range(self.num_docs)]
for j in range(self.num_docs)] # 初始化相似度矩阵
self.initialize_bow() # 初试化词袋
self.compute_tfs() # 计算tf
self.compute_idfs() # 计算idf
self.compute_tfidfs() # 计算tf-idf
def initialize_bow(self):
"""Generate the bag of words for given data
"""
for doc in self.docs:
counter = defaultdict(int)
for word in doc:
if word not in self.token2id:
self.token2id[word] = len(self.token2id) # 为词进行编号
counter[self.token2id[word]] += 1
self.bag_of_words.append(counter)
def compute_idfs(self):
"""Compute idfs for given data
"""
dfs = {}
for bow in self.bag_of_words:
for termid in bow:
dfs[termid] = dfs.get(termid, 0) + 1
self.idfs = {
termid: df2idf(df, self.num_docs)
for termid, df in dfs.items()
}
def compute_tfs(self):
"""Compute tfs for given data
"""
for bow in self.bag_of_words:
total_fs = sum(bow.values())
self.tfs.append(
{termid: fs / total_fs
for termid, fs in bow.items()})
def compute_tfidfs(self):
"""Compute tf-idf
"""
self.tfidfs = [{
termid: tf * self.idfs.get(termid)
for termid, tf in adoc_tfs.items()
} for adoc_tfs in self.tfs]
def compute_similarity_naive(self):
"""Naive implement of the cosine similarity
"""
for docno_1, idfs_1 in enumerate(self.tfidfs):
for docno_2, idfs_2 in enumerate(self.tfidfs):
dot_product = compute_dot_product(idfs_1, idfs_2) # 分子
self.similarity[docno_1][docno_2] = dot_product / (
norm(idfs_1) * norm(idfs_2))
def compute_similarity_subprocess(self, processes=1, i=0):
"""Subprocess function for computing the cosine similarity
Args:
processes (int, optional): processes number. Defaults to 1.
i (int, optional): process id. Defaults to 0.
"""
similarity = []
length = math.ceil(self.num_docs / processes)
for idfs_1 in self.tfidfs[length * i:length * (i + 1)]: # 该进程负责运算的范围
similarity.append([])
for idfs_2 in self.tfidfs:
dot_product = compute_dot_product(idfs_1, idfs_2) # 分子
similarity[-1].append(dot_product /
(norm(idfs_1) * norm(idfs_2)))
return similarity
def compute_similarity_multiprocess(self, processes):
"""Multiprocess implement of the cosine similarity
Args:
processes (int): processes number.
"""
res = [] # 记录每个进程运算的结果
with Pool(processes) as pool:
for i in range(processes):
res.append(
pool.apply_async(self.compute_similarity_subprocess,
(processes, i)))
pool.close() # 关闭进程池
pool.join()
similarity = []
for i in range(processes):
similarity += res[i].get() # 整合进程运算结果
self.similarity = similarity
if __name__ == "__main__":
vsm = VectorSpaceModel("data/199801_clear.txt", "data/stopwords.txt")
t = time.time()
vsm.compute_similarity_naive() # 单进程计算
print("单进程计算相似度花费时间:{}s.".format(time.time() - t))
# 记录结果并将相似度矩阵重置
res_multiprocess = [[
vsm.similarity[j][i] for i in range(len(vsm.similarity[j]))
] for j in range(len(vsm.similarity))]
vsm.similarity = [[0.0 for i in range(len(res_multiprocess[j]))]
for j in range(len(res_multiprocess))]
t = time.time()
vsm.compute_similarity_multiprocess(4) # 多线程计算相似度
print("多进程计算相似度花费时间:{}s.".format(time.time() - t))
print("单进程与多进程计算结果是否相同:{}".format(res_multiprocess == vsm.similarity))