-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathArticleSimilarityLSI.py
142 lines (118 loc) · 4.21 KB
/
ArticleSimilarityLSI.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import time
import re
import math
import numpy
import scipy.linalg
import scipy.sparse
import scipy.sparse.linalg
start_time = time.time()
file_object = open("199801_clear_utf8.txt")
article_num_list = [""]
word_count = {}
#每个词出现过的文章数
word_article_count = {}
#每篇文章内的词数临时计数
article_word_count = {}
word_bag = {}
total_article_word_list = []
article_num_pattern = re.compile(r"\d{8}-\d{2}-\d{3}")
article_word_pattern = re.compile(r"\S*[^,。:;\"、()—《》]/\S")
article_count = 0
for line in file_object:
article_num = article_num_pattern.match(line)
if article_num:
paragraph_word_list = article_word_pattern.findall(line)
#删掉第一列的文章标题
del paragraph_word_list[0]
# 判断是否为新文章
# 开始处理新文章前需先处理上一篇文章的词
if article_num.group() != article_num_list[article_count]:
article_num_list.append(article_num.group())
article_count += 1
# 将上一篇文章的词更新到word_article_count中
for w in article_word_count:
if w in word_article_count:
word_article_count[w] += 1
else:
word_article_count[w] = 1
total_article_word_list.append(article_word_count.copy())
# 清空文章内词数临时计数
article_word_count.clear()
for word in paragraph_word_list:
if word in word_count:
word_count[word] += 1
else:
word_count[word] = 1
if word in article_word_count:
article_word_count[word] += 1
else:
article_word_count[word] = 1
#最后一篇文章需单独处理
article_num_list.append(article_num.group())
article_count += 1
# 将上一篇文章的词更新到word_article_count中
for w in article_word_count:
if w in word_article_count:
word_article_count[w] += 1
else:
word_article_count[w] = 1
total_article_word_list.append(article_word_count.copy())
# 清空文章内词数临时计数
article_word_count.clear()
del total_article_word_list[0]
del article_num_list[0]
article_count -= 1
#构建word bag, 并且记录每个词为第x维
x = 0
for word in word_count:
if word_count[word] >= 2:
word_bag[word] = x
x += 1
finish_time = time.time()
cost_time = (finish_time - start_time)
print("Read file, build word bag and article word list time cost: %ss" % cost_time)
start_time = time.time()
#计算word bag中每个词的idf值
word_idf = {}
for word in word_bag:
word_idf[word] = math.log(article_count / (word_article_count[word] + 1.0))
term_document_lil_matrix = scipy.sparse.lil_matrix((len(word_bag), article_count), dtype='float')
#对每篇文章中每个存在于word bag的词计算tf-idf值并存入矩阵
i = 0
for a_w_c in total_article_word_list:
#取出每篇文章的article_word_count
for w in a_w_c:
if w in word_bag:
w_tfidf = (a_w_c[w] / sum(a_w_c.values())) * word_idf[w]
term_document_lil_matrix[word_bag[w], i] = w_tfidf
i += 1
finish_time = time.time()
cost_time = (finish_time - start_time)
print("Build tf-idf matrix time cost: %ss" % cost_time)
start_time = time.time()
U, sigma, VT = scipy.sparse.linalg.svds(term_document_lil_matrix.tocsr(), k=300)
finish_time = time.time()
cost_time = (finish_time - start_time)
print("Run svd time cost: %ss" % cost_time)
start_time = time.time()
sigma_diag = scipy.diag(sigma)
new_matrix = scipy.dot(scipy.dot(U, sigma_diag), VT)
res = scipy.dot(new_matrix.T, new_matrix)
for i in range(article_count):
res[i, i] = 0
finish_time = time.time()
cost_time = (finish_time - start_time)
print("Compute result matrix time cost: %ss" % cost_time)
start_time = time.time()
print(word_bag)
print(total_article_word_list[0])
for i in range(10):
pos = numpy.where(res == numpy.max(res))[0]
row = pos[0]
column = pos[1]
print("Article num", row, column)
print(article_num_list[row], total_article_word_list[row])
print(article_num_list[column], total_article_word_list[column])
res[row, column] = 0
res[column, row] = 0
#scipy.savetxt("res4.txt", res)