-
Notifications
You must be signed in to change notification settings - Fork 13
/
main.py
148 lines (115 loc) · 4.03 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# coding=utf-8
from __future__ import unicode_literals
import gensim
import jieba
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from sklearn.cluster import KMeans
import scipy.cluster.hierarchy as sch
def find_main_charecters(num=10):
with open('hlm.txt') as f:
data = f.read()
with open('names.txt') as f:
characters_names = [line.strip('\n') for line in f.readlines()]
count = []
for name in characters_names:
count.append([name, data.count(name)])
count.sort(key=lambda x: x[1])
_, ax = plt.subplots()
numbers = [x[1] for x in count[-num:]]
names = [x[0] for x in count[-num:]]
ax.barh(range(num), numbers, color='red', align='center')
ax.set_title('红楼梦')
ax.set_yticks(range(num))
ax.set_yticklabels(names)
plt.show()
def find_all_names():
with open('names.txt') as f:
characters_names = [line.strip('\n') for line in f.readlines()]
return characters_names
def add_to_dict(characters_names):
for name in characters_names:
jieba.add_word(name)
def tranning():
with open('hlm.txt') as f:
data = [line.strip()
for line in f.readlines()
if line.strip()]
sentences = []
for line in data:
words = list(jieba.cut(line))
sentences.append(words)
model = gensim.models.Word2Vec(sentences,
size=100,
window=5,
min_count=5,
workers=4)
return model
def find_relationship(tranning_model, a, b, c):
"""
返回 d
a与b的关系,跟c与d的关系一样
"""
d, _ = tranning_model.most_similar(positive=[c, b], negative=[a])[0]
print("给定“{}”与“{}”,“{}”和“{}”有类似的关系".format(a, b, c, d))
def kmeans(model, characters_names):
all_names = []
word_vectors = None
np_names = None
for name in characters_names:
if name in model:
all_names.append(name)
for name in all_names:
if word_vectors is None:
word_vectors = model[name]
else:
word_vectors = np.vstack((word_vectors, model[name]))
np_names = np.array(all_names)
return np_names, word_vectors
def aggre3(np_names, word_vectors):
N = 3
label = KMeans(N).fit(word_vectors).labels_
for c in range(N):
print("类别{}:".format(c + 1))
for idx, name in enumerate(np_names[label == c]):
print(name, end=", ")
if idx % 10 == 9:
print('')
print('')
def aggre4(np_names, word_vectors):
N = 4
label = KMeans(N).fit(word_vectors).labels_
c = sp.stats.mode(label).mode
remain_names = np_names[label != c]
remain_vectors = word_vectors[label != c]
remain_label = KMeans(N).fit(remain_vectors).labels_
for c in range(N):
print("类别{}:".format(c + 1))
for idx, name in enumerate(remain_names[remain_label == c]):
print(name, end=", ")
if idx % 10 == 9:
print('')
print('')
def hierarchy(np_names, word_vectors):
font_yahei_consolas = FontProperties(fname="/Library/Fonts/华文仿宋.ttf")
Y = sch.linkage(word_vectors, method="ward")
_, ax = plt.subplots(figsize=(10, 40))
Z = sch.dendrogram(Y, orientation='right')
idx = Z['leaves']
ax.set_xticks([])
ax.set_yticklabels(np_names[idx], fontproperties=font_yahei_consolas,
fontsize=14)
ax.set_frame_on(False)
plt.show()
if __name__ == "__main__":
# find_main_charecters(10)
add_to_dict(find_all_names())
model = tranning()
for k, s in model.most_similar(positive=["宝钗"]):
print('{}:{}'.format(k, s))
find_relationship(model, '王夫人', '凤姐', '薛姨妈')
# np_names, word_vectors = kmeans(model, find_all_names())
# aggre4(np_names, word_vectors)
# hierarchy(np_names, word_vectors)