-
Notifications
You must be signed in to change notification settings - Fork 0
/
eval_func.py
297 lines (249 loc) · 12.2 KB
/
eval_func.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
from gensim.models import Word2Vec
import numpy as np
def word_analogy_evaluation(vectors_file):
import torch
# in this function the vector file should be devrivative from the start to the return
# so we need not to make any changes on this file to make the loss to be derivative
# however, there are few changes here: like
# wait, firstly, we need input a torch tensor but not a np array (for the vectors file)
# secondly, we need no use of like these sentences:
# assignment W[vocab[word], :] = torch.from_numpy(v)
# slice pred_vec = (W[ind2[subset], :] - W[ind1[subset], :]+ W[ind3[subset], :])
# assginemt expected_matrix[ind4[subset[setid]]][setid]=1 #第i个列最大的数值为1,为真值,其行坐标为ind14[i]
# no change for the leaf node
# tensor ---> req-grad =False--> f(tensor)--->req-grad =True->return value
# for f, only computation formulas include tensor, no value changes for this one
vectors = vectors_file
vocab_words=list(vectors.keys())
vocab_size = len(vocab_words)
# create word->index and index->word converter
vocab = {w: idx for idx, w in enumerate(vocab_words)}
ivocab = {idx: w for idx, w in enumerate(vocab_words)}
# create the embedding matrix of shape (vocab_size, dim)
vector_dim = len(vectors[ivocab[0]])
#W = np.zeros((vocab_size, vector_dim))
#W = torch.from_numpy(W)
W = torch.zeros(vocab_size, vector_dim).float()
#print(W.requires_grad)
#print(W.dtype)
#W.requires_grad_(True)
for word, v in vectors.items():
if word == '<unk>':
continue
W[vocab[word], :] = torch.from_numpy(v)
# normalize each word vector to unit length
# Vectors are usually normalized to unit length before they are used for similarity calculation, making cosine similarity and dot-product equivalent.
W_norm = torch.zeros(vocab_size, vector_dim).float()
d = (torch.sum(W ** 2, 1) ** (0.5))
W_norm = (W.T / d).T
def evaluate_vectors(W,vocab, prefix='./eval/question-data/'):
"""Evaluate the trained word vectors on a variety of tasks"""
filenames = [
'capital-common-countries.txt', 'capital-world.txt', 'currency.txt',
'city-in-state.txt', 'family.txt', 'gram1-adjective-to-adverb.txt',
'gram2-opposite.txt', 'gram3-comparative.txt', 'gram4-superlative.txt',
'gram5-present-participle.txt', 'gram6-nationality-adjective.txt',
'gram7-past-tense.txt', 'gram8-plural.txt', 'gram9-plural-verbs.txt',
]
# to avoid memory overflow, could be increased/decreased
# depending on system and vocab size
split_size = 100
correct_sem = 0; # count correct semantic questions
correct_syn = 0; # count correct syntactic questions
correct_tot = 0 # count correct questions
count_sem = 0; # count all semantic questions
count_syn = 0; # count all syntactic questions
count_tot = 0 # count all questions
full_count = 0 # count all questions, including those with unknown words
for i in range(len(filenames)):
with open('%s/%s' % (prefix, filenames[i]), 'r') as f:
full_data = [line.rstrip().split(' ') for line in f]
full_count += len(full_data)
data = [x for x in full_data if all(word in vocab for word in x)]
if len(data) == 0:
#print("ERROR: no lines of vocab kept for %s !" % filenames[i])
#print("Example missing line:", full_data[0])
continue
indices = np.array([[vocab[word] for word in row] for row in data])#Note: Great Problem
indices = torch.LongTensor(indices)
ind1, ind2, ind3, ind4 = indices.T
ind1 = torch.LongTensor(ind1)
ind2 = torch.LongTensor(ind2)
ind3 = torch.LongTensor(ind3)
ind4 = torch.LongTensor(ind4)
predictions = np.zeros((len(indices),))
predictions = torch.from_numpy(predictions)
num_iter = int(np.ceil(len(indices) / float(split_size)))
tot_loss1 = 0
tot_loss2 = 0
for j in range(num_iter): #每100个连续的词语做成一个batch进行训练
subset = np.arange(j*split_size, min((j + 1)*split_size, len(ind1))) #subset用于对set中id->整个corpus词语id的重新定位
subset = torch.LongTensor(subset)
pred_vec = (W[ind2[subset], :] - W[ind1[subset], :]
+ W[ind3[subset], :])
# normalization
W_norm_pred = np.zeros(pred_vec.shape)
W_norm_pred = torch.from_numpy(W_norm_pred)
d_pred = (torch.sum(pred_vec ** 2, 1) ** (0.5))
W_norm_pred = (pred_vec.T / d_pred).T
dist = torch.mm(W, pred_vec.T)
for k in range(len(subset)):
dist[ind1[subset[k]], k] = 0
dist[ind2[subset[k]], k] = 0
dist[ind3[subset[k]], k] = 0
# predicted word index
tmp = torch.argmax(dist,0).flatten()
tmp = torch.FloatTensor(tmp.float())
predictions[subset] = tmp.double()#torch.argmax(dist,0).flatten()
dist.requires_grad_(True)
### construct expected matrix
expected_matrix = 0.01*np.ones((W.shape[0],len(subset))) #所有词语数目(用于找到wordvec最相近词),此过程中的ind4数目
expected_matrix = torch.from_numpy(expected_matrix)
for setid in range(0,len(subset)): #ind4[i]代表对应的正确id
expected_matrix[ind4[subset[setid]]][setid]=1 #第i个列最大的数值为1,为真值,其行坐标为ind14[i]
expected_matrix.requires_grad_(True)
### calculate the angle between [0.01,,,,1,,,0.01]
tot_loss2 += torch.sum(dist*expected_matrix) #similarity of prediction result matrix
val = (ind4 == predictions) # correct predictions
for sid in range(0,len(ind4)):
if(ind4[sid]!=predictions[sid]):
tot_loss1 += torch.sum(W[ind4[sid]]*W[int(predictions[sid])])
count_tot = count_tot + len(ind1)
correct_tot = correct_tot + sum(val)
if i < 5:
count_sem = count_sem + len(ind1)
correct_sem = correct_sem + sum(val)
else:
count_syn = count_syn + len(ind1)
correct_syn = correct_syn + sum(val)
tot_loss1.requires_grad_(True)
tot_loss2.requires_grad_(True)
return correct_sem, correct_syn, correct_tot, count_sem, count_syn, count_tot, full_count, tot_loss1, tot_loss2
correct_sem, correct_syn, correct_tot, count_sem, count_syn, count_tot, full_count,tot_loss1, tot_loss2 \
= evaluate_vectors(W,vocab, prefix='GloVe/eval/question-data')
semantic_acc = 100 * correct_sem / float(count_sem)
syntactic_acc = 100 * correct_syn / float(count_syn)
total_acc = 100 * correct_tot / float(count_tot)
tot_loss1 = 100*tot_loss1 / float(vocab_size) #100--embedding vector length
tot_loss2 =tot_loss2 / float(vocab_size) #average of similarity
#print(total_acc)
#print(tot_loss1)
#print(tot_loss2)
return semantic_acc, syntactic_acc, total_acc, tot_loss1, tot_loss2
import math
import numpy
from operator import itemgetter
from numpy.linalg import norm
import sys
import os
import gzip
EPSILON = 1e-6
def euclidean(vec1, vec2):
diff = vec1 - vec2
return math.sqrt(diff.dot(diff))
def cosine_sim(vec1, vec2):
vec1 += EPSILON * numpy.ones(len(vec1))
vec2 += EPSILON * numpy.ones(len(vec1))
return vec1.dot(vec2)/(norm(vec1)*norm(vec2)+1)
def assign_ranks(item_dict):
ranked_dict = {}
sorted_list = [(key, val) for (key, val) in sorted(item_dict.items(),
key=itemgetter(1),
reverse=True)]
for i, (key, val) in enumerate(sorted_list):
same_val_indices = []
for j, (key2, val2) in enumerate(sorted_list):
if val2 == val:
same_val_indices.append(j+1)
if len(same_val_indices) == 1:
ranked_dict[key] = i+1
else:
ranked_dict[key] = 1.*sum(same_val_indices)/(len(same_val_indices)+1)
return ranked_dict
def correlation(dict1, dict2):
avg1 = 1.*sum([val for key, val in dict1.iteritems()])/len(dict1)
avg2 = 1.*sum([val for key, val in dict2.iteritems()])/len(dict2)
numr, den1, den2 = (0., 0., 0.)
for val1, val2 in zip(dict1.itervalues(), dict2.itervalues()):
numr += (val1 - avg1) * (val2 - avg2)
den1 += (val1 - avg1) ** 2
den2 += (val2 - avg2) ** 2
return numr / math.sqrt(den1 * den2)
def spearmans_rho(ranked_dict1, ranked_dict2):
assert len(ranked_dict1) == len(ranked_dict2)
if len(ranked_dict1) == 0 or len(ranked_dict2) == 0:
return 0.
x_avg = 1.*sum([val for val in ranked_dict1.values()])/len(ranked_dict1)
y_avg = 1.*sum([val for val in ranked_dict2.values()])/len(ranked_dict2)
num, d_x, d_y = (0., 0., 0.)
for key in ranked_dict1.keys():
xi = ranked_dict1[key]
yi = ranked_dict2[key]
num += (xi-x_avg)*(yi-y_avg)
d_x += (xi-x_avg)**2
d_y += (yi-y_avg)**2
return num/(math.sqrt(d_x*d_y)+1)
from collections import Counter
from operator import itemgetter
''' Read all the word vectors and normalize them '''
def read_word_vectors(filename):
word_vecs = {}
#if filename.endswith('.gz'): file_object = gzip.open(filename, 'r')
#else: file_object = open(filename, 'r')
print(filename)
file_object = open(str(filename),"r")
for line_num, line in enumerate(file_object):
line = line.strip().lower()
word = line.split()[0]
word_vecs[word] = numpy.zeros(len(line.split())-1, dtype=float)
for index, vec_val in enumerate(line.split()[1:]):
word_vecs[word][index] = float(vec_val)
''' normalize weight vector '''
word_vecs[word] /= math.sqrt((word_vecs[word]**2).sum() + 1e-6)
sys.stderr.write("Vectors read from: "+filename+" \n")
return word_vecs
word_sim_dir = "./en"
word_vec_file = "./embed/filtered.txt"
def sim_loss_cal(word_sim_dir,word_vec_file):
word_vecs = read_word_vectors(word_vec_file)
#print(word_vecs)
print('=================================================================================')
print("%6s" %"Serial", "%20s" % "Dataset", "%15s" % "Num Pairs", "%15s" % "Not found", "%15s" % "Rho")
print('=================================================================================')
avg_spearmans_rho = 0
for i, filename in enumerate(os.listdir(word_sim_dir)):
manual_dict, auto_dict = ({}, {})
not_found, total_size = (0, 0)
for line in open(os.path.join(word_sim_dir, filename),'r'):
line = line.strip().lower()
word1, word2, val = line.split()
if word1 in word_vecs and word2 in word_vecs:
manual_dict[(word1, word2)] = float(val)
auto_dict[(word1, word2)] = cosine_sim(word_vecs[word1], word_vecs[word2])
else:
not_found += 1
total_size += 1
print("%6s" % str(i+1), "%20s" % filename, "%15s" % str(total_size), end=""),
print("%15s" % str(not_found), end=""),
print("%15.4f" % spearmans_rho(assign_ranks(manual_dict), assign_ranks(auto_dict)))
avg_spearmans_rho +=spearmans_rho(assign_ranks(manual_dict), assign_ranks(auto_dict))
avg_spearmans_rho /= len(os.listdir(word_sim_dir))
return avg_spearmans_rho
def sim_loss_cal2(word_sim_dir,word_vec_file):
word_vecs = word_vec_file
avg_spearmans_rho = 0
for i, filename in enumerate(os.listdir(word_sim_dir)):
manual_dict, auto_dict = ({}, {})
not_found, total_size = (0, 0)
for line in open(os.path.join(word_sim_dir, filename),'r'):
line = line.strip().lower()
word1, word2, val = line.split()
if word1 in word_vecs and word2 in word_vecs:
manual_dict[(word1, word2)] = float(val)
auto_dict[(word1, word2)] = cosine_sim(word_vecs[word1], word_vecs[word2])
else:
not_found += 1
total_size += 1
avg_spearmans_rho +=spearmans_rho(assign_ranks(manual_dict), assign_ranks(auto_dict))
avg_spearmans_rho /= len(os.listdir(word_sim_dir))
return avg_spearmans_rho