forked from lvyilin/BaikeNRE
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinfobox2vec_size.py
61 lines (54 loc) · 1.88 KB
/
infobox2vec_size.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# 添加可INFOBOX实际尺寸,用于可变卷积核
import json
import os
import pickle
import numpy as np
from gensim.models import KeyedVectors
DIMENSION = 100
INFOBOX_VALUE_LENGTH = 10
INFOBOX_LENGTH = 20
CWD = os.getcwd()
WORDVEC = os.path.join(CWD, "wordvectors.kv")
wordvec = KeyedVectors.load(WORDVEC, mmap='r')
PLACEHOLDER = np.zeros(DIMENSION)
PLACEHOLDER_INFOBOX = [PLACEHOLDER] * INFOBOX_VALUE_LENGTH
infoboxvec_dict = {}
with open("corpus_infobox.json", "r", encoding="utf8") as fp:
infobox_dict = json.load(fp)
infobox_size_dict = {}
for key, val in infobox_dict.items():
if key not in wordvec:
continue
infobox = []
val_size = []
infobox_count = 0
for k, v in val.items():
if infobox_count >= INFOBOX_LENGTH:
break
if k in wordvec:
infobox_vals = [wordvec[k]]
else:
infobox_vals = [PLACEHOLDER]
infobox_vals_count = 1
for w in v:
if infobox_vals_count >= INFOBOX_VALUE_LENGTH:
break
infobox_vals.append(wordvec[w])
infobox_vals_count += 1
val_size.append(infobox_vals_count)
infobox += infobox_vals
infobox_count += 1
infobox_size_dict[key] = val_size
infobox_vec = np.array(infobox)
assert infobox_vec.shape[1] == DIMENSION
result = np.zeros((INFOBOX_VALUE_LENGTH * INFOBOX_LENGTH, DIMENSION))
result[:infobox_vec.shape[0], :infobox_vec.shape[1]] = infobox_vec
# assert infobox_vec.shape == (INFOBOX_LENGTH, INFOBOX_VALUE_LENGTH, DIMENSION)
infoboxvec_dict[key] = result
with open("infobox2vec_size_key.txt", "w", encoding="utf8") as f:
for k in infoboxvec_dict.keys():
f.write(k + "\n")
with open("infobox2vec_size.txt", "wb") as f:
pickle.dump(infobox_size_dict, f)
values = np.array(list(infoboxvec_dict.values()), dtype=float)
np.save("infobox2vec_size_value.npy", values)