-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepare_vocab.py
173 lines (145 loc) · 5.52 KB
/
prepare_vocab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
"""
Prepare vocabulary and initial word vectors.
"""
import torch
import numpy as np
import json
import pickle
from collections import Counter
from pytorch_pretrained_bert import BertTokenizer, BertModel
def main():
print("# Load pre-trained model tokenizer (vocabulary)")
tokenizer = BertTokenizer.from_pretrained('./dataset/bert/')
print("# Construct vocab")
vocabulary = [token for token in tokenizer.vocab]
vocab = set(vocabulary)
print("Vocabulary Size: {}".format(len(vocabulary)))
print("# Load pre-trained model")
model = BertModel.from_pretrained('./dataset/bert/')
print("# Load word embeddings")
emb = model.embeddings.word_embeddings.weight.data
emb = emb.numpy()
print("# Embedding size: {} x {}".format(*emb.shape))
train_file = "./dataset/trp/data/train.json"
dev_file = "./dataset/trp/data/dev.json"
test_file = "./dataset/trp/data/test.json"
vocab_file = "./dataset/trp/vocab/vocab.pkl"
emb_file = "./dataset/trp/vocab/embedding.npy"
print("# Loading TrP files...")
train_tokens = load_tokens(train_file)
dev_tokens = load_tokens(dev_file)
test_tokens = load_tokens(test_file)
v = build_vocab(train_tokens, vocab, 0)
print("# Calculating TrP oov...")
datasets = {'train': train_tokens, 'dev': dev_tokens, 'test': test_tokens}
for dname, d in datasets.items():
total, oov = count_oov(d, v)
print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total, oov * 100.0 / total))
print("# Dumping to files...")
with open(vocab_file, 'wb') as outfile:
pickle.dump(vocabulary, outfile)
np.save(emb_file, emb)
print("# TrP all done.")
train_file = "./dataset/tep/data/train.json"
dev_file = "./dataset/tep/data/dev.json"
test_file = "./dataset/tep/data/test.json"
vocab_file = "./dataset/tep/vocab/vocab.pkl"
emb_file = "./dataset/tep/vocab/embedding.npy"
print("# Loading TeP files...")
train_tokens = load_tokens(train_file)
dev_tokens = load_tokens(dev_file)
test_tokens = load_tokens(test_file)
v = build_vocab(train_tokens, vocab, 0)
print("# Calculating TeP oov...")
datasets = {'train': train_tokens, 'dev': dev_tokens, 'test': test_tokens}
for dname, d in datasets.items():
total, oov = count_oov(d, v)
print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total, oov * 100.0 / total))
print("# Dumping to files...")
with open(vocab_file, 'wb') as outfile:
pickle.dump(vocabulary, outfile)
np.save(emb_file, emb)
print("# TeP all done.")
train_file = "./dataset/pp/data/train.json"
dev_file = "./dataset/pp/data/dev.json"
test_file = "./dataset/pp/data/test.json"
vocab_file = "./dataset/pp/vocab/vocab.pkl"
emb_file = "./dataset/pp/vocab/embedding.npy"
print("# Loading PiP files...")
train_tokens = load_tokens(train_file)
dev_tokens = load_tokens(dev_file)
test_tokens = load_tokens(test_file)
v = build_vocab(train_tokens, vocab, 0)
print("# Calculating PiP oov...")
datasets = {'train': train_tokens, 'dev': dev_tokens, 'test': test_tokens}
for dname, d in datasets.items():
total, oov = count_oov(d, v)
print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total, oov * 100.0 / total))
print("# Dumping to files...")
with open(vocab_file, 'wb') as outfile:
pickle.dump(vocabulary, outfile)
np.save(emb_file, emb)
print("# PiP all done.")
def load_tokens(filename):
with open(filename) as infile:
data = json.load(infile)
tokens = []
for d in data:
ts = d['token']
ss, se, os, oe = d['subj_start'], d['subj_end'], d['obj_start'], d['obj_end']
# do not create vocab for entity words
ts[ss:se+1] = ['<PAD>']*(se-ss+1)
ts[os:oe+1] = ['<PAD>']*(oe-os+1)
tokens += list(filter(lambda t: t!='<PAD>', ts))
print("{} tokens from {} examples loaded from {}.".format(len(tokens), len(data), filename))
return tokens
def build_vocab(tokens, vocab, min_freq):
""" build vocab from tokens and glove words. """
counter = Counter(t for t in tokens)
# if min_freq > 0, use min_freq, otherwise keep all glove words
if min_freq > 0:
v = sorted([t for t in counter if counter.get(t) >= min_freq], key=counter.get, reverse=True)
else:
v = sorted([t for t in counter if t in vocab], key=counter.get, reverse=True)
# add special tokens and entity mask tokens
PAD_TOKEN = '<PAD>'
PAD_ID = 0
UNK_TOKEN = '<UNK>'
UNK_ID = 1
VOCAB_PREFIX = [PAD_TOKEN, UNK_TOKEN]
v = VOCAB_PREFIX + entity_masks() + v
print("vocab built with {}/{} words.".format(len(v), len(counter)))
return v
def count_oov(tokens, vocab):
c = Counter(t for t in tokens)
total = sum(c.values())
matched = sum(c[t] for t in vocab)
return total, total-matched
def entity_masks():
""" Get all entity mask tokens as a list. """
PAD_TOKEN = '<PAD>'
PAD_ID = 0
UNK_TOKEN = '<UNK>'
UNK_ID = 1
masks = []
SUBJ_NER_TO_ID = {
PAD_TOKEN: 0,
UNK_TOKEN: 1,
'treatment': 2,
'problem': 3,
'test': 4
}
OBJ_NER_TO_ID = {
PAD_TOKEN: 0,
UNK_TOKEN: 1,
'treatment': 2,
'problem': 3,
'test': 4
}
subj_entities = list(SUBJ_NER_TO_ID.keys())[2:]
obj_entities = list(OBJ_NER_TO_ID.keys())[2:]
masks += ["SUBJ-" + e for e in subj_entities]
masks += ["OBJ-" + e for e in obj_entities]
return masks
if __name__ == '__main__':
main()