-
Notifications
You must be signed in to change notification settings - Fork 0
/
haem.py
226 lines (190 loc) · 10.3 KB
/
haem.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
from __future__ import division
import dynet as dy
import numpy as np
import datasets
from defaults import (COPY, DELETE, BEGIN_WORD, END_WORD, UNK, MAX_ACTION_SEQ_LEN)
from stack_lstms import Encoder, StackRNN, StackBiRNN
from transducer import Transducer
class EditTransducer(Transducer):
def __init__(self, model, vocab, **kwargs):
#self.INSERT = COPY + 1
# Actually, action types are: COPY, DELETE, INSERT
#self.NUM_ACT_TYPES = self.INSERT + 1
#kwargs['action_dim'] = 10
super(EditTransducer, self).__init__(model, vocab, **kwargs)
#def _classifier(self, model):
# Here, we add a copy logistic classifier.
# input + decoder output for good measure
#op_dim = self.CHAR_DIM + self.ACTION_DIM + self.FEAT_INPUT_DIM
#self.pW_op = model.add_parameters((1, gen_dim))
#self.pb_op = model.add_parameters(1)
#print ' * COPY LOGISTIC: IN-DIM: {}, OUT-DIM: {}'.format(gen_dim, 1)
#super(MinimalTransducer, self)._classifier(model)
def _features(self, model):
# trainable embeddings for characters and actions
self.CHAR_LOOKUP = model.add_lookup_parameters((self.NUM_CHARS, self.CHAR_DIM))
self.ACT_LOOKUP = model.add_lookup_parameters((self.NUM_ACTS, self.ACTION_DIM))
self.p_empty_string = model.add_parameters(self.CHAR_DIM)
# embed features or bag-of-word them?
if not self.FEAT_DIM:
print 'Using an n-hot representation for features.'
# n-hot POS features are simply concatenated to feature vector
self.FEAT_INPUT_DIM = self.NUM_FEATS + self.NUM_POS
else:
self.FEAT_LOOKUP = model.add_lookup_parameters((self.NUM_FEATS, self.FEAT_DIM))
self.POS_LOOKUP = model.add_lookup_parameters((self.NUM_POS, self.FEAT_DIM))
# POS feature is the only feature with many values (=`self.NUM_POS`), hence + 1.
# All other features are binary (e.g. SG and PL are disjoint binary features).
# This is the implementation of Aharoni & Goldberg
self.FEAT_INPUT_DIM = (self.NUM_FEATS + 1)*self.FEAT_DIM
# BiLSTM encoding lemma
self.fbuffRNN = self.LSTM(self.ENC_LAYERS, self.CHAR_DIM, self.ENC_HIDDEN_DIM, model)
self.bbuffRNN = self.LSTM(self.ENC_LAYERS, self.CHAR_DIM, self.ENC_HIDDEN_DIM, model)
# LSTM representing generated word
self.WORD_REPR_DIM = self.CHAR_DIM + self.ACTION_DIM + self.FEAT_INPUT_DIM
self.wordRNN = self.LSTM(self.DEC_LAYERS, self.WORD_REPR_DIM, self.DEC_HIDDEN_DIM, model)
self.CLASSIFIER_IMPUT_DIM = self.DEC_HIDDEN_DIM + self.ENC_HIDDEN_DIM*2 + self.FEAT_INPUT_DIM
print ' * LEMMA biLSTM: IN-DIM: {}, OUT-DIM: {}'.format(2*self.CHAR_DIM, 2*self.ENC_HIDDEN_DIM)
print ' * WORD LSTM: IN-DIM: {}, OUT-DIM: {}'.format(self.WORD_REPR_DIM, self.DEC_HIDDEN_DIM)
print ' LEMMA LSTMs have {} layer(s)'.format(self.ENC_LAYERS)
print ' WORD LSTM has {} layer(s)'.format(self.DEC_LAYERS)
print
print ' * CHAR EMBEDDINGS: IN-DIM: {}, OUT-DIM: {}'.format(self.NUM_CHARS, self.CHAR_DIM)
print ' * ACTION EMBEDDINGS: IN-DIM: {}, OUT-DIM: {}'.format(self.NUM_ACTS, self.ACTION_DIM)
if self.FEAT_DIM:
print ' * FEAT. EMBEDDINGS: IN-DIM: {}, OUT-DIM: {}'.format(self.NUM_FEATS, self.FEAT_DIM)
def transduce(self, lemma, feats, oracle_actions=None, external_cg=True, sampling=False, unk_avg=True):
# Returns an expression of the loss for the sequence of actions.
# (that is, the oracle_actions if present or the predicted sequence otherwise)
def _valid_actions(encoder):
valid_actions = []
if len(encoder) > 1:
valid_actions += [COPY, DELETE]
else:
valid_actions += [END_WORD]
valid_actions += self.INSERTS
return valid_actions
show_oracle_actions = True
if not external_cg:
dy.renew_cg()
if unk_avg:
# Dealing with unseen data, what UNK embedding for characters
# and actions do we use? => Average trained embeddings.
UNK_CHAR_EMB = dy.average([self.CHAR_LOOKUP[i] for i in xrange(1, self.CHAR_TRAIN)])
#UNK_ACT_EMB = dy.average([self.ACT_LOOKUP[i] for i in xrange(1, self.ACT_TRAIN)])
else:
# @TODO Pretrain it with "word dropout", otherwise
# these are randomly initialized embeddings.
UNK_CHAR_EMB = self.CHAR_LOOKUP[UNK]
#UNK_ACT_EMB = self.ACT_LOOKUP[UNK]
if oracle_actions:
# reverse to enable simple popping
oracle_actions = oracle_actions[::-1]
oracle_actions.pop() # COPY of BEGIN_WORD_CHAR
# vectorize lemma without UNK
lemma_enc = [self.CHAR_LOOKUP[c] for c in lemma]
else:
# vectorize lemma with UNK
lemma_enc = [self.CHAR_LOOKUP[c] if c < self.CHAR_TRAIN else UNK_CHAR_EMB for c in lemma]
# vectorize features
features = self._build_features(*feats)
# add encoder and decoder to computation graph
encoder = Encoder(self.fbuffRNN, self.bbuffRNN)
decoder = StackRNN(self.wordRNN)
# encoder is a stack which pops lemma characters and their
# representations from the top. Thus, to get lemma characters
# in the right order, we reverse the lemma.
encoder.transduce(lemma_enc, zip(lemma, lemma_enc))
# add classifier to computation graph
if self.MLP_DIM:
# decoder output to hidden
W_s2h = dy.parameter(self.pW_s2h)
b_s2h = dy.parameter(self.pb_s2h)
# hidden to action
W_act = dy.parameter(self.pW_act)
b_act = dy.parameter(self.pb_act)
# ACTION TYPE EMBEDDING
#COPY_EMB = self.ACT_LOOKUP[COPY]
#DELETE_EMB = self.ACT_LOOKUP[DELETE]
#INSERT_EMB = self.ACT_LOOKUP[self.INSERT]
#print 'Action embs: ', COPY_EMB.npvalue()
# DETERMINISTICALLY COPY BEGIN_WORD_CHAR
#print 'sizes: ', len(encoder.s[-1]), len(encoder.s[-1][-1])
empty_string = dy.parameter(self.p_empty_string)
_, char_emb = encoder.pop()
action_history = [COPY]
decoder.push(dy.concatenate([char_emb, self.ACT_LOOKUP[COPY], features]))
word = []
losses = []
count = 0
if show_oracle_actions and oracle_actions:
print
print u''.join([self.vocab.act.i2w[a] for a in oracle_actions])
print u''.join([self.vocab.char.i2w[a] for a in lemma])
while len(action_history) <= MAX_ACTION_SEQ_LEN:
if show_oracle_actions:
print 'Action: ', count, self.vocab.act.i2w[action_history[-1]]
print 'Encoder length, char: ', lemma, len(encoder), self.vocab.char.i2w[encoder.s[-1][-1]]
print 'word: ', u''.join(word)
#print 'Remaining actions: ', oracle_actions, u''.join([self.vocab.act.i2w[a] for a in oracle_actions])
count += 1
#elif action_history[-1] >= self.ACT_TRAIN:
# print 'Will be adding unseen act embedding: ', self.vocab.act.i2w[action_history[-1]]
# compute probability of each of the actions and choose an action
# either from the oracle or if there is no oracle, based on the model
valid_actions = _valid_actions(encoder)
# classifier
classifier_input = dy.concatenate([encoder.embedding(), decoder.embedding(), features])
if self.MLP_DIM:
h = self.NONLIN(W_s2h * classifier_input + b_s2h)
else:
h = classifier_input
logits = W_act * h + b_act
log_probs = dy.log_softmax(logits, valid_actions)
# get action (argmax, sampling, or use oracle actions)
if oracle_actions is None:
if sampling:
dist = np.exp(log_probs.npvalue()) #**0.9
dist = dist / np.sum(dist)
# sample according to softmax
rand = np.random.rand()
for action, p in enumerate(dist):
rand -= p
if rand <= 0: break
else:
try:
action = np.argmax(log_probs.npvalue())
except Exception, e:
print 'Encoder length, char: ', ''.join([self.vocab.char.i2w[c] for c in lemma]), len(encoder), self.vocab.char.i2w[encoder.s[-1][-1]]
print 'word: ', u''.join(word)
print 'valid action: ', len(valid_actions), valid_actions
print 'Actions: ', ''.join([self.vocab.act.i2w[a] for a in action_history])
raise e
else:
action = oracle_actions.pop()
losses.append(dy.pick(log_probs, action))
action_history.append(action)
# execute the action to update the transducer state
if action == COPY:
# 1. Increment attention index
char_, char_emb = encoder.pop()
# 3. Append copied character to the output word
word.append(self.vocab.char.i2w[char_])
elif action == DELETE:
# 1. Increment attention index
char_, char_emb = encoder.pop()
# 2. Add popped character and action to decoder
elif action == END_WORD:
# 1. Finish transduction
break
else:
# one of the INSERT actions
assert action in self.INSERTS
# 1. Append inserted character to the output word
char_str = self.vocab.act.i2w[action]
word.append(char_str)
char_emb = empty_string # empty string embedding
# Add character popped from encoder and action to decoder
decoder.push(dy.concatenate([char_emb, self.ACT_LOOKUP[action], features]))
word = u''.join(word)
return losses, word, action_history