forked from rksltnl/RNTN
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathComputeCostAndGradMiniBatch.py
217 lines (166 loc) · 7.3 KB
/
ComputeCostAndGradMiniBatch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
import numpy as np
import utils
from RNTNModel import RNTNModel
class ComputeCostAndGradMiniBatch:
"""
Input: model, data
Output: cost, grad
"""
def __init__(self):
self.dictionary = None
self.trees_train = None
self.trees_dev = None
self.loss = 0.0
self.dJ_dWs = None
self.dJ_dL = None
self.dJ_dW = None
self.dJ_dV = None
def compute(self, theta, dictionary, trees_train, trees_dev=None):
# set variables
self.dictionary = dictionary
self.trees_train = trees_train
self.trees_dev = trees_dev
# Create a new model from scratch
model = RNTNModel(self.dictionary)
if theta is not None:
model.updateParamsGivenTheta(theta)
# Initialize all parameters
cost = 0.0
grad = np.zeros(model.num_parameters)
self.loss = 0.0
self.dJ_dWs = np.zeros(model.Ws.shape)
self.dJ_dL = np.zeros(model.L.shape)
self.dJ_dW = np.zeros(model.W.shape)
self.dJ_dV = np.zeros(model.V.shape)
# Copy tree and forward prop to populate node vectors
# return the cost of the network
tree_train_clone = []
for tree in self.trees_train:
cloned_tree = tree.clone()
self.forwardPass(model, cloned_tree)
tree_train_clone.append(cloned_tree)
# Scaler to take average over batch elements
scaler = 1.0 / len(self.trees_train)
# Compute cost: sum of the prediction loss and add regularization terms
cost = self.loss*scaler + self.calculateRegularizationCost(model)
# Backprop on cloned trees
# return the gradient of the network params
for tree in tree_train_clone:
dJ_dz_prop = np.zeros(model.dim)
self.backwardPass(model, tree, dJ_dz_prop)
# Compute full gradient: sum of gradient matrices and \Delta J_reg terms
grad = self.calculateTotalGradient(model, scaler)
return cost, grad
def forwardPass(self, model, tree):
# Traverse the tree and populate word vectors and
# predictions, bottom up
if tree.is_leaf():
#word_index = model.word_lookup[tree.word]
word_index = self.getWordIndex(
model, tree.word)
tree.word_vector = model.L[:, word_index]
else:
left_child = tree.subtrees[0]
right_child = tree.subtrees[1]
self.forwardPass(model, left_child)
self.forwardPass(model, right_child)
tree.word_vector = self.composition(
model, left_child.word_vector, right_child.word_vector)
# hit elemenwise tanh
tree.word_vector = np.tanh(tree.word_vector)
# make softmax prediction
tree.prediction = utils.softmax(
model.Ws.dot(np.append(tree.word_vector, [1])))
# update (increment) loss
label_vector = self.getLabelVector(model, tree.label)
#self.loss += -1*sum(label_vector*np.log(tree.prediction))
self.loss += -1*label_vector.dot(np.log(tree.prediction))
def backwardPass(self, model, tree, dJ_dz_prop):
# Update classification matrix derivative: dJ_dWs
word_vector_with_bias = np.append(tree.word_vector, [1])
prediction_diff = tree.prediction - self.getLabelVector(model, tree.label)
# check if this is a matrix
self.dJ_dWs += np.outer(prediction_diff, word_vector_with_bias)
assert self.dJ_dWs.shape == model.Ws.shape, \
"classification matrix dim is incorrect"
# Compute dJ_dz
dJ_dz_pred = model.Ws[:, :-1].T.dot(prediction_diff)*(1 - tree.word_vector**2)
# Add dJ_dz_prop
dJ_dz_full = dJ_dz_pred + dJ_dz_prop
# Branch based on leaf vs non-leaf nodes
if tree.is_leaf():
# Leaf node update L matrix
#word_index = model.word_lookup[tree.word]
word_index = self.getWordIndex(model, tree.word)
self.dJ_dL[:, word_index] += dJ_dz_full
else:
# None leaf node updates W, V matrices
c_vector = np.hstack([tree.subtrees[0].word_vector, tree.subtrees[1].word_vector])
self.dJ_dW += np.outer(dJ_dz_full, np.append(c_vector, [1]))
assert self.dJ_dW.shape == model.W.shape,\
"composition W dim is incorrect"
if model.use_tensor:
self.dJ_dV += np.tensordot(dJ_dz_full, np.outer(c_vector, c_vector), axes=0).T
# Compute the down layer dJ_dz^1 derivative from dJ_dz^0 derivative
dJ_dz_down = model.W[:, :-1].T.dot(dJ_dz_full)
if model.use_tensor:
dJ_dz_down += (model.V + np.transpose(model.V, axes=[1,0,2])).T.dot(c_vector).T.dot(dJ_dz_full)
assert dJ_dz_down.size == model.dim*2,\
"down gradient dim is incorrect"
dJ_dz_down = dJ_dz_down * (1 - c_vector**2)
dJ_dz_down_left = dJ_dz_down[:model.dim]
dJ_dz_down_right = dJ_dz_down[model.dim:]
assert dJ_dz_down_left.size == dJ_dz_down_right.size, \
"down gradient left&right dim mismatch"
self.backwardPass(model, tree.subtrees[0], dJ_dz_down_left)
self.backwardPass(model, tree.subtrees[1], dJ_dz_down_right)
# Helper functions
def calculateRegularizationCost(self, model):
# initialize with prediction loss
reg = 0.0
# add Ws regularization term
reg = model.lambda_Ws/2 * np.linalg.norm(model.Ws)**2
# add L regularization term
reg += model.lambda_L/2 * np.linalg.norm(model.L)**2
# add W regularization term
reg += model.lambda_W/2 * np.linalg.norm(model.W)**2
if model.use_tensor:
reg += model.lambda_V/2 * np.linalg.norm(model.V)**2
return reg
def calculateTotalGradient(self, model, scaler):
grad = np.zeros(model.num_parameters)
# average the gradient by dividing the minibatch size
self.dJ_dWs *= scaler
self.dJ_dL *= scaler
self.dJ_dW *= scaler
# add regularizer gradients
self.dJ_dWs += model.lambda_Ws * model.Ws
self.dJ_dL += model.lambda_L * model.L
self.dJ_dW += model.lambda_W * model.W
if model.use_tensor:
self.dJ_dV *= scaler
self.dJ_dV += model.lambda_V * model.V
grad = utils.vectorizeParams(
self.dJ_dWs, self.dJ_dL, self.dJ_dW, self.dJ_dV)
else:
grad = utils.vectorizeParams(
self.dJ_dWs, self.dJ_dL, self.dJ_dW)
return grad
def composition(self, model, child1, child2):
c_vector = np.hstack([child1, child2])
word_vector = model.W.dot(np.append(c_vector, [1]))
if model.use_tensor:
word_vector += c_vector.T.dot(model.V.T).dot(c_vector)
return word_vector
def getLabelVector(self, model, label):
label_vector = np.zeros(model.K)
node_label = (int)(label)
label_vector[node_label] = 1
return label_vector
def getWordIndex(self, model, word):
# Deal with previously unseen words
if word in model.word_lookup:
word_index = model.word_lookup[word]
else:
word_index = model.word_lookup[model.UNKNOWN_WORD]
return word_index