-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdecoder.py
225 lines (196 loc) · 8.91 KB
/
decoder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
torch.set_num_threads(1)
class Attn(nn.Module):
def __init__(self, hidden_size_input, hidden_size):
super(Attn, self).__init__()
self.hidden_size = hidden_size
self.hidden_size_input = hidden_size_input
# MLP to run over encoder_outputs
self.attn1 = nn.Linear(self.hidden_size_input +\
self.hidden_size, self.hidden_size)
self.attn2 = nn.Linear(self.hidden_size, 1)
def forward(self, hidden, encoder_outputs, mask):
"""
Compute the attention distribution from the encoder outputs
at all timesteps, and the previous hiddens tate in the GRU.
"""
# Make them both B x seq_length x H
H = hidden.repeat(encoder_outputs.size(0), 1, 1).transpose(0, 1)
encoder_outputs = encoder_outputs.transpose(0, 1)
# Get the scores of each time step in the output
attn_scores = self.score(H, encoder_outputs)
# Mask the scores with -inf at each padded character
# So that softmax computes a 0 towards the distribution
# For that cell.
attn_scores.data.masked_fill_(mask, -float('inf'))
# Return the attention distribution
# B x 1 x seq_len
return F.softmax(attn_scores).unsqueeze(1)
def score(self, hidden, encoder_outputs):
"""
Compute the scores by running the encoder outputs
through the MLP, and taking dot product with previous
hidden state of the GRU
"""
tanh = torch.nn.Tanh()
# Concat and run through Linear layer,
# applying a non-linearity
# tanh(X * W + b), giving B x H x seq_len
attn1_output = tanh(self.attn1(torch.cat([encoder_outputs,\
hidden], 2)))
# Run through second Linear layer
# W * attn1_output + b
scores = self.attn2(attn1_output)
# return B x seq_len
return scores.squeeze(2)
class Decoder(nn.Module):
def __init__(self, hidden_size, embedding_size,\
output_size, bidirectional_input=False):
super(Decoder, self).__init__()
self.embedding_size = embedding_size
self.hidden_size_input = hidden_size * 2 if\
bidirectional_input else\
hidden_size
self.hidden_size = hidden_size
self.output_size = output_size
self.bidirectional_input = bidirectional_input
# Hidden state whose parameters are shared across all examples
self.h0 = nn.Parameter(torch.rand(self.hidden_size))
self.embedding = nn.Embedding(self.output_size\
, self.embedding_size)
self.gru = nn.GRU(self.hidden_size_input +\
self.embedding_size, self.hidden_size)
self.attn = Attn(self.hidden_size_input, self.hidden_size)
# MLP for mapping GRU output to a
# distribution over characters
self.out = nn.Linear(self.hidden_size,\
self.output_size)
self.softmax = nn.LogSoftmax(dim=2)
def forward(self, input, last_hidden, encoder_out,\
batch_size, use_cuda, mask):
# Reshape to 1 x B x H (double check this
# is the right way to do this...)
embedded = self.embedding(input).view(1,\
batch_size, self.hidden_size)
#if self.bidirectional_input:
# last_encoder_out = encoder_out[0, :, :].unsqueeze(0)
#else:
# last_encoder_out = encoder_out[-1, :, :].unsqueeze(0)
# Get the attn distribution over enc outputs of
# B x 1 x seq_len
attn_weights = self.attn(last_hidden, encoder_out, mask)
# Apply attention to the enc outputs
context = torch.bmm(attn_weights, encoder_out.transpose(0, 1))
# B x 1 x H --> 1 x B x H, In order ot match embedded dims
context = context.transpose(0, 1)
# Concatenate the embedding and
# last encoder output to pass to the GRU
gru_in = torch.cat((embedded, \
context), 2)
# Run the GRU, also passing it the
# h of the previous GRU run
output, hidden = self.gru(gru_in, last_hidden)
# Run it through the MLP
output = self.out(output)
# Compute log_softmax scores for NLLLoss
scores = self.softmax(output)
return scores, hidden
def init_hidden(self, batch_size):
# This will copy the h0 tensor B times to get a 1 x B x H tensor
return self.h0.repeat(1, batch_size, 1)
class PhoneDecoder(nn.Module):
def __init__(self, hidden_size, embedding_size,\
output_size, bidirectional_input=False,\
concat_phone=False):
super(PhoneDecoder, self).__init__()
self.embedding_size = embedding_size
self.hidden_size_input = hidden_size * 2 if\
bidirectional_input else\
hidden_size
# Potentially double again toa ccount for concatenated feats
self.hidden_size_input = self.hidden_size_input * 2 if\
concat_phone else self.hidden_size_input
self.hidden_size = hidden_size
self.output_size = output_size
self.concat_phone = concat_phone
self.bidirectional_input = bidirectional_input
# Hidden state whose parameters are shared across all examples
self.h0 = nn.Parameter(torch.rand(self.hidden_size))
self.embedding = nn.Embedding(self.output_size\
, self.embedding_size)
self.gru = nn.GRU(self.hidden_size_input +\
self.embedding_size, self.hidden_size)
self.attn = PhoneAttn(self.hidden_size_input, self.hidden_size)
# MLP for mapping GRU output to a
# distribution over characters
self.out = nn.Linear(self.hidden_size,\
self.output_size)
self.softmax = nn.LogSoftmax(dim=2)
def forward(self, input, last_hidden, encoder_out,\
use_cuda):
embedded = self.embedding(input).view(1, 1, -1)
if self.bidirectional_input:
last_encoder_out = encoder_out[0, :, :].unsqueeze(0)
else:
last_encoder_out = encoder_out[-1, :, :].unsqueeze(0)
# Get the attn distribution over enc outputs
attn_weights = self.attn(last_hidden, encoder_out)
# Apply attention to the enc outputs
context = torch.bmm(attn_weights,\
encoder_out.transpose(0, 1))
# Concatenate the embedding and
# last encoder output to pass to the GRU
gru_in = torch.cat((embedded, \
context), 2)
# Run the GRU, also passing it the
# h of the previous GRU run
output, hidden = self.gru(gru_in, last_hidden)
# Run it through the MLP
output = self.out(output)
# Compute log_softmax scores for NLLLoss
scores = self.softmax(output)
return scores, hidden
def init_hidden(self):
return self.h0.view(1, 1, -1)
class PhoneAttn(nn.Module):
def __init__(self, hidden_size_input, hidden_size):
super(PhoneAttn, self).__init__()
self.hidden_size = hidden_size
self.hidden_size_input = hidden_size_input
# MLP to run over encoder_outputs
self.attn1 = nn.Linear(self.hidden_size_input +\
self.hidden_size, self.hidden_size)
self.attn2 = nn.Linear(self.hidden_size, 1)
def forward(self, hidden, encoder_outputs):
"""
Compute the attention distribution from the encoder outputs
at all timesteps, and the previous hiddens state in the GRU.
"""
# Make them both B x seq_length x H
H = hidden.repeat(encoder_outputs.size(0), 1, 1).transpose(0, 1)
encoder_outputs = encoder_outputs.transpose(0, 1)
# Get the scores of each time step in the output
attn_scores = self.score(H, encoder_outputs)
# For that cell.
# Return the attention distribution
return F.softmax(attn_scores).unsqueeze(1)
def score(self, hidden, encoder_outputs):
"""
Compute the scores by running the encoder outputs
through the MLP, and taking dot product with previous
hidden state of the GRU
"""
tanh = torch.nn.Tanh()
# Concat and run through Linear layer,
# applying a non-linearity
# tanh(X * W + b), giving B x H x seq_len
attn1_output = tanh(self.attn1(torch.cat([encoder_outputs,\
hidden], 2)))
# Run through second Linear layer
# W * attn1_output + b
scores = self.attn2(attn1_output)
# return B x seq_len
return scores.squeeze(2)