-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathtrain.py
249 lines (203 loc) · 9.08 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
# -*- coding: utf-8 -*-
'''
This is a TensorFlow implementation of
Character-Level Machine Translation in the paper
'Neural Machine Translation in Linear Time' (version updated in 2017)
https://arxiv.org/abs/1610.10099.
Note that I've changed a line in the file.
`tensorflow/contrib/layers/python/layers/layer.py` for some reason.
Check below.
line 1532
Before: mean, variance = nn.moments(inputs, axis, keep_dims=True)
After: mean, variance = nn.moments(inputs, [-1], keep_dims=True)
By kyubyong park. kbpark.linguist@gmail.com. https://www.github.com/kyubyong/bytenet
'''
from __future__ import print_function
from hyperparams import Hyperparams as hp
import tensorflow as tf
import numpy as np
from prepro import *
import os
from tqdm import tqdm
def get_batch_data():
# Load data
X, Y = load_train_data()
# calc total batch count
num_batch = len(X) // hp.batch_size
# Convert to tensor
X = tf.convert_to_tensor(X, tf.int32)
Y = tf.convert_to_tensor(Y, tf.int32)
# Create Queues
input_queues = tf.train.slice_input_producer([X, Y])
# create batch queues
x, y = tf.train.shuffle_batch(input_queues,
num_threads=8,
batch_size=hp.batch_size,
capacity=hp.batch_size*64,
min_after_dequeue=hp.batch_size*32,
allow_smaller_final_batch=False)
return x, y, num_batch # (64, 100), (64, 100), ()
def embed(inputs, vocab_size, embed_size, scope="embed"):
'''
Args:
tensor: A 2-D tensor of [batch, time].
vocab_size: An int. The number of vocabulary.
num_units: An int. The number of embedding units.
Returns:
An embedded tensor whose index zero is associated with constant 0.
'''
with tf.variable_scope(scope):
lookup_table_for_zero = tf.zeros(shape=[1, embed_size], dtype=tf.float32)
lookup_table_for_others = tf.get_variable('lookup_table',
dtype=tf.float32,
shape=[vocab_size-1, embed_size],
initializer=tf.contrib.layers.xavier_initializer())
lookup_table = tf.concat((lookup_table_for_zero, lookup_table_for_others), 0)
return tf.nn.embedding_lookup(lookup_table, inputs)
def normalize_activate(inputs, scope="norm1"):
'''
Args:
tensor: A 3-D or 4-D tensor.
Returns:
A tensor of the same shape as `tensor`, which has been
layer normalized and subsequently activated by Relu.
'''
return tf.contrib.layers.layer_norm(inputs=inputs, center=True, scale=True,
activation_fn=tf.nn.relu, scope=scope)
def conv1d(inputs,
filters,
size=1,
rate=1,
padding="SAME",
causal=False,
use_bias=False,
scope="conv1d"):
'''
Args:
inputs: A 3-D tensor of [batch, time, depth].
filters: An int. Number of outputs (=activation maps)
size: An int. Filter size.
rate: An int. Dilation rate.
padding: Either `SAME` or `VALID`.
causal: A boolean. If True, zeros of (kernel size - 1) * rate are padded on the left
for causality.
use_bias: A boolean.
Returns:
A masked tensor of the sampe shape as `tensor`.
'''
with tf.variable_scope(scope):
if causal:
# pre-padding for causality
pad_len = (size - 1) * rate # padding size
inputs = tf.pad(inputs, [[0, 0], [pad_len, 0], [0, 0]])
padding = "VALID"
params = {"inputs":inputs, "filters":filters, "kernel_size":size,
"dilation_rate":rate, "padding":padding, "activation":None,
"use_bias":use_bias}
out = tf.layers.conv1d(**params)
return out
def block(tensor,
size=3,
rate=1,
initial=False,
causal=False,
scope="block1"):
'''
Refer to Figure 3 on page 4 of the original paper.
Args
tensor: A 3-D tensor of [batch, time, depth].
size: An int. Filter size.
rate: An int. Dilation rate.
initial: A boolean. If True, `tensor` will not be activated at first.
is_training: A boolean. Phase declaration for batch normalization.
normalization_type: Either `ln` or `bn`.
causal: A boolean. If True, zeros of (kernel size - 1) * rate are prepadded
for causality.
Returns
A tensor of the same shape as `tensor`.
'''
with tf.variable_scope(scope):
out = tensor
# input dimension
in_dim = out.get_shape().as_list()[-1]
if not initial:
out = normalize_activate(out, scope="norm_1")
# 1 X 1 convolution -> Dimensionality reduction
out = conv1d(out, filters=in_dim/2, size=1, causal=causal, scope="conv1d_1")
# normalize and activate
out = normalize_activate(out, scope="norm_2")
# 1 X k convolution
out = conv1d(out, filters=in_dim/2, size=size, rate=rate, causal=causal, scope="conv1d_2")
# normalize and activate
out = normalize_activate(out, scope="norm_3")
# 1 X 1 convolution -> Dimension recovery
out = conv1d(out, filters=in_dim, size=1, causal=causal, scope="conv1d_3")
# Residual connection
out += tensor
return out
class Graph():
def __init__(self, is_training=True):
self.graph = tf.Graph()
with self.graph.as_default():
if is_training:
self.x, self.y, self.num_batch = get_batch_data() # (N, T)
self.decoder_inputs = tf.concat((tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) # 2: BOS
else: # inference
self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
self.decoder_inputs = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
# Load vocabulary
char2idx, idx2char = load_vocab()
# Embedding
self.enc = embed(self.x, len(char2idx), hp.hidden_units, scope="embed_enc")
self.dec = embed(self.decoder_inputs, len(char2idx), hp.hidden_units, scope="embed_dec")
# Encoding
for i in range(hp.num_blocks):
for rate in (1,2,4,8,16):
self.enc = block(self.enc,
size=5,
rate=rate,
causal=False,
initial=True if (i==0 and rate==1) else False,
scope="enc_block_{}_{}".format(i, rate)) # (N, T, C)
# Decoding
self.dec = tf.concat((self.enc, self.dec), -1)
for i in range(hp.num_blocks):
for rate in (1,2,4,8,16):
self.dec = block(self.dec,
size=3,
rate=rate,
causal=True,
scope="dec_block_{}_{}".format(i, rate))
# final 1 X 1 convolutional layer for softmax
self.logits = conv1d(self.dec, filters=len(char2idx), use_bias=True) # (N, T, V)
if is_training:
# Loss
ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y) # (N, T)
istarget = tf.to_float(tf.not_equal(self.y, 0)) # zeros: 0, non-zeros: 1 (N, T)
self.loss = tf.reduce_sum(ce * istarget) / (tf.reduce_sum(istarget) + 1e-8)
# Training
self.global_step = tf.Variable(0, name='global_step', trainable=False)
self.train_op = tf.train.AdamOptimizer(learning_rate=hp.lr)\
.minimize(self.loss, global_step=self.global_step)
# Summmary
tf.summary.scalar('loss', self.loss)
self.merged = tf.summary.merge_all()
# Predictions
self.preds = tf.arg_max(self.logits, dimension=-1)
def main():
g = Graph("train"); print("Graph loaded")
char2idx, idx2char = load_vocab()
sv = tf.train.Supervisor(graph=g.graph,
logdir=hp.logdir,
save_model_secs=0)
with sv.managed_session() as sess:
# Training
for epoch in range(1, hp.num_epochs+1):
if sv.should_stop(): break
for step in tqdm(range(g.num_batch), total=g.num_batch, ncols=70, leave=False, unit='b'):
sess.run(g.train_op)
gs = sess.run(g.global_step)
sv.saver.save(sess, hp.logdir + '/model_epoch_%02d_gs_%d' % (epoch, gs))
if __name__ == '__main__':
main()
print("Done")