From 33d47f0ac2edf0a276ba7efc3bd7dfd893bdb9d1 Mon Sep 17 00:00:00 2001 From: Niki Parmar Date: Thu, 27 Jul 2017 23:46:12 -0700 Subject: [PATCH 01/17] Add tests for dot product and local unmasked attention PiperOrigin-RevId: 163436731 --- .gitignore | 1 - README.md | 4 +- tensor2tensor/data_generators/wmt.py | 2 +- tensor2tensor/models/common_attention.py | 244 ++++++++++-------- tensor2tensor/models/common_attention_test.py | 84 +++--- tensor2tensor/models/common_layers.py | 146 ++++------- .../models/transformer_alternative.py | 90 +++---- 7 files changed, 265 insertions(+), 306 deletions(-) diff --git a/.gitignore b/.gitignore index fbd98dca5..c9dd3db88 100644 --- a/.gitignore +++ b/.gitignore @@ -10,7 +10,6 @@ _pycache__/ # PyPI distribution artifacts. build/ dist/ -data/ # Sublime project files *.sublime-project diff --git a/README.md b/README.md index bb0f6f534..edd6460d0 100644 --- a/README.md +++ b/README.md @@ -89,7 +89,7 @@ t2t-datagen \ --problem=$PROBLEM # Train -# * If you run out of memory, add --hparams='batch_size=1024'. +# * If you run out of memory, add --hparams='batch_size=2048' or even 1024. t2t-trainer \ --data_dir=$DATA_DIR \ --problems=$PROBLEM \ @@ -166,7 +166,7 @@ python -c "from tensor2tensor.models.transformer import Transformer" with `Modality` objects, which are specified per-feature in the dataset/task specification. * Support for multi-GPU machines and synchronous (1 master, many workers) and - asynchronous (independent workers synchronizing through a parameter server) + asynchrounous (independent workers synchronizing through a parameter server) [distributed training](https://github.com/tensorflow/tensor2tensor/tree/master/docs/distributed_training.md). * Easily swap amongst datasets and models by command-line flag with the data generation script `t2t-datagen` and the training script `t2t-trainer`. diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py index bcd29e1d4..456f36321 100644 --- a/tensor2tensor/data_generators/wmt.py +++ b/tensor2tensor/data_generators/wmt.py @@ -574,7 +574,7 @@ class WMTEnCsTokens32k(WMTProblem): """Problem spec for WMT English-Czech translation.""" @property - def targeted_vocab_size(self): + def target_vocab_size(self): return 2**15 # 32768 @property diff --git a/tensor2tensor/models/common_attention.py b/tensor2tensor/models/common_attention.py index 94d75b48d..95e982790 100644 --- a/tensor2tensor/models/common_attention.py +++ b/tensor2tensor/models/common_attention.py @@ -344,33 +344,23 @@ def dot_product_attention(q, return tf.matmul(weights, v) -def local_attention_1d(q, k, v, bias=None, - block_length=128, look_right=True, use_whole_block=False, - truncate_bias=True, name=None): - """Attention to the source position and a neigborhood around it. +def masked_local_attention_1d( + q, k, v, block_length=128, name=None): + """Attention to the source position and a neigborhood to the left of it. - The sequence is divided into blocks of length block_size. Attention for a - given query position can only see memory positions within a certain number - of positions before and behind it. + The sequence is divided into blocks of length block_size. + Attention for a given query position can only see memory positions + less than or equal to the query position, in the corresponding block + and the previous block. - If look_right is True then each query will attend to block_length//2 - positions either side, otherwise it will attend to block_length previous + If mask_right is True, then a target position cannot see greater source positions. - If use_whole_block is True then no mask will be applied to the local blocks - meaning the full blocks are used (if look_right is True then the elements to - the right of the current position are still masked out). This allows use to - attend to more elements without additional overhead, but means we have - inconsistent window positions and sizes. - Args: - q: a Tensor with shape [batch, heads, length_q, depth_k] - k: a Tensor with shape [batch, heads, length_kv, depth_k] - v: a Tensor with shape [batch, heads, length_kv, depth_v] - bias: Not currently used [batch, heads, length_q, length_k] + q: a Tensor with shape [batch, heads, length, depth_k] + k: a Tensor with shape [batch, heads, length, depth_k] + v: a Tensor with shape [batch, heads, length, depth_v] block_length: an integer - look_right: a bool - use_whole_block: a bool name: an optional string Returns: @@ -382,110 +372,146 @@ def local_attention_1d(q, k, v, bias=None, batch = tf.shape(q)[0] heads = tf.shape(q)[1] length = tf.shape(q)[2] + # If (length < 2 * block_length), then we use only one block. + block_length = tf.where(tf.less(length, block_length * 2), + length, block_length) depth_k = tf.shape(q)[3] depth_v = tf.shape(v)[3] - original_length = length - - #Pad to desired length - #If (length < block_length), then we use only one block. - block_length = tf.where(tf.less(length, block_length), - length, block_length) padding_size = tf.mod(-length, block_length) length += padding_size - num_blocks = tf.div(length, block_length) - padding = [[0, 0], [0, 0], [0, padding_size], [0, 0]] q = tf.pad(q, padding) + k = tf.pad(k, padding) + v = tf.pad(v, padding) + num_blocks = tf.div(length, block_length) - if not look_right: - #Add extra padding so we son't have to do an initial query - extra_padding = [[0, 0], [0, 0], [block_length, padding_size], [0, 0]] - bp = [[0, 0], [0, 0], [0, padding_size], [block_length, padding_size]] - else: - #We shift everything over by half a block so query is in centre - pad_right = block_length // 2 - pad_left = block_length - pad_right - extra_padding = [[0, 0], [0, 0], - [pad_left, padding_size+pad_right], [0, 0]] - bp = [[0, 0], [0, 0], - [0, padding_size], [pad_left, padding_size+pad_right]] - k = tf.pad(k, extra_padding) - v = tf.pad(v, extra_padding) - - # Reshape into blocks + # compute attention for the first query block. + first_q = tf.slice(q, [0, 0, 0, 0], [-1, -1, block_length, -1]) + first_k = tf.slice(k, [0, 0, 0, 0], [-1, -1, block_length, -1]) + first_v = tf.slice(v, [0, 0, 0, 0], [-1, -1, block_length, -1]) + first_output = dot_product_attention( + first_q, first_k, first_v, attention_bias_lower_triangle(block_length), + name="fist_block") + + # compute attention for all subsequent query blocks. q = tf.reshape(q, [batch, heads, num_blocks, block_length, depth_k]) - k = tf.reshape(k, [batch, heads, num_blocks+1, block_length, depth_k]) - v = tf.reshape(v, [batch, heads, num_blocks+1, block_length, depth_v]) + k = tf.reshape(k, [batch, heads, num_blocks, block_length, depth_k]) + v = tf.reshape(v, [batch, heads, num_blocks, block_length, depth_v]) - # Get local blocks by slicing def local(x): """Create a local version of the keys or values.""" prev_block = tf.slice( - x, [0, 0, 0, 0, 0], [-1, -1, num_blocks, -1, -1]) + x, [0, 0, 0, 0, 0], [-1, -1, num_blocks - 1, -1, -1]) cur_block = tf.slice( x, [0, 0, 1, 0, 0], [-1, -1, -1, -1, -1]) return tf.concat([prev_block, cur_block], 3) local_k = local(k) local_v = local(v) - local_length = tf.shape(local_k)[3] + tail_q = tf.slice(q, [0, 0, 1, 0, 0], [-1, -1, -1, -1, -1]) - # [batch, heads, num_blocks, block_length, local_length] - attention = tf.matmul(q, local_k, transpose_b=True) - - # Apply bias (N.B: This is not currently working) - if bias is not None: - with tf.name_scope('bias'): - b_batch = tf.shape(bias)[0] - b_heads = tf.shape(bias)[1] - bias_ = bias - #bias = 1.0 + tf.clip_by_value(bias, -1.0, 1.0) - if truncate_bias: - # Use only the query dimension - bias = tf.expand_dims(bias[:,:,:,0], 2) - bias = tf.pad(bias, extra_padding, name='bias_pad_b')# 17, 5, 3 - bias = tf.reshape(bias, - [b_batch, b_heads, 1, num_blocks+1, block_length], - name='divide_blocks') - local_b = tf.reshape(local(bias), - [b_batch, b_heads, num_blocks, 1, -1], name='reshape_local') - else: - bias = tf.pad(bias, bp, name='pad') - bias = tf.reshape(bias, - [b_batch, b_heads, num_blocks, block_length, - num_blocks+1, block_length], name='divide_blocks') - bias = tf.transpose(bias, [4,2,0,1,3,5]) - bias = tf.reshape(bias, - [num_blocks*(num_blocks+1), b_batch, b_heads, - block_length, block_length], name='combine') - indices = (num_blocks+1)*tf.range(num_blocks) - prev_block = tf.gather(bias, indices) - cur_block = tf.gather(bias, indices+num_blocks) - local_b = tf.concat([prev_block, cur_block], 4) - local_b = tf.transpose(local_b, [1,2,0,3,4]) - return l-local_b - attention += local_b - - attention = tf.nn.softmax(attention) - - # Get local mask - if not use_whole_block: - good_part = tf.matrix_band_part( - tf.ones([block_length, local_length]), 0, tf.to_int64(block_length)) - elif not look_right: - good_part = tf.matrix_band_part( - tf.ones([block_length, local_length]), -1, tf.to_int64(block_length)) - else: - good_part = tf.ones([block_length, local_length]) + local_length = tf.shape(local_k)[3] - #good_part = tf.cast(good_part, tf.float64) - attention *= tf.reshape(good_part, [1, 1, 1, block_length, local_length]) + # [batch, heads, num_blocks - 1, block_length, local_length] + attention = tf.matmul(tail_q, local_k, transpose_b=True) - + # make sure source_pos <= target_pos + good_part = tf.matrix_band_part( + tf.ones([block_length, local_length]), -1, tf.to_int64(block_length)) + mask = (1.0 - good_part) * -1e9 + attention += tf.reshape(mask, [1, 1, 1, block_length, local_length]) + attention = tf.nn.softmax(attention) + # TODO(noam): figure out how to show a summary for the remaining blocks. + # The naive way currently causes errors due to empty tensors. + # output: [batch, heads, num_blocks-1, block_length, depth_v] output = tf.matmul(attention, local_v) output = tf.reshape(output, [batch, heads, -1, depth_v]) + output = tf.concat([first_output, output], axis=2) + output = tf.slice(output, [0, 0, 0, 0], [-1, -1, original_length, -1]) + output.set_shape(v_shape) + return output + + +def unmasked_local_attention_1d(q, k, v, block_length=128, filter_width=100, + name=None): + """strided block local self-attention. + + Args: + q: a Tensor with shape [batch, heads, length, depth_k] + k: a Tensor with shape [batch, heads, length, depth_k] + v: a Tensor with shape [batch, heads, length, depth_v] + block_length: an integer + filter_width: an integer indicating how much to look left. + name: an optional string - # Remove added padding + Returns: + a Tensor of shape [batch, heads, length, depth_v] + """ + with tf.variable_scope(name, default_name="local_self_attention_1d", + values=[q, k, v]): + v_shape = v.get_shape() + depth_v = tf.shape(v)[3] + batch_size = tf.shape(q)[0] + num_heads = tf.shape(q)[1] + original_length = tf.shape(q)[2] + # making sure q is a multiple of d + def pad_to_multiple(x, pad_length): + x_length = tf.shape(x)[2] + return tf.pad(x, [[0, 0], [0, 0], [0, -x_length % pad_length], [0, 0]]) + def pad_l_and_r(x, pad_length): + return tf.pad(x, [[0, 0], [0, 0], [pad_length, pad_length], [0, 0]]) + q = pad_to_multiple(q, block_length) + k = pad_to_multiple(k, block_length) + v = pad_to_multiple(v, block_length) + + # Setting up q blocks + new_q_shape = tf.shape(q) + # Setting up q blocks + q = tf.reshape(q, [new_q_shape[0], new_q_shape[1], + new_q_shape[2]//block_length, + block_length, new_q_shape[3]]) + + # Setting up k and v values + k = pad_l_and_r(k, filter_width) + v = pad_l_and_r(v, filter_width) + + length = tf.shape(k)[2] + full_filter_width = block_length + 2*filter_width + # getting gather indices + indices = tf.range(0, length, delta=1, name="index_range") + # making indices [1, length, 1] to appy convs + indices = tf.reshape(indices, [1, -1, 1]) + kernel = tf.expand_dims(tf.eye(full_filter_width), axis=1) + gather_indices = tf.nn.conv1d( + tf.cast(indices, tf.float32), + kernel, + block_length, + padding="VALID", + name="gather_conv") + + gather_indices = tf.squeeze(tf.cast(gather_indices, tf.int32), axis=0) + + # [length, batch, heads, dim] + k_t = tf.transpose(k, [2, 0, 1, 3]) + k_new = tf.gather(k_t, gather_indices) + + # [batch, heads, blocks, block_length, dim] + k_new = tf.transpose(k_new, [2, 3, 0, 1, 4]) + + attention_bias = tf.expand_dims( + tf.to_float(embedding_to_padding(k_new)) * -1e9, axis=-2) + + v_t = tf.transpose(v, [2, 0, 1, 3]) + v_new = tf.gather(v_t, gather_indices) + v_new = tf.transpose(v_new, [2, 3, 0, 1, 4]) + + logits = tf.matmul(q, k_new, transpose_b=True) + + attention = tf.nn.softmax(logits+attention_bias) + output = tf.matmul(attention, v_new) + + output = tf.reshape(output, [batch_size, num_heads, -1, depth_v]) + # Remove the padding if introduced output = tf.slice(output, [0, 0, 0, 0], [-1, -1, original_length, -1]) output.set_shape(v_shape) return output @@ -502,6 +528,7 @@ def multihead_attention(query_antecedent, image_shapes=None, attention_type="dot_product", block_length=128, + block_width=128, name=None): """Multihead scaled-dot-product attention with input/output transformations. @@ -516,9 +543,10 @@ def multihead_attention(query_antecedent, dropout_rate: a floating point number image_shapes: optional tuple of integer scalars. see comments for attention_image_summary() - attention_type: a string, either "dot_product" or "local" or - "local_mask_right" + attention_type: a string, either "dot_product" or "local_mask_right" or + "local_unmasked" block_length: an integer - relevant for "local_mask_right" + block_width: an integer - relevant for "local_unmasked" name: an optional string Returns: @@ -566,12 +594,12 @@ def multihead_attention(query_antecedent, if attention_type == "dot_product": x = dot_product_attention( q, k, v, bias, dropout_rate, image_shapes) - elif attention_type == "local": - x = local_attention_1d(q, k, v, block_length=block_length) + elif attention_type == "local_mask_right": + x = masked_local_attention_1d(q, k, v, block_length=block_length) else: - assert attention_type == "local_mask_right" - x = local_attention_1d( - q, k, v, block_length=block_length, look_right=False) + assert attention_type == "local_unmasked" + x = unmasked_local_attention_1d(q, k, v, block_length=block_length, + filter_width=block_width) x = combine_heads(x) x = common_layers.conv1d(x, output_depth, 1, name="output_transform") return x diff --git a/tensor2tensor/models/common_attention_test.py b/tensor2tensor/models/common_attention_test.py index 2e534ba1a..78be4b645 100644 --- a/tensor2tensor/models/common_attention_test.py +++ b/tensor2tensor/models/common_attention_test.py @@ -1,4 +1,5 @@ -# Copyright 2017 Google Inc. +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Tests for common layers.""" +"""Tests for common attention.""" from __future__ import absolute_import from __future__ import division @@ -28,54 +29,43 @@ class CommonAttentionTest(tf.test.TestCase): - def testLocalAttention(self): - q = np.array([[[ [1.0, 0.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0] ]]]) - - k = np.array([[[ [1.0, 0.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0] ]]]) - - b = np.array([[[ [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], - [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], - [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], - [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], - [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], - [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], - [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], - [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] ]]]) - - #b = np.ones((1,1,8,8)) - #b = (1-b) * (-1e9) - v = np.ones((1, 1, 8, 1)) + def testDotProductAttention(self): + x = np.random.rand(5, 7, 12, 32) + y = np.random.rand(5, 7, 12, 32) + with self.test_session() as session: + a = common_attention.dot_product_attention( + tf.constant(x, dtype=tf.float32), + tf.constant(y, dtype=tf.float32), + tf.constant(y, dtype=tf.float32), None) + session.run(tf.global_variables_initializer()) + res = session.run(a) + self.assertEqual(res.shape, (5, 7, 12, 32)) - #q = np.random.rand(5, 7, 13, 3) - #k = np.random.rand(5, 7, 13, 3) - #v = np.random.rand(5, 7, 13, 11) - #b = np.random.rand(5, 1, 13, 1) + def testLocalUnmaskedAttention(self): + x = np.random.rand(5, 4, 25, 16) + y = np.random.rand(5, 4, 25, 16) + with self.test_session() as session: + a = common_attention.unmasked_local_attention_1d( + tf.constant(x, dtype=tf.float32), + tf.constant(y, dtype=tf.float32), + tf.constant(y, dtype=tf.float32), + block_length=4, filter_width=3) + session.run(tf.global_variables_initializer()) + res = session.run(a) + self.assertEqual(res.shape, (5, 4, 25, 16)) + def testLocalUnmaskedAttentionMatchingBlockLength(self): + x = np.random.rand(5, 4, 25, 16) + y = np.random.rand(5, 4, 25, 16) with self.test_session() as session: - q_ = tf.constant(q) - k_ = tf.constant(k) - v_ = tf.constant(v) - b_ = tf.constant(b) - y = common_attention.local_attention_1d(q_, k_, v_, b_, block_length=tf.constant(2)) - res = session.run(y) - #print(q) - #rint(k) - print(res) - #self.assertEqual(res.shape, (5, 7, 13, 11)) + a = common_attention.unmasked_local_attention_1d( + tf.constant(x, dtype=tf.float32), + tf.constant(y, dtype=tf.float32), + tf.constant(y, dtype=tf.float32), + block_length=5, filter_width=3) + session.run(tf.global_variables_initializer()) + res = session.run(a) + self.assertEqual(res.shape, (5, 4, 25, 16)) if __name__ == "__main__": diff --git a/tensor2tensor/models/common_layers.py b/tensor2tensor/models/common_layers.py index ae6d0cede..e98531d88 100644 --- a/tensor2tensor/models/common_layers.py +++ b/tensor2tensor/models/common_layers.py @@ -1420,22 +1420,22 @@ def smoothing_cross_entropy(logits, labels, vocab_size, confidence): return xentropy - normalizing +def global_pool_1d(inputs, pooling_type="MAX", mask=None): + """Pool elements across the last dimension. -def global_pool_1d(inputs, pooling_type='MAX', mask=None): - """ - Pools elements across the last dimension. Useful to a list of vectors into a - single vector to get a representation of a set. - Concatenating - - Args - inputs: A tensor of dimensions batch_size x sequence_length x input_dims - containing the sequences of input vectors. - pooling_type: the pooling type to use, MAX or AVR - mask: A tensor of dimensions batch_size x sequence_length containing a - mask for the inputs with 1's for existing elements, and 0's elsewhere. - Returns - output: A tensor of dimensions batch_size x input_dims - dimension containing the sequences of transformed vectors. + Useful to convert a list of vectors into a single vector so as + to get a representation of a set. + + Args: + inputs: A tensor of dimensions batch_size x sequence_length x input_dims + containing the sequences of input vectors. + pooling_type: the pooling type to use, MAX or AVR + mask: A tensor of dimensions batch_size x sequence_length containing a + mask for the inputs with 1's for existing elements, and 0's elsewhere. + + Returns: + output: A tensor of dimensions batch_size x input_dims + dimension containing the sequences of transformed vectors. """ with tf.name_scope("global_pool", [inputs]): if mask is not None: @@ -1457,38 +1457,6 @@ def global_pool_1d(inputs, pooling_type='MAX', mask=None): return output - -def running_global_pool_1d(inputs, pooling_type='MAX'): - """ - Same global pool, but only for the elements up to the current element. Useful - for outputs where the state of future elements is not known. - Takes no mask as all elements up to the current element are assumed to exist. - Currently only supports maximum. Equivalent to using a lower triangle bias. - - Args - inputs: A tensor of dimensions batch_size x sequence_length x input_dims - containing the sequences of input vectors. - pooling_type: Pooling type to use. Currently only supports 'MAX'. - Returns - output: A tensor of dimensions batch_size x sequence_length x input_dims - dimension containing the running 'totals'. - """ - - with tf.name_scope("running_global_pool", [inputs]): - scan_fct = tf.maximum - - # Permute inputs so seq_length is first - elems = tf.transpose(inputs, [1, 0, 2]) - - # Perform scan - cumulatives = tf.scan(scan_fct, elems, swap_memory=True) - - # Permute output to get back to original order - output = tf.transpose(cumulatives, [1, 0, 2]) - - return output - - def linear_set_layer(layer_size, inputs, context=None, @@ -1502,19 +1470,21 @@ def linear_set_layer(layer_size, e.g. One can use global_pool_1d to get a representation of the set which can then be used as the context for the next layer. - Args - layer_size: Dimension to transform the input vectors to - inputs: A tensor of dimensions batch_size x sequence_length x input_dims - containing the sequences of input vectors. - context: A tensor of dimensions batch_size x context_dims or batch_size x - sequence_length x context_dims containing a global statistic about the - set. - dropout: Dropout probability. - activation_fn: The activation function to use. - Returns - output: A tensor of dimensions batch_size x sequence_length x output_dims - dimension containing the sequences of transformed vectors. + TODO: Add bias add (or control the biases used). + Args: + layer_size: Dimension to transform the input vectors to. + inputs: A tensor of dimensions batch_size x sequence_length x input_dims + containing the sequences of input vectors. + context: A tensor of dimensions batch_size x context_dims + containing a global statistic about the set. + activation_fn: The activation function to use. + dropout: Dropout probability. + name: name. + + Returns: + output: A tensor of dimensions batch_size x sequence_length x output_dims + dimension containing the sequences of transformed vectors. """ with tf.variable_scope(name, "linear_set_layer", [inputs]): # Apply 1D convolution to apply linear filter to each element @@ -1524,12 +1494,10 @@ def linear_set_layer(layer_size, # Apply the context if it exists. if context is not None: # Unfortunately tf doesn't support broadcasting via concat, but we can - # simply add the transformed context to get the same effect - if len(context.get_shape().as_list())==2: - context = tf.expand_dims(context, axis=1) - #context_size = context.get_shape().as_list()[-1] - cont_tfm = conv1d(context, layer_size, 1, - activation=None, name="cont_conv") + # simply add the transformed context to get the same effect. + context = tf.expand_dims(context, axis=1) + cont_tfm = conv1d( + context, layer_size, 1, activation=None, name="cont_conv") outputs += cont_tfm if activation_fn is not None: @@ -1544,7 +1512,6 @@ def linear_set_layer(layer_size, def ravanbakhsh_set_layer(layer_size, inputs, mask=None, - sequential=False, activation_fn=tf.nn.tanh, dropout=0.0, name=None): @@ -1552,35 +1519,26 @@ def ravanbakhsh_set_layer(layer_size, More parameter-efficient verstion of a linear-set-layer with context. - Args - layer_size: Dimension to transform the input vectors to. - inputs: A tensor of dimensions batch_size x sequence_length x vector - containing the sequences of input vectors. - mask: A tensor of dimensions batch_size x sequence_length containing a - mask for the inputs with 1's for existing elements, and 0's elsewhere. - sequential: If true, will use a running global pool so each element will - only depend on those before it. Set true if this layer is being used in - an ouput sequence. - Returns - output: A tensor of dimensions batch_size x sequence_length x vector - dimension containing the sequences of transformed vectors. + Args: + layer_size: Dimension to transform the input vectors to. + inputs: A tensor of dimensions batch_size x sequence_length x vector + containing the sequences of input vectors. + mask: A tensor of dimensions batch_size x sequence_length containing a + mask for the inputs with 1's for existing elements, and 0's elsewhere. + activation_fn: The activation function to use. + dropout: dropout. + name: name. + + Returns: + output: A tensor of dimensions batch_size x sequence_length x vector + dimension containing the sequences of transformed vectors. """ with tf.variable_scope(name, "ravanbakhsh_set_layer", [inputs]): - - if sequential: - output = linear_set_layer( - layer_size, - inputs - running_global_pool_1d(inputs), - activation_fn=activation_fn, - name=name) - else: - output = linear_set_layer( - layer_size, - inputs - tf.expand_dims(global_pool_1d(inputs, mask=mask), axis=1), - activation_fn=activation_fn, - name=name) - - return output - + output = linear_set_layer( + layer_size, + inputs - tf.expand_dims(global_pool_1d(inputs, mask=mask), axis=1), + activation_fn=activation_fn, + dropout=dropout, + name=name) return output diff --git a/tensor2tensor/models/transformer_alternative.py b/tensor2tensor/models/transformer_alternative.py index 78398471a..62413c325 100644 --- a/tensor2tensor/models/transformer_alternative.py +++ b/tensor2tensor/models/transformer_alternative.py @@ -50,13 +50,17 @@ def model_fn_body(self, features): inputs = common_layers.flatten4d3d(inputs) targets = common_layers.flatten4d3d(targets) + (encoder_input, encoder_attention_bias, + _) = transformer.transformer_prepare_encoder(inputs, target_space, hparams) + (decoder_input, + decoder_self_attention_bias) = transformer.transformer_prepare_decoder( + targets, hparams) - (encoder_input, encoder_attention_bias, _) = (transformer.\ - transformer_prepare_encoder(inputs, target_space, hparams) ) - (decoder_input, decoder_self_attention_bias) = transformer.\ - transformer_prepare_decoder(targets, hparams) - - encoder_mask = bias_to_mask(encoder_attention_bias) + # We need masks of the form batch size x input sequences + # Biases seem to be of the form batch_size x 1 x input sequences x vec dim + # Squeeze out dim one, and get the first element of each vector. + encoder_mask = tf.squeeze(encoder_attention_bias, [1])[:, :, 0] + decoder_mask = tf.squeeze(decoder_self_attention_bias, [1])[:, :, 0] def residual_fn(x, y): return common_layers.layer_norm(x + tf.nn.dropout( @@ -64,20 +68,20 @@ def residual_fn(x, y): encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.residual_dropout) decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.residual_dropout) - encoder_output = alt_transformer_encoder( encoder_input, residual_fn, encoder_mask, hparams) decoder_output = alt_transformer_decoder( - decoder_input, encoder_output, residual_fn, + decoder_input, encoder_output, residual_fn, decoder_mask, encoder_attention_bias, hparams) decoder_output = tf.expand_dims(decoder_output, 2) return decoder_output - -def composite_layer(inputs, mask, hparams, for_output=False): + +def composite_layer(inputs, mask, hparams): + """Composite layer.""" x = inputs # Applies ravanbakhsh on top of each other. @@ -85,32 +89,28 @@ def composite_layer(inputs, mask, hparams, for_output=False): for layer in xrange(hparams.layers_per_layer): with tf.variable_scope(".%d" % layer): x = common_layers.ravanbakhsh_set_layer( - hparams.hidden_size, - x, - mask=mask, - sequential=for_output, - dropout=hparams.relu_dropout) - - # Transforms elements to get a context, and then uses this in a final layer + hparams.hidden_size, + x, + mask=mask, + dropout=0.0) + + # Transforms elements to get a context, and then uses this in a final layer. elif hparams.composite_layer_type == "reembedding": # Transform elements n times and then pool. for layer in xrange(hparams.layers_per_layer): - with tf.variable_scope("sub_layer_%d" % layer): + with tf.variable_scope(".%d" % layer): x = common_layers.linear_set_layer( - hparams.hidden_size, - x, - dropout=hparams.relu_dropout) - if for_output: - context = common_layers.running_global_pool_1d(x) - else: - context = common_layers.global_pool_1d(x, mask=mask) - - #Final layer - x = common_layers.linear_set_layer( hparams.hidden_size, x, - context=context, - dropout=hparams.relu_dropout) + dropout=0.0) + context = common_layers.global_pool_1d(x, mask=mask) + + # Final layer. + x = common_layers.linear_set_layer( + hparams.hidden_size, + x, + context=context, + dropout=0.0) return x @@ -120,12 +120,10 @@ def alt_transformer_encoder(encoder_input, mask, hparams, name="encoder"): - """Alternative encoder.""" x = encoder_input with tf.variable_scope(name): - x = encoder_input for layer in xrange(hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): x = residual_fn(x, composite_layer(x, mask, hparams)) @@ -136,12 +134,14 @@ def alt_transformer_encoder(encoder_input, def alt_transformer_decoder(decoder_input, encoder_output, residual_fn, + mask, encoder_decoder_attention_bias, hparams, name="decoder"): + """Alternative decoder.""" + x = decoder_input with tf.variable_scope(name): - x = decoder_input for layer in xrange(hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): @@ -156,33 +156,17 @@ def alt_transformer_decoder(decoder_input, hparams.attention_dropout, name="encdec_attention") - x_ = residual_fn(x_, composite_layer(x_, None, hparams, for_output=True)) + x_ = residual_fn(x_, composite_layer(x_, mask, hparams)) x = residual_fn(x, x_) - - return x - - -def bias_to_mask(bias): - # We need masks of the form batch size x input sequences - # Biases are of the form batch_size x num_heads x input sequences x - # output sequences. Squeeze out dim one, and get the first element of - # each vector. - - bias = tf.squeeze(bias, [1])[:,:,0] - bias = - tf.clip_by_value(bias, -1.0, 1.0) - mask = 1 - bias - return mask + return x @registry.register_hparams def transformer_alt(): """Set of hyperparameters.""" hparams = transformer.transformer_base() - hparams.batch_size = 2048 - hparams.num_hidden_layers = 10 + hparams.batch_size = 64 hparams.add_hparam("layers_per_layer", 4) - hparams.add_hparam("composite_layer_type", "ravanbakhsh") #ravanbakhsh or reembedding - #hparams.add_hparam("composite_layer_type", "reembedding") - + hparams.add_hparam("composite_layer_type", "reembedding") return hparams From 7db5ee881c6e9b961ff83fd316c6e983951fdf76 Mon Sep 17 00:00:00 2001 From: Ashish Vaswani Date: Fri, 28 Jul 2017 14:12:04 -0700 Subject: [PATCH 02/17] Forgot to change trainer_utils. Added an option for "label" input type and printing out image decodes without decoding with the vocabulary. PiperOrigin-RevId: 163516796 --- tensor2tensor/utils/trainer_utils.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py index 260ec6a00..91a638529 100644 --- a/tensor2tensor/utils/trainer_utils.py +++ b/tensor2tensor/utils/trainer_utils.py @@ -130,6 +130,7 @@ "\t..\t") flags.DEFINE_integer("decode_max_input_size", -1, "Maximum number of ids in input. Or <= 0 for no max.") +flags.DEFINE_bool("identity_output", False, "To print the output as identity") def _save_until_eos(hyp): @@ -766,8 +767,11 @@ def decode_interactively(estimator): else: tf.logging.info(beam_string) else: - tf.logging.info( - targets_vocab.decode(_save_until_eos(result["outputs"].flatten()))) + if FLAGS.identity_output: + tf.logging.info(" ".join(map(str, result["outputs"].flatten()))) + else: + tf.logging.info(targets_vocab.decode(_save_until_eos( + result["outputs"].flatten()))) def _decode_batch_input_fn(problem_id, num_decode_batches, sorted_inputs, @@ -843,7 +847,7 @@ def _interactive_input_fn(hparams): const_array_size = 10000 while True: prompt = ("INTERACTIVE MODE num_samples=%d decode_length=%d \n" - " it= ('text' or 'image')\n" + " it= ('text' or 'image' or 'label')\n" " pr= (set the problem number)\n" " in= (set the input problem number)\n" " ou= (set the output problem number)\n" @@ -894,6 +898,13 @@ def _interactive_input_fn(hparams): "inputs": img, "problem_choice": np.array(problem_id) } + elif input_type == "label": + input_ids = [int(input_string)] + x = [num_samples, decode_length, len(input_ids)] + input_ids + yield problem_id, { + "inputs": np.array(x), + "problem_choice": np.array(problem_id) + } else: raise Exception("Unsupported input type.") From 0416d957cc04f6778cb18d11e4426522e1c69b1d Mon Sep 17 00:00:00 2001 From: T2T Team Date: Fri, 28 Jul 2017 14:54:37 -0700 Subject: [PATCH 03/17] Prevent extremely unlikely bug around escaping characters. PiperOrigin-RevId: 163522446 --- tensor2tensor/data_generators/text_encoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py index ff284bcc6..21215472d 100644 --- a/tensor2tensor/data_generators/text_encoder.py +++ b/tensor2tensor/data_generators/text_encoder.py @@ -54,7 +54,7 @@ # '\\' is converted to '\' # '\213;' is converted to unichr(213) _UNESCAPE_REGEX = re.compile(r"\\u|\\\\|\\([0-9]+);") -_ESCAPE_CHARS = set(u"\\_;0123456789") +_ESCAPE_CHARS = set(u"\\_u;0123456789") def native_to_unicode_py2(s): From d79d67ac04df89a2a161cf082083a56eb0aa33dd Mon Sep 17 00:00:00 2001 From: Lukasz Kaiser Date: Sun, 30 Jul 2017 22:37:10 -0700 Subject: [PATCH 04/17] Correct generator, refactor T2TModel.model_fn to return a dict of losses. PiperOrigin-RevId: 163653918 --- tensor2tensor/data_generators/text_encoder.py | 2 +- tensor2tensor/data_generators/wmt.py | 5 +- tensor2tensor/models/bluenet_test.py | 2 +- tensor2tensor/models/bytenet_test.py | 2 +- tensor2tensor/models/common_hparams.py | 4 +- tensor2tensor/models/gene_expression_test.py | 4 +- tensor2tensor/models/lstm_test.py | 4 +- tensor2tensor/models/multimodel_test.py | 2 +- tensor2tensor/models/neural_gpu_test.py | 2 +- tensor2tensor/models/slicenet_test.py | 2 +- tensor2tensor/models/transformer.py | 16 +++--- tensor2tensor/models/transformer_test.py | 2 +- tensor2tensor/models/xception_test.py | 2 +- tensor2tensor/utils/t2t_model.py | 21 +++---- tensor2tensor/utils/trainer_utils.py | 55 +++++++++++-------- 15 files changed, 66 insertions(+), 59 deletions(-) diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py index 21215472d..cd6ca0eea 100644 --- a/tensor2tensor/data_generators/text_encoder.py +++ b/tensor2tensor/data_generators/text_encoder.py @@ -427,7 +427,7 @@ def bisect(min_val, max_val): token_counts, present_count, num_iterations) # If min_val == max_val, we can't do any better than this. - if subtokenizer.vocab_size == target_size or min_val == max_val: + if subtokenizer.vocab_size == target_size or min_val >= max_val: return subtokenizer if subtokenizer.vocab_size > target_size: diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py index 456f36321..4975971c6 100644 --- a/tensor2tensor/data_generators/wmt.py +++ b/tensor2tensor/data_generators/wmt.py @@ -271,7 +271,7 @@ def bi_vocabs_token_generator(source_path, # English-Czech datasets _ENCS_TRAIN_DATASETS = [ [ - "http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v11.tgz", # pylint: disable=line-too-long + "http://data.statmt.org/wmt16/translation-task/training-parallel-nc-v11.tgz", # pylint: disable=line-too-long ("training-parallel-nc-v11/news-commentary-v11.cs-en.en", "training-parallel-nc-v11/news-commentary-v11.cs-en.cs") ], @@ -336,8 +336,7 @@ def _compile_data(tmp_dir, datasets, filename): lang1_filepath = os.path.join(tmp_dir, lang1_filename) lang2_filepath = os.path.join(tmp_dir, lang2_filename) - if not os.path.exists(compressed_filepath): - generator_utils.maybe_download(tmp_dir, compressed_filename, url) + generator_utils.maybe_download(tmp_dir, compressed_filename, url) if not (os.path.exists(lang1_filepath) and os.path.exists(lang2_filepath)): # For .tar.gz and .tgz files, we read compressed. diff --git a/tensor2tensor/models/bluenet_test.py b/tensor2tensor/models/bluenet_test.py index d4ce85b1a..70b8defe9 100644 --- a/tensor2tensor/models/bluenet_test.py +++ b/tensor2tensor/models/bluenet_test.py @@ -46,7 +46,7 @@ def testBlueNet(self): } model = bluenet.BlueNet( hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams) - sharded_logits, _, _ = model.model_fn(features) + sharded_logits, _ = model.model_fn(features) logits = tf.concat(sharded_logits, 0) session.run(tf.global_variables_initializer()) res = session.run(logits) diff --git a/tensor2tensor/models/bytenet_test.py b/tensor2tensor/models/bytenet_test.py index 738b84251..536d348e7 100644 --- a/tensor2tensor/models/bytenet_test.py +++ b/tensor2tensor/models/bytenet_test.py @@ -45,7 +45,7 @@ def testByteNet(self): } model = bytenet.ByteNet( hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams) - sharded_logits, _, _ = model.model_fn(features) + sharded_logits, _ = model.model_fn(features) logits = tf.concat(sharded_logits, 0) session.run(tf.global_variables_initializer()) res = session.run(logits) diff --git a/tensor2tensor/models/common_hparams.py b/tensor2tensor/models/common_hparams.py index e36b2e4e1..cf58b33e8 100644 --- a/tensor2tensor/models/common_hparams.py +++ b/tensor2tensor/models/common_hparams.py @@ -88,10 +88,10 @@ def basic_params1(): # modality, add an entry to this semicolon-separated string. Entries are # formatted "feature_name:modality_type:modality_name", e.g. # "inputs:image:small_image_modality;other_inputs:audio:identity". - input_modalities="", + input_modalities="default", # We don't use empty string in params. # To override the default target modality, specify # "modality_type:modality_name", e.g. "image:small_image_modality". - target_modality="") + target_modality="default") class RangedHParams(object): diff --git a/tensor2tensor/models/gene_expression_test.py b/tensor2tensor/models/gene_expression_test.py index bec5268fd..a43eda97a 100644 --- a/tensor2tensor/models/gene_expression_test.py +++ b/tensor2tensor/models/gene_expression_test.py @@ -55,8 +55,8 @@ def _testModel(self, hparams, model_cls): "targets": tf.constant(targets, dtype=tf.float32), } p_hparams, = hparams.problems - sharded_logits, _, _ = model_cls(hparams, tf.contrib.learn.ModeKeys.TRAIN, - p_hparams).model_fn(features) + sharded_logits, _ = model_cls(hparams, tf.contrib.learn.ModeKeys.TRAIN, + p_hparams).model_fn(features) logits = tf.concat(sharded_logits, 0) with self.test_session() as sess: diff --git a/tensor2tensor/models/lstm_test.py b/tensor2tensor/models/lstm_test.py index 1e542a666..31380afa5 100644 --- a/tensor2tensor/models/lstm_test.py +++ b/tensor2tensor/models/lstm_test.py @@ -46,7 +46,7 @@ def testLSTMSeq2Seq(self): } model = lstm.LSTMSeq2Seq( hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams) - sharded_logits, _, _ = model.model_fn(features) + sharded_logits, _ = model.model_fn(features) logits = tf.concat(sharded_logits, 0) session.run(tf.global_variables_initializer()) res = session.run(logits) @@ -70,7 +70,7 @@ def testLSTMSeq2SeqAttention(self): } model = lstm.LSTMSeq2SeqAttention( hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams) - sharded_logits, _, _ = model.model_fn(features) + sharded_logits, _ = model.model_fn(features) logits = tf.concat(sharded_logits, 0) session.run(tf.global_variables_initializer()) res = session.run(logits) diff --git a/tensor2tensor/models/multimodel_test.py b/tensor2tensor/models/multimodel_test.py index 03990594b..cf109beb4 100644 --- a/tensor2tensor/models/multimodel_test.py +++ b/tensor2tensor/models/multimodel_test.py @@ -45,7 +45,7 @@ def testMultiModel(self): } model = multimodel.MultiModel( hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams) - sharded_logits, _, _ = model.model_fn(features) + sharded_logits, _ = model.model_fn(features) logits = tf.concat(sharded_logits, 0) session.run(tf.global_variables_initializer()) res = session.run(logits) diff --git a/tensor2tensor/models/neural_gpu_test.py b/tensor2tensor/models/neural_gpu_test.py index 3d1cc0562..46c01f403 100644 --- a/tensor2tensor/models/neural_gpu_test.py +++ b/tensor2tensor/models/neural_gpu_test.py @@ -52,7 +52,7 @@ def testNeuralGPU(self): } model = neural_gpu.NeuralGPU( hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams) - shadred_logits, _, _ = model.model_fn(features) + shadred_logits, _ = model.model_fn(features) logits = tf.concat(shadred_logits, 0) session.run(tf.global_variables_initializer()) res = session.run(logits) diff --git a/tensor2tensor/models/slicenet_test.py b/tensor2tensor/models/slicenet_test.py index 692799571..54b57a9f2 100644 --- a/tensor2tensor/models/slicenet_test.py +++ b/tensor2tensor/models/slicenet_test.py @@ -45,7 +45,7 @@ def testSliceNet(self): } model = slicenet.SliceNet( hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams) - sharded_logits, _, _ = model.model_fn(features) + sharded_logits, _ = model.model_fn(features) logits = tf.concat(sharded_logits, 0) session.run(tf.global_variables_initializer()) res = session.run(logits) diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py index c45e88577..1d4ee04ed 100644 --- a/tensor2tensor/models/transformer.py +++ b/tensor2tensor/models/transformer.py @@ -23,8 +23,6 @@ from __future__ import division from __future__ import print_function -import copy - # Dependency imports from six.moves import xrange # pylint: disable=redefined-builtin @@ -43,8 +41,7 @@ class Transformer(t2t_model.T2TModel): """Attention net. See file docstring.""" def model_fn_body(self, features): - # Remove dropout if not training - hparams = copy.copy(self._hparams) + hparams = self._hparams targets = features["targets"] inputs = features["inputs"] target_space = features["target_space_id"] @@ -541,13 +538,16 @@ def transformer_parameter_attention_b(): return hparams -@registry.register_ranged_hparams("transformer_big_single_gpu") -def transformer_range1(rhp): +@registry.register_ranged_hparams("transformer_base") +def transformer_base_range(rhp): """Small range of hyperparameters.""" - hparams = transformer_big_single_gpu() + hparams = transformer_base() common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp) - + # After starting from base, set intervals for some parameters. rhp.set_float("learning_rate", 0.3, 3.0, scale=rhp.LOG_SCALE) + rhp.set_discrete("learning_rate_warmup_steps", + [1000, 2000, 4000, 8000, 16000]) rhp.set_float("initializer_gain", 0.5, 2.0) + rhp.set_float("optimizer_adam_beta2", 0.85, 0.95) rhp.set_float("optimizer_adam_beta2", 0.97, 0.99) rhp.set_float("weight_decay", 0.0, 2.0) diff --git a/tensor2tensor/models/transformer_test.py b/tensor2tensor/models/transformer_test.py index a7f1fc9ae..8f4d26339 100644 --- a/tensor2tensor/models/transformer_test.py +++ b/tensor2tensor/models/transformer_test.py @@ -51,7 +51,7 @@ def _testTransformer(self, net): "target_space_id": tf.constant(1, dtype=tf.int32), } model = net(hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams) - shadred_logits, _, _ = model.model_fn(features) + shadred_logits, _ = model.model_fn(features) logits = tf.concat(shadred_logits, 0) session.run(tf.global_variables_initializer()) res = session.run(logits) diff --git a/tensor2tensor/models/xception_test.py b/tensor2tensor/models/xception_test.py index bf434aeac..776d1306a 100644 --- a/tensor2tensor/models/xception_test.py +++ b/tensor2tensor/models/xception_test.py @@ -45,7 +45,7 @@ def testXception(self): } model = xception.Xception( hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams) - sharded_logits, _, _ = model.model_fn(features) + sharded_logits, _ = model.model_fn(features) logits = tf.concat(sharded_logits, 0) session.run(tf.global_variables_initializer()) res = session.run(logits) diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py index f67cc9540..835a60259 100644 --- a/tensor2tensor/utils/t2t_model.py +++ b/tensor2tensor/utils/t2t_model.py @@ -104,13 +104,14 @@ def _create_modalities(self, problem_hparams, hparams): input_modality_overrides = {} for override_str in hparams.input_modalities.split(";"): - parts = override_str.split(":") - feature_name = parts[0] - modality_name = ":".join(parts[1:]) - input_modality_overrides[feature_name] = modality_name + if override_str != "default": + parts = override_str.split(":") + feature_name = parts[0] + modality_name = ":".join(parts[1:]) + input_modality_overrides[feature_name] = modality_name target_modality_name = None - if hparams.target_modality: + if hparams.target_modality and hparams.target_modality != "default": target_modality_name = hparams.target_modality input_modality = {} @@ -206,7 +207,7 @@ def symbols_to_logits_fn(ids): features["targets"] = ids self._coverage = None - sharded_logits, _, _ = self.model_fn( + sharded_logits, _ = self.model_fn( features, False, last_position_only=last_position_only) # now self._coverage is a coverage tensor for the first datashard. # it has shape [batch_size] and contains floats between 0 and @@ -330,7 +331,7 @@ def sample(self, features, last_position_only=False): Returns: samples: an integer `Tensor`. """ - sharded_logits, _, _ = self.model_fn( + sharded_logits, _ = self.model_fn( features, False, last_position_only=last_position_only) if self._hparams.sampling_method == "argmax": sharded_samples = self._data_parallelism(tf.argmax, sharded_logits, 4) @@ -362,7 +363,7 @@ def _shard_features(self, features): # pylint: disable=missing-docstring return sharded_features def model_fn(self, features, skip=False, last_position_only=False): - """Computes the entire model and produces sharded logits and training loss. + """Computes the entire model and produces sharded logits and losses. Args: features: A dictionary of feature name to tensor. @@ -372,7 +373,7 @@ def model_fn(self, features, skip=False, last_position_only=False): Returns: sharded_logits: a list of `Tensor`s, one per datashard. - training_loss: a floating point `Scalar`. + losses: a dictionary: {loss-name (string): floating point `Scalar`}. """ start_time = time.time() dp = self._data_parallelism @@ -447,7 +448,7 @@ def model_fn(self, features, skip=False, last_position_only=False): training_loss = None tf.logging.info("This model_fn took %.3f sec." % (time.time() - start_time)) - return sharded_logits, training_loss, extra_loss + return sharded_logits, {"training": training_loss, "extra": extra_loss} def model_fn_body_sharded(self, sharded_features): """Mixture-of-experts models will override this function. diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py index 91a638529..9900da459 100644 --- a/tensor2tensor/utils/trainer_utils.py +++ b/tensor2tensor/utils/trainer_utils.py @@ -228,6 +228,24 @@ def log_registry(): sys.exit(0) +def add_problem_hparams(hparams, problems): + """Add problem hparams for the problems.""" + hparams.problems = [] + hparams.problem_instances = [] + for problem_name in problems.split("-"): + try: + problem = registry.problem(problem_name) + p_hparams = problem.internal_hparams(hparams) + except ValueError: + problem = None + p_hparams = problem_hparams.problem_hparams(problem_name, hparams) + + hparams.problem_instances.append(problem) + hparams.problems.append(p_hparams) + + return hparams + + def create_hparams(params_id, data_dir): """Returns hyperparameters, including any flag value overrides. @@ -248,21 +266,7 @@ def create_hparams(params_id, data_dir): if FLAGS.hparams: hparams = hparams.parse(FLAGS.hparams) - # Add hparams for the problems - hparams.problems = [] - hparams.problem_instances = [] - for problem_name in FLAGS.problems.split("-"): - try: - problem = registry.problem(problem_name) - p_hparams = problem.internal_hparams(hparams) - except ValueError: - problem = None - p_hparams = problem_hparams.problem_hparams(problem_name, hparams) - - hparams.problem_instances.append(problem) - hparams.problems.append(p_hparams) - - return hparams + return add_problem_hparams(hparams, FLAGS.problems) def run(data_dir, model, output_dir, train_steps, eval_steps, schedule): @@ -469,21 +473,24 @@ def nth_model(n): # On worker 0 also build graph for problems <= 1. # TODO(lukaszkaiser): why is this hack needed for variables init? Repair. skip_this_one = skip_this_one and (FLAGS.worker_id != 0 or n > 1) - sharded_logits, training_loss, extra_loss = model_class.model_fn( + sharded_logits, losses_dict = model_class.model_fn( features, skip=(skipping_is_on and skip_this_one)) with tf.variable_scope("losses_avg", reuse=True): - loss_moving_avg = tf.get_variable("problem_%d/training_loss" % n) - o1 = loss_moving_avg.assign(loss_moving_avg * 0.9 + training_loss * 0.1) - loss_moving_avg = tf.get_variable("problem_%d/extra_loss" % n) - o2 = loss_moving_avg.assign(loss_moving_avg * 0.9 + extra_loss * 0.1) + total_loss, ops = 0.0, [] + for loss_key, loss_value in losses_dict.iteritems(): + loss_moving_avg = tf.get_variable("problem_%d/%s_loss" + % (n, loss_key)) + ops.append(loss_moving_avg.assign( + loss_moving_avg * 0.9 + loss_value * 0.1)) + total_loss += loss_value loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n) - total_loss = training_loss + extra_loss - o3 = loss_moving_avg.assign(loss_moving_avg * 0.9 + total_loss * 0.1) + ops.append(loss_moving_avg.assign( + loss_moving_avg * 0.9 + total_loss * 0.1)) with tf.variable_scope("train_stats"): # Count steps for this problem. problem_steps = tf.get_variable( "problem_%d_steps" % n, initializer=0, trainable=False) - o4 = problem_steps.assign_add(1) - with tf.control_dependencies([o1, o2, o3, o4]): # Make sure the ops run. + ops.append(problem_steps.assign_add(1)) + with tf.control_dependencies(ops): # Make sure the ops run. # Ensure the loss is a scalar here. total_loss = tf.reshape(total_loss, [], name="total_loss_control_id") return [total_loss] + sharded_logits # Need to flatten for cond later. From 401903019f896d4f8cb31f39d2f492e09cb5d0dd Mon Sep 17 00:00:00 2001 From: Lukasz Kaiser Date: Mon, 31 Jul 2017 11:03:43 -0700 Subject: [PATCH 05/17] More corrections to WMT data-sets. PiperOrigin-RevId: 163717751 --- tensor2tensor/data_generators/wmt.py | 46 ++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py index 4975971c6..c9b43d507 100644 --- a/tensor2tensor/data_generators/wmt.py +++ b/tensor2tensor/data_generators/wmt.py @@ -193,9 +193,9 @@ def bi_vocabs_token_generator(source_path, _ENDE_TRAIN_DATASETS = [ [ - "http://data.statmt.org/wmt16/translation-task/training-parallel-nc-v11.tgz", # pylint: disable=line-too-long - ("training-parallel-nc-v11/news-commentary-v11.de-en.en", - "training-parallel-nc-v11/news-commentary-v11.de-en.de") + "http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz", # pylint: disable=line-too-long + ("training/news-commentary-v12.de-en.en", + "training/news-commentary-v12.de-en.de") ], [ "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz", @@ -250,7 +250,7 @@ def bi_vocabs_token_generator(source_path, _ZHEN_TEST_DATASETS = [[ "http://data.statmt.org/wmt17/translation-task/dev.tgz", - ("dev/newsdev2017-zhen-src.zh", "dev/newsdev2017-zhen-ref.en") + ("dev/newsdev2017-zhen-src.zh.sgm", "dev/newsdev2017-zhen-ref.en.sgm") ]] # For Macedonian-English the SETimes corpus @@ -271,9 +271,9 @@ def bi_vocabs_token_generator(source_path, # English-Czech datasets _ENCS_TRAIN_DATASETS = [ [ - "http://data.statmt.org/wmt16/translation-task/training-parallel-nc-v11.tgz", # pylint: disable=line-too-long - ("training-parallel-nc-v11/news-commentary-v11.cs-en.en", - "training-parallel-nc-v11/news-commentary-v11.cs-en.cs") + "http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz", # pylint: disable=line-too-long + ("training/news-commentary-v12.cs-en.en", + "training/news-commentary-v12.cs-en.cs") ], [ "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz", @@ -322,6 +322,23 @@ def ende_bpe_token_generator(data_dir, tmp_dir, train): EOS) +def _preprocess_sgm(line, is_sgm): + """Preprocessing to strip tags in SGM files.""" + if not is_sgm: + return line + # In SGM files, remove ,

, lines. + if line.startswith("") or line.startswith("

"): + return "" + # Strip tags. + if line.startswith(""): + i = line.index(">") + return line[i+1:-6] # Strip first and last . + + def _compile_data(tmp_dir, datasets, filename): """Concatenate all `datasets` and save to `filename`.""" filename = os.path.join(tmp_dir, filename) @@ -335,6 +352,8 @@ def _compile_data(tmp_dir, datasets, filename): lang1_filename, lang2_filename = dataset[1] lang1_filepath = os.path.join(tmp_dir, lang1_filename) lang2_filepath = os.path.join(tmp_dir, lang2_filename) + is_sgm = (lang1_filename.endswith("sgm") and + lang2_filename.endswith("sgm")) generator_utils.maybe_download(tmp_dir, compressed_filename, url) if not (os.path.exists(lang1_filepath) and @@ -355,8 +374,11 @@ def _compile_data(tmp_dir, datasets, filename): with tf.gfile.GFile(lang2_filepath, mode="r") as lang2_file: line1, line2 = lang1_file.readline(), lang2_file.readline() while line1 or line2: - lang1_resfile.write(line1.strip() + "\n") - lang2_resfile.write(line2.strip() + "\n") + line1res = _preprocess_sgm(line1, is_sgm) + line2res = _preprocess_sgm(line2, is_sgm) + if line1res or line2res: + lang1_resfile.write(line1res.strip() + "\n") + lang2_resfile.write(line2res.strip() + "\n") line1, line2 = lang1_file.readline(), lang2_file.readline() return filename @@ -433,8 +455,8 @@ def train_generator(self, data_dir, tmp_dir, train): source_vocab_size = self.targeted_vocab_size target_vocab_size = self.targeted_vocab_size datasets = _ZHEN_TRAIN_DATASETS if train else _ZHEN_TEST_DATASETS - source_datasets = [[item[0], [item[1][0]]] for item in datasets] - target_datasets = [[item[0], [item[1][1]]] for item in datasets] + source_datasets = [[item[0], [item[1][0]]] for item in _ZHEN_TRAIN_DATASETS] + target_datasets = [[item[0], [item[1][1]]] for item in _ZHEN_TRAIN_DATASETS] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, "vocab.zh.%d" % source_vocab_size, source_vocab_size, source_datasets) @@ -573,7 +595,7 @@ class WMTEnCsTokens32k(WMTProblem): """Problem spec for WMT English-Czech translation.""" @property - def target_vocab_size(self): + def targeted_vocab_size(self): return 2**15 # 32768 @property From cd006ff6a7b5688e2c2a527d28b2d62afe5d97e9 Mon Sep 17 00:00:00 2001 From: Noam Shazeer Date: Mon, 31 Jul 2017 13:28:44 -0700 Subject: [PATCH 06/17] Deep-copy the model hparams between modes to eliminate side-effects PiperOrigin-RevId: 163739337 --- tensor2tensor/utils/t2t_model.py | 8 ----- tensor2tensor/utils/trainer_utils.py | 46 ++++++++++++++++++---------- 2 files changed, 29 insertions(+), 25 deletions(-) diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py index 835a60259..95774dabc 100644 --- a/tensor2tensor/utils/t2t_model.py +++ b/tensor2tensor/utils/t2t_model.py @@ -28,7 +28,6 @@ from tensor2tensor.utils import beam_search from tensor2tensor.utils import expert_utils as eu -from tensor2tensor.utils import modality from tensor2tensor.utils import registry import tensorflow as tf @@ -116,11 +115,6 @@ def _create_modalities(self, problem_hparams, hparams): input_modality = {} for f, modality_spec in six.iteritems(problem_hparams.input_modality): - if isinstance(modality_spec, modality.Modality): - # This function has been previously run (e.g. for training and now is - # being called for eval) and the modalities have already been - # constructed. Return. - return if f in input_modality_overrides: _warn_changed_modality_type(input_modality_overrides[f], modality_spec[0], f) @@ -129,8 +123,6 @@ def _create_modalities(self, problem_hparams, hparams): problem_hparams.input_modality = input_modality target_modality_spec = problem_hparams.target_modality - if isinstance(target_modality_spec, modality.Modality): - return if target_modality_name: _warn_changed_modality_type(target_modality_name, target_modality_spec[0], "target") diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py index 9900da459..ef1d1dcb3 100644 --- a/tensor2tensor/utils/trainer_utils.py +++ b/tensor2tensor/utils/trainer_utils.py @@ -19,6 +19,7 @@ from __future__ import division from __future__ import print_function +import copy import math import operator import os @@ -415,11 +416,22 @@ def model_fn(features, targets, mode): Returns: A tuple consisting of the prediction, loss, and train_op. """ + # Deep-copy the model hparams between modes to eliminate + # side-effects caused by abuse of the linked problem_hparams + # objects which are used to share modality objects between + # problems. We do not want to share the modality objects between + # modes, since the modality objects may decide to do something + # mode-specific. A better fix would be to stop abusing the + # hparams in this way and instead use a separate dictionary to + # share the modality objects between problems. This dictionary + # could be created once per mode and passed to the constructor of + # t2t_model. + my_hp = copy.deepcopy(hparams) if mode == tf.contrib.learn.ModeKeys.INFER: if FLAGS.decode_interactive: - features = _interactive_input_tensor_to_features_dict(features, hparams) + features = _interactive_input_tensor_to_features_dict(features, my_hp) elif FLAGS.decode_from_file: - features = _decode_input_tensor_to_features_dict(features, hparams) + features = _decode_input_tensor_to_features_dict(features, my_hp) # A dictionary containing: # - problem_choice: A Tensor containing an integer indicating which problem # was selected for this run. @@ -451,9 +463,9 @@ def model_fn(features, targets, mode): def nth_model(n): """Build the model for the n-th problem, plus some added variables.""" model_class = registry.model(model)( - hparams, + my_hp, mode, - hparams.problems[n], + my_hp.problems[n], n, dp, _ps_devices(all_workers=True)) @@ -467,8 +479,8 @@ def nth_model(n): alpha=FLAGS.decode_alpha, decode_length=FLAGS.decode_extra_length) # In distributed mode, we build graph for problem=0 and problem=worker_id. - skipping_is_on = hparams.problem_choice == "distributed" and train - problem_worker_id = FLAGS.worker_id % len(hparams.problems) + skipping_is_on = my_hp.problem_choice == "distributed" and train + problem_worker_id = FLAGS.worker_id % len(my_hp.problems) skip_this_one = n != 0 and n % FLAGS.worker_replicas != problem_worker_id # On worker 0 also build graph for problems <= 1. # TODO(lukaszkaiser): why is this hack needed for variables init? Repair. @@ -496,7 +508,7 @@ def nth_model(n): return [total_loss] + sharded_logits # Need to flatten for cond later. result_list = _cond_on_index(nth_model, features["problem_choice"], 0, - len(hparams.problems) - 1) + len(my_hp.problems) - 1) if mode == tf.contrib.learn.ModeKeys.INFER: # Beam search in sequence model returns both decodes withe key "outputs" @@ -532,11 +544,11 @@ def nth_model(n): # Some training statistics. with tf.name_scope("training_stats"): - learning_rate = hparams.learning_rate * learning_rate_decay() + learning_rate = my_hp.learning_rate * learning_rate_decay() learning_rate /= math.sqrt(float(FLAGS.worker_replicas)) tf.summary.scalar("learning_rate", learning_rate) global_step = tf.to_float(tf.contrib.framework.get_global_step()) - for n in xrange(len(hparams.problems)): + for n in xrange(len(my_hp.problems)): with tf.variable_scope("losses_avg", reuse=True): total_loss_var = tf.get_variable("problem_%d/total_loss" % n) training_loss_var = tf.get_variable("problem_%d/training_loss" % n) @@ -558,27 +570,27 @@ def nth_model(n): tf.logging.info("Weight %s\tshape %s\tsize %d", v.name[:-2].ljust(80), str(v.shape).ljust(20), v_size) total_size += v_size - if hparams.weight_decay > 0.0 and len(v.shape.as_list()) > 1: + if my_hp.weight_decay > 0.0 and len(v.shape.as_list()) > 1: # Add weight regularization if set and the weight is not a bias (dim>1). with tf.device(v._ref().device): # pylint: disable=protected-access v_loss = tf.nn.l2_loss(v) / v_size weight_decay_loss += v_loss is_body = len(v_name) > 5 and v_name[:5] == "body/" - if hparams.weight_noise > 0.0 and is_body: - # Add weight noise if set in hparams. + if my_hp.weight_noise > 0.0 and is_body: + # Add weight noise if set in my_hp. with tf.device(v._ref().device): # pylint: disable=protected-access scale = learning_rate * 0.001 - noise = tf.truncated_normal(v.shape) * hparams.weight_noise * scale + noise = tf.truncated_normal(v.shape) * my_hp.weight_noise * scale noise_op = v.assign_add(noise) with tf.control_dependencies([noise_op]): total_loss = tf.identity(total_loss) tf.logging.info("Total trainable variables size: %d", total_size) - if hparams.weight_decay > 0.0: - total_loss += weight_decay_loss * hparams.weight_decay + if my_hp.weight_decay > 0.0: + total_loss += weight_decay_loss * my_hp.weight_decay total_loss = tf.identity(total_loss, name="total_loss") # Define the train_op for the TRAIN mode. - opt = _ConditionalOptimizer(hparams.optimizer, learning_rate, hparams) + opt = _ConditionalOptimizer(my_hp.optimizer, learning_rate, my_hp) tf.logging.info("Computing gradients for global model_fn.") opt_summaries = ["learning_rate", "loss"] if hparams.summarize_grads: @@ -588,7 +600,7 @@ def nth_model(n): loss=total_loss, global_step=tf.contrib.framework.get_global_step(), learning_rate=learning_rate, - clip_gradients=hparams.clip_grad_norm or None, + clip_gradients=my_hp.clip_grad_norm or None, gradient_noise_scale=hparams.grad_noise_scale or None, optimizer=opt, summaries=opt_summaries, From 474843d7d2f2c2a080e49bd163343698a4dbd69e Mon Sep 17 00:00:00 2001 From: T2T Team Date: Tue, 1 Aug 2017 08:45:13 -0700 Subject: [PATCH 07/17] Small typo correction, "handles" -> "handled". PiperOrigin-RevId: 163833578 --- tensor2tensor/data_generators/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensor2tensor/data_generators/README.md b/tensor2tensor/data_generators/README.md index 310bc39df..0e6d64dd2 100644 --- a/tensor2tensor/data_generators/README.md +++ b/tensor2tensor/data_generators/README.md @@ -28,7 +28,7 @@ for an example. `Problem`s support data generation, training, and decoding. -Data generation is handles by `Problem.generate_data` which should produce 2 +Data generation is handled by `Problem.generate_data` which should produce 2 datasets, training and dev, which should be named according to `Problem.training_filepaths` and `Problem.dev_filepaths`. `Problem.generate_data` should also produce any other files that may be required From 9b1740227e3e9fe53ad69aab2530875875aed190 Mon Sep 17 00:00:00 2001 From: T2T Team Date: Tue, 1 Aug 2017 10:15:33 -0700 Subject: [PATCH 08/17] Freeing character level WMTProblems from needing a targeted_vocab_size. PiperOrigin-RevId: 163846429 --- tensor2tensor/data_generators/problem.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py index 9623791f5..339703676 100644 --- a/tensor2tensor/data_generators/problem.py +++ b/tensor2tensor/data_generators/problem.py @@ -386,12 +386,13 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1): self.dev_filepaths(data_dir, 1, shuffled=False)) def feature_encoders(self, data_dir): - vocab_filename = os.path.join(data_dir, self.vocab_file) if self.is_character_level: encoder = text_encoder.ByteTextEncoder(), elif self.use_subword_tokenizer: + vocab_filename = os.path.join(data_dir, self.vocab_file) encoder = text_encoder.SubwordTextEncoder(vocab_filename) else: + vocab_filename = os.path.join(data_dir, self.vocab_file) encoder = text_encoder.TokenTextEncoder(vocab_filename) if self.has_inputs: return {"inputs": encoder, "targets": encoder} From 2ba6a56c9f3514bea851b8d1b3cec4131e1f113e Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Tue, 1 Aug 2017 13:03:54 -0700 Subject: [PATCH 09/17] Get vocab size from encoder for Text2Text chr Problems PiperOrigin-RevId: 163873675 --- tensor2tensor/data_generators/problem.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py index 339703676..37d3ed4da 100644 --- a/tensor2tensor/data_generators/problem.py +++ b/tensor2tensor/data_generators/problem.py @@ -400,17 +400,12 @@ def feature_encoders(self, data_dir): def hparams(self, defaults, unused_model_hparams): p = defaults - if self.is_character_level: - source_vocab_size = 256 - target_vocab_size = 256 - else: - target_vocab_size = self._encoders["targets"].vocab_size - if self.has_inputs: - source_vocab_size = self._encoders["inputs"].vocab_size if self.has_inputs: + source_vocab_size = self._encoders["inputs"].vocab_size p.input_modality = {"inputs": (registry.Modalities.SYMBOL, source_vocab_size)} + target_vocab_size = self._encoders["targets"].vocab_size p.target_modality = (registry.Modalities.SYMBOL, target_vocab_size) if self.has_inputs: p.input_space_id = self.input_space_id From 1e8ed5a1267040882a361888bfd911f7fa835617 Mon Sep 17 00:00:00 2001 From: Noam Shazeer Date: Tue, 1 Aug 2017 14:03:55 -0700 Subject: [PATCH 10/17] Add option for proximity bias. Hypothesis: this should help with training of PiperOrigin-RevId: 163882942 --- tensor2tensor/models/common_attention.py | 14 +++++++++ tensor2tensor/models/transformer.py | 36 +++++++++++++++--------- 2 files changed, 37 insertions(+), 13 deletions(-) diff --git a/tensor2tensor/models/common_attention.py b/tensor2tensor/models/common_attention.py index 95e982790..2c7e8afc9 100644 --- a/tensor2tensor/models/common_attention.py +++ b/tensor2tensor/models/common_attention.py @@ -205,6 +205,20 @@ def attention_bias_ignore_padding(memory_padding): return tf.expand_dims(tf.expand_dims(ret, 1), 1) +def attention_bias_proximal(length): + """Bias for self-attention to encourage attention to close positions. + + Args: + length: an integer scalar. + + Returns: + a Tensor with shape [1, 1, length, length] + """ + r = tf.to_float(tf.range(length)) + diff = tf.expand_dims(r, 0) - tf.expand_dims(r, 1) + return tf.expand_dims(tf.expand_dims(-tf.log(1 + tf.abs(diff)), 0), 0) + + def split_last_dimension(x, n): """Reshape x so that the last dimension becomes two dimensions. diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py index 1d4ee04ed..c9d0a2db2 100644 --- a/tensor2tensor/models/transformer.py +++ b/tensor2tensor/models/transformer.py @@ -49,8 +49,10 @@ def model_fn_body(self, features): inputs = common_layers.flatten4d3d(inputs) targets = common_layers.flatten4d3d(targets) - (encoder_input, encoder_attention_bias, _) = (transformer_prepare_encoder( - inputs, target_space, hparams)) + (encoder_input, + encoder_self_attention_bias, + encoder_decoder_attention_bias) = ( + transformer_prepare_encoder(inputs, target_space, hparams)) (decoder_input, decoder_self_attention_bias) = transformer_prepare_decoder( targets, hparams) @@ -61,11 +63,11 @@ def residual_fn(x, y): encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.residual_dropout) decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.residual_dropout) encoder_output = transformer_encoder(encoder_input, residual_fn, - encoder_attention_bias, hparams) + encoder_self_attention_bias, hparams) decoder_output = transformer_decoder( decoder_input, encoder_output, residual_fn, decoder_self_attention_bias, - encoder_attention_bias, hparams) + encoder_decoder_attention_bias, hparams) decoder_output = tf.expand_dims(decoder_output, 2) return decoder_output @@ -81,17 +83,20 @@ def transformer_prepare_encoder(inputs, target_space, hparams): Returns: encoder_input: a Tensor, bottom of encoder stack - encoder_self_attention_bias: a Tensor, containing large negative values - to implement masked attention and possibly baises for diagonal - alignments - encoder_padding: a Tensor + encoder_self_attention_bias: a bias tensor for use in encoder self-attention + encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder + attention """ - # Flatten inputs. ishape_static = inputs.shape.as_list() encoder_input = inputs encoder_padding = common_attention.embedding_to_padding(encoder_input) - encoder_self_attention_bias = common_attention.attention_bias_ignore_padding( + ignore_padding = common_attention.attention_bias_ignore_padding( encoder_padding) + encoder_self_attention_bias = ignore_padding + encoder_decoder_attention_bias = ignore_padding + if hparams.proximity_bias: + encoder_self_attention_bias += common_attention.attention_bias_proximal( + tf.shape(inputs)[1]) # Append target_space_id embedding to inputs. emb_target_space = common_layers.embedding( target_space, 32, ishape_static[-1], name="target_space_embedding") @@ -99,7 +104,9 @@ def transformer_prepare_encoder(inputs, target_space, hparams): encoder_input += emb_target_space if hparams.pos == "timing": encoder_input = common_attention.add_timing_signal_1d(encoder_input) - return (encoder_input, encoder_self_attention_bias, encoder_padding) + return (encoder_input, + encoder_self_attention_bias, + encoder_decoder_attention_bias) def transformer_prepare_decoder(targets, hparams): @@ -111,11 +118,13 @@ def transformer_prepare_decoder(targets, hparams): Returns: decoder_input: a Tensor, bottom of decoder stack - decoder_self_attention_bias: a Tensor, containing large negative values - to implement masked attention and possibly baises for diagonal alignments + decoder_self_attention_bias: a bias tensor for use in encoder self-attention """ decoder_self_attention_bias = ( common_attention.attention_bias_lower_triangle(tf.shape(targets)[1])) + if hparams.proximity_bias: + decoder_self_attention_bias += common_attention.attention_bias_proximal( + tf.shape(targets)[1]) decoder_input = common_layers.shift_left_3d(targets) if hparams.pos == "timing": decoder_input = common_attention.add_timing_signal_1d(decoder_input) @@ -292,6 +301,7 @@ def transformer_base(): hparams.add_hparam("residual_dropout", 0.1) hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("nbr_decoder_problems", 1) + hparams.add_hparam("proximity_bias", int(False)) return hparams From 36b1c59ae9ce8ce563d8bf9d8486c82ab874d6a6 Mon Sep 17 00:00:00 2001 From: T2T Team Date: Tue, 1 Aug 2017 14:21:19 -0700 Subject: [PATCH 11/17] Make tokenizer just log a warning on malformed vocab file lines and fix wrong flags in text_encoder_build_subword.py. PiperOrigin-RevId: 163885555 --- tensor2tensor/data_generators/test_data/vocab-2.txt | 1 + .../data_generators/text_encoder_build_subword.py | 4 ++-- tensor2tensor/data_generators/tokenizer.py | 7 ++++++- tensor2tensor/data_generators/tokenizer_test.py | 2 +- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/tensor2tensor/data_generators/test_data/vocab-2.txt b/tensor2tensor/data_generators/test_data/vocab-2.txt index 7793af4f6..1ad6d20b9 100644 --- a/tensor2tensor/data_generators/test_data/vocab-2.txt +++ b/tensor2tensor/data_generators/test_data/vocab-2.txt @@ -1,3 +1,4 @@ kattywampus,11 +kaput balderdash,10 jiggery-pokery,14 diff --git a/tensor2tensor/data_generators/text_encoder_build_subword.py b/tensor2tensor/data_generators/text_encoder_build_subword.py index 47e82a176..0c366c896 100644 --- a/tensor2tensor/data_generators/text_encoder_build_subword.py +++ b/tensor2tensor/data_generators/text_encoder_build_subword.py @@ -24,7 +24,7 @@ python data_generators/text_encoder_build_subword.py \ --corpus_filepattern=$DATA_DIR/my_problem-train-* \ --corpus_max_lines=12345 \ - --output_fn=$DATA_DIR/my_problem.subword_text_encoder \ + --output_filename=$DATA_DIR/my_problem.subword_text_encoder \ --logtostderr """ @@ -75,7 +75,7 @@ def main(unused_argv): encoder = text_encoder.SubwordTextEncoder() encoder.build_from_token_counts(token_counts, FLAGS.min_count, FLAGS.num_iterations) - encoder.store_to_file(FLAGS.output_fn) + encoder.store_to_file(FLAGS.output_filename) if __name__ == '__main__': diff --git a/tensor2tensor/data_generators/tokenizer.py b/tensor2tensor/data_generators/tokenizer.py index 0e8daa75f..5cfd7c42e 100644 --- a/tensor2tensor/data_generators/tokenizer.py +++ b/tensor2tensor/data_generators/tokenizer.py @@ -185,7 +185,12 @@ def vocab_token_counts(text_filepattern, max_lines): a dictionary mapping token to count. """ ret = {} - for line in _read_filepattern(text_filepattern, max_lines=max_lines): + for i, line in enumerate( + _read_filepattern(text_filepattern, max_lines=max_lines)): + if "," not in line: + tf.logging.warning("Malformed vocab line #%d '%s'", i, line) + continue + token, count = line.rsplit(",", 1) ret[_native_to_unicode(token)] = int(count) diff --git a/tensor2tensor/data_generators/tokenizer_test.py b/tensor2tensor/data_generators/tokenizer_test.py index 0c299bd0b..065a32e91 100644 --- a/tensor2tensor/data_generators/tokenizer_test.py +++ b/tensor2tensor/data_generators/tokenizer_test.py @@ -130,7 +130,7 @@ def test_vocab_token_counts(self): def test_vocab_token_counts_with_max_lines(self): # vocab-1 has 2 lines, vocab-2 has 3 - token_counts = tokenizer.vocab_token_counts(self.vocab_path, 4) + token_counts = tokenizer.vocab_token_counts(self.vocab_path, 5) expected = { u"lollipop": 8, From 23f53cb3aeaf13320b58f2c67717b8a48e3977ef Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Tue, 1 Aug 2017 17:07:05 -0700 Subject: [PATCH 12/17] Simplify input_pipeline PiperOrigin-RevId: 163910099 --- tensor2tensor/utils/data_reader.py | 60 ++++++++++-------------------- 1 file changed, 20 insertions(+), 40 deletions(-) diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py index ba5139433..567f186d5 100644 --- a/tensor2tensor/utils/data_reader.py +++ b/tensor2tensor/utils/data_reader.py @@ -179,30 +179,6 @@ def preprocess(img): return examples -def problem_input_pipeline(problem, data_file_pattern, capacity, mode): - """Input pipeline for Problems.""" - data_fields, data_items_to_decoders = problem.example_reading_spec() - - # Create placeholders for input, rather than reading data from disk. - if data_file_pattern is None: - return feature_placeholders(data_fields) - - # Now the non-trivial case construction. - examples = examples_reader( - [data_file_pattern], - data_fields, - training=(mode == tf.contrib.learn.ModeKeys.TRAIN), - capacity=capacity, - data_items_to_decoders=data_items_to_decoders) - - examples = problem.preprocess_examples(examples, mode) - - # We do not want int64s as they are not supported on GPUs. - examples = cast_int64_to_int32(examples) - - return examples - - def cast_int64_to_int32(features): f = {} for k, v in six.iteritems(features): @@ -221,19 +197,10 @@ def feature_placeholders(data_fields): return feature_map -def input_pipeline(problem, data_file_pattern, capacity, mode): - """Input pipeline, returns a dictionary of tensors from queues.""" - - if problem is not None: - # problem is not None when the problem is specified with the Problem API, - # which handles Example decoding and preprocessing. - # Otherwise the problem is specified in problem_hparams and is dealt with - # below. - # As problems are ported to the Problem API, the special handling here will - # need to be moved to Problem.example_reading_spec and - # Problem.preprocessing. - return problem_input_pipeline(problem, data_file_pattern, capacity, mode) - +def default_example_reading_spec(data_file_pattern): + """Example reading spec for problem_hparams problems.""" + # This function is for problems that have yet to be ported to the new Problem + # API. Do not add here. data_items_to_decoders = None # Read from image TFRecords if the file has "image" in its name. if data_file_pattern and "image" in data_file_pattern: @@ -267,12 +234,21 @@ def input_pipeline(problem, data_file_pattern, capacity, mode): "inputs": tf.VarLenFeature(tf.int64), "targets": tf.VarLenFeature(tf.int64) } + return data_fields, data_items_to_decoders + + +def input_pipeline(problem, data_file_pattern, capacity, mode): + """Input pipeline, returns a dictionary of tensors from queues.""" + if problem is None: + data_fields, data_items_to_decoders = default_example_reading_spec( + data_file_pattern) + else: + data_fields, data_items_to_decoders = problem.example_reading_spec() - # Create placeholders for input, rather than reading data from disk. if data_file_pattern is None: + # Create placeholders for input, rather than reading data from disk. return feature_placeholders(data_fields) - # Now the non-trivial case construction. examples = examples_reader( [data_file_pattern], data_fields, @@ -280,10 +256,14 @@ def input_pipeline(problem, data_file_pattern, capacity, mode): capacity=capacity, data_items_to_decoders=data_items_to_decoders) - examples = preprocessing(examples, data_file_pattern, mode) + if problem is None: + examples = preprocessing(examples, data_file_pattern, mode) + else: + examples = problem.preprocess_examples(examples, mode) # We do not want int64s as they are not supported on GPUs. examples = cast_int64_to_int32(examples) + return examples From 768ef743e6198ad1a6226da816ff8f35bda4df7e Mon Sep 17 00:00:00 2001 From: T2T Team Date: Tue, 1 Aug 2017 17:20:53 -0700 Subject: [PATCH 13/17] Removed bogus comma from feature_encoders. PiperOrigin-RevId: 163911814 --- tensor2tensor/data_generators/problem.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py index 37d3ed4da..577e051d3 100644 --- a/tensor2tensor/data_generators/problem.py +++ b/tensor2tensor/data_generators/problem.py @@ -387,7 +387,7 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1): def feature_encoders(self, data_dir): if self.is_character_level: - encoder = text_encoder.ByteTextEncoder(), + encoder = text_encoder.ByteTextEncoder() elif self.use_subword_tokenizer: vocab_filename = os.path.join(data_dir, self.vocab_file) encoder = text_encoder.SubwordTextEncoder(vocab_filename) From 6b54dfbddec8f2408748fc72344a0d6ea0b63954 Mon Sep 17 00:00:00 2001 From: Niki Parmar Date: Tue, 1 Aug 2017 17:57:53 -0700 Subject: [PATCH 14/17] Add functionality to download from google drive. Enable that option for celeba. PiperOrigin-RevId: 163915517 --- .../data_generators/generator_utils.py | 51 +++++++++++++++++++ .../data_generators/generator_utils_test.py | 14 +++++ tensor2tensor/data_generators/image.py | 14 ++--- 3 files changed, 72 insertions(+), 7 deletions(-) diff --git a/tensor2tensor/data_generators/generator_utils.py b/tensor2tensor/data_generators/generator_utils.py index 5c7f9f2a1..b38531c1a 100644 --- a/tensor2tensor/data_generators/generator_utils.py +++ b/tensor2tensor/data_generators/generator_utils.py @@ -28,6 +28,7 @@ # Dependency imports +import requests import six from six.moves import xrange # pylint: disable=redefined-builtin import six.moves.urllib_request as urllib # Imports urllib on Python2, urllib.request on Python3 @@ -196,6 +197,56 @@ def maybe_download(directory, filename, url): return filepath +def maybe_download_from_drive(directory, filename, url): + """Download filename from google drive unless it's already in directory. + + Args: + directory: path to the directory that will be used. + filename: name of the file to download to (do nothing if it already exists). + url: URL to download from. + + Returns: + The path to the downloaded file. + """ + if not tf.gfile.Exists(directory): + tf.logging.info("Creating directory %s" % directory) + os.mkdir(directory) + filepath = os.path.join(directory, filename) + confirm_token = None + if tf.gfile.Exists(filepath): + tf.logging.info("Not downloading, file already found: %s" % filepath) + return filepath + + # Since the file is big, drive will scan it for virus and take it to a + # warning page. We find the confirm token on this page and append it to the + # URL to start the download process. + confirm_token = None + session = requests.Session() + response = session.get(url, stream=True) + for k, v in response.cookies.items(): + if k.startswith("download_warning"): + confirm_token = v + + if confirm_token: + url = url + "&confirm=" + confirm_token + tf.logging.info("Downloading %s to %s" % (url, filepath)) + + response = session.get(url, stream=True) + # Now begin the download. + chunk_size = 16 * 1024 + with open(filepath, "wb") as f: + for chunk in response.iter_content(chunk_size): + if chunk: + f.write(chunk) + + # Print newline to clear the carriage return from the download progress + print() + statinfo = os.stat(filepath) + tf.logging.info("Succesfully downloaded %s, %s bytes." % (filename, + statinfo.st_size)) + return filepath + + def gunzip_file(gz_path, new_path): """Unzips from gz_path into new_path. diff --git a/tensor2tensor/data_generators/generator_utils_test.py b/tensor2tensor/data_generators/generator_utils_test.py index fd6e15ca3..144507e6b 100644 --- a/tensor2tensor/data_generators/generator_utils_test.py +++ b/tensor2tensor/data_generators/generator_utils_test.py @@ -64,6 +64,20 @@ def testMaybeDownload(self): os.remove(tmp_file_path + ".http") os.remove(tmp_file_path) + def testMaybeDownloadFromDrive(self): + tmp_dir = self.get_temp_dir() + (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir) + tmp_file_name = os.path.basename(tmp_file_path) + + # Download Google index to the temporary file.http. + res_path = generator_utils.maybe_download_from_drive( + tmp_dir, tmp_file_name + ".http", "http://drive.google.com") + self.assertEqual(res_path, tmp_file_path + ".http") + + # Clean up. + os.remove(tmp_file_path + ".http") + os.remove(tmp_file_path) + def testGunzipFile(self): tmp_dir = self.get_temp_dir() (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir) diff --git a/tensor2tensor/data_generators/image.py b/tensor2tensor/data_generators/image.py index fdad8d432..44e2fda15 100644 --- a/tensor2tensor/data_generators/image.py +++ b/tensor2tensor/data_generators/image.py @@ -380,18 +380,18 @@ def example_reading_spec(self): return super(ImageFSNS, self).example_reading_spec(self, label_key=label_key) -# Filename for CELEBA data. +# URL and filename for CELEBA data. _CELEBA_NAME = "img_align_celeba" +_CELEBA_URL = "https://drive.google.com/uc?export=download&id=0B7EVK8r0v71pZjFTYXZWM3FlRnM" def _get_celeba(directory): """Download and extract CELEBA to directory unless it is there.""" - path = os.path.join(directory, _CELEBA_NAME) + # path = os.path.join(directory, _CELEBA_NAME) + path = generator_utils.maybe_download_from_drive(directory, + _CELEBA_NAME, _CELEBA_URL) if not tf.gfile.Exists(path): - # We expect that this file has been downloaded from: - # https://drive.google.com/uc?export=download&id=0B7EVK8r0v71pZjFTYXZWM3FlRnM - # and placed in `directory`. - zipfile.ZipFile(path+".zip", "r").extractall(directory) + zipfile.ZipFile(path + ".zip", "r").extractall(directory) def celeba_generator(tmp_dir, how_many, start_from=0): @@ -408,7 +408,7 @@ def celeba_generator(tmp_dir, how_many, start_from=0): * image/format: the string "jpeg" representing image format, """ _get_celeba(tmp_dir) - image_files = tf.gfile.Glob(tmp_dir + "/*.jpg") + image_files = tf.gfile.Glob(os.path.join(tmp_dir, _CELEBA_NAME) + "/*.jpg") for filename in image_files[start_from:start_from+how_many]: with tf.gfile.Open(filename, "r") as f: encoded_image_data = f.read() From 465ae188ea9205f27c92c0bb532a2d8f1b336009 Mon Sep 17 00:00:00 2001 From: T2T Team Date: Tue, 1 Aug 2017 18:01:10 -0700 Subject: [PATCH 15/17] Add an option to truncate long input and target sequences. PiperOrigin-RevId: 163915865 --- tensor2tensor/data_generators/gene_expression.py | 3 ++- tensor2tensor/data_generators/problem.py | 6 +++++- tensor2tensor/models/common_hparams.py | 14 +++++++++++++- tensor2tensor/utils/data_reader.py | 4 ++-- tensor2tensor/utils/trainer_utils.py | 5 +++-- 5 files changed, 25 insertions(+), 7 deletions(-) diff --git a/tensor2tensor/data_generators/gene_expression.py b/tensor2tensor/data_generators/gene_expression.py index 60e38a90f..82c15414a 100644 --- a/tensor2tensor/data_generators/gene_expression.py +++ b/tensor2tensor/data_generators/gene_expression.py @@ -163,8 +163,9 @@ def example_reading_spec(self): data_items_to_decoders = None return (data_fields, data_items_to_decoders) - def preprocess_examples(self, examples, mode): + def preprocess_examples(self, examples, mode, hparams): del mode + del hparams # Reshape targets examples["targets"] = tf.reshape(examples["targets"], diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py index 577e051d3..6f49a8d97 100644 --- a/tensor2tensor/data_generators/problem.py +++ b/tensor2tensor/data_generators/problem.py @@ -162,8 +162,12 @@ def example_reading_spec(self): data_items_to_decoders = None return (data_fields, data_items_to_decoders) - def preprocess_examples(self, examples, mode): + def preprocess_examples(self, examples, mode, hparams): del mode + if hparams.max_input_seq_length > 0: + examples["inputs"] = examples["inputs"][:hparams.max_input_seq_length] + if hparams.max_target_seq_length > 0: + examples["targets"] = examples["targets"][:hparams.max_target_seq_length] return examples def eval_metrics(self): diff --git a/tensor2tensor/models/common_hparams.py b/tensor2tensor/models/common_hparams.py index cf58b33e8..353586393 100644 --- a/tensor2tensor/models/common_hparams.py +++ b/tensor2tensor/models/common_hparams.py @@ -91,7 +91,19 @@ def basic_params1(): input_modalities="default", # We don't use empty string in params. # To override the default target modality, specify # "modality_type:modality_name", e.g. "image:small_image_modality". - target_modality="default") + target_modality="default", + # The maximum length of "input" sequence. + # Sequences longer than this value will be truncated. 0 or negative values + # mean there is no maximum or truncation. + # You can change this behavior by overridding preprocess_examples() method + # in your problem class. + max_input_seq_length=0, + # The maximum length of "target" sequence. + # Sequences longer than this value will be truncated. 0 or negative values + # mean there is no maximum or truncation. + # You can change this behavior by overridding preprocess_examples() method + # in your problem class. + max_target_seq_length=0) class RangedHParams(object): diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py index 567f186d5..81dcb52a5 100644 --- a/tensor2tensor/utils/data_reader.py +++ b/tensor2tensor/utils/data_reader.py @@ -237,7 +237,7 @@ def default_example_reading_spec(data_file_pattern): return data_fields, data_items_to_decoders -def input_pipeline(problem, data_file_pattern, capacity, mode): +def input_pipeline(problem, data_file_pattern, capacity, mode, hparams): """Input pipeline, returns a dictionary of tensors from queues.""" if problem is None: data_fields, data_items_to_decoders = default_example_reading_spec( @@ -259,7 +259,7 @@ def input_pipeline(problem, data_file_pattern, capacity, mode): if problem is None: examples = preprocessing(examples, data_file_pattern, mode) else: - examples = problem.preprocess_examples(examples, mode) + examples = problem.preprocess_examples(examples, mode, hparams) # We do not want int64s as they are not supported on GPUs. examples = cast_int64_to_int32(examples) diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py index ef1d1dcb3..33053806d 100644 --- a/tensor2tensor/utils/trainer_utils.py +++ b/tensor2tensor/utils/trainer_utils.py @@ -1115,8 +1115,9 @@ def input_fn(): with tf.device("/cpu:0"): # Input reading on CPU capacity = p_hparams.max_expected_batch_size_per_shard capacity *= num_datashards - examples = data_reader.input_pipeline( - problem_instance, data_file_patterns[n], capacity, mode) + examples = data_reader.input_pipeline(problem_instance, + data_file_patterns[n], + capacity, mode, hparams) feature_map = data_reader.batch_examples( examples, data_reader.hparams_to_batching_scheme( From 75270907dbba3c7c224a08515d33167c29e26aed Mon Sep 17 00:00:00 2001 From: Lukasz Kaiser Date: Tue, 1 Aug 2017 18:03:10 -0700 Subject: [PATCH 16/17] More WMT corrections, port image problems to Problem. PiperOrigin-RevId: 163916109 --- README.md | 4 +- tensor2tensor/bin/t2t-datagen | 31 -- tensor2tensor/data_generators/image.py | 416 +++++++++++++++--- tensor2tensor/data_generators/problem.py | 5 +- .../data_generators/problem_hparams.py | 91 ---- tensor2tensor/data_generators/wmt.py | 2 +- tensor2tensor/models/bluenet.py | 1 - tensor2tensor/models/common_attention.py | 102 +++-- tensor2tensor/models/common_attention_test.py | 28 ++ tensor2tensor/models/common_layers.py | 71 ++- tensor2tensor/models/lstm.py | 8 +- tensor2tensor/models/lstm_test.py | 4 +- tensor2tensor/models/modalities.py | 8 +- tensor2tensor/models/multimodel.py | 1 - tensor2tensor/models/multimodel_test.py | 9 +- tensor2tensor/models/slicenet.py | 1 - tensor2tensor/models/slicenet_test.py | 10 +- tensor2tensor/models/transformer.py | 8 +- .../models/transformer_alternative.py | 67 +-- tensor2tensor/models/xception.py | 1 - tensor2tensor/utils/data_reader.py | 34 +- tensor2tensor/utils/registry.py | 2 +- tensor2tensor/utils/registry_test.py | 5 +- tensor2tensor/utils/t2t_model.py | 14 +- tensor2tensor/utils/trainer_utils_test.py | 2 +- 25 files changed, 594 insertions(+), 331 deletions(-) diff --git a/README.md b/README.md index edd6460d0..bb0f6f534 100644 --- a/README.md +++ b/README.md @@ -89,7 +89,7 @@ t2t-datagen \ --problem=$PROBLEM # Train -# * If you run out of memory, add --hparams='batch_size=2048' or even 1024. +# * If you run out of memory, add --hparams='batch_size=1024'. t2t-trainer \ --data_dir=$DATA_DIR \ --problems=$PROBLEM \ @@ -166,7 +166,7 @@ python -c "from tensor2tensor.models.transformer import Transformer" with `Modality` objects, which are specified per-feature in the dataset/task specification. * Support for multi-GPU machines and synchronous (1 master, many workers) and - asynchrounous (independent workers synchronizing through a parameter server) + asynchronous (independent workers synchronizing through a parameter server) [distributed training](https://github.com/tensorflow/tensor2tensor/tree/master/docs/distributed_training.md). * Easily swap amongst datasets and models by command-line flag with the data generation script `t2t-datagen` and the training script `t2t-trainer`. diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen index 1f876c981..837d6d203 100644 --- a/tensor2tensor/bin/t2t-datagen +++ b/tensor2tensor/bin/t2t-datagen @@ -118,40 +118,9 @@ _SUPPORTED_PROBLEM_GENERATORS = { lambda: wiki.generator(FLAGS.tmp_dir, True), 1000 ), - "image_mnist_tune": ( - lambda: image.mnist_generator(FLAGS.tmp_dir, True, 55000), - lambda: image.mnist_generator(FLAGS.tmp_dir, True, 5000, 55000)), - "image_mnist_test": ( - lambda: image.mnist_generator(FLAGS.tmp_dir, True, 60000), - lambda: image.mnist_generator(FLAGS.tmp_dir, False, 10000)), - "image_cifar10_tune": ( - lambda: image.cifar10_generator(FLAGS.tmp_dir, True, 48000), - lambda: image.cifar10_generator(FLAGS.tmp_dir, True, 2000, 48000)), - "image_cifar10_test": ( - lambda: image.cifar10_generator(FLAGS.tmp_dir, True, 50000), - lambda: image.cifar10_generator(FLAGS.tmp_dir, False, 10000)), - "image_mscoco_characters_test": ( - lambda: image.mscoco_generator( - FLAGS.data_dir, FLAGS.tmp_dir, True, 80000), - lambda: image.mscoco_generator( - FLAGS.data_dir, FLAGS.tmp_dir, False, 40000)), "image_celeba_tune": ( lambda: image.celeba_generator(FLAGS.tmp_dir, 162770), lambda: image.celeba_generator(FLAGS.tmp_dir, 19867, 162770)), - "image_mscoco_tokens_8k_test": ( - lambda: image.mscoco_generator( - FLAGS.data_dir, FLAGS.tmp_dir, True, 80000, - vocab_filename="vocab.endefr.%d" % 2**13, vocab_size=2**13), - lambda: image.mscoco_generator( - FLAGS.data_dir, FLAGS.tmp_dir, False, 40000, - vocab_filename="vocab.endefr.%d" % 2**13, vocab_size=2**13)), - "image_mscoco_tokens_32k_test": ( - lambda: image.mscoco_generator( - FLAGS.data_dir, FLAGS.tmp_dir, True, 80000, - vocab_filename="vocab.endefr.%d" % 2**15, vocab_size=2**15), - lambda: image.mscoco_generator( - FLAGS.data_dir, FLAGS.tmp_dir, False, 40000, - vocab_filename="vocab.endefr.%d" % 2**15, vocab_size=2**15)), "snli_32k": ( lambda: snli.snli_token_generator(FLAGS.tmp_dir, True, 2**15), lambda: snli.snli_token_generator(FLAGS.tmp_dir, False, 2**15), diff --git a/tensor2tensor/data_generators/image.py b/tensor2tensor/data_generators/image.py index 44e2fda15..a2e328f00 100644 --- a/tensor2tensor/data_generators/image.py +++ b/tensor2tensor/data_generators/image.py @@ -36,11 +36,189 @@ from tensor2tensor.data_generators import generator_utils from tensor2tensor.data_generators import problem from tensor2tensor.data_generators import text_encoder +from tensor2tensor.models import common_layers from tensor2tensor.utils import registry import tensorflow as tf +class ImageProblem(problem.Problem): + + def example_reading_spec(self, label_key=None): + if label_key is None: + label_key = "image/class/label" + + data_fields = { + "image/encoded": tf.FixedLenFeature((), tf.string), + "image/format": tf.FixedLenFeature((), tf.string), + label_key: tf.VarLenFeature(tf.int64) + } + data_items_to_decoders = { + "inputs": + tf.contrib.slim.tfexample_decoder.Image( + image_key="image/encoded", + format_key="image/format", + channels=3), + "targets": + tf.contrib.slim.tfexample_decoder.Tensor(label_key), + } + + return data_fields, data_items_to_decoders + + +# French street names dataset. + + +@registry.register_problem +class ImageFSNS(ImageProblem): + """Problem spec for French Street Name recognition.""" + + def generate_data(self, data_dir, tmp_dir, task_id=-1): + list_url = ("https://raw.githubusercontent.com/tensorflow/models/master/" + "street/python/fsns_urls.txt") + fsns_urls = generator_utils.maybe_download( + tmp_dir, "fsns_urls.txt", list_url) + fsns_files = [f.strip() for f in open(fsns_urls, "r") + if f.startswith("http://")] + for url in fsns_files: + if "/train/train" in url: + generator_utils.maybe_download( + data_dir, "image_fsns-train" + url[-len("-00100-of-00512"):], url) + elif "/validation/validation" in url: + generator_utils.maybe_download( + data_dir, "image_fsns-dev" + url[-len("-00100-of-00512"):], url) + elif "charset" in url: + generator_utils.maybe_download( + data_dir, "charset_size134.txt", url) + + def feature_encoders(self, data_dir): + # This vocab file must be present within the data directory. + vocab_filename = os.path.join(data_dir, "charset_size134.txt") + return { + "inputs": text_encoder.TextEncoder(), + "targets": text_encoder.SubwordTextEncoder(vocab_filename) + } + + def hparams(self, defaults, model_hparams): + p = defaults + p.input_modality = {"inputs": (registry.Modalities.IMAGE, None)} + vocab_size = self._encoders["targets"].vocab_size + p.target_modality = (registry.Modalities.SYMBOL, vocab_size) + p.batch_size_multiplier = 256 + p.max_expected_batch_size_per_shard = 2 + p.input_space_id = problem.SpaceID.IMAGE + p.target_space_id = problem.SpaceID.EN_TOK + + def example_reading_spec(self): + label_key = "image/unpadded_label" + return super(ImageFSNS, self).example_reading_spec(self, + label_key=label_key) + + +class Image2ClassProblem(ImageProblem): + """Base class for image classification problems.""" + + @property + def is_small(self): + raise NotImplementedError() + + @property + def num_classes(self): + raise NotImplementedError() + + @property + def train_shards(self): + raise NotImplementedError() + + @property + def dev_shards(self): + return 1 + + def generator(self, data_dir, tmp_dir, is_training): + raise NotImplementedError() + + def hparams(self, defaults, model_hparams): + p = defaults + small_modality = "%s:small_image_modality" % registry.Modalities.IMAGE + modality = small_modality if self.is_small else registry.Modalities.IMAGE + p.input_modality = {"inputs": (modality, None)} + p.target_modality = (registry.Modalities.CLASS_LABEL, self.num_classes) + p.batch_size_multiplier = 4 if self.is_small else 256 + p.max_expected_batch_size_per_shard = 8 if self.is_small else 2 + p.loss_multiplier = 3.0 if self.is_small else 1.0 + if self._was_reversed: + p.loss_multiplier = 1.0 + p.input_space_id = problem.SpaceID.IMAGE + p.target_space_id = problem.SpaceID.IMAGE_LABEL + + def generate_data(self, data_dir, tmp_dir, task_id=-1): + generator_utils.generate_dataset_and_shuffle( + self.generator(data_dir, tmp_dir, True), + self.training_filepaths(data_dir, self.train_shards, shuffled=False), + self.generator(data_dir, tmp_dir, False), + self.dev_filepaths(data_dir, self.dev_shards, shuffled=False)) + + +def imagenet_preprocess_examples(examples, mode): + """Preprocessing used for Imagenet and similar problems.""" + def preprocess(img): + img = tf.image.resize_images(img, [360, 360]) + img = common_layers.image_augmentation(tf.to_float(img) / 255.) + return tf.to_int64(img * 255.) + + def resize(img): + return tf.to_int64(tf.image.resize_images(img, [299, 299])) + + inputs = tf.cast(examples["inputs"], tf.int64) + if mode == tf.contrib.learn.ModeKeys.TRAIN: + examples["inputs"] = tf.cond( # Preprocess 90% of the time. + tf.less(tf.random_uniform([]), 0.9), + lambda img=inputs: preprocess(img), + lambda img=inputs: resize(img)) + else: + examples["inputs"] = resize(inputs) + return examples + + +@registry.register_problem +class ImageImagenet(Image2ClassProblem): + """Imagenet.""" + + @property + def is_small(self): + return False + + @property + def num_classes(self): + return 1000 + + def generate_data(self, data_dir, tmp_dir, task_id=-1): + # TODO(lukaszkaiser): find a better way than printing this. + print("To generate the ImageNet dataset in the proper format, follow " + "instructions at https://github.com/tensorflow/models/blob/master" + "/inception/README.md#getting-started") + + def preprocess_examples(self, examples, mode): + return imagenet_preprocess_examples(examples, mode) + + +@registry.register_problem +class ImageImagenet32(Image2ClassProblem): + """Imagenet rescaled to 32x32.""" + + def dataset_filename(self): + return "image_imagenet" # Reuse Imagenet data. + + @property + def is_small(self): + return True # Modalities like for CIFAR. + + def preprocess_examples(self, examples, mode): + examples = imagenet_preprocess_examples(examples, mode) + examples["inputs"] = tf.to_int64(tf.image.resize_images( + examples["inputs"], [32, 32])) + + def image_generator(images, labels): """Generator for images that takes image and labels lists and creates pngs. @@ -158,6 +336,39 @@ def mnist_generator(tmp_dir, training, how_many, start_from=0): labels[start_from:start_from + how_many]) +@registry.register_problem +class ImageMnistTune(Image2ClassProblem): + """MNIST, tuning data.""" + + @property + def is_small(self): + return True + + @property + def num_classes(self): + return 10 + + @property + def train_shards(self): + return 10 + + def generator(self, data_dir, tmp_dir, is_training): + if is_training: + return mnist_generator(tmp_dir, True, 55000) + else: + return mnist_generator(tmp_dir, True, 5000, 55000) + + +@registry.register_problem +class ImageMnist(ImageMnistTune): + + def generator(self, data_dir, tmp_dir, is_training): + if is_training: + return mnist_generator(tmp_dir, True, 60000) + else: + return mnist_generator(tmp_dir, False, 10000) + + # URLs and filenames for CIFAR data. _CIFAR10_URL = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz" _CIFAR10_PREFIX = "cifar-10-batches-py/" @@ -208,6 +419,39 @@ def cifar10_generator(tmp_dir, training, how_many, start_from=0): all_labels[start_from:start_from + how_many]) +@registry.register_problem +class ImageCifar10Tune(ImageMnistTune): + + def preprocess_examples(self, examples, mode): + if mode == tf.contrib.learn.ModeKeys.TRAIN: + examples["inputs"] = common_layers.cifar_image_augmentation( + examples["inputs"]) + return examples + + def generator(self, data_dir, tmp_dir, is_training): + if is_training: + return cifar10_generator(tmp_dir, True, 48000) + else: + return cifar10_generator(tmp_dir, True, 2000, 48000) + + +@registry.register_problem +class ImageCifar10(ImageCifar10Tune): + + def generator(self, data_dir, tmp_dir, is_training): + if is_training: + return cifar10_generator(tmp_dir, True, 50000) + else: + return cifar10_generator(tmp_dir, False, 10000) + + +@registry.register_problem +class ImageCifar10Plain(ImageCifar10): + + def preprocess_examples(self, examples, mode): + return examples + + # URLs and filenames for MSCOCO data. _MSCOCO_ROOT_URL = "http://msvocds.blob.core.windows.net/" _MSCOCO_URLS = [ @@ -308,77 +552,135 @@ def mscoco_generator(data_dir, } -class ImageProblem(problem.Problem): +class Image2TextProblem(ImageProblem): + """Base class for image-to-text problems.""" - def example_reading_spec(self, label_key=None): - if label_key is None: - label_key = "image/class/label" + @property + def is_character_level(self): + raise NotImplementedError() - data_fields = { - "image/encoded": tf.FixedLenFeature((), tf.string), - "image/format": tf.FixedLenFeature((), tf.string), - label_key: tf.VarLenFeature(tf.int64) - } - data_items_to_decoders = { - "inputs": - tf.contrib.slim.tfexample_decoder.Image( - image_key="image/encoded", - format_key="image/format", - channels=3), - "targets": - tf.contrib.slim.tfexample_decoder.Tensor(label_key), - } + @property + def targeted_vocab_size(self): + raise NotImplementedError() # Not needed if self.is_character_level. - return data_fields, data_items_to_decoders + @property + def target_space_id(self): + raise NotImplementedError() -# French street names dataset. + @property + def train_shards(self): + raise NotImplementedError() + @property + def dev_shards(self): + raise NotImplementedError() -@registry.register_problem -class ImageFSNS(ImageProblem): - """Problem spec for French Street Name recognition.""" + def generator(self, data_dir, tmp_dir, is_training): + raise NotImplementedError() - def generate_data(self, data_dir, tmp_dir, task_id=-1): - list_url = ("https://raw.githubusercontent.com/tensorflow/models/master/" - "street/python/fsns_urls.txt") - fsns_urls = generator_utils.maybe_download( - tmp_dir, "fsns_urls.txt", list_url) - fsns_files = [f.strip() for f in open(fsns_urls, "r") - if f.startswith("http://")] - for url in fsns_files: - if "/train/train" in url: - generator_utils.maybe_download( - data_dir, "image_fsns-train" + url[-len("-00100-of-00512"):], url) - elif "/validation/validation" in url: - generator_utils.maybe_download( - data_dir, "image_fsns-dev" + url[-len("-00100-of-00512"):], url) - elif "charset" in url: - generator_utils.maybe_download( - data_dir, "charset_size134.txt", url) + def feature_encoders(self, data_dir): + if self.is_character_level: + encoder = text_encoder.ByteTextEncoder() + else: + vocab_filename = os.path.join( + data_dir, "vocab.endefr.%d" % self.targeted_vocab_size) + encoder = text_encoder.SubwordTextEncoder(vocab_filename) + return {"targets": encoder} def hparams(self, defaults, model_hparams): p = defaults p.input_modality = {"inputs": (registry.Modalities.IMAGE, None)} - # This vocab file must be present within the data directory. - vocab_filename = os.path.join(model_hparams.data_dir, "charset_size134.txt") - subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename) - p.target_modality = (registry.Modalities.SYMBOL, subtokenizer.vocab_size) - p.vocabulary = { - "inputs": text_encoder.TextEncoder(), - "targets": subtokenizer, - } + encoder = self._encoders["targets"] + p.target_modality = (registry.Modalities.SYMBOL, encoder.vocab_size) p.batch_size_multiplier = 256 p.max_expected_batch_size_per_shard = 2 - vocab_size = 144 - p.input_modality = {"inputs": (registry.Modalities.SYMBOL, vocab_size)} - p.target_modality = (registry.Modalities.SYMBOL, vocab_size) - p.input_space_id = problem.SpaceID.DIGIT_0 - p.target_space_id = problem.SpaceID.DIGIT_1 + p.loss_multiplier = 1.0 + p.input_space_id = problem.SpaceID.IMAGE + p.target_space_id = self.target_space_id + + def generate_data(self, data_dir, tmp_dir, task_id=-1): + generator_utils.generate_dataset_and_shuffle( + self.generator(data_dir, tmp_dir, True), + self.training_filepaths(data_dir, self.train_shards, shuffled=False), + self.generator(data_dir, tmp_dir, False), + self.dev_filepaths(data_dir, self.dev_shards, shuffled=False)) + + +@registry.register_problem +class ImageMsCocoCharacters(Image2TextProblem): + """MSCOCO, character level.""" + + @property + def is_character_level(self): + return True + + @property + def target_space_id(self): + return problem.SpaceID.EN_CHR + + @property + def train_shards(self): + return 100 + + @property + def dev_shards(self): + return 10 + + def preprocess_examples(self, examples, mode): + return imagenet_preprocess_examples(examples, mode) + + def generator(self, data_dir, tmp_dir, is_training): + if is_training: + return mscoco_generator(data_dir, tmp_dir, True, 80000) + else: + return mscoco_generator(data_dir, tmp_dir, False, 40000) + raise NotImplementedError() + + +@registry.register_problem +class ImageMsCocoTokens8k(ImageMsCocoCharacters): + """MSCOCO, 8k tokens vocab.""" + + @property + def is_character_level(self): + return False + + @property + def targeted_vocab_size(self): + return 2**13 # 8192 + + @property + def target_space_id(self): + return problem.SpaceID.EN_TOK + + @property + def train_shards(self): + return 100 + + @property + def dev_shards(self): + return 10 + + def generator(self, data_dir, tmp_dir, is_training): + vocab_filename = "vocab.endefr.%d" % self.targeted_vocab_size + if is_training: + return mscoco_generator( + data_dir, tmp_dir, True, 80000, + vocab_filename=vocab_filename, vocab_size=self.targeted_vocab_size) + else: + return mscoco_generator( + data_dir, tmp_dir, False, 40000, + vocab_filename=vocab_filename, vocab_size=self.targeted_vocab_size) + + +@registry.register_problem +class ImageMsCocoTokens32k(ImageMsCocoTokens8k): + """MSCOCO, 32k tokens vocab.""" + + @property + def targeted_vocab_size(self): + return 2**15 # 32768 - def example_reading_spec(self): - label_key = "image/unpadded_label" - return super(ImageFSNS, self).example_reading_spec(self, - label_key=label_key) # URL and filename for CELEBA data. _CELEBA_NAME = "img_align_celeba" diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py index 6f49a8d97..3d30ec239 100644 --- a/tensor2tensor/data_generators/problem.py +++ b/tensor2tensor/data_generators/problem.py @@ -82,6 +82,8 @@ class SpaceID(object): DNA = 23 # Real numbers REAL = 24 + # Images + IMAGE = 25 class Problem(object): @@ -234,9 +236,6 @@ def internal_hparams(self, model_hparams): if self._was_reversed: _reverse_problem_hparams(hp) - # TODO(rsepassi): Move this into the cifar10 Problem - if "image_cifar10" in self.name: - hp.loss_multiplier = 1. if self._was_copy: _copy_problem_hparams(hp) return hp diff --git a/tensor2tensor/data_generators/problem_hparams.py b/tensor2tensor/data_generators/problem_hparams.py index 3c829eeac..607078d2f 100644 --- a/tensor2tensor/data_generators/problem_hparams.py +++ b/tensor2tensor/data_generators/problem_hparams.py @@ -48,8 +48,6 @@ def problem_hparams(problem_name, model_hparams): p = _lookup_problem_hparams_fn(base_name)(model_hparams) if was_reversed: _reverse_problem_hparams(p) - if "image_cifar10" in base_name: - p.loss_multiplier = 1. if was_copy: _copy_problem_hparams(p) return p @@ -509,86 +507,6 @@ def ice_parsing_tokens(model_hparams, wrong_source_vocab_size): return p -def image_cifar10(unused_model_hparams): - """CIFAR-10.""" - p = default_problem_hparams() - p.input_modality = { - "inputs": ("%s:small_image_modality" % registry.Modalities.IMAGE, None) - } - p.target_modality = (registry.Modalities.CLASS_LABEL, 10) - p.batch_size_multiplier = 4 - p.max_expected_batch_size_per_shard = 8 - p.loss_multiplier = 3.0 - p.input_space_id = 1 - p.target_space_id = 1 - return p - - -def image_mnist(unused_model_hparams): - """MNIST.""" - p = default_problem_hparams() - p.input_modality = {"inputs": (registry.Modalities.SYMBOL, 256)} - p.target_modality = (registry.Modalities.CLASS_LABEL, 10) - p.batch_size_multiplier = 4 - p.max_expected_batch_size_per_shard = 8 - p.loss_multiplier = 3.0 - p.input_space_id = 1 - p.target_space_id = 1 - return p - - -def image_imagenet(model_hparams): - """ImageNet.""" - p = default_problem_hparams() - p.input_modality = { - "inputs": (registry.Modalities.IMAGE, None), - } - target_modality = ("%s:class_label_2d" % registry.Modalities.CLASS_LABEL - if model_hparams.imagenet_use_2d else - registry.Modalities.CLASS_LABEL) - p.target_modality = (target_modality, 1000) - p.batch_size_multiplier = 256 - p.max_expected_batch_size_per_shard = 2 - p.loss_multiplier = 0.7 - p.input_space_id = 1 - p.target_space_id = 1 - return p - - -def image_mscoco_characters(unused_model_hparams): - """COCO image captioning with captions as characters.""" - p = default_problem_hparams() - p.input_modality = {"inputs": (registry.Modalities.IMAGE, None)} - p.target_modality = (registry.Modalities.SYMBOL, 256) - p.vocabulary = { - "inputs": text_encoder.TextEncoder(), - "targets": text_encoder.ByteTextEncoder(), - } - p.batch_size_multiplier = 128 - p.max_expected_batch_size_per_shard = 2 - p.loss_multiplier = 2.0 - p.input_space_id = 1 - p.target_space_id = 2 - return p - - -def image_mscoco_tokens(model_hparams, vocab_count): - """COCO image captioning with captions as tokens.""" - p = default_problem_hparams() - p.input_modality = {"inputs": (registry.Modalities.IMAGE, None)} - # This vocab file must be present within the data directory. - vocab_filename = os.path.join(model_hparams.data_dir, - "vocab.endefr.%d" % vocab_count) - subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename) - p.target_modality = (registry.Modalities.SYMBOL, subtokenizer.vocab_size) - p.vocabulary = { - "inputs": text_encoder.TextEncoder(), - "targets": subtokenizer, - } - p.batch_size_multiplier = 256 - p.max_expected_batch_size_per_shard = 2 - - def img2img_imagenet(unused_model_hparams): """Image 2 Image for imagenet dataset.""" p = default_problem_hparams() @@ -633,15 +551,6 @@ def image_celeba(unused_model_hparams): "wsj_parsing_tokens_16k": lambda p: wsj_parsing_tokens( # pylint: disable=g-long-lambda p, "wsj", 2**14, 2**9), "wmt_ende_bpe32k": wmt_ende_bpe32k, - "image_cifar10_tune": image_cifar10, - "image_cifar10_test": image_cifar10, - "image_mnist_tune": image_mnist, - "image_mnist_test": image_mnist, "image_celeba_tune": image_celeba, - "image_mscoco_characters_tune": image_mscoco_characters, - "image_mscoco_characters_test": image_mscoco_characters, - "image_mscoco_tokens_8k_test": lambda p: image_mscoco_tokens(p, 2**13), - "image_mscoco_tokens_32k_test": lambda p: image_mscoco_tokens(p, 2**15), - "image_imagenet": image_imagenet, "img2img_imagenet": img2img_imagenet, } diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py index c9b43d507..0a47e9989 100644 --- a/tensor2tensor/data_generators/wmt.py +++ b/tensor2tensor/data_generators/wmt.py @@ -426,7 +426,7 @@ class WMTEnDeCharacters(WMTProblem): def is_character_level(self): return True - def train_generator(self, tmp_dir, train): + def train_generator(self, _, tmp_dir, train): character_vocab = text_encoder.ByteTextEncoder() datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS tag = "train" if train else "dev" diff --git a/tensor2tensor/models/bluenet.py b/tensor2tensor/models/bluenet.py index 3ac477e4b..87ad70e41 100644 --- a/tensor2tensor/models/bluenet.py +++ b/tensor2tensor/models/bluenet.py @@ -546,7 +546,6 @@ def bluenet_base(): hparams.optimizer_adam_epsilon = 1e-6 hparams.optimizer_adam_beta1 = 0.85 hparams.optimizer_adam_beta2 = 0.997 - hparams.add_hparam("imagenet_use_2d", True) hparams.add_hparam("anneal_until", 40000) hparams.add_hparam("batch_deviation_loss_factor", 5.0) return hparams diff --git a/tensor2tensor/models/common_attention.py b/tensor2tensor/models/common_attention.py index 2c7e8afc9..b52fb8aea 100644 --- a/tensor2tensor/models/common_attention.py +++ b/tensor2tensor/models/common_attention.py @@ -358,23 +358,33 @@ def dot_product_attention(q, return tf.matmul(weights, v) -def masked_local_attention_1d( - q, k, v, block_length=128, name=None): - """Attention to the source position and a neigborhood to the left of it. +def masked_local_attention_1d(q, k, v, + block_length=128, look_right=True, + use_whole_block=False, name=None): + """Attention to the source position and a neigborhood around it. - The sequence is divided into blocks of length block_size. - Attention for a given query position can only see memory positions - less than or equal to the query position, in the corresponding block - and the previous block. + The sequence is divided into blocks of length block_size. Attention for a + given query position can only see memory positions within a certain number + of positions before and behind it. - If mask_right is True, then a target position cannot see greater source + + If look_right is True then each query will attend to block_length//2 + positions either side, otherwise it will attend to block_length previous positions. + If use_whole_block is True then no mask will be applied to the local blocks + meaning the full blocks are used (if look_right is True then the elements to + the right of the current position are still masked out). This allows to + attend to more elements without additional overhead, but means we have + inconsistent window positions and sizes. + Args: - q: a Tensor with shape [batch, heads, length, depth_k] - k: a Tensor with shape [batch, heads, length, depth_k] - v: a Tensor with shape [batch, heads, length, depth_v] + q: a Tensor with shape [batch, heads, length_q, depth_k] + k: a Tensor with shape [batch, heads, length_kv, depth_k] + v: a Tensor with shape [batch, heads, length_kv, depth_v] block_length: an integer + look_right: a bool + use_whole_block: a bool name: an optional string Returns: @@ -386,61 +396,71 @@ def masked_local_attention_1d( batch = tf.shape(q)[0] heads = tf.shape(q)[1] length = tf.shape(q)[2] - # If (length < 2 * block_length), then we use only one block. - block_length = tf.where(tf.less(length, block_length * 2), - length, block_length) depth_k = tf.shape(q)[3] depth_v = tf.shape(v)[3] original_length = length + + # If (length < block_length), then we use only one block. + block_length = tf.where(tf.less(length, block_length), + length, block_length) + # Pad to desired length. padding_size = tf.mod(-length, block_length) length += padding_size + num_blocks = tf.div(length, block_length) padding = [[0, 0], [0, 0], [0, padding_size], [0, 0]] q = tf.pad(q, padding) - k = tf.pad(k, padding) - v = tf.pad(v, padding) - num_blocks = tf.div(length, block_length) - - # compute attention for the first query block. - first_q = tf.slice(q, [0, 0, 0, 0], [-1, -1, block_length, -1]) - first_k = tf.slice(k, [0, 0, 0, 0], [-1, -1, block_length, -1]) - first_v = tf.slice(v, [0, 0, 0, 0], [-1, -1, block_length, -1]) - first_output = dot_product_attention( - first_q, first_k, first_v, attention_bias_lower_triangle(block_length), - name="fist_block") - # compute attention for all subsequent query blocks. + if not look_right: + # Add extra padding so we son't have to do an initial query block. + extra_padding = [[0, 0], [0, 0], [block_length, padding_size], [0, 0]] + else: + # We shift everything over by half a block so query is in center. + pad_right = block_length // 2 + pad_left = block_length - pad_right + extra_padding = [[0, 0], [0, 0], + [pad_left, padding_size+pad_right], [0, 0]] + k = tf.pad(k, extra_padding) + v = tf.pad(v, extra_padding) + + # Reshape into blocks. q = tf.reshape(q, [batch, heads, num_blocks, block_length, depth_k]) - k = tf.reshape(k, [batch, heads, num_blocks, block_length, depth_k]) - v = tf.reshape(v, [batch, heads, num_blocks, block_length, depth_v]) + k = tf.reshape(k, [batch, heads, num_blocks+1, block_length, depth_k]) + v = tf.reshape(v, [batch, heads, num_blocks+1, block_length, depth_v]) + # Get local blocks by slicing. def local(x): """Create a local version of the keys or values.""" prev_block = tf.slice( - x, [0, 0, 0, 0, 0], [-1, -1, num_blocks - 1, -1, -1]) + x, [0, 0, 0, 0, 0], [-1, -1, num_blocks, -1, -1]) cur_block = tf.slice( x, [0, 0, 1, 0, 0], [-1, -1, -1, -1, -1]) return tf.concat([prev_block, cur_block], 3) local_k = local(k) local_v = local(v) - tail_q = tf.slice(q, [0, 0, 1, 0, 0], [-1, -1, -1, -1, -1]) - local_length = tf.shape(local_k)[3] - # [batch, heads, num_blocks - 1, block_length, local_length] - attention = tf.matmul(tail_q, local_k, transpose_b=True) - - # make sure source_pos <= target_pos - good_part = tf.matrix_band_part( - tf.ones([block_length, local_length]), -1, tf.to_int64(block_length)) - mask = (1.0 - good_part) * -1e9 - attention += tf.reshape(mask, [1, 1, 1, block_length, local_length]) + # [batch, heads, num_blocks, block_length, local_length] + attention = tf.matmul(q, local_k, transpose_b=True) attention = tf.nn.softmax(attention) + + # Get local mask + if not use_whole_block: + good_part = tf.matrix_band_part( + tf.ones([block_length, local_length]), 0, tf.to_int64(block_length)) + elif not look_right: + good_part = tf.matrix_band_part( + tf.ones([block_length, local_length]), -1, tf.to_int64(block_length)) + else: + good_part = tf.ones([block_length, local_length]) + + attention *= tf.reshape(good_part, [1, 1, 1, block_length, local_length]) + # TODO(noam): figure out how to show a summary for the remaining blocks. # The naive way currently causes errors due to empty tensors. - # output: [batch, heads, num_blocks-1, block_length, depth_v] output = tf.matmul(attention, local_v) output = tf.reshape(output, [batch, heads, -1, depth_v]) - output = tf.concat([first_output, output], axis=2) + + # Remove added padding output = tf.slice(output, [0, 0, 0, 0], [-1, -1, original_length, -1]) output.set_shape(v_shape) return output diff --git a/tensor2tensor/models/common_attention_test.py b/tensor2tensor/models/common_attention_test.py index 78be4b645..a09da74e1 100644 --- a/tensor2tensor/models/common_attention_test.py +++ b/tensor2tensor/models/common_attention_test.py @@ -41,6 +41,34 @@ def testDotProductAttention(self): res = session.run(a) self.assertEqual(res.shape, (5, 7, 12, 32)) + def testMaskedLocalAttention(self): + q = np.array([[[[1.0, 0.0, 0.0, 0.0], + [1.0, 0.0, 0.0, 0.0], + [1.0, 0.0, 0.0, 0.0], + [1.0, 0.0, 0.0, 0.0], + [1.0, 0.0, 0.0, 0.0], + [1.0, 0.0, 0.0, 0.0], + [1.0, 0.0, 0.0, 0.0], + [1.0, 0.0, 0.0, 0.0]]]]) + k = np.array([[[[1.0, 0.0, 0.0, 0.0], + [1.0, 0.0, 0.0, 0.0], + [1.0, 0.0, 0.0, 0.0], + [1.0, 0.0, 0.0, 0.0], + [1.0, 0.0, 0.0, 0.0], + [1.0, 0.0, 0.0, 0.0], + [1.0, 0.0, 0.0, 0.0], + [1.0, 0.0, 0.0, 0.0]]]]) + v = np.ones((1, 1, 8, 1)) + with self.test_session() as session: + q_ = tf.constant(q, dtype=tf.float32) + k_ = tf.constant(k, dtype=tf.float32) + v_ = tf.constant(v, dtype=tf.float32) + y = common_attention.masked_local_attention_1d( + q_, k_, v_, block_length=tf.constant(2)) + res = session.run(y) + + self.assertEqual(res.shape, (1, 1, 8, 1)) + def testLocalUnmaskedAttention(self): x = np.random.rand(5, 4, 25, 16) y = np.random.rand(5, 4, 25, 16) diff --git a/tensor2tensor/models/common_layers.py b/tensor2tensor/models/common_layers.py index e98531d88..5449a8bef 100644 --- a/tensor2tensor/models/common_layers.py +++ b/tensor2tensor/models/common_layers.py @@ -475,7 +475,7 @@ def residual_fn(x, residual_dropout, filters=None, epsilon=1e-16, - name="residual"): + name=None, reuse=None): """Returns a function for combining layer input and layer output. The returned function on x (layer input) and y (layer output) computes: @@ -489,16 +489,19 @@ def residual_fn(x, filters: integer, dimension for layer norm, optional epsilon: integer, value of layer norm epsilon name: string, name + reuse: bool, whether to reuse Returns: residual layer output with applied norm_fn. """ - norm_fn = get_norm(norm_type) - res = x + tf.nn.dropout(y, 1.0 - residual_dropout) - if norm_type == "layer": - return norm_fn(res, name=name, filters=filters, epsilon=epsilon) - else: - return norm_fn(res, name=name) + with tf.variable_scope(name, default_name="residual", + values=[x, y], reuse=reuse): + norm_fn = get_norm(norm_type) + res = x + tf.nn.dropout(y, 1.0 - residual_dropout) + if norm_type == "layer": + return norm_fn(res, filters=filters, epsilon=epsilon, name=norm_type) + else: + return norm_fn(res, name=norm_type) def conv_block_internal(conv_fn, @@ -1457,6 +1460,34 @@ def global_pool_1d(inputs, pooling_type="MAX", mask=None): return output +def running_global_pool_1d(inputs, pooling_type="MAX"): + """Same global pool, but only for the elements up to the current element. + + Useful for outputs where the state of future elements is not known. + Takes no mask as all elements up to the current element are assumed to exist. + Currently only supports maximum. Equivalent to using a lower triangle bias. + + Args: + inputs: A tensor of dimensions batch_size x sequence_length x input_dims + containing the sequences of input vectors. + pooling_type: Pooling type to use. Currently only supports 'MAX'. + + Returns: + output: A tensor of dimensions batch_size x sequence_length x input_dims + dimension containing the running 'totals'. + """ + del pooling_type + with tf.name_scope("running_global_pool", [inputs]): + scan_fct = tf.maximum + # Permute inputs so seq_length is first. + elems = tf.transpose(inputs, [1, 0, 2]) + # Perform scan. + cumulatives = tf.scan(scan_fct, elems, swap_memory=True) + # Permute output to get back to original order. + output = tf.transpose(cumulatives, [1, 0, 2]) + return output + + def linear_set_layer(layer_size, inputs, context=None, @@ -1486,7 +1517,8 @@ def linear_set_layer(layer_size, output: A tensor of dimensions batch_size x sequence_length x output_dims dimension containing the sequences of transformed vectors. """ - with tf.variable_scope(name, "linear_set_layer", [inputs]): + with tf.variable_scope(name, default_name="linear_set_layer", + values=[inputs]): # Apply 1D convolution to apply linear filter to each element # along the 2nd dimension. outputs = conv1d(inputs, layer_size, 1, activation=None, name="set_conv") @@ -1495,9 +1527,10 @@ def linear_set_layer(layer_size, if context is not None: # Unfortunately tf doesn't support broadcasting via concat, but we can # simply add the transformed context to get the same effect. - context = tf.expand_dims(context, axis=1) - cont_tfm = conv1d( - context, layer_size, 1, activation=None, name="cont_conv") + if len(context.get_shape().as_list()) == 2: + context = tf.expand_dims(context, axis=1) + cont_tfm = conv1d(context, layer_size, 1, + activation=None, name="cont_conv") outputs += cont_tfm if activation_fn is not None: @@ -1512,6 +1545,7 @@ def linear_set_layer(layer_size, def ravanbakhsh_set_layer(layer_size, inputs, mask=None, + sequential=False, activation_fn=tf.nn.tanh, dropout=0.0, name=None): @@ -1525,6 +1559,9 @@ def ravanbakhsh_set_layer(layer_size, containing the sequences of input vectors. mask: A tensor of dimensions batch_size x sequence_length containing a mask for the inputs with 1's for existing elements, and 0's elsewhere. + sequential: If true, will use a running global pool so each element will + only depend on those before it. Set true if this layer is being used in + an output sequence. activation_fn: The activation function to use. dropout: dropout. name: name. @@ -1533,12 +1570,16 @@ def ravanbakhsh_set_layer(layer_size, output: A tensor of dimensions batch_size x sequence_length x vector dimension containing the sequences of transformed vectors. """ + del dropout with tf.variable_scope(name, "ravanbakhsh_set_layer", [inputs]): - output = linear_set_layer( + if sequential: + return linear_set_layer( + layer_size, + inputs - running_global_pool_1d(inputs), + activation_fn=activation_fn, + name=name) + return linear_set_layer( layer_size, inputs - tf.expand_dims(global_pool_1d(inputs, mask=mask), axis=1), activation_fn=activation_fn, - dropout=dropout, name=name) - - return output diff --git a/tensor2tensor/models/lstm.py b/tensor2tensor/models/lstm.py index ae221bdff..195879d78 100644 --- a/tensor2tensor/models/lstm.py +++ b/tensor2tensor/models/lstm.py @@ -247,8 +247,8 @@ def lstm_seq2seq_internal_attention(inputs, targets, hparams, train): return tf.expand_dims(decoder_outputs, axis=2) -@registry.register_model("baseline_lstm_seq2seq") -class LSTMSeq2Seq(t2t_model.T2TModel): +@registry.register_model +class LSTMSeq2seq(t2t_model.T2TModel): def model_fn_body(self, features): train = self._hparams.mode == tf.contrib.learn.ModeKeys.TRAIN @@ -256,8 +256,8 @@ def model_fn_body(self, features): self._hparams, train) -@registry.register_model("baseline_lstm_seq2seq_attention") -class LSTMSeq2SeqAttention(t2t_model.T2TModel): +@registry.register_model +class LSTMSeq2seqAttention(t2t_model.T2TModel): def model_fn_body(self, features): train = self._hparams.mode == tf.contrib.learn.ModeKeys.TRAIN diff --git a/tensor2tensor/models/lstm_test.py b/tensor2tensor/models/lstm_test.py index 31380afa5..6ac792f48 100644 --- a/tensor2tensor/models/lstm_test.py +++ b/tensor2tensor/models/lstm_test.py @@ -44,7 +44,7 @@ def testLSTMSeq2Seq(self): "inputs": tf.constant(x, dtype=tf.int32), "targets": tf.constant(y, dtype=tf.int32), } - model = lstm.LSTMSeq2Seq( + model = lstm.LSTMSeq2seq( hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams) sharded_logits, _ = model.model_fn(features) logits = tf.concat(sharded_logits, 0) @@ -68,7 +68,7 @@ def testLSTMSeq2SeqAttention(self): "inputs": x, "targets": tf.constant(y, dtype=tf.int32), } - model = lstm.LSTMSeq2SeqAttention( + model = lstm.LSTMSeq2seqAttention( hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams) sharded_logits, _ = model.model_fn(features) logits = tf.concat(sharded_logits, 0) diff --git a/tensor2tensor/models/modalities.py b/tensor2tensor/models/modalities.py index c57a97905..912c54f8c 100644 --- a/tensor2tensor/models/modalities.py +++ b/tensor2tensor/models/modalities.py @@ -359,7 +359,7 @@ def xnet_resblock(x, filters, res_relu, name): class ClassLabelModality(modality.Modality): """Used for label data.""" - def __init__(self, model_hparams, vocab_size, is2d=False): + def __init__(self, model_hparams, vocab_size, is2d=True): super(ClassLabelModality, self).__init__(model_hparams, vocab_size) self._is_2d = is2d self._kernel = (3, 3) if is2d else (5, 1) @@ -425,12 +425,12 @@ def loss(self, top_out, targets, weights_fn=common_layers.weights_all): @registry.register_class_label_modality("class_label_2d") -class ClassLabel2DModality(ClassLabelModality): +class ClassLabel1DModality(ClassLabelModality): """Used for label data.""" def __init__(self, model_hparams, vocab_size): - super(ClassLabel2DModality, self).__init__( - model_hparams=model_hparams, vocab_size=vocab_size, is2d=True) + super(ClassLabel1DModality, self).__init__( + model_hparams=model_hparams, vocab_size=vocab_size, is2d=False) @registry.register_generic_modality("default") diff --git a/tensor2tensor/models/multimodel.py b/tensor2tensor/models/multimodel.py index 089889ce6..6f60dbfbf 100644 --- a/tensor2tensor/models/multimodel.py +++ b/tensor2tensor/models/multimodel.py @@ -190,7 +190,6 @@ def multimodel_base(): hparams.add_hparam("moe_n2", 0) hparams.add_hparam("moe_layers", "2") hparams.add_hparam("moe_loss_coef", 1e-2) - hparams.add_hparam("imagenet_use_2d", int(True)) return hparams diff --git a/tensor2tensor/models/multimodel_test.py b/tensor2tensor/models/multimodel_test.py index cf109beb4..73a8436cc 100644 --- a/tensor2tensor/models/multimodel_test.py +++ b/tensor2tensor/models/multimodel_test.py @@ -23,8 +23,9 @@ import numpy as np -from tensor2tensor.data_generators import problem_hparams +from tensor2tensor.data_generators import image # pylint: disable=unused-import from tensor2tensor.models import multimodel +from tensor2tensor.utils import registry import tensorflow as tf @@ -32,10 +33,12 @@ class MultiModelTest(tf.test.TestCase): def testMultiModel(self): - x = np.random.random_integers(0, high=255, size=(3, 5, 4, 3)) + x = np.random.random_integers(0, high=255, size=(3, 5, 5, 3)) y = np.random.random_integers(0, high=9, size=(3, 5, 1, 1)) hparams = multimodel.multimodel_tiny() - p_hparams = problem_hparams.image_cifar10(hparams) + hparams.add_hparam("data_dir", "") + problem = registry.problem("image_cifar10") + p_hparams = problem.internal_hparams(hparams) hparams.problems = [p_hparams] with self.test_session() as session: features = { diff --git a/tensor2tensor/models/slicenet.py b/tensor2tensor/models/slicenet.py index 69e2338b6..f1534137c 100644 --- a/tensor2tensor/models/slicenet.py +++ b/tensor2tensor/models/slicenet.py @@ -316,7 +316,6 @@ def slicenet_params1(): hparams.add_hparam("moe_n1", 32) hparams.add_hparam("moe_n2", 0) hparams.add_hparam("moe_loss_coef", 1e-2) - hparams.add_hparam("imagenet_use_2d", int(True)) # attention-related flags hparams.add_hparam("attention_type", "simple") hparams.add_hparam("num_heads", 8) diff --git a/tensor2tensor/models/slicenet_test.py b/tensor2tensor/models/slicenet_test.py index 54b57a9f2..c357448e4 100644 --- a/tensor2tensor/models/slicenet_test.py +++ b/tensor2tensor/models/slicenet_test.py @@ -23,8 +23,10 @@ import numpy as np -from tensor2tensor.data_generators import problem_hparams +from tensor2tensor.data_generators import image # pylint: disable=unused-import +from tensor2tensor.models import modalities # pylint: disable=unused-import from tensor2tensor.models import slicenet +from tensor2tensor.utils import registry import tensorflow as tf @@ -32,10 +34,12 @@ class SliceNetTest(tf.test.TestCase): def testSliceNet(self): - x = np.random.random_integers(0, high=255, size=(3, 5, 4, 3)) + x = np.random.random_integers(0, high=255, size=(3, 5, 5, 3)) y = np.random.random_integers(0, high=9, size=(3, 5, 1, 1)) hparams = slicenet.slicenet_params1_tiny() - p_hparams = problem_hparams.image_cifar10(hparams) + hparams.add_hparam("data_dir", "") + problem = registry.problem("image_cifar10") + p_hparams = problem.internal_hparams(hparams) hparams.problems = [p_hparams] with self.test_session() as session: features = { diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py index c9d0a2db2..2320a57f1 100644 --- a/tensor2tensor/models/transformer.py +++ b/tensor2tensor/models/transformer.py @@ -57,8 +57,11 @@ def model_fn_body(self, features): targets, hparams) def residual_fn(x, y): - return common_layers.layer_norm(x + tf.nn.dropout( - y, 1.0 - hparams.residual_dropout)) + return common_layers.residual_fn(x, y, + hparams.norm_type, + hparams.residual_dropout, + hparams.hidden_size, + epsilon=hparams.layer_norm_epsilon) encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.residual_dropout) decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.residual_dropout) @@ -267,6 +270,7 @@ def transformer_ffn_layer(x, hparams): def transformer_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() + hparams.norm_type = "layer" hparams.hidden_size = 512 hparams.batch_size = 4096 hparams.max_length = 256 diff --git a/tensor2tensor/models/transformer_alternative.py b/tensor2tensor/models/transformer_alternative.py index 62413c325..1f20bfb51 100644 --- a/tensor2tensor/models/transformer_alternative.py +++ b/tensor2tensor/models/transformer_alternative.py @@ -50,17 +50,12 @@ def model_fn_body(self, features): inputs = common_layers.flatten4d3d(inputs) targets = common_layers.flatten4d3d(targets) - (encoder_input, encoder_attention_bias, - _) = transformer.transformer_prepare_encoder(inputs, target_space, hparams) - (decoder_input, - decoder_self_attention_bias) = transformer.transformer_prepare_decoder( - targets, hparams) - - # We need masks of the form batch size x input sequences - # Biases seem to be of the form batch_size x 1 x input sequences x vec dim - # Squeeze out dim one, and get the first element of each vector. - encoder_mask = tf.squeeze(encoder_attention_bias, [1])[:, :, 0] - decoder_mask = tf.squeeze(decoder_self_attention_bias, [1])[:, :, 0] + (encoder_input, encoder_attention_bias, _) = ( + transformer.transformer_prepare_encoder(inputs, target_space, hparams)) + (decoder_input, _) = ( + transformer.transformer_prepare_decoder(targets, hparams)) + + encoder_mask = bias_to_mask(encoder_attention_bias) def residual_fn(x, y): return common_layers.layer_norm(x + tf.nn.dropout( @@ -68,11 +63,12 @@ def residual_fn(x, y): encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.residual_dropout) decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.residual_dropout) + encoder_output = alt_transformer_encoder( encoder_input, residual_fn, encoder_mask, hparams) decoder_output = alt_transformer_decoder( - decoder_input, encoder_output, residual_fn, decoder_mask, + decoder_input, encoder_output, residual_fn, encoder_attention_bias, hparams) decoder_output = tf.expand_dims(decoder_output, 2) @@ -80,7 +76,7 @@ def residual_fn(x, y): return decoder_output -def composite_layer(inputs, mask, hparams): +def composite_layer(inputs, mask, hparams, for_output=False): """Composite layer.""" x = inputs @@ -92,26 +88,28 @@ def composite_layer(inputs, mask, hparams): hparams.hidden_size, x, mask=mask, - dropout=0.0) + sequential=for_output, + dropout=hparams.relu_dropout) # Transforms elements to get a context, and then uses this in a final layer. elif hparams.composite_layer_type == "reembedding": # Transform elements n times and then pool. for layer in xrange(hparams.layers_per_layer): - with tf.variable_scope(".%d" % layer): + with tf.variable_scope("sub_layer_%d" % layer): x = common_layers.linear_set_layer( hparams.hidden_size, x, - dropout=0.0) - context = common_layers.global_pool_1d(x, mask=mask) - + dropout=hparams.relu_dropout) + if for_output: + context = common_layers.running_global_pool_1d(x) + else: + context = common_layers.global_pool_1d(x, mask=mask) # Final layer. x = common_layers.linear_set_layer( hparams.hidden_size, x, context=context, - dropout=0.0) - + dropout=hparams.relu_dropout) return x @@ -122,29 +120,25 @@ def alt_transformer_encoder(encoder_input, name="encoder"): """Alternative encoder.""" x = encoder_input - with tf.variable_scope(name): + x = encoder_input for layer in xrange(hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): x = residual_fn(x, composite_layer(x, mask, hparams)) - return x def alt_transformer_decoder(decoder_input, encoder_output, residual_fn, - mask, encoder_decoder_attention_bias, hparams, name="decoder"): """Alternative decoder.""" - x = decoder_input - with tf.variable_scope(name): + x = decoder_input for layer in xrange(hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): - x_ = common_attention.multihead_attention( x, encoder_output, @@ -156,17 +150,30 @@ def alt_transformer_decoder(decoder_input, hparams.attention_dropout, name="encdec_attention") - x_ = residual_fn(x_, composite_layer(x_, mask, hparams)) + x_ = residual_fn(x_, composite_layer(x_, None, hparams, + for_output=True)) x = residual_fn(x, x_) - return x +def bias_to_mask(bias): + # We need masks of the form batch size x input sequences + # Biases are of the form batch_size x num_heads x input sequences x + # output sequences. Squeeze out dim one, and get the first element of + # each vector. + bias = tf.squeeze(bias, [1])[:, :, 0] + bias = - tf.clip_by_value(bias, -1.0, 1.0) + mask = 1 - bias + return mask + + @registry.register_hparams def transformer_alt(): """Set of hyperparameters.""" hparams = transformer.transformer_base() - hparams.batch_size = 64 + hparams.batch_size = 2048 + hparams.num_hidden_layers = 10 hparams.add_hparam("layers_per_layer", 4) - hparams.add_hparam("composite_layer_type", "reembedding") + # Composite layer: ravanbakhsh or reembedding. + hparams.add_hparam("composite_layer_type", "ravanbakhsh") return hparams diff --git a/tensor2tensor/models/xception.py b/tensor2tensor/models/xception.py index 61fa61235..f2e69da21 100644 --- a/tensor2tensor/models/xception.py +++ b/tensor2tensor/models/xception.py @@ -86,7 +86,6 @@ def xception_base(): hparams.optimizer_adam_epsilon = 1e-6 hparams.optimizer_adam_beta1 = 0.85 hparams.optimizer_adam_beta2 = 0.997 - hparams.add_hparam("imagenet_use_2d", True) return hparams diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py index 81dcb52a5..05aa9bf26 100644 --- a/tensor2tensor/utils/data_reader.py +++ b/tensor2tensor/utils/data_reader.py @@ -27,7 +27,6 @@ from six.moves import zip # pylint: disable=redefined-builtin from tensor2tensor.data_generators import problem_hparams -from tensor2tensor.models import common_layers from tensor2tensor.utils import registry import tensorflow as tf @@ -127,35 +126,15 @@ def decode_record(record): return decode_record(example_serialized) -def preprocessing(examples, data_file_pattern, mode): +def preprocessing(examples, data_file_pattern): """Preprocessing of examples.""" + # This function is for obsolete problems only, as we're porting them + # all to the Problem class and its preprocess_examples method. Don't add. if "image" in data_file_pattern: - # Small single-example pre-processing for images. def resize(img, size): return tf.to_int64(tf.image.resize_images(img, [size, size])) - def preprocess(img): - img = tf.image.resize_images(img, [360, 360]) - img = common_layers.image_augmentation(tf.to_float(img) / 255.) - return tf.to_int64(img * 255.) - - if ("image_imagenet" in data_file_pattern or - "image_mscoco" in data_file_pattern): - examples["inputs"] = tf.cast(examples["inputs"], tf.int64) - # For imagnet/coco, resize images to 299x299 as is standard. - inputs = examples["inputs"] - if mode == tf.contrib.learn.ModeKeys.TRAIN: - examples["inputs"] = tf.cond( # Preprocess 80% of the time. - tf.less(tf.random_uniform([]), 0.8), - lambda img=inputs: preprocess(img), - lambda img=inputs: resize(img, 299)) - else: - examples["inputs"] = tf.to_int64(resize(inputs, 299)) - elif ("image_cifar10" in data_file_pattern and - mode == tf.contrib.learn.ModeKeys.TRAIN): - examples["inputs"] = common_layers.cifar_image_augmentation( - examples["inputs"]) - elif "img2img" in data_file_pattern: + if "img2img" in data_file_pattern: inputs = examples["inputs"] examples["inputs"] = resize(inputs, 16) examples["targets"] = resize(inputs, 64) @@ -163,7 +142,6 @@ def preprocess(img): inputs = examples["inputs"] examples["inputs"] = resize(inputs, 8) examples["targets"] = resize(inputs, 32) - elif "audio" in data_file_pattern: # Reshape audio to proper shape sample_count = tf.to_int32(examples.pop("audio/sample_count")) @@ -205,8 +183,6 @@ def default_example_reading_spec(data_file_pattern): # Read from image TFRecords if the file has "image" in its name. if data_file_pattern and "image" in data_file_pattern: label_key = "image/class/label" - if "fsns" in data_file_pattern: - label_key = "image/unpadded_label" data_fields = { "image/encoded": tf.FixedLenFeature((), tf.string), "image/format": tf.FixedLenFeature((), tf.string), @@ -257,7 +233,7 @@ def input_pipeline(problem, data_file_pattern, capacity, mode, hparams): data_items_to_decoders=data_items_to_decoders) if problem is None: - examples = preprocessing(examples, data_file_pattern, mode) + examples = preprocessing(examples, data_file_pattern) else: examples = problem.preprocess_examples(examples, mode, hparams) diff --git a/tensor2tensor/utils/registry.py b/tensor2tensor/utils/registry.py index 0baad2471..9d5e1e0a6 100644 --- a/tensor2tensor/utils/registry.py +++ b/tensor2tensor/utils/registry.py @@ -76,7 +76,7 @@ class Modalities(object): # Camel case to snake case utils _first_cap_re = re.compile("(.)([A-Z][a-z0-9]+)") -_all_cap_re = re.compile("([a-z])([A-Z])") +_all_cap_re = re.compile("([a-z0-9])([A-Z])") def _convert_camel_to_snake(name): diff --git a/tensor2tensor/utils/registry_test.py b/tensor2tensor/utils/registry_test.py index 3231809ea..62c24b054 100644 --- a/tensor2tensor/utils/registry_test.py +++ b/tensor2tensor/utils/registry_test.py @@ -94,8 +94,9 @@ def testSnakeCase(self): convert = registry._convert_camel_to_snake self.assertEqual("typical_camel_case", convert("TypicalCamelCase")) - self.assertEqual("numbers_fuse2gether", convert("NumbersFuse2Gether")) - self.assertEqual("lstm_seq2seq", convert("LSTMSeq2Seq")) + self.assertEqual("numbers_fuse2gether", convert("NumbersFuse2gether")) + self.assertEqual("numbers_fuse2_gether", convert("NumbersFuse2Gether")) + self.assertEqual("lstm_seq2_seq", convert("LSTMSeq2Seq")) self.assertEqual("starts_lower", convert("startsLower")) self.assertEqual("starts_lower_caps", convert("startsLowerCAPS")) self.assertEqual("caps_fuse_together", convert("CapsFUSETogether")) diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py index 95774dabc..7cb484bc8 100644 --- a/tensor2tensor/utils/t2t_model.py +++ b/tensor2tensor/utils/t2t_model.py @@ -410,10 +410,13 @@ def model_fn(self, features, skip=False, last_position_only=False): # Construct the model body. with tf.variable_scope("body", reuse=self._problem_idx > 0): if skip: - body_outputs, extra_loss = transformed_features["targets"], 0.0 + body_outputs = transformed_features["targets"] + losses = {"extra": 0.0} else: - body_outputs, extra_loss = self.model_fn_body_sharded( + body_outputs, losses = self.model_fn_body_sharded( transformed_features) + if isinstance(losses, tf.Tensor): # If it's a single extra loss. + losses = {"extra": losses} with tf.variable_scope(target_modality.name, reuse=target_reuse): if not last_position_only: @@ -440,7 +443,8 @@ def model_fn(self, features, skip=False, last_position_only=False): training_loss = None tf.logging.info("This model_fn took %.3f sec." % (time.time() - start_time)) - return sharded_logits, {"training": training_loss, "extra": extra_loss} + losses["training"] = training_loss + return sharded_logits, losses def model_fn_body_sharded(self, sharded_features): """Mixture-of-experts models will override this function. @@ -465,10 +469,10 @@ def model_fn_body_sharded(self, sharded_features): _with_timing(self.model_fn_body, "model_fn_body"), datashard_to_features) if isinstance(output, tuple): - loss = tf.reduce_mean(output[1]) + loss = {"extra": tf.reduce_mean(output[1])} output = output[0] else: - loss = 0.0 + loss = {"extra": 0.0} return output, loss def model_fn_body(self, features): diff --git a/tensor2tensor/utils/trainer_utils_test.py b/tensor2tensor/utils/trainer_utils_test.py index ea88183c9..562279623 100644 --- a/tensor2tensor/utils/trainer_utils_test.py +++ b/tensor2tensor/utils/trainer_utils_test.py @@ -67,7 +67,7 @@ def setUpClass(cls): def testModelsImported(self): models = registry.list_models() - self.assertTrue("baseline_lstm_seq2seq" in models) + self.assertTrue("lstm_seq2seq" in models) def testHParamsImported(self): hparams = registry.list_hparams() From 41bca6896dd4c906ce67faeb19ca6422fbd3b6c3 Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Tue, 1 Aug 2017 18:06:29 -0700 Subject: [PATCH 17/17] v1.1.4 PiperOrigin-RevId: 163916460 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ae028d847..6f509d03e 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='tensor2tensor', - version='1.1.3', + version='1.1.4', description='Tensor2Tensor', author='Google Inc.', author_email='no-reply@google.com',