From 594da6340fc814743a2b7b8dd545a9965e036241 Mon Sep 17 00:00:00 2001 From: T2T Team Date: Wed, 30 Aug 2017 07:21:14 -0700 Subject: [PATCH 01/32] internal. PiperOrigin-RevId: 166990178 --- README.md | 3 +- docs/new_problem.md | 58 ++++++++++++------------ tensor2tensor/utils/data_reader.py | 1 - tensor2tensor/visualization/attention.py | 2 +- 4 files changed, 32 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index 58a58aa17..4e56d7855 100644 --- a/README.md +++ b/README.md @@ -214,7 +214,8 @@ on the task (e.g. fed through a final linear transform to produce logits for a softmax over classes). All models are imported in [`models.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/models/models.py), inherit from `T2TModel` - defined in -[`t2t_model.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/utils/t2t_model.py) - and are registered with +[`t2t_model.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/utils/t2t_model.py) +- and are registered with [`@registry.register_model`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/utils/registry.py). ### Hyperparameter Sets diff --git a/docs/new_problem.md b/docs/new_problem.md index e69a7dfdb..c859c6eba 100644 --- a/docs/new_problem.md +++ b/docs/new_problem.md @@ -15,20 +15,18 @@ Let's add a new dataset together and train the transformer model. We'll be learn For each problem we want to tackle we create a new problem class and register it. Let's call our problem `Word2def`. -Since many text2text problems share similar methods, there's already a class called [`Text2TextProblem`](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/data_generators/problem.py#L354) that extends the base problem class, `Problem` (both found in [`problem.py`](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/data_generators/problem.py)). +Since many text2text problems share similar methods, there's already a class called `Text2TextProblem` that extends the base problem class, `Problem` (both found in `problem.py`). -For our problem, we can go ahead and create the file `word2def.py` in the [`data_generators`](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/data_generators/) folder and add our new problem, `Word2def`, which extends [`Text2TextProblem`](https://github.com/tensorflow/tensor2tensor/blob/24071ba07d5a14c170044c5e60a24bda8179fb7a/tensor2tensor/data_generators/problem.py#L354). Let's also register it while we're at it so we can specify the problem through flags. +For our problem, we can go ahead and create the file `word2def.py` in the `data_generators` folder and add our new problem, `Word2def`, which extends `TranslateProblem`. Let's also register it while we're at it so we can specify the problem through flags. ```python -@registry.register_problem +@registry.register_problem() class Word2def(problem.Text2TextProblem): """Problem spec for English word to dictionary definition.""" - @property - def is_character_level(self): - ... + return NotImplementedError() ``` -We need to implement the following methods from [`Text2TextProblem`](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/data_generators/problem.py#L354) in our new class: +We need to implement the following methods from `Text2TextProblem` in our new class: * is_character_level * targeted_vocab_size * generator @@ -42,7 +40,7 @@ Let's tackle them one by one: **input_space_id, target_space_id, is_character_level, targeted_vocab_size, use_subword_tokenizer**: -SpaceIDs tell Tensor2Tensor what sort of space the input and target tensors are in. These are things like, EN_CHR (English character), EN_TOK (English token), AUDIO_WAV (audio waveform), IMAGE, DNA (genetic bases). The complete list can be found at [`data_generators/problem.py`](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/data_generators/problem.py) in the class `SpaceID`. +SpaceIDs tell Tensor2Tensor what sort of space the input and target tensors are in. These are things like, EN_CHR (English character), EN_TOK (English token), AUDIO_WAV (audio waveform), IMAGE, DNA (genetic bases). The complete list can be found at `data_generators/problem.py` in the class `SpaceID`. Since we're generating definitions and feeding in words at the character level, we set `is_character_level` to true, and use the same SpaceID, EN_CHR, for both input and target. Additionally, since we aren't using tokens, we don't need to give a `targeted_vocab_size` or define `use_subword_tokenizer`. @@ -58,8 +56,6 @@ The number of shards to break data files into. @registry.register_problem() class Word2def(problem.Text2TextProblem): """Problem spec for English word to dictionary definition.""" - - @property def is_character_level(self): return True @@ -86,11 +82,12 @@ class Word2def(problem.Text2TextProblem): **generator**: -We're almost done. `generator` generates the training and evaluation data and stores them in files like "word2def_train.lang1" in your DATA_DIR. Thankfully several commonly used methods like `character_generator`, and `token_generator` are already written in the file [`wmt.py`](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/data_generators/wmt.py). We will import `character_generator` and [`text_encoder`](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/data_generators/text_encoder.py) to write: +We're almost done. `generator` generates the training and evaluation data and stores them in files like "word2def_train.lang1" in your DATA_DIR. Thankfully several commonly used methods like `character_generator`, and `token_generator` are already written in the file `wmt.py`. We will import `character_generator` and write: ```python def generator(self, data_dir, tmp_dir, train): character_vocab = text_encoder.ByteTextEncoder() datasets = _WORD2DEF_TRAIN_DATASETS if train else _WORD2DEF_TEST_DATASETS + tag = "train" if train else "dev" return character_generator(datasets[0], datasets[1], character_vocab, EOS) ``` @@ -111,6 +108,7 @@ class Word2def(problem.Text2TextProblem): def generator(self, data_dir, tmp_dir, train): character_vocab = text_encoder.ByteTextEncoder() datasets = _WORD2DEF_TRAIN_DATASETS if train else _WORD2DEF_TEST_DATASETS + tag = "train" if train else "dev" return character_generator(datasets[0], datasets[1], character_vocab, EOS) @property @@ -139,31 +137,42 @@ I've gone ahead and split all words into a train and test set and saved them in ```python # English Word2def datasets _WORD2DEF_TRAIN_DATASETS = [ - LOCATION_OF_DATA + 'words_train.txt', - LOCATION_OF_DATA + 'definitions_train.txt' + [ + "LOCATION_OF_DATA/", ("words_train.txt", "definitions_train.txt") + ] ] - _WORD2DEF_TEST_DATASETS = [ - LOCATION_OF_DATA + 'words_test.txt', - LOCATION_OF_DATA + 'definitions_test.txt' + [ + "LOCATION_OF_DATA", ("words_test.txt", "definitions_test.txt") + ] ] ``` ## Putting it all together -Now our `word2def.py` file looks like: +Now our `word2def.py` file looks like: (with the correct imports) ```python """ Problem definition for word to dictionary definition. """ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + import os +import tarfile # do we need this import +from tensor2tensor.data_generators import generator_utils from tensor2tensor.data_generators import problem from tensor2tensor.data_generators import text_encoder from tensor2tensor.data_generators.wmt import character_generator from tensor2tensor.utils import registry +import tensorflow as tf + +FLAGS = tf.flags.FLAGS + # English Word2def datasets _WORD2DEF_TRAIN_DATASETS = [ LOCATION_OF_DATA+'words_train.txt', @@ -189,6 +198,7 @@ class Word2def(problem.Text2TextProblem): def generator(self, data_dir, tmp_dir, train): character_vocab = text_encoder.ByteTextEncoder() datasets = _WORD2DEF_TRAIN_DATASETS if train else _WORD2DEF_TEST_DATASETS + tag = "train" if train else "dev" return character_generator(datasets[0], datasets[1], character_vocab, EOS) @property @@ -210,17 +220,7 @@ class Word2def(problem.Text2TextProblem): ``` # Hyperparameters -All hyperparamters inherit from `_default_hparams()` in `problem.py.` If you would like to customize your hyperparameters, register a new hyperparameter set in `word2def.py` like the example provided in the walkthrough. For example: - -```python -from tensor2tensor.models import transformer - -@registry.register_hparams -def word2def_hparams(self): - hparams = transformer.transformer_base_single_gpu() # Or whatever you'd like to build off. - hparams.batch_size = 1024 - return hparams -``` +All hyperparamters inherit from `_default_hparams()` in `problem.py.` If you would like to customize your hyperparameters, add another method to the file `problem_hparams.py`. # Run the problem Now that we've gotten our problem set up, let's train a model and generate definitions. @@ -229,7 +229,7 @@ We specify our problem name, the model, and hparams. ```bash PROBLEM=word2def MODEL=transformer -HPARAMS=word2def_hparams +HPARAMS=transofmer_base_single_gpu ``` The rest of the steps are as given in the [walkthrough](walkthrough.md). diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py index cde91cc7b..d55911f19 100644 --- a/tensor2tensor/utils/data_reader.py +++ b/tensor2tensor/utils/data_reader.py @@ -27,7 +27,6 @@ import six from six.moves import zip # pylint: disable=redefined-builtin -from six.moves import xrange # pylint: disable=redefined-builtin from tensor2tensor.data_generators import problem_hparams from tensor2tensor.data_generators.problem import preprocess_examples_common diff --git a/tensor2tensor/visualization/attention.py b/tensor2tensor/visualization/attention.py index 6109f9cc6..bc4238081 100644 --- a/tensor2tensor/visualization/attention.py +++ b/tensor2tensor/visualization/attention.py @@ -15,7 +15,7 @@ """Module for postprocessing and displaying tranformer attentions. -This module is designed to be called from an ipython notebook. +This module is deigned to be called from an ipython notebook. """ import json From 98f55734aa8f49aa00aec5cb27a90887e96b5682 Mon Sep 17 00:00:00 2001 From: Etienne Pot Date: Wed, 30 Aug 2017 19:48:53 -0700 Subject: [PATCH 02/32] Add some logging/debug messages. Remove padding for all layers when local experts (both attention and fc) PiperOrigin-RevId: 167086679 --- tensor2tensor/layers/common_attention.py | 34 +++++++-- tensor2tensor/models/attention_lm_moe.py | 94 ++++++++++++++++++++++-- tensor2tensor/utils/expert_utils.py | 15 ---- 3 files changed, 117 insertions(+), 26 deletions(-) diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py index 253e9bee5..975ed94ae 100644 --- a/tensor2tensor/layers/common_attention.py +++ b/tensor2tensor/layers/common_attention.py @@ -33,6 +33,9 @@ from tensorflow.python.framework import function +_expert_count = 0 + + def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): """Adds a bunch of sinusoids of different frequencies to a Tensor. @@ -1007,9 +1010,22 @@ def self_attention_expert( expert_fn=functools.partial(self_attention_expert, mask_right=) ) """ + depth = x.get_shape().as_list()[-1] length = tf.shape(batch_coordinate)[0] + # Print a warning message if one of the expert isn't used (useful at + # inference where summaries aren't used and the gating function don't add + # noise) + global _expert_count # Hack to make each expert have a unique id + _expert_count += 1 + length = tf.cond( + tf.equal(length, 0), + lambda: tf.Print( # pylint: disable=g-long-lambda + length, [length], "Expert {} empty: ".format(_expert_count)), + lambda: length, + ) + tf.summary.scalar("batch_size", length, family="experts_stats_batch_size") attention_kq_size = attention_kq_size or depth @@ -1063,7 +1079,7 @@ def local_expert_attention( loss_coef, attention_num_experts, train=True, - pad_remover=None, + batch_coordinate=None, **kwargs ): """Attention using a mixture of experts. @@ -1072,23 +1088,30 @@ def local_expert_attention( The mixture of experts is "local" in that it is replicated on each datashard. + local_moe flatten all batches so to avoid problems with padding (ex: all + padding going to the same expert, self attention attending to non null + padding tokens,...), the padding should be removed before. + Args: - x: a Tensor with shape [batch, length, depth] + x: a Tensor with shape [batch, length, depth] or [1, batch*length, depth] k: The number of experts to dispatch each example to loss_coef: a scalar. A multiplier for the expert loss attention_num_experts: The number of experts to use train: a boolean for the current mode - pad_remover (PadRemover): A util object containing the padding position + batch_coordinate (tf.Tensor): int32 tensor of shape [1, batch*length, 1] + containing the batch ids. If None, deduced from first dim of x. **kwargs: Arguments to forward to self_attention_expert Returns: y: a Tensor with shape [batch, length, depth] loss: a Scalar """ + if batch_coordinate is None: + batch_coordinate = tf.expand_dims( + coordinate_tensor(tf.shape(x)[:-1], axis=0), axis=-1) with tf.variable_scope("local_expert_attention"): additional_dispatch_params = { - "batch_coordinate": tf.expand_dims( - coordinate_tensor(tf.shape(x)[:-1], axis=0), axis=-1) + "batch_coordinate": batch_coordinate } return expert_utils.local_moe( x, @@ -1100,7 +1123,6 @@ def local_expert_attention( pass_x=True, pass_gates=False, additional_dispatch_params=additional_dispatch_params, - pad_remover=pad_remover ) diff --git a/tensor2tensor/models/attention_lm_moe.py b/tensor2tensor/models/attention_lm_moe.py index 3b72ea9c2..191d4aa04 100644 --- a/tensor2tensor/models/attention_lm_moe.py +++ b/tensor2tensor/models/attention_lm_moe.py @@ -25,6 +25,8 @@ from __future__ import division from __future__ import print_function +import functools + # Dependency imports from six.moves import xrange # pylint: disable=redefined-builtin @@ -40,6 +42,9 @@ import tensorflow as tf +ModeKeys = tf.contrib.learn.ModeKeys # pylint: disable=invalid-name + + class AttentionType(object): MULTIHEAD = "multihead" LOCAL_EXPERTS = "local_experts" @@ -90,6 +95,37 @@ def _diet_expert(x): expert_fn = expert_utils.ffn_expert_fn( hparams.hidden_size, moe_hidden_sizes, hparams.hidden_size) + if hparams.attention_type == AttentionType.LOCAL_EXPERTS: + # As preprocess and postprocess are called with batch of size one (all + # batches concatenated), we just make sure that batch_norm is not use ( + # should not either way) + assert hparams.norm_type != "batch" + + dp_remove_pad = functools.partial( + dp, remove_pad, pad_remover=pad_remover, mode=hparams.mode) + dp_restore_pad = functools.partial( + dp, restore_pad, ref_x=x, pad_remover=pad_remover, mode=hparams.mode) + elif (hparams.attention_type == AttentionType.MULTIHEAD or + hparams.attention_type == AttentionType.MEMORY_EFFICIENT): + # Using identity function: No effect + dp_remove_pad = lambda x: (x, None) + dp_restore_pad = lambda x: x + else: + raise ValueError("Only {} supported for now.".format( + AttentionType.get_choices())) + + def print_shape(x, suffix): + # To help debugging, print the input/output shapes at inference and eval + # Inference for long sequences can take a long time, so that's help to + # see the progession of the generation + if hparams.mode == ModeKeys.TRAIN: + return x + return tf.Print(x, [tf.shape(x)], "shape_x_{}".format(suffix)) + + x = dp(print_shape, x, "in") + x, batch_coordinate = dp_remove_pad(x) + x = dp(print_shape, x, "in_flat") + for layer in xrange(hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope( @@ -118,11 +154,11 @@ def _diet_expert(x): y, loss = dp( common_attention.local_expert_attention, preprocess(x), - k=2, + k=hparams.attention_moe_k, loss_coef=hparams.attention_load_balance, attention_num_experts=hparams.attention_num_experts, - train=hparams.mode == tf.contrib.learn.ModeKeys.TRAIN, - pad_remover=pad_remover, + train=hparams.mode == ModeKeys.TRAIN, + batch_coordinate=batch_coordinate, mask_right=True, attention_kq_size=hparams.attention_kq_size, attention_v_size=hparams.attention_v_size) @@ -138,7 +174,7 @@ def _diet_expert(x): dp, self._ps_devices, preprocess(x), - hparams.mode == tf.contrib.learn.ModeKeys.TRAIN, + hparams.mode == ModeKeys.TRAIN, input_size=hparams.hidden_size, expert_fn=expert_fn, num_experts=hparams.moe_num_experts, @@ -160,6 +196,9 @@ def _diet_expert(x): dropout=hparams.relu_dropout) x = postprocess(x, y) x = preprocess(x) + + x = dp_restore_pad(x) + decoder_output = dp(tf.expand_dims, x, 2) return decoder_output, extra_loss @@ -187,12 +226,56 @@ def attention_lm_moe_prepare_decoder(targets, hparams): else: decoder_self_attention_bias = ( common_attention.attention_bias_lower_triangle(tf.shape(targets)[1])) + # TODO(epot): The padding remover should take into account that the input is + # shifted. decoder_input = common_layers.shift_left_3d(targets) if hparams.pos == "timing": decoder_input = common_attention.add_timing_signal_1d(decoder_input) return (decoder_input, decoder_self_attention_bias, pad_remover) +def remove_pad(x, pad_remover, mode): + """Remove padding by concatenating all dimension into one. + + Args: + x (tf.Tensor): input of shape [batch_size, length, depth] + pad_remover (obj): a PadRemover object + mode (ModeKeys): infer, train or eval. If inference, the padding remover is + not applied + + Returns: + tf.Tensor of shape [1,length_nonpad,depth] where + length_nonpad <= batch_size*length + """ + # Compute the batch coordinate before flattening all batches + batch_coordinate = tf.expand_dims( + common_attention.coordinate_tensor(tf.shape(x)[:-1], axis=0), axis=-1) + batch_coordinate = expert_utils.flatten_all_but_last(batch_coordinate) + + # Concatenate all tokens (without padding) + x = expert_utils.flatten_all_but_last(x) + + # Remove padding for training and eval + if mode != ModeKeys.INFER: + # This is a hack to allows inference when the token + # is detected as padding and removed. This works for now because there is + # no padding at inference. + batch_coordinate = pad_remover.remove(batch_coordinate) + x = pad_remover.remove(x) + + batch_coordinate = tf.expand_dims(batch_coordinate, axis=0) + x = tf.expand_dims(x, axis=0) # Now batch_size=1 + return x, batch_coordinate + + +def restore_pad(x, ref_x, pad_remover, mode): + x = tf.squeeze(x, axis=0) + if mode != ModeKeys.INFER: + x = pad_remover.restore(x) + x = expert_utils.reshape_like(x, ref_x) + return x + + @registry.register_hparams def attention_lm_moe_base(): """Set of hyperparameters. @@ -238,6 +321,7 @@ def attention_lm_moe_base(): hparams.add_hparam("moe_layers", "2") # comma separated list of layer numbers # moe params. local attention moe. hparams.add_hparam("attention_type", AttentionType.MULTIHEAD) + hparams.add_hparam("attention_moe_k", 2) hparams.add_hparam("attention_num_experts", 16) # Key, query and value dimensions for the attention hparams.add_hparam("attention_kq_size", 128) @@ -256,7 +340,7 @@ def attention_lm_moe_base_ae(): hparams.attention_type = AttentionType.LOCAL_EXPERTS hparams.max_length = hparams.batch_size hparams.eval_drop_long_sequences = int(True) - hparams.batching_mantissa_bits = 2 # More buckets + hparams.min_length_bucket = 256 # Avoid cyclic problems for big batches hparams.learning_rate = 0.05 hparams.learning_rate_warmup_steps = 10000 return hparams diff --git a/tensor2tensor/utils/expert_utils.py b/tensor2tensor/utils/expert_utils.py index fb1d1fac0..16820ff37 100644 --- a/tensor2tensor/utils/expert_utils.py +++ b/tensor2tensor/utils/expert_utils.py @@ -847,7 +847,6 @@ def local_moe(x, pass_x=True, pass_gates=False, additional_dispatch_params=None, - pad_remover=None, name=None): """Call a local mixture of experts. @@ -864,8 +863,6 @@ def local_moe(x, additional_dispatch_params: The extra tensors that need to be sent to each expert. Examples include batch batch coordinates (see common_attention.local_expert_attention) - pad_remover (PadRemover): If given, the padding is removed/restored before - sending to the experts name: a string Returns: @@ -879,14 +876,6 @@ def local_moe(x, with tf.variable_scope(name, default_name="local_moe"): x_flat = flatten_all_but_last(x) - # Remove the padding tokens - if pad_remover: - x_flat = pad_remover.remove(x_flat) - tf.summary.scalar( # Should match the targets_nonpadding_tokens - "nonpadding_tokens", - tf.shape(x_flat)[0], - family="experts_stats") - # The gates indicate which batch elements go to which tensors. # load is a measure of approximately how many examples go to each expert gates, load = noisy_top_k_gating( @@ -908,16 +897,12 @@ def local_moe(x, expert_kwargs["gates"] = dispatcher.expert_to_gates() for k, v in six.iteritems(additional_dispatch_params or {}): v = flatten_all_but_last(v) - if pad_remover: - v = pad_remover.remove(v) expert_kwargs[k] = dispatcher.dispatch(v) ep = Parallelism([DEFAULT_DEV_STRING] * num_experts) expert_outputs = ep(expert_fn, **expert_kwargs) y_flat = dispatcher.combine(expert_outputs) - if pad_remover: - y_flat = pad_remover.restore(y_flat) y = reshape_like(y_flat, x) importance = tf.reduce_sum(gates, 0) From 473089b113e7644aa9b3a7f8794f237d3f41e24f Mon Sep 17 00:00:00 2001 From: Lukasz Kaiser Date: Wed, 30 Aug 2017 20:20:07 -0700 Subject: [PATCH 03/32] Correct CNN+DailyMail generator, make TransforerAE work with 2d input. PiperOrigin-RevId: 167088556 --- .../data_generators/cnn_dailymail.py | 4 +- tensor2tensor/layers/modalities.py | 45 ++++--------- tensor2tensor/models/cycle_gan.py | 66 +------------------ tensor2tensor/models/transformer_vae.py | 46 +++++++++---- 4 files changed, 48 insertions(+), 113 deletions(-) diff --git a/tensor2tensor/data_generators/cnn_dailymail.py b/tensor2tensor/data_generators/cnn_dailymail.py index db4deae4e..93e846a0b 100644 --- a/tensor2tensor/data_generators/cnn_dailymail.py +++ b/tensor2tensor/data_generators/cnn_dailymail.py @@ -53,8 +53,8 @@ def _maybe_download_corpora(tmp_dir): filepath of the downloaded corpus file. """ cnn_filename = "cnn_stories.tgz" - dailymail_filename = "dailymail_stories.tgz" cnn_finalpath = os.path.join(tmp_dir, "cnn/stories/") + dailymail_filename = "dailymail_stories.tgz" dailymail_finalpath = os.path.join(tmp_dir, "dailymail/stories/") if not tf.gfile.Exists(cnn_finalpath): cnn_file = generator_utils.maybe_download_from_drive( @@ -63,7 +63,7 @@ def _maybe_download_corpora(tmp_dir): cnn_tar.extractall(tmp_dir) if not tf.gfile.Exists(dailymail_finalpath): dailymail_file = generator_utils.maybe_download_from_drive( - tmp_dir, dailymail_filename, _CNN_STORIES_DRIVE_URL) + tmp_dir, dailymail_filename, _DAILYMAIL_STORIES_DRIVE_URL) with tarfile.open(dailymail_file, "r:gz") as dailymail_tar: dailymail_tar.extractall(tmp_dir) return [cnn_finalpath, dailymail_finalpath] diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py index e03e6835e..c93a05433 100644 --- a/tensor2tensor/layers/modalities.py +++ b/tensor2tensor/layers/modalities.py @@ -141,17 +141,11 @@ def top_dimensionality(self): def bottom(self, inputs): with tf.variable_scope(self.name): inputs = common_layers.standardize_images(inputs) - # TODO(lukaszkaiser): summaries here don't work in multi-problem case yet. - # tf.summary.image("inputs", inputs, max_outputs=2) - if self._model_hparams.compress_steps > 0: - strides = (2, 2) - else: - strides = (1, 1) + tf.summary.image("inputs", inputs, max_outputs=2) return common_layers.conv_block( inputs, self._body_input_depth, [((1, 1), (3, 3))], first_relu=False, - strides=strides, padding="SAME", force2d=True, name="small_image_conv") @@ -159,43 +153,26 @@ def bottom(self, inputs): def targets_bottom(self, inputs): with tf.variable_scope(self.name): # Reshape inputs to 2-d tensor and embed the RGB pixel values. + shape = tf.shape(inputs) inputs = common_layers.flatten4d3d(inputs) ret = common_layers.embedding( - inputs, + tf.to_int32(inputs), self.top_dimensionality, self._body_input_depth, name="input_rgb_embedding") if self._model_hparams.multiply_embedding_mode == "sqrt_depth": ret *= self._body_input_depth**0.5 - return ret + ret = tf.reshape(ret, [shape[0], shape[1], shape[2], + self._body_input_depth * 3]) + return tf.layers.dense(ret, self._body_input_depth) def top(self, body_output, _): with tf.variable_scope("rgb_softmax"): - # separate embedding for each channel - # assuming the body output returns a tensor of shape - # [batch_size, rows, cols, channels, self._body_input_depth] - body_output_split = tf.split(body_output, self._channels, axis=3) - output_rgb_embedding_var = tf.get_variable( - "output_rgb_embedding", - [self._channels, self.top_dimensionality, self._body_input_depth], - initializer=tf.random_normal_initializer(0.0, self._body_input_depth - **-0.5)) - # compute logits separately for each channel - rgb_channel_logits = [] - for i in self._channels: - shape = tf.shape(body_output_split[i])[:-1] - body_output = tf.reshape(body_output_split[i], - [-1, self._body_input_depth]) - channel_logits = tf.matmul( - body_output, output_rgb_embedding_var[i], transpose_b=True) - rgb_channel_logits.append( - tf.reshape(channel_logits, - tf.concat([shape, [self.top_dimensionality]], 0))) - - logits = tf.concat(rgb_channel_logits, axis=3) - # Reshape logits to conform to CIFAR image shapes (32 by 32 by 3) - - return logits + shape = tf.shape(body_output) + dim = body_output.get_shape().as_list()[-1] // 3 + out = tf.reshape(body_output, [shape[0], shape[1], shape[2], + self._channels, dim]) + return tf.layers.dense(out, self.top_dimensionality) def loss(self, top_out, targets, weights_fn=common_layers.weights_all): # Call the default implementation, but weight 1.0 on 0s by default. diff --git a/tensor2tensor/models/cycle_gan.py b/tensor2tensor/models/cycle_gan.py index c17becbbe..4cf1a5871 100644 --- a/tensor2tensor/models/cycle_gan.py +++ b/tensor2tensor/models/cycle_gan.py @@ -124,74 +124,10 @@ def model_fn_body(self, features): self._hparams) -def cycle_vae_gan_internal(inputs, targets, _, hparams): - """Cycle GAN, main step used for training.""" - with tf.variable_scope("cycle_vae_gan"): - # Embed inputs and targets. - inputs_orig, targets_orig = tf.to_int32(inputs), tf.to_int32(targets) - k = 2**hparams.num_compress_steps - inputs_orig, targets_orig = common_layers.pad_to_same_length( - inputs_orig, targets_orig, final_length_divisible_by=k) - inputs = common_layers.embedding( - inputs_orig, hparams.vocab_size, hparams.hidden_size, "embed") - targets = common_layers.embedding( - targets_orig, hparams.vocab_size, hparams.hidden_size, - "embed", reuse=True) - - # Split the batch into input-input and target-target parts. - inputs1, _ = split_on_batch(inputs) - _, targets2 = split_on_batch(targets) - - # Input-input part. - inp1_back, kl_loss1, inp1_mu, inp1_log_sigma = transformer_vae.vae_compress( - inputs1, None, hparams, "inp2hyp", "hyp2inp") - inp1_hyp = tf.concat([inp1_mu, inp1_log_sigma], axis=3) - - # Target-target part. - tgt2_back, kl_loss2, tgt2_mu, tgt2_log_sigma = transformer_vae.vae_compress( - targets2, None, hparams, "tgt2hyp", "hyp2tgt") - tgt2_hyp = tf.concat([tgt2_mu, tgt2_log_sigma], axis=3) - - # Reconstruction losses. - inp1_orig, _ = split_on_batch(inputs_orig) - _, tgt2_orig = split_on_batch(targets_orig) - inp1_loss = reconstruct_loss( - inp1_back, tf.squeeze(inp1_orig, axis=3), hparams) - tgt2_loss = reconstruct_loss( - tgt2_back, tf.squeeze(tgt2_orig, axis=3), hparams, reuse=True) - - # Discriminator loss. - dloss = discriminate_loss(inp1_hyp, tgt2_hyp, False, hparams, "dloss") - - # Reconstruct targets from inputs. - tgt, _, _, _ = transformer_vae.vae_compress( - inputs, None, hparams, "inp2hyp", "hyp2tgt", reuse=True) - tgt = tf.layers.dense(tgt, hparams.vocab_size, name="softmax", reuse=True) - # We use the reconstruction only for tracking progress, no gradients here! - tgt = tf.stop_gradient(tf.expand_dims(tgt, axis=2)) - - kl_rev_decay = common_layers.inverse_exp_decay(hparams.kl_warmup_steps) - losses = {"input_input": hparams.cycle_loss_multiplier * inp1_loss, - "target_target": hparams.cycle_loss_multiplier * tgt2_loss, - "input_kl": kl_loss1 * kl_rev_decay * 15.0, - "target_kl": kl_loss2 * kl_rev_decay * 15.0, - "discriminator": dloss} - return tgt, losses - - -@registry.register_model -class CycleVaeGAN(t2t_model.T2TModel): - - def model_fn_body(self, features): - return cycle_vae_gan_internal( - features["inputs"], features["targets"], features["target_space_id"], - self._hparams) - - @registry.register_hparams def cycle_gan_small(): """Set of hyperparameters.""" - hparams = transformer_vae.transformer_vae_small() + hparams = transformer_vae.transformer_ae_small() hparams.batch_size = 2048 hparams.input_modalities = "inputs:symbol:identity" hparams.target_modality = "symbol:identity" diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py index 1c566e996..025f8d631 100644 --- a/tensor2tensor/models/transformer_vae.py +++ b/tensor2tensor/models/transformer_vae.py @@ -253,18 +253,25 @@ def ae_decompress(z, ae, x, is_2d, hparams, name, reuse=None): # Decompress. d = z + k = (3, 3) if is_2d else (3, 1) for i in xrange(hparams.num_compress_steps): j = hparams.num_compress_steps - i - 1 - d = residual_conv(d, 1, (3, 1), hparams, "decompress_rc_%d" % j) + d = residual_conv(d, 1, k, hparams, "decompress_rc_%d" % j) d = decompress_step(d, None, hparams, i > 0, is_2d, "decompress_%d" % j) - k = 2**hparams.num_compress_steps - z_batch = tf.reshape(z, [-1, 1, 1, hparams.hidden_size]) - x_batch = tf.reshape(x, [-1, k, 1, hparams.hidden_size]) - d_batch = tf.reshape(d, [-1, k, 1, hparams.hidden_size]) - dec_batch = decode(z_batch, d_batch, x_batch, None, None, hparams) - z = tf.reshape(dec_batch, [-1, tf.shape(x)[1], 1, hparams.hidden_size]) - + # Autoregressive part. + if not is_2d: # Currently we don't do it autoregressively for 2d problems. + k = 2**(hparams.num_compress_steps * (2 if is_2d else 1)) + z_batch = tf.reshape(z, [-1, 1, 1, hparams.hidden_size]) + x_batch = tf.reshape(x, [-1, k, 1, hparams.hidden_size]) + d_batch = tf.reshape(d, [-1, k, 1, hparams.hidden_size]) + dec_batch = decode(z_batch, d_batch, x_batch, None, None, hparams) + else: # For non-autoregressive. + dec_batch = d + z = tf.reshape(dec_batch, [-1, tf.shape(x)[1], tf.shape(x)[2], + hparams.hidden_size]) + if is_2d: + z = tf.layers.dense(z, hparams.hidden_size * 3) return z @@ -286,11 +293,14 @@ def ae_transformer_internal(inputs, targets, target_space, hparams): inputs, ed = encode(inputs, target_space, hparams, "input_enc") # Compress and ae. - ae, hot, kl = ae_compress(targets, False, hparams, "ae") + ae, hot, kl = ae_compress(targets, hparams.is_2d, hparams, "ae") + tf.summary.histogram("hot", tf.reshape(tf.argmax(hot, axis=-1), [-1])) emb = ae_embed(hot, hparams, "ae", reuse=True) # Compress context and run autoregressive decoder on emb-hot. - dec_c = decode(None, None, emb, inputs, ed, hparams) + emb_flat = tf.expand_dims(common_layers.flatten4d3d(emb), axis=2) + dec_c = decode(None, None, emb_flat, inputs, ed, hparams) + dec_c = tf.reshape(dec_c, tf.shape(emb)) c_z = tf.layers.dense(dec_c, hparams.v_size, name="mask_context") reconstruct_loss = tf.nn.softmax_cross_entropy_with_logits( labels=hot, logits=c_z) @@ -299,8 +309,8 @@ def ae_transformer_internal(inputs, targets, target_space, hparams): hot = tf.one_hot(tf.argmax(c_z, axis=-1), hparams.v_size) # Decompress, pass for ae loss. - z = ae_decompress(emb, ae, targets, False, hparams, "ae") - kl *= common_layers.inverse_exp_decay(int(hparams.startup_steps * 0.5)) + z = ae_decompress(emb, ae, targets, hparams.is_2d, hparams, "ae") + kl *= common_layers.inverse_exp_decay(int(hparams.startup_steps * 0.8)) reconstruct_loss *= common_layers.inverse_exp_decay(hparams.startup_steps) losses = {"kl": kl, "reconstruction": reconstruct_loss} return z, losses @@ -365,6 +375,18 @@ def transformer_ae_small(): hparams.add_hparam("startup_steps", 30000) hparams.add_hparam("kmeans_lr_factor", 0.002) hparams.add_hparam("z_dropout", 0.1) + hparams.add_hparam("is_2d", 0) + return hparams + + +@registry.register_hparams +def transformer_ae_cifar(): + hparams = transformer_ae_small() + hparams.batch_size = 1024 * 16 + hparams.num_compress_steps = 2 + hparams.v_size = 1024 * 16 + hparams.startup_steps = 120000 + hparams.is_2d = 1 return hparams From c7636a372e5575c040cfcb8a574bd0b0387da53e Mon Sep 17 00:00:00 2001 From: T2T Team Date: Thu, 31 Aug 2017 14:46:43 -0700 Subject: [PATCH 04/32] Transformer hparams fall back on `num_hidden_layers` PiperOrigin-RevId: 167194460 --- tensor2tensor/models/transformer.py | 17 +++++++++------- tensor2tensor/models/transformer_test.py | 25 ------------------------ 2 files changed, 10 insertions(+), 32 deletions(-) diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py index 105d9eb32..41bfa5b7f 100644 --- a/tensor2tensor/models/transformer.py +++ b/tensor2tensor/models/transformer.py @@ -190,7 +190,8 @@ def transformer_encoder(encoder_input, """ x = encoder_input with tf.variable_scope(name): - for layer in xrange(hparams.num_encoder_layers): + for layer in xrange( + hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = common_attention.multihead_attention( @@ -233,7 +234,8 @@ def transformer_decoder(decoder_input, """ x = decoder_input with tf.variable_scope(name): - for layer in xrange(hparams.num_decoder_layers): + for layer in xrange( + hparams.num_decoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = common_attention.multihead_attention( @@ -323,11 +325,12 @@ def transformer_base(): hparams.label_smoothing = 0.1 hparams.shared_embedding_and_softmax_weights = int(True) - hparams.add_hparam("filter_size", 2048) # Add new ones like this. - # layer-related flags - hparams.add_hparam("num_encoder_layers", hparams.num_hidden_layers) - hparams.add_hparam("num_decoder_layers", hparams.num_hidden_layers) - # attention-related flags + # Add new ones like this. + hparams.add_hparam("filter_size", 2048) + # Layer-related flags. If zero, these fall back on hparams.num_hidden_layers. + hparams.add_hparam("num_encoder_layers", 0) + hparams.add_hparam("num_decoder_layers", 0) + # Attention-related flags. hparams.add_hparam("num_heads", 8) hparams.add_hparam("attention_key_channels", 0) hparams.add_hparam("attention_value_channels", 0) diff --git a/tensor2tensor/models/transformer_test.py b/tensor2tensor/models/transformer_test.py index 391824524..6c0eee203 100644 --- a/tensor2tensor/models/transformer_test.py +++ b/tensor2tensor/models/transformer_test.py @@ -64,31 +64,6 @@ def testTransformer(self): res = session.run(logits) self.assertEqual(res.shape, (BATCH_SIZE, TARGET_LENGTH, 1, 1, VOCAB_SIZE)) - def testBeamDecodeVsGreedy(self): - model, features = self.getModel() - - decode_length = 20 - - greedy_result, _, _ = model._greedy_infer( - features, decode_length, last_position_only=True) - greedy_result = tf.squeeze(greedy_result, axis=[2, 3]) - - with tf.variable_scope(tf.get_variable_scope(), reuse=True): - beam_res = model._beam_decode( - features, - decode_length, - beam_size=1, - top_beams=1, - last_position_only=True, - alpha=1.0) - - with self.test_session() as session: - session.run(tf.global_variables_initializer()) - greedy_res, beam_res = session.run([greedy_result, beam_res]) - - self.assertEqual(beam_res.shape, (BATCH_SIZE, INPUT_LENGTH + decode_length)) - self.assertAllClose(greedy_res, beam_res) - if __name__ == "__main__": tf.test.main() From a317801dd7594b8b60a846e974d31ed426a8eeba Mon Sep 17 00:00:00 2001 From: T2T Team Date: Thu, 31 Aug 2017 15:13:22 -0700 Subject: [PATCH 05/32] Speed up Transformer using PadRemover PiperOrigin-RevId: 167198565 --- tensor2tensor/layers/common_attention.py | 16 +++++++++++++ tensor2tensor/models/transformer.py | 29 ++++++++++++++++++++---- 2 files changed, 40 insertions(+), 5 deletions(-) diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py index 975ed94ae..7ed7799d0 100644 --- a/tensor2tensor/layers/common_attention.py +++ b/tensor2tensor/layers/common_attention.py @@ -215,6 +215,22 @@ def attention_bias_ignore_padding(memory_padding): return tf.expand_dims(tf.expand_dims(ret, axis=1), axis=1) +def attention_bias_to_padding(attention_bias): + """Inverse of attention_bias_ignore_padding(). + + Args: + attention_bias: a `Tensor` with shape [batch, 1, 1, memory_length], as + returned by attention_bias_ignore_padding(). + + Returns: + a Tensor with shape [batch, memory_length] with 1.0 in padding positions + and 0.0 in non-padding positions. + """ + # `attention_bias` is a large negative number in padding positions and 0.0 + # elsewhere. + return tf.squeeze(tf.to_float(tf.less(attention_bias, -1)), axis=[1, 2]) + + def attention_bias_prepend_inputs_full_attention(padding): """Create a bias tensor for prepend_mode="prepend_inputs_full_attention". diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py index 41bfa5b7f..86b920dc5 100644 --- a/tensor2tensor/models/transformer.py +++ b/tensor2tensor/models/transformer.py @@ -30,6 +30,7 @@ from tensor2tensor.layers import common_attention from tensor2tensor.layers import common_hparams from tensor2tensor.layers import common_layers +from tensor2tensor.utils import expert_utils from tensor2tensor.utils import registry from tensor2tensor.utils import t2t_model @@ -50,8 +51,8 @@ def model_fn_body(self, features): targets = common_layers.flatten4d3d(targets) (encoder_input, encoder_self_attention_bias, - encoder_decoder_attention_bias) = (transformer_prepare_encoder( - inputs, target_space, hparams)) + encoder_decoder_attention_bias) = transformer_prepare_encoder( + inputs, target_space, hparams) (decoder_input, decoder_self_attention_bias) = transformer_prepare_decoder( targets, hparams) @@ -202,8 +203,11 @@ def transformer_encoder(encoder_input, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): + pad_remover = expert_utils.PadRemover( + common_attention.attention_bias_to_padding( + encoder_self_attention_bias)) y = transformer_ffn_layer( - common_layers.layer_preprocess(x, hparams), hparams) + common_layers.layer_preprocess(x, hparams), hparams, pad_remover) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it shuold also be done # on the output, since the output can grow very large, being the sum of @@ -265,22 +269,37 @@ def transformer_decoder(decoder_input, return common_layers.layer_preprocess(x, hparams) -def transformer_ffn_layer(x, hparams): +def transformer_ffn_layer(x, hparams, pad_remover=None): """Feed-forward layer in the transformer. Args: x: a Tensor of shape [batch_size, length, hparams.hidden_size] hparams: hyperparmeters for model + pad_remover: an expert_utils.PadRemover object tracking the padding + positions. If provided, when using convolutional settings, the padding + is removed before applying the convolution, and restored afterward. This + can give a significant speedup. Returns: a Tensor of shape [batch_size, length, hparams.hidden_size] """ if hparams.ffn_layer == "conv_hidden_relu": - return common_layers.conv_hidden_relu( + # In simple convolution mode, use `pad_remover` to speed up processing. + if pad_remover: + original_shape = tf.shape(x) + # Collapse `x` across examples, and remove padding positions. + x = tf.reshape(x, tf.concat([[-1], tf.shape(x)[2:]], axis=0)) + x = tf.expand_dims(pad_remover.remove(x), axis=0) + conv_output = common_layers.conv_hidden_relu( x, hparams.filter_size, hparams.hidden_size, dropout=hparams.relu_dropout) + if pad_remover: + # Restore `conv_output` to the original shape of `x`, including padding. + conv_output = tf.reshape( + pad_remover.restore(tf.squeeze(conv_output, axis=0)), original_shape) + return conv_output elif hparams.ffn_layer == "parameter_attention": return common_attention.parameter_attention( x, hparams.parameter_attention_key_channels or hparams.hidden_size, From 1a9bdacf2fc4f87faa4da74908487a626e06c2db Mon Sep 17 00:00:00 2001 From: T2T Team Date: Fri, 1 Sep 2017 12:55:56 -0700 Subject: [PATCH 06/32] Bug fix and better documentation for normalizer_fn. PiperOrigin-RevId: 167312851 --- tensor2tensor/layers/common_layers.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tensor2tensor/layers/common_layers.py b/tensor2tensor/layers/common_layers.py index 4b09e70cb..264c11cf6 100644 --- a/tensor2tensor/layers/common_layers.py +++ b/tensor2tensor/layers/common_layers.py @@ -628,11 +628,22 @@ def conv_block_internal(conv_fn, Returns: a Tensor. """ + name = kwargs.pop("name") if "name" in kwargs else None mask = kwargs.pop("mask") if "mask" in kwargs else None - norm = kwargs.pop("normalizer_fn") if "normalizer_fn" in kwargs else None - if norm is None and "normalizer_fn" not in kwargs: + + # Usage for normalize_fn kwarg: + # if not specified, use layer norm + # if given normalize_fn=None, don't use any normalization + # if given normalize_fn=norm, use the specified norm function + + use_layer_norm = "normalizer_fn" not in kwargs + norm = kwargs.pop("normalizer_fn", None) + use_normalizer_fn = use_layer_norm or norm + + if use_layer_norm: norm = lambda x, name: layer_norm(x, filters, name=name) + with tf.variable_scope(name, "conv_block", [inputs]): cur, counter = inputs, -1 for dilation_rate, kernel_size in dilation_rates_and_kernel_sizes: @@ -660,7 +671,7 @@ def conv_block_internal(conv_fn, name="conv_block_%d" % counter, use_bias=norm is None, **kwargs) - if norm is not None: + if use_normalizer_fn: cur = norm(cur, name="conv_block_norm_%d" % counter) return cur From 956e767af673be6292e9b2d06e5ce15688ba76d9 Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Fri, 1 Sep 2017 12:59:51 -0700 Subject: [PATCH 07/32] Use new dynamic window size group_by_window functionality in an OSS-compatible way PiperOrigin-RevId: 167313309 --- tensor2tensor/utils/data_reader.py | 41 +++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py index d55911f19..09ef159a4 100644 --- a/tensor2tensor/utils/data_reader.py +++ b/tensor2tensor/utils/data_reader.py @@ -267,12 +267,14 @@ def input_pipeline(problem, data_file_pattern, capacity, mode, hparams, dataset = dataset.filter( lambda ex: _example_too_big(ex, batching_scheme["max_length"])) - dataset = bucket_by_sequence_length(dataset, _example_length, - batching_scheme["boundaries"], - batching_scheme["batch_sizes"], - batching_scheme["window_size"]) + dataset = bucket_by_sequence_length( + dataset, _example_length, batching_scheme["boundaries"], + batching_scheme["batch_sizes"], batching_scheme["window_size"]) # We reshuffle the batches to prevent many long-sequence batches at once. - if batching_scheme["shuffle_queue_size"] is not None: + # TODO(rsepassi): Rm hasattr call once new dynamic window size functionality + # is in a stable TF release. + if (batching_scheme["shuffle_queue_size"] is not None and + not hasattr(dataset, "apply")): dataset = dataset.shuffle(batching_scheme["shuffle_queue_size"]) batched_examples = dataset.make_one_shot_iterator().get_next() return batched_examples @@ -338,6 +340,12 @@ def example_to_bucket_id(example): return bucket_id + def window_size_fn(bucket_id): + # window size = batch size + batch_sizes = tf.constant(bucket_batch_sizes, dtype=tf.int64) + window_size = batch_sizes[bucket_id] + return window_size + def batching_fn(bucket_id, grouped_dataset): batch_sizes = tf.constant(bucket_batch_sizes, dtype=tf.int64) batch_size = batch_sizes[bucket_id] @@ -348,8 +356,16 @@ def batching_fn(bucket_id, grouped_dataset): for name, shape in grouped_dataset.output_shapes.items()]) return grouped_dataset.padded_batch(batch_size, padded_shapes) - dataset = dataset.group_by_window(example_to_bucket_id, batching_fn, - window_size) + # TODO(rsepassi): Rm branch once the new group_by_window functionality is in + # a stable TF release. + if hasattr(dataset, "apply"): + # If the Dataset supports dynamic window size, use it. + dataset = dataset.apply( + tf.contrib.data.group_by_window, + args=(example_to_bucket_id, batching_fn, None, window_size_fn)) + else: + dataset = dataset.group_by_window(example_to_bucket_id, batching_fn, + window_size) return dataset @@ -398,8 +414,8 @@ def _batching_scheme(batch_size, * max_length: int, maximum length of an example """ max_length = max_length or batch_size - boundaries = _bucket_boundaries( - max_length, min_length_bucket, length_bucket_step) + boundaries = _bucket_boundaries(max_length, min_length_bucket, + length_bucket_step) boundaries = [boundary * length_multiplier for boundary in boundaries] max_length *= length_multiplier batch_sizes = [ @@ -417,9 +433,10 @@ def _batching_scheme(batch_size, 83160, 110880, 166320, 221760, 277200, 332640, 498960, 554400, 665280, 720720, 1081080, 1441440, 2162160, 2882880, 3603600, 4324320, 6486480, 7207200, 8648640, 10810800, 14414400, 17297280, 21621600, 32432400, - 36756720, 43243200, 61261200, 73513440, 110270160] - window_size = max([ - i for i in highly_composite_numbers if i <= 3 * max_batch_size]) + 36756720, 43243200, 61261200, 73513440, 110270160 + ] + window_size = max( + [i for i in highly_composite_numbers if i <= 3 * max_batch_size]) divisors = [i for i in xrange(1, window_size + 1) if window_size % i == 0] batch_sizes = [max([d for d in divisors if d <= bs]) for bs in batch_sizes] window_size *= shard_multiplier From f76ea08833639613287b2c46fa079bf5ef88207e Mon Sep 17 00:00:00 2001 From: T2T Team Date: Fri, 1 Sep 2017 13:12:17 -0700 Subject: [PATCH 08/32] Fixed typo. PiperOrigin-RevId: 167314859 --- tensor2tensor/utils/yellowfin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensor2tensor/utils/yellowfin.py b/tensor2tensor/utils/yellowfin.py index 450875fa5..a3f6a18a1 100644 --- a/tensor2tensor/utils/yellowfin.py +++ b/tensor2tensor/utils/yellowfin.py @@ -602,7 +602,7 @@ def minimize(self, Raises: ValueError: if no gradients are provided for any variable. """ - grads_and_vars = self._optimizer.compute_gradients( + grads_and_vars = self._momentum_optimizer.compute_gradients( loss, var_list=var_list, gate_gradients=gate_gradients, From 0f3d76cc266c6a96f8093cd2ddca6bfc6f3cd721 Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Mon, 4 Sep 2017 04:38:00 -0700 Subject: [PATCH 09/32] Enable easy dataset construction from Problems with Problem.dataset PiperOrigin-RevId: 167485065 --- .../data_generators/gene_expression.py | 5 +- tensor2tensor/data_generators/ice_parsing.py | 2 +- tensor2tensor/data_generators/image.py | 14 +- tensor2tensor/data_generators/imdb.py | 2 +- tensor2tensor/data_generators/problem.py | 176 +++++++++++++++++- tensor2tensor/models/gene_expression_test.py | 2 +- tensor2tensor/models/multimodel_test.py | 2 +- tensor2tensor/models/slicenet_test.py | 2 +- tensor2tensor/problems.py | 36 ++++ tensor2tensor/problems_test.py | 60 ++++++ tensor2tensor/utils/data_reader.py | 2 + tensor2tensor/utils/trainer_utils.py | 2 +- 12 files changed, 284 insertions(+), 21 deletions(-) create mode 100644 tensor2tensor/problems.py create mode 100644 tensor2tensor/problems_test.py diff --git a/tensor2tensor/data_generators/gene_expression.py b/tensor2tensor/data_generators/gene_expression.py index 0607aad15..43d5a6702 100644 --- a/tensor2tensor/data_generators/gene_expression.py +++ b/tensor2tensor/data_generators/gene_expression.py @@ -142,7 +142,7 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1): # Shuffle generator_utils.shuffle_dataset(all_filepaths) - def hparams(self, defaults, model_hparams): + def hparams(self, defaults, unused_model_hparams): p = defaults vocab_size = self._encoders["inputs"].vocab_size p.input_modality = {"inputs": (registry.Modalities.SYMBOL, vocab_size)} @@ -159,9 +159,8 @@ def example_reading_spec(self): data_items_to_decoders = None return (data_fields, data_items_to_decoders) - def preprocess_examples(self, examples, mode, hparams): + def preprocess_examples(self, examples, mode, unused_hparams): del mode - del hparams # Reshape targets to contain num_output_predictions per output timestep examples["targets"] = tf.reshape(examples["targets"], diff --git a/tensor2tensor/data_generators/ice_parsing.py b/tensor2tensor/data_generators/ice_parsing.py index 4fb0424bb..2aa261cd4 100644 --- a/tensor2tensor/data_generators/ice_parsing.py +++ b/tensor2tensor/data_generators/ice_parsing.py @@ -109,7 +109,7 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1): self.targeted_vocab_size), self.dev_filepaths(data_dir, 1, shuffled=False)) - def hparams(self, defaults, model_hparams): + def hparams(self, defaults, unused_model_hparams): p = defaults source_vocab_size = self._encoders["inputs"].vocab_size p.input_modality = {"inputs": (registry.Modalities.SYMBOL, diff --git a/tensor2tensor/data_generators/image.py b/tensor2tensor/data_generators/image.py index fbe91d70e..03cea1d02 100644 --- a/tensor2tensor/data_generators/image.py +++ b/tensor2tensor/data_generators/image.py @@ -105,7 +105,7 @@ def resize(img, size): examples["targets"] = resize(inputs, 32) return examples - def hparams(self, defaults, model_hparams): + def hparams(self, defaults, unused_model_hparams): p = defaults p.input_modality = {"inputs": ("image:identity_no_pad", None)} p.target_modality = ("image:identity_no_pad", None) @@ -229,7 +229,7 @@ def feature_encoders(self, data_dir): "targets": text_encoder.SubwordTextEncoder(vocab_filename) } - def hparams(self, defaults, model_hparams): + def hparams(self, defaults, unused_model_hparams): p = defaults p.input_modality = {"inputs": (registry.Modalities.IMAGE, None)} vocab_size = self._encoders["targets"].vocab_size @@ -267,7 +267,7 @@ def dev_shards(self): def generator(self, data_dir, tmp_dir, is_training): raise NotImplementedError() - def hparams(self, defaults, model_hparams): + def hparams(self, defaults, unused_model_hparams): p = defaults small_modality = "%s:small_image_modality" % registry.Modalities.IMAGE modality = small_modality if self.is_small else registry.Modalities.IMAGE @@ -349,7 +349,7 @@ def is_small(self): def num_classes(self): return 1000 - def preprocess_examples(self, examples, mode, hparams): + def preprocess_examples(self, examples, mode, unused_hparams): # Just resize with area. if self._was_reversed: examples["inputs"] = tf.to_int64( @@ -565,7 +565,7 @@ def cifar10_generator(tmp_dir, training, how_many, start_from=0): @registry.register_problem class ImageCifar10Tune(ImageMnistTune): - def preprocess_examples(self, examples, mode, hparams): + def preprocess_examples(self, examples, mode, unused_hparams): if mode == tf.contrib.learn.ModeKeys.TRAIN: examples["inputs"] = common_layers.cifar_image_augmentation( examples["inputs"]) @@ -591,7 +591,7 @@ def generator(self, data_dir, tmp_dir, is_training): @registry.register_problem class ImageCifar10Plain(ImageCifar10): - def preprocess_examples(self, examples, mode, hparams): + def preprocess_examples(self, examples, mode, unused_hparams): return examples @@ -730,7 +730,7 @@ def feature_encoders(self, data_dir): encoder = text_encoder.SubwordTextEncoder(vocab_filename) return {"targets": encoder} - def hparams(self, defaults, model_hparams): + def hparams(self, defaults, unused_model_hparams): p = defaults p.input_modality = {"inputs": (registry.Modalities.IMAGE, None)} encoder = self._encoders["targets"] diff --git a/tensor2tensor/data_generators/imdb.py b/tensor2tensor/data_generators/imdb.py index 281a03bee..4216747c4 100644 --- a/tensor2tensor/data_generators/imdb.py +++ b/tensor2tensor/data_generators/imdb.py @@ -97,7 +97,7 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1): self.generator(data_dir, tmp_dir, True), train_paths, self.generator(data_dir, tmp_dir, False), dev_paths) - def hparams(self, defaults, model_hparams): + def hparams(self, defaults, unused_model_hparams): p = defaults source_vocab_size = self._encoders["inputs"].vocab_size p.input_modality = { diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py index e4424e73e..d0ed6ad2c 100644 --- a/tensor2tensor/data_generators/problem.py +++ b/tensor2tensor/data_generators/problem.py @@ -18,10 +18,14 @@ from __future__ import division from __future__ import print_function +import collections import os +import random # Dependency imports +import six + from tensor2tensor.data_generators import generator_utils from tensor2tensor.data_generators import text_encoder from tensor2tensor.utils import metrics @@ -30,6 +34,7 @@ import tensorflow as tf + class SpaceID(object): """Input and target space ids. Add more as needed.""" # Generic / unknown output space (default) @@ -92,6 +97,14 @@ class SpaceID(object): CPP_TOK = 28 +def default_model_hparams(): + return tf.contrib.training.HParams( + max_input_seq_length=0, + max_target_seq_length=0, + prepend_mode="none", + data_dir=None) + + def preprocess_examples_common(examples, hparams): """Preprocessing steps common to all models.""" if hparams.max_input_seq_length > 0: @@ -232,14 +245,23 @@ def __init__(self, was_reversed=False, was_copy=False): self._was_reversed = was_reversed self._was_copy = was_copy self._encoders = None + self._hparams = None + self._feature_info = None - def internal_build_encoders(self, data_dir): - self._encoders = self.feature_encoders(data_dir) + def get_feature_encoders(self, data_dir=None): + if self._encoders is None: + self._encoders = self.feature_encoders(data_dir) + return self._encoders - def internal_hparams(self, model_hparams): + def get_hparams(self, model_hparams=None): """Returns problem_hparams.""" + if self._hparams is not None: + return self._hparams + + assert model_hparams is not None + if self._encoders is None: - self.internal_build_encoders(model_hparams.data_dir) + self.get_feature_encoders(model_hparams.data_dir) hp = _default_hparams() ret = self.hparams(hp, model_hparams) @@ -255,7 +277,9 @@ def internal_hparams(self, model_hparams): _reverse_problem_hparams(hp) if self._was_copy: _copy_problem_hparams(hp) - return hp + + self._hparams = hp + return self._hparams def maybe_reverse_features(self, feature_map): if not self._was_reversed: @@ -268,6 +292,148 @@ def maybe_copy_features(self, feature_map): return feature_map["targets"] = feature_map["inputs"] + def dataset(self, + mode, + data_dir=None, + num_threads=None, + output_buffer_size=None, + shuffle_files=None, + hparams=None): + """Build a Dataset for this problem. + + Args: + mode: tf.estimator.ModeKeys; determines which files to read from. + data_dir: directory that contains data files. + num_threads: int, number of threads to use for decode and preprocess + Dataset.map calls. + output_buffer_size: int, how many elements to prefetch in Dataset.map + calls. + shuffle_files: whether to shuffle input files. Default behavior (i.e. when + shuffle_files=None) is to shuffle if mode == TRAIN. + hparams: tf.contrib.training.HParams; hparams to be passed to + Problem.preprocess_examples and Problem.hparams. If None, will use a + default set that is a no-op. + + Returns: + Dataset containing dict. + """ + assert data_dir + + if hparams is None: + hparams = default_model_hparams() + + if not hasattr(hparams, "data_dir"): + hparams.add_hparam("data_dir", data_dir) + if not hparams.data_dir: + hparams.data_dir = data_dir + # Construct the Problem's hparams so that items within it are accessible + _ = self.get_hparams(hparams) + + base_filename = self.dataset_filename() + path = os.path.join(data_dir, base_filename) + + # TODO(rsepassi): handle ModeKeys.PREDICT with placeholders + is_training = mode == tf.estimator.ModeKeys.TRAIN + if is_training: + suffix = "train" + elif mode == tf.estimator.ModeKeys.EVAL: + suffix = "dev" + else: + assert mode == "test" + suffix = "test" + + filepattern = "%s-%s*" % (path, suffix) + data_fields, data_items_to_decoders = self.example_reading_spec() + if data_items_to_decoders is None: + data_items_to_decoders = { + field: tf.contrib.slim.tfexample_decoder.Tensor(field) + for field in data_fields + } + + data_files = tf.contrib.slim.parallel_reader.get_data_files(filepattern) + if shuffle_files or shuffle_files is None and is_training: + random.shuffle(data_files) + dataset = tf.contrib.data.TFRecordDataset(data_files) + + def decode_record(record): + """Serialized Example to dict of .""" + decoder = tf.contrib.slim.tfexample_decoder.TFExampleDecoder( + data_fields, data_items_to_decoders) + + decode_items = list(data_items_to_decoders) + decoded = decoder.decode(record, items=decode_items) + return dict(zip(decode_items, decoded)) + + def preprocess(example): + example = self.preprocess_examples(example, mode, hparams) + self.maybe_reverse_features(example) + self.maybe_copy_features(example) + return example + + dataset = dataset.map(decode_record, num_threads=num_threads) + dataset = dataset.map( + preprocess, + num_threads=num_threads, + output_buffer_size=output_buffer_size) + + return dataset + + @property + def feature_info(self): + """Retrieve dict. + + Must first call Problem.get_hparams or Problem.dataset to have the problem's + internal hparams already constructed. + + Returns: + dict + """ + if self._feature_info is not None: + return self._feature_info + + assert self._hparams is not None + + hp = self.get_hparams() + input_mods = hp.input_modality + target_mod = hp.target_modality + vocabs = hp.vocabulary + in_id = hp.input_space_id + out_id = hp.target_space_id + + features = collections.defaultdict(FeatureInfo) + + for name, mod_spec in six.iteritems(input_mods): + mod, vocab_size = mod_spec + finfo = features[name] + finfo.modality = mod + finfo.vocab_size = vocab_size + + mod, vocab_size = target_mod + features["targets"].modality = mod + features["targets"].vocab_size = vocab_size + + for name, encoder in six.iteritems(vocabs): + features[name].encoder = encoder + + features["inputs"].space_id = in_id + features["targets"].space_id = out_id + + self._feature_info = features + return features + + +class FeatureInfo(object): + + def __init__(self, + encoder=None, + modality=None, + vocab_size=None, + space_id=None): + self.encoder = encoder + self.modality = modality + self.vocab_size = vocab_size + self.space_id = space_id + def _copy_problem_hparams(p_hparams): """Use input modality, vocab, and space id for target.""" diff --git a/tensor2tensor/models/gene_expression_test.py b/tensor2tensor/models/gene_expression_test.py index cc4cd1200..e46e81859 100644 --- a/tensor2tensor/models/gene_expression_test.py +++ b/tensor2tensor/models/gene_expression_test.py @@ -70,7 +70,7 @@ def testGeneExpressionModels(self): gene_expression_conv_test())] for model_cls, hparams in models_hparams: hparams.add_hparam("data_dir", None) - p_hparams = gene_data.GenomicsExpressionCage10().internal_hparams(hparams) + p_hparams = gene_data.GenomicsExpressionCage10().get_hparams(hparams) hparams.problems = [p_hparams] self._testModel(hparams, model_cls) diff --git a/tensor2tensor/models/multimodel_test.py b/tensor2tensor/models/multimodel_test.py index 73a8436cc..ab60bae97 100644 --- a/tensor2tensor/models/multimodel_test.py +++ b/tensor2tensor/models/multimodel_test.py @@ -38,7 +38,7 @@ def testMultiModel(self): hparams = multimodel.multimodel_tiny() hparams.add_hparam("data_dir", "") problem = registry.problem("image_cifar10") - p_hparams = problem.internal_hparams(hparams) + p_hparams = problem.get_hparams(hparams) hparams.problems = [p_hparams] with self.test_session() as session: features = { diff --git a/tensor2tensor/models/slicenet_test.py b/tensor2tensor/models/slicenet_test.py index 388acde1b..c3a064a85 100644 --- a/tensor2tensor/models/slicenet_test.py +++ b/tensor2tensor/models/slicenet_test.py @@ -39,7 +39,7 @@ def testSliceNet(self): hparams = slicenet.slicenet_params1_tiny() hparams.add_hparam("data_dir", "") problem = registry.problem("image_cifar10") - p_hparams = problem.internal_hparams(hparams) + p_hparams = problem.get_hparams(hparams) hparams.problems = [p_hparams] with self.test_session() as session: features = { diff --git a/tensor2tensor/problems.py b/tensor2tensor/problems.py new file mode 100644 index 000000000..1e94c7bad --- /dev/null +++ b/tensor2tensor/problems.py @@ -0,0 +1,36 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Access T2T Problems. + +See problems_test.py for basic usage. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +from tensor2tensor.data_generators import all_problems # pylint: disable=unused-import +from tensor2tensor.utils import registry + + +def problem(name): + return registry.problem(name) + + +def available(): + return sorted(registry.list_problems()) diff --git a/tensor2tensor/problems_test.py b/tensor2tensor/problems_test.py new file mode 100644 index 000000000..de101e6e7 --- /dev/null +++ b/tensor2tensor/problems_test.py @@ -0,0 +1,60 @@ +# coding=utf-8 +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""tensor2tensor.problems test.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +from tensor2tensor import problems + +import tensorflow as tf + +MODES = tf.estimator.ModeKeys + + +class ProblemsTest(tf.test.TestCase): + + def testBuildDataset(self): + # See all the available problems + self.assertTrue(len(problems.available()) > 10) + + # Retrieve a problem by name + problem = problems.problem("translate_ende_wmt8k") + + # Access train and dev datasets through Problem + train_dataset = problem.dataset(MODES.TRAIN) + dev_dataset = problem.dataset(MODES.EVAL) + + # Access vocab size and other info (e.g. the data encoders used to + # encode/decode data for the feature, used below) through feature_info. + feature_info = problem.feature_info + self.assertTrue(feature_info["inputs"].vocab_size > 0) + self.assertTrue(feature_info["targets"].vocab_size > 0) + + train_example = train_dataset.make_one_shot_iterator().get_next() + dev_example = dev_dataset.make_one_shot_iterator().get_next() + + with tf.Session() as sess: + train_ex_val, _ = sess.run([train_example, dev_example]) + _ = feature_info["inputs"].encoder.decode(train_ex_val["inputs"]) + _ = feature_info["targets"].encoder.decode(train_ex_val["targets"]) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py index 09ef159a4..681f3598b 100644 --- a/tensor2tensor/utils/data_reader.py +++ b/tensor2tensor/utils/data_reader.py @@ -260,6 +260,8 @@ def input_pipeline(problem, data_file_pattern, capacity, mode, hparams, num_threads = 4 if is_training else 1 with tf.name_scope("input_pipeline"): + # TODO(rsepassi): Once all problems use the Problem class, rm example + # reading, parsing, and preprocessing. Use Problem.dataset instead. dataset = read_examples(problem, data_file_pattern, capacity, mode=mode) dataset = dataset.map( lambda ex: _preprocess(ex, problem, data_file_pattern, hparams, mode), diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py index a747b9a09..8539f4eb1 100644 --- a/tensor2tensor/utils/trainer_utils.py +++ b/tensor2tensor/utils/trainer_utils.py @@ -237,7 +237,7 @@ def add_problem_hparams(hparams, problems): if problem is None: p_hparams = problem_hparams.problem_hparams(problem_name, hparams) else: - p_hparams = problem.internal_hparams(hparams) + p_hparams = problem.get_hparams(hparams) hparams.problem_instances.append(problem) hparams.problems.append(p_hparams) From 636d2e1fd089290f28eaa45f2476cc00ce67d7a4 Mon Sep 17 00:00:00 2001 From: Katherine Lee Date: Mon, 4 Sep 2017 14:44:19 -0700 Subject: [PATCH 10/32] Add strokes SpaceID. PiperOrigin-RevId: 167518694 --- tensor2tensor/data_generators/all_problems.py | 1 - tensor2tensor/data_generators/problem.py | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tensor2tensor/data_generators/all_problems.py b/tensor2tensor/data_generators/all_problems.py index f9afa895b..52354704d 100644 --- a/tensor2tensor/data_generators/all_problems.py +++ b/tensor2tensor/data_generators/all_problems.py @@ -45,4 +45,3 @@ pass # pylint: enable=g-import-not-at-top # pylint: enable=unused-import - diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py index d0ed6ad2c..302c51fa7 100644 --- a/tensor2tensor/data_generators/problem.py +++ b/tensor2tensor/data_generators/problem.py @@ -95,6 +95,8 @@ class SpaceID(object): PY_TOK = 27 # C++ CPP_TOK = 28 + # Strokes + STROKES = 29 def default_model_hparams(): From c25325be184bd555a1b0df0af021699996435f79 Mon Sep 17 00:00:00 2001 From: Katherine Lee Date: Mon, 4 Sep 2017 15:36:23 -0700 Subject: [PATCH 11/32] Merge from GitHub PiperOrigin-RevId: 167520632 --- docs/new_problem.md | 48 ++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/docs/new_problem.md b/docs/new_problem.md index c859c6eba..d581a3a1b 100644 --- a/docs/new_problem.md +++ b/docs/new_problem.md @@ -17,13 +17,15 @@ For each problem we want to tackle we create a new problem class and register it Since many text2text problems share similar methods, there's already a class called `Text2TextProblem` that extends the base problem class, `Problem` (both found in `problem.py`). -For our problem, we can go ahead and create the file `word2def.py` in the `data_generators` folder and add our new problem, `Word2def`, which extends `TranslateProblem`. Let's also register it while we're at it so we can specify the problem through flags. +For our problem, we can go ahead and create the file `word2def.py` in the `data_generators` folder and add our new problem, `Word2def`, which extends `Text2TextProblem`. Let's also register it while we're at it so we can specify the problem through flags. ```python -@registry.register_problem() +@registry.register_problem class Word2def(problem.Text2TextProblem): """Problem spec for English word to dictionary definition.""" - return NotImplementedError() + @property + def is_character_level(self): + ... ``` We need to implement the following methods from `Text2TextProblem` in our new class: @@ -56,6 +58,8 @@ The number of shards to break data files into. @registry.register_problem() class Word2def(problem.Text2TextProblem): """Problem spec for English word to dictionary definition.""" + + @property def is_character_level(self): return True @@ -87,7 +91,6 @@ We're almost done. `generator` generates the training and evaluation data and st def generator(self, data_dir, tmp_dir, train): character_vocab = text_encoder.ByteTextEncoder() datasets = _WORD2DEF_TRAIN_DATASETS if train else _WORD2DEF_TEST_DATASETS - tag = "train" if train else "dev" return character_generator(datasets[0], datasets[1], character_vocab, EOS) ``` @@ -108,7 +111,6 @@ class Word2def(problem.Text2TextProblem): def generator(self, data_dir, tmp_dir, train): character_vocab = text_encoder.ByteTextEncoder() datasets = _WORD2DEF_TRAIN_DATASETS if train else _WORD2DEF_TEST_DATASETS - tag = "train" if train else "dev" return character_generator(datasets[0], datasets[1], character_vocab, EOS) @property @@ -137,14 +139,13 @@ I've gone ahead and split all words into a train and test set and saved them in ```python # English Word2def datasets _WORD2DEF_TRAIN_DATASETS = [ - [ - "LOCATION_OF_DATA/", ("words_train.txt", "definitions_train.txt") - ] + LOCATION_OF_DATA + 'words_train.txt', + LOCATION_OF_DATA + 'definitions_train.txt' ] + _WORD2DEF_TEST_DATASETS = [ - [ - "LOCATION_OF_DATA", ("words_test.txt", "definitions_test.txt") - ] + LOCATION_OF_DATA + 'words_test.txt', + LOCATION_OF_DATA + 'definitions_test.txt' ] ``` @@ -155,24 +156,14 @@ Now our `word2def.py` file looks like: (with the correct imports) """ Problem definition for word to dictionary definition. """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - import os -import tarfile # do we need this import -from tensor2tensor.data_generators import generator_utils from tensor2tensor.data_generators import problem from tensor2tensor.data_generators import text_encoder from tensor2tensor.data_generators.wmt import character_generator from tensor2tensor.utils import registry -import tensorflow as tf - -FLAGS = tf.flags.FLAGS - # English Word2def datasets _WORD2DEF_TRAIN_DATASETS = [ LOCATION_OF_DATA+'words_train.txt', @@ -198,7 +189,6 @@ class Word2def(problem.Text2TextProblem): def generator(self, data_dir, tmp_dir, train): character_vocab = text_encoder.ByteTextEncoder() datasets = _WORD2DEF_TRAIN_DATASETS if train else _WORD2DEF_TEST_DATASETS - tag = "train" if train else "dev" return character_generator(datasets[0], datasets[1], character_vocab, EOS) @property @@ -220,7 +210,17 @@ class Word2def(problem.Text2TextProblem): ``` # Hyperparameters -All hyperparamters inherit from `_default_hparams()` in `problem.py.` If you would like to customize your hyperparameters, add another method to the file `problem_hparams.py`. +All hyperparamters inherit from `_default_hparams()` in `problem.py.` If you would like to customize your hyperparameters, register a new hyperparameter set in `word2def.py` like the example provided in the walkthrough. For example: + +```python +from tensor2tensor.models import transformer + +@registry.register_hparams +def word2def_hparams(self): + hparams = transformer.transformer_base_single_gpu() # Or whatever you'd like to build off. + hparams.batch_size = 1024 + return hparams +``` # Run the problem Now that we've gotten our problem set up, let's train a model and generate definitions. @@ -229,7 +229,7 @@ We specify our problem name, the model, and hparams. ```bash PROBLEM=word2def MODEL=transformer -HPARAMS=transofmer_base_single_gpu +HPARAMS=word2def_hparams ``` The rest of the steps are as given in the [walkthrough](walkthrough.md). From 0de6f8c53204ebbce4cdabaaa32182d69571ad6c Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Tue, 5 Sep 2017 14:05:23 -0700 Subject: [PATCH 12/32] Save metadata (flags, hparams) on train PiperOrigin-RevId: 167628142 --- tensor2tensor/utils/trainer_utils.py | 34 ++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py index 8539f4eb1..ee3445e26 100644 --- a/tensor2tensor/utils/trainer_utils.py +++ b/tensor2tensor/utils/trainer_utils.py @@ -19,6 +19,7 @@ from __future__ import division from __future__ import print_function +import os import sys # Dependency imports @@ -147,6 +148,8 @@ def create_experiment(output_dir, data_dir, model_name, train_steps, """Create Experiment.""" hparams = create_hparams( FLAGS.hparams_set, FLAGS.problems, data_dir, passed_hparams=FLAGS.hparams) + if FLAGS.worker_id == 0: + save_metadata(output_dir, hparams) estimator, input_fns = create_experiment_components( hparams=hparams, output_dir=output_dir, @@ -245,6 +248,37 @@ def add_problem_hparams(hparams, problems): return hparams +def save_metadata(output_dir, hparams): + """Saves FLAGS and hparams to output_dir.""" + # Save FLAGS in txt file + if hasattr(FLAGS, "flags_into_string"): + flags_str = FLAGS.flags_into_string() + t2t_flags_str = "\n".join([ + "--%s=%s" % (f.name, f.value) + for f in FLAGS.flags_by_module_dict()[ + "tensor2tensor.utils.trainer_utils"] + ]) + else: + flags_dict = FLAGS.__dict__["__flags"] + flags_str = "\n".join( + ["--%s=%s" % (name, str(f.value)) for (name, f) in flags_dict.items()]) + t2t_flags_str = None + + flags_txt = os.path.join(output_dir, "flags.txt") + with tf.gfile.Open(flags_txt, "w") as f: + f.write(flags_str) + + if t2t_flags_str: + t2t_flags_txt = os.path.join(output_dir, "flags_t2t.txt") + with tf.gfile.Open(t2t_flags_txt, "w") as f: + f.write(t2t_flags_str) + + # Save hparams as hparams.json + hparams_fname = os.path.join(output_dir, "hparams.json") + with tf.gfile.Open(hparams_fname, "w") as f: + f.write(hparams.to_json()) + + def create_hparams(params_id, problems, data_dir, passed_hparams=None): """Returns hyperparameters, including any flag value overrides. From c46684f79620ae695e4c79708e3064ab2aea8b7d Mon Sep 17 00:00:00 2001 From: Etienne Pot Date: Wed, 6 Sep 2017 08:55:04 -0700 Subject: [PATCH 13/32] Attention experts uses local info for the FC. Fix long max_length size when batch_size is set through command line. Minor cleanup PiperOrigin-RevId: 167726943 --- tensor2tensor/models/attention_lm_moe.py | 63 ++++++++++++++++-------- 1 file changed, 43 insertions(+), 20 deletions(-) diff --git a/tensor2tensor/models/attention_lm_moe.py b/tensor2tensor/models/attention_lm_moe.py index 191d4aa04..eccf349c9 100644 --- a/tensor2tensor/models/attention_lm_moe.py +++ b/tensor2tensor/models/attention_lm_moe.py @@ -101,29 +101,30 @@ def _diet_expert(x): # should not either way) assert hparams.norm_type != "batch" + tf.logging.info("Applying Padding Remover for the attention experts") + dp_remove_pad = functools.partial( dp, remove_pad, pad_remover=pad_remover, mode=hparams.mode) dp_restore_pad = functools.partial( dp, restore_pad, ref_x=x, pad_remover=pad_remover, mode=hparams.mode) - elif (hparams.attention_type == AttentionType.MULTIHEAD or - hparams.attention_type == AttentionType.MEMORY_EFFICIENT): + else: # Using identity function: No effect - dp_remove_pad = lambda x: (x, None) + dp_remove_pad = lambda x: x dp_restore_pad = lambda x: x - else: - raise ValueError("Only {} supported for now.".format( - AttentionType.get_choices())) - def print_shape(x, suffix): + def print_shape(x, suffix, debug=False): # To help debugging, print the input/output shapes at inference and eval # Inference for long sequences can take a long time, so that's help to # see the progession of the generation - if hparams.mode == ModeKeys.TRAIN: + if not debug and hparams.mode == ModeKeys.TRAIN: return x return tf.Print(x, [tf.shape(x)], "shape_x_{}".format(suffix)) + batch_coordinate = dp(get_batch_coordinate, x) + batch_coordinate = dp_remove_pad(batch_coordinate) + x = dp(print_shape, x, "in") - x, batch_coordinate = dp_remove_pad(x) + x = dp_remove_pad(x) x = dp(print_shape, x, "in_flat") for layer in xrange(hparams.num_hidden_layers): @@ -188,12 +189,31 @@ def print_shape(x, suffix): x, hparams.filter_size) else: + x_in = preprocess(x) + additional_conv_params = dict() + if hparams.use_sepconv: + # Restore padding so sequences don't attend to each others + # restore_pad will apply a reshape like x_ref, to restore the + # original shape. Here this works because the last dimension is + # constant between the output of attention and the original input + # but it shouldn't necessarily be the case. + x_in = dp_restore_pad(x_in) + additional_conv_params = dict( + padding="LEFT", + # Parameters copied from the transformer model + kernel_size=(3, 1), + second_kernel_size=(31, 1), + ) y = dp( common_layers.conv_hidden_relu, - preprocess(x), + x_in, hparams.filter_size, hparams.hidden_size, - dropout=hparams.relu_dropout) + dropout=hparams.relu_dropout, + **additional_conv_params + ) + if hparams.use_sepconv: + y = dp_remove_pad(y) x = postprocess(x, y) x = preprocess(x) @@ -234,6 +254,14 @@ def attention_lm_moe_prepare_decoder(targets, hparams): return (decoder_input, decoder_self_attention_bias, pad_remover) +def get_batch_coordinate(x): + """Return a flat int32 tensor of shape [1, batch_size*length, 1].""" + # Compute the batch coordinate before flattening all batches + batch_coordinate = tf.expand_dims( + common_attention.coordinate_tensor(tf.shape(x)[:-1], axis=0), axis=-1) + return batch_coordinate + + def remove_pad(x, pad_remover, mode): """Remove padding by concatenating all dimension into one. @@ -247,11 +275,6 @@ def remove_pad(x, pad_remover, mode): tf.Tensor of shape [1,length_nonpad,depth] where length_nonpad <= batch_size*length """ - # Compute the batch coordinate before flattening all batches - batch_coordinate = tf.expand_dims( - common_attention.coordinate_tensor(tf.shape(x)[:-1], axis=0), axis=-1) - batch_coordinate = expert_utils.flatten_all_but_last(batch_coordinate) - # Concatenate all tokens (without padding) x = expert_utils.flatten_all_but_last(x) @@ -260,12 +283,10 @@ def remove_pad(x, pad_remover, mode): # This is a hack to allows inference when the token # is detected as padding and removed. This works for now because there is # no padding at inference. - batch_coordinate = pad_remover.remove(batch_coordinate) x = pad_remover.remove(x) - batch_coordinate = tf.expand_dims(batch_coordinate, axis=0) x = tf.expand_dims(x, axis=0) # Now batch_size=1 - return x, batch_coordinate + return x def restore_pad(x, ref_x, pad_remover, mode): @@ -328,6 +349,7 @@ def attention_lm_moe_base(): hparams.add_hparam("attention_v_size", 256) # Loss coef for load balancing hparams.add_hparam("attention_load_balance", 2e-2) + hparams.add_hparam("use_sepconv", int(False)) hparams.add_hparam("diet_experts", int(False)) hparams.add_hparam("memory_efficient_ffn", int(False)) return hparams @@ -338,7 +360,8 @@ def attention_lm_moe_base_ae(): """Base model with attention expert.""" hparams = attention_lm_moe_base() hparams.attention_type = AttentionType.LOCAL_EXPERTS - hparams.max_length = hparams.batch_size + hparams.use_sepconv = int(True) + hparams.max_length = 0 # max_length == batch_size hparams.eval_drop_long_sequences = int(True) hparams.min_length_bucket = 256 # Avoid cyclic problems for big batches hparams.learning_rate = 0.05 From 5767beceb71c56222f73cb41e70641c380636cb9 Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Wed, 6 Sep 2017 10:11:01 -0700 Subject: [PATCH 14/32] ClassLabelEncoder to map class ids to names PiperOrigin-RevId: 167736101 --- tensor2tensor/data_generators/image.py | 23 +++++++++++++++ tensor2tensor/data_generators/imdb.py | 2 +- tensor2tensor/data_generators/problem.py | 17 +++++++---- tensor2tensor/data_generators/text_encoder.py | 29 +++++++++++++++++++ 4 files changed, 64 insertions(+), 7 deletions(-) diff --git a/tensor2tensor/data_generators/image.py b/tensor2tensor/data_generators/image.py index 03cea1d02..8d142d239 100644 --- a/tensor2tensor/data_generators/image.py +++ b/tensor2tensor/data_generators/image.py @@ -264,6 +264,17 @@ def train_shards(self): def dev_shards(self): return 1 + @property + def class_labels(self): + return ["ID_%d" % i for i in range(self.num_classes)] + + def feature_encoders(self, data_dir): + del data_dir + return { + "inputs": text_encoder.TextEncoder(), + "targets": text_encoder.ClassLabelEncoder(self.class_labels) + } + def generator(self, data_dir, tmp_dir, is_training): raise NotImplementedError() @@ -491,6 +502,10 @@ def is_small(self): def num_classes(self): return 10 + @property + def class_labels(self): + return [str(c) for c in range(self.num_classes)] + @property def train_shards(self): return 10 @@ -564,6 +579,14 @@ def cifar10_generator(tmp_dir, training, how_many, start_from=0): @registry.register_problem class ImageCifar10Tune(ImageMnistTune): + """Cifar-10 Tune.""" + + @property + def class_labels(self): + return [ + "airplane", "automobile", "bird", "cat", "deer", "dog", "frog", "horse", + "ship", "truck" + ] def preprocess_examples(self, examples, mode, unused_hparams): if mode == tf.contrib.learn.ModeKeys.TRAIN: diff --git a/tensor2tensor/data_generators/imdb.py b/tensor2tensor/data_generators/imdb.py index 4216747c4..d7eadcd1d 100644 --- a/tensor2tensor/data_generators/imdb.py +++ b/tensor2tensor/data_generators/imdb.py @@ -112,7 +112,7 @@ def feature_encoders(self, data_dir): encoder = text_encoder.SubwordTextEncoder(vocab_filename) return { "inputs": encoder, - "targets": text_encoder.TextEncoder(), + "targets": text_encoder.ClassLabelEncoder(["neg", "pos"]), } def example_reading_spec(self): diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py index 302c51fa7..4aa4862ef 100644 --- a/tensor2tensor/data_generators/problem.py +++ b/tensor2tensor/data_generators/problem.py @@ -300,7 +300,8 @@ def dataset(self, num_threads=None, output_buffer_size=None, shuffle_files=None, - hparams=None): + hparams=None, + preprocess=True): """Build a Dataset for this problem. Args: @@ -315,6 +316,8 @@ def dataset(self, hparams: tf.contrib.training.HParams; hparams to be passed to Problem.preprocess_examples and Problem.hparams. If None, will use a default set that is a no-op. + preprocess: bool, whether to map the Dataset through + Problem.preprocess_examples. Returns: Dataset containing dict. @@ -366,17 +369,19 @@ def decode_record(record): decoded = decoder.decode(record, items=decode_items) return dict(zip(decode_items, decoded)) - def preprocess(example): + def _preprocess(example): example = self.preprocess_examples(example, mode, hparams) self.maybe_reverse_features(example) self.maybe_copy_features(example) return example dataset = dataset.map(decode_record, num_threads=num_threads) - dataset = dataset.map( - preprocess, - num_threads=num_threads, - output_buffer_size=output_buffer_size) + + if preprocess: + dataset = dataset.map( + _preprocess, + num_threads=num_threads, + output_buffer_size=output_buffer_size) return dataset diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py index ac9260cfa..97ab88402 100644 --- a/tensor2tensor/data_generators/text_encoder.py +++ b/tensor2tensor/data_generators/text_encoder.py @@ -154,6 +154,35 @@ def vocab_size(self): return 2**8 + self._num_reserved_ids +class ClassLabelEncoder(TextEncoder): + """Encoder for class labels.""" + + def __init__(self, class_labels=None, class_labels_fname=None): + super(ClassLabelEncoder, self).__init__(num_reserved_ids=0) + + assert class_labels or class_labels_fname + assert not (class_labels and class_labels_fname) + + if class_labels_fname: + with tf.gfile.Open(class_labels_fname) as f: + class_labels = [label.strip() for label in f.readlines()] + + self._class_labels = class_labels + + def encode(self, label_str): + return self._class_labels.index(label_str) + + def decode(self, label_id): + if isinstance(label_id, list): + assert len(label_id) == 1 + label_id, = label_id + return self._class_labels[label_id] + + @property + def vocab_size(self): + return len(self._class_labels) + + class TokenTextEncoder(TextEncoder): """Encoder based on a user-supplied vocabulary (file or list).""" From 78d8ddb349870400c89cd08c1c3e70bcc92f1f5f Mon Sep 17 00:00:00 2001 From: Niki Parmar Date: Wed, 6 Sep 2017 14:56:10 -0700 Subject: [PATCH 15/32] Add attention 2D functions over local_attention_2d PiperOrigin-RevId: 167777554 --- tensor2tensor/layers/common_attention.py | 119 +++++++++++++++++++++++ 1 file changed, 119 insertions(+) diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py index 7ed7799d0..1053a69af 100644 --- a/tensor2tensor/layers/common_attention.py +++ b/tensor2tensor/layers/common_attention.py @@ -328,6 +328,19 @@ def split_heads(x, num_heads): return tf.transpose(split_last_dimension(x, num_heads), [0, 2, 1, 3]) +def split_heads_2d(x, num_heads): + """Split channels (dimension 4) into multiple heads (becomes dimension 1). + + Args: + x: a Tensor with shape [batch, height, width, channels] + num_heads: an integer + + Returns: + a Tensor with shape [batch, num_heads, height, width, channels / num_heads] + """ + return tf.transpose(split_last_dimension(x, num_heads), [0, 3, 1, 2, 4]) + + def combine_heads(x): """Inverse of split_heads. @@ -340,6 +353,18 @@ def combine_heads(x): return combine_last_two_dimensions(tf.transpose(x, [0, 2, 1, 3])) +def combine_heads_2d(x): + """Inverse of split_heads_2d function. + + Args: + x: a Tensor with shape [batch, num_heads, height, width, channels/num_heads] + + Returns: + a Tensor with shape [batch, height, width, channels] + """ + return combine_last_two_dimensions(tf.transpose(x, [0, 2, 3, 1, 4])) + + def attention_image_summary(attn, image_shapes=None): """Compute color image summary. @@ -768,6 +793,43 @@ def compute_qkv(query_antecedent, memory_antecedent, total_key_depth, return q, k, v +def compute_qkv_2d(query_antecedent, memory_antecedent, total_key_depth, + total_value_depth): + """Computes query, key and value of a 4D tensor. + + Args: + query_antecedent: a Tensor with shape [batch, h, w, depth_k] + memory_antecedent: a Tensor with shape [batch, h, w, depth_k] + total_key_depth: an integer + total_value_depth: and integer + + Returns: + q, k, v : [batch, h, w, depth_k] tensors + """ + # self attention with single position q, k, and v. + if memory_antecedent is None: + combined = tf.layers.conv2d( + query_antecedent, + total_key_depth * 2 + total_value_depth, (1, 1), + name="qkv_transform") + q, k, v = tf.split( + combined, [total_key_depth, total_key_depth, total_value_depth], + axis=-1) + return q, k, v + + # Encoder decoder attention. + q = common_layers.conv1d( + query_antecedent, total_key_depth, 1, name="q_transform") + combined = common_layers.conv1d( + memory_antecedent, + total_key_depth + total_value_depth, + 1, + name="kv_transform") + k, v = tf.split(combined, [total_key_depth, total_value_depth], axis=2) + + return q, k, v + + def multihead_attention(query_antecedent, memory_antecedent, bias, @@ -849,6 +911,63 @@ def multihead_attention(query_antecedent, return x +def multihead_attention_2d(query_antecedent, + memory_antecedent, + total_key_depth, + total_value_depth, + output_depth, + num_heads, + attention_type="local_attention_2d", + block_length=128, + block_width=128, + name=None): + """2d Multihead scaled-dot-product attention with inp/output transformations. + + Args: + query_antecedent: a Tensor with shape [batch, h, w, depth_k] + memory_antecedent: a Tensor with shape [batch, h, w, depth_k] + total_key_depth: an integer + total_value_depth: an integer + output_depth: an integer + num_heads: an integer dividing total_key_depth and total_value_depth + attention_type: String, type of attention function to use. + block_length: an integer - relevant for "local_attention_2d" + block_width: an integer - relevant for "local_attention_2d" + name: an optional string + + Returns: + A Tensor of shape [batch, h, w, depth_k] + + Raises: + ValueError: if the key depth or value depth are not divisible by the + number of attention heads. + """ + if total_key_depth % num_heads != 0: + raise ValueError("Key depth (%d) must be divisible by the number of " + "attention heads (%d)." % (total_key_depth, num_heads)) + if total_value_depth % num_heads != 0: + raise ValueError("Value depth (%d) must be divisible by the number of " + "attention heads (%d)." % (total_value_depth, num_heads)) + with tf.variable_scope( + name, + default_name="multihead_attention", + values=[query_antecedent, memory_antecedent]): + q, k, v = compute_qkv_2d(query_antecedent, memory_antecedent, + total_key_depth, total_value_depth) + + q = split_heads_2d(q, num_heads) + k = split_heads_2d(k, num_heads) + v = split_heads_2d(v, num_heads) + key_depth_per_head = total_key_depth // num_heads + q *= key_depth_per_head**-0.5 + if attention_type == "local_attention_2d": + x = local_attention_2d( + q, k, v, block_length=block_length, filter_flange=block_width) + x = tf.squeeze(combine_heads_2d(x), axis=-2) + x = common_layers.conv1d(x, output_depth, 1, name="output_transform") + return x + + def ffn_self_attention_layer(x, filter_depth, output_depth, From 4794c20af3e0d104e38985a37cfa7244185cd13e Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Wed, 6 Sep 2017 16:13:02 -0700 Subject: [PATCH 16/32] GPU mem fraction default 0.95 to rm allocation error msg PiperOrigin-RevId: 167788682 --- tensor2tensor/utils/trainer_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py index ee3445e26..3248d9ca9 100644 --- a/tensor2tensor/utils/trainer_utils.py +++ b/tensor2tensor/utils/trainer_utils.py @@ -93,7 +93,7 @@ flags.DEFINE_integer("worker_gpu", 1, "How many GPUs to use.") flags.DEFINE_integer("worker_replicas", 1, "How many workers to use.") flags.DEFINE_integer("worker_id", 0, "Which worker task are we.") -flags.DEFINE_float("worker_gpu_memory_fraction", 1., +flags.DEFINE_float("worker_gpu_memory_fraction", 0.95, "Fraction of GPU memory to allocate.") flags.DEFINE_integer("ps_gpu", 0, "How many GPUs to use per ps.") flags.DEFINE_string("gpu_order", "", "Optional order for daisy-chaining gpus." From 74044ea9768fd41e90166305d041d67457955bfd Mon Sep 17 00:00:00 2001 From: T2T Team Date: Wed, 6 Sep 2017 16:32:29 -0700 Subject: [PATCH 17/32] Share one PadRemover across all Transformer encoder layers PiperOrigin-RevId: 167791186 --- tensor2tensor/models/transformer.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py index 86b920dc5..38766ec19 100644 --- a/tensor2tensor/models/transformer.py +++ b/tensor2tensor/models/transformer.py @@ -191,6 +191,8 @@ def transformer_encoder(encoder_input, """ x = encoder_input with tf.variable_scope(name): + pad_remover = expert_utils.PadRemover( + common_attention.attention_bias_to_padding(encoder_self_attention_bias)) for layer in xrange( hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): @@ -203,9 +205,6 @@ def transformer_encoder(encoder_input, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): - pad_remover = expert_utils.PadRemover( - common_attention.attention_bias_to_padding( - encoder_self_attention_bias)) y = transformer_ffn_layer( common_layers.layer_preprocess(x, hparams), hparams, pad_remover) x = common_layers.layer_postprocess(x, y, hparams) From 665dbe8b92f827d68a7671fa15cbb6f0231de1ad Mon Sep 17 00:00:00 2001 From: Ashish Vaswani Date: Wed, 6 Sep 2017 16:43:15 -0700 Subject: [PATCH 18/32] 2d masked local attention. Each memory block can attend to a memory region top-left, top, and top-right. The mask ensures that we don't peek into the future. Refactored some functions out of local_attention_2d so that they could be shared. PiperOrigin-RevId: 167792489 --- tensor2tensor/layers/common_attention.py | 263 +++++++++++++----- tensor2tensor/layers/common_attention_test.py | 8 +- 2 files changed, 196 insertions(+), 75 deletions(-) diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py index 1053a69af..84f8d2d9a 100644 --- a/tensor2tensor/layers/common_attention.py +++ b/tensor2tensor/layers/common_attention.py @@ -22,6 +22,7 @@ import math # Dependency imports +import numpy as np from six.moves import xrange # pylint: disable=redefined-builtin @@ -354,10 +355,11 @@ def combine_heads(x): def combine_heads_2d(x): - """Inverse of split_heads_2d function. + """Inverse of split_heads_2d. Args: - x: a Tensor with shape [batch, num_heads, height, width, channels/num_heads] + x: a Tensor with shape + [batch, num_heads, height, width, channels / num_heads] Returns: a Tensor with shape [batch, height, width, channels] @@ -627,8 +629,8 @@ def pad_l_and_r(x, pad_length): def local_attention_2d(q, k, v, - block_length=128, - filter_flange=100, + query_shape=(8, 16), + memory_flange=(8, 16), name=None): """strided block local self-attention. @@ -636,8 +638,9 @@ def local_attention_2d(q, q: a Tensor with shape [batch, heads, h, w, depth_k] k: a Tensor with shape [batch, heads, h, w, depth_k] v: a Tensor with shape [batch, heads, h, w, depth_v] - block_length: an integer indicating the side length of each square block. - filter_flange: an integer indicating how much to look around each block. + query_shape: an tuple indicating the height and width of each query block. + memory_flange: an integer indicating how much to look in height and width + from each query block. name: an optional string Returns: @@ -651,68 +654,26 @@ def local_attention_2d(q, num_heads = tf.shape(q)[1] original_length = tf.shape(q)[2] * tf.shape(q)[3] - def reshape_range(tensor, i, j, shape): - """Reshapes a tensor between dimensions i and j.""" - target_shape = tf.concat( - [tf.shape(tensor)[:i], shape, tf.shape(tensor)[j:]], - axis=0) - return tf.reshape(tensor, target_shape) - - def pad_to_multiple(x, d): - """Making sure x is a multiple of d.""" - height_padding = -tf.shape(x)[1] % d - width_padding = -tf.shape(x)[2] % d - paddings = [[0, 0], [0, 0], [0, height_padding], - [0, width_padding], [0, 0]] - return tf.pad(x, paddings) - - def gather_indices(x, block_length, stride): - """Getting gather indices.""" - # making an identity matrix kernel - kernel = tf.eye(block_length ** 2) - kernel = reshape_range(kernel, 0, 1, [block_length, block_length, 1]) - # making indices [1, h, w, 1] to appy convs - indices = tf.range(0, tf.shape(x)[2] * tf.shape(x)[3], delta=1) - indices = tf.reshape(indices, [1, tf.shape(x)[2], tf.shape(x)[3], 1]) - indices = tf.nn.conv2d( - tf.cast(indices, tf.float32), - kernel, - strides=[1, stride, stride, 1], - padding="VALID") - # making indices [num_blocks, dim] to gather - num_blocks = tf.reduce_prod(tf.shape(indices)[:2]) - indices = tf.reshape(indices, [num_blocks, -1]) - return tf.cast(indices, tf.int32) - - def gather_blocks(x, indices): - """Gathers flattened blocks from x.""" - x_shape = tf.shape(x) - x = reshape_range(x, 2, 4, [tf.reduce_prod(x_shape[2:4])]) - # [length, batch, heads, dim] - x_t = tf.transpose(x, [2, 0, 1, 3]) - x_new = tf.gather(x_t, indices) - # returns [batch, heads, num_blocks, block_length ** 2, dim] - return tf.transpose(x_new, [2, 3, 0, 1, 4]) - - q = pad_to_multiple(q, block_length) - k = pad_to_multiple(k, block_length) - v = pad_to_multiple(v, block_length) + q = pad_to_multiple_2d(q, query_shape) + k = pad_to_multiple_2d(k, query_shape) + v = pad_to_multiple_2d(v, query_shape) # Setting up k and v values - paddings = [[0, 0], [0, 0], [filter_flange, filter_flange], - [filter_flange, filter_flange], [0, 0]] + paddings = [[0, 0], [0, 0], [memory_flange[0], memory_flange[1]], + [memory_flange[0], memory_flange[1]], [0, 0]] k = tf.pad(k, paddings) v = tf.pad(v, paddings) # Setting up q blocks - q_indices = gather_indices(q, block_length, block_length) - q_new = gather_blocks(q, q_indices) + q_indices = gather_indices_2d(q, query_shape, query_shape) + q_new = gather_blocks_2d(q, q_indices) # Setting up k and v blocks - full_filter_width = block_length + 2 * filter_flange - k_and_v_indices = gather_indices(k, full_filter_width, block_length) - k_new = gather_blocks(k, k_and_v_indices) - v_new = gather_blocks(v, k_and_v_indices) + memory_shape = (query_shape[0]+2*memory_flange[0], + query_shape[1]+2*memory_flange[1]) + k_and_v_indices = gather_indices_2d(k, memory_shape, query_shape) + k_new = gather_blocks_2d(k, k_and_v_indices) + v_new = gather_blocks_2d(v, k_and_v_indices) attention_bias = tf.expand_dims( tf.to_float(embedding_to_padding(k_new)) * -1e9, axis=-2) @@ -729,6 +690,159 @@ def gather_blocks(x, indices): return tf.reshape(output, v_shape) +def pad_to_multiple_2d(x, block_shape): + """Making sure x is a multiple of shape.""" + old_shape = x.get_shape().dims + last = old_shape[-1] + height_padding = -tf.shape(x)[1] % block_shape[0] + width_padding = -tf.shape(x)[2] % block_shape[1] + paddings = [[0, 0], [0, 0], [0, height_padding], + [0, width_padding], [0, 0]] + padded_x = tf.pad(x, paddings) + padded_shape = padded_x.get_shape().as_list() + padded_shape = padded_shape[:-1]+[last] + padded_x.set_shape(padded_shape) + return padded_x + + +def reshape_range(tensor, i, j, shape): + """Reshapes a tensor between dimensions i and j.""" + target_shape = tf.concat( + [tf.shape(tensor)[:i], shape, tf.shape(tensor)[j:]], + axis=0) + return tf.reshape(tensor, target_shape) + + +def gather_blocks_2d(x, indices): + """Gathers flattened blocks from x.""" + x_shape = tf.shape(x) + x = reshape_range(x, 2, 4, [tf.reduce_prod(x_shape[2:4])]) + # [length, batch, heads, dim] + x_t = tf.transpose(x, [2, 0, 1, 3]) + x_new = tf.gather(x_t, indices) + # returns [batch, heads, num_blocks, block_length ** 2, dim] + return tf.transpose(x_new, [2, 3, 0, 1, 4]) + + +def gather_indices_2d(x, block_shape, block_stride): + """Getting gather indices.""" + # making an identity matrix kernel + kernel = tf.eye(block_shape[0]*block_shape[1]) + kernel = reshape_range(kernel, 0, 1, [block_shape[0], block_shape[1], 1]) + # making indices [1, h, w, 1] to appy convs + indices = tf.range(0, tf.shape(x)[2] * tf.shape(x)[3], delta=1) + indices = tf.reshape(indices, [1, tf.shape(x)[2], tf.shape(x)[3], 1]) + indices = tf.nn.conv2d( + tf.cast(indices, tf.float32), + kernel, + strides=[1, block_stride[0], block_stride[1], 1], + padding="VALID") + # making indices [num_blocks, dim] to gather + num_blocks = tf.reduce_prod(tf.shape(indices)[:3]) + indices = tf.reshape(indices, [num_blocks, -1]) + return tf.cast(indices, tf.int32) + + +def masked_local_attention_2d(q, + k, + v, + query_shape=(8, 16), + memory_flange=(8, 16), + name=None): + """strided block local self-attention. + + Args: + q: a Tensor with shape [batch, heads, h, w, depth_k] + k: a Tensor with shape [batch, heads, h, w, depth_k] + v: a Tensor with shape [batch, heads, h, w, depth_v] + query_shape: an tuple indicating the height and width of each query block. + query_shape = block_shape + memory_flange: an integer indicating how much to look in height and width + from each query block. + memory shape = query_shape + (block_flange[0], 2*block_flange[1]) + name: an optional string + + Returns: + a Tensor of shape [batch, heads, h, w, depth_v] + """ + with tf.variable_scope( + name, default_name="local_masked_self_attention_2d", values=[q, k, v]): + v_shape = tf.shape(v) + depth_v = tf.shape(v)[4] + batch_size = tf.shape(q)[0] + num_heads = tf.shape(q)[1] + original_length = tf.shape(q)[2] * tf.shape(q)[3] + def make_mask(query_shape, memory_flange): + """creates a mask. + + The query mask can look to the left, top left, top, and top right, but + not the right. Inside the query, we have the standard raster scan + masking. + Args: + query_shape: A tuple of ints (query_height, query_width) + memory_flange: A tuple of ints + (memory_flange_height, memory_flange_width) + + Returns: + A tensor of shape query_size, memory_size + """ + + query_triangle = tf.matrix_band_part( + tf.ones([np.prod(query_shape), np.prod(query_shape)]), -1, 0) + split_query_masks = tf.split(query_triangle, query_shape[0], axis=1) + mask_pieces = [ + tf.concat( + [tf.ones([np.prod(query_shape), memory_flange[1]]), + split_query_masks[i], + tf.zeros([np.prod(query_shape), memory_flange[1]]) + ], axis=1) for i in range(query_shape[0])] + + final_mask = tf.concat( + [tf.ones( + [np.prod(query_shape), + (query_shape[1]+2*memory_flange[1])*memory_flange[0]]), + tf.concat(mask_pieces, axis=1) + ], axis=1) + # 0. is visible location, 1.0 is masked. + return 1. - final_mask + q = pad_to_multiple_2d(q, query_shape) + k = pad_to_multiple_2d(k, query_shape) + v = pad_to_multiple_2d(v, query_shape) + # Setting up k and v values. Padding top, left, and right + paddings = [[0, 0], [0, 0], [memory_flange[0], 0], + [memory_flange[1], memory_flange[1]], [0, 0]] + k = tf.pad(k, paddings) + v = tf.pad(v, paddings) + # Setting up q blocks + q_indices = gather_indices_2d(q, query_shape, query_shape) + q_new = gather_blocks_2d(q, q_indices) + # Setting up k and v blocks + memory_shape = (query_shape[0]+memory_flange[0], + query_shape[1]+memory_flange[1]*2) + k_and_v_indices = gather_indices_2d(k, memory_shape, query_shape) + k_new = gather_blocks_2d(k, k_and_v_indices) + v_new = gather_blocks_2d(v, k_and_v_indices) + logits = tf.matmul(q_new, k_new, transpose_b=True) + # Combining the mask for padding and visible region + attention_mask_shape = [np.prod(query_shape), + (query_shape[0]+memory_flange[0])* + (query_shape[1]+2*memory_flange[1])] + attention_mask = tf.cast(make_mask(query_shape, memory_flange), tf.bool) + # reshaping attention mask to have same dims as logits + attention_mask = tf.reshape(attention_mask, [1, 1, 1]+attention_mask_shape) + padding_mask = tf.expand_dims( + tf.cast(embedding_to_padding(k_new), tf.bool), axis=-2) + attention_bias = ( + tf.to_float(tf.logical_or(attention_mask, padding_mask)) *-1e9) + attention = tf.nn.softmax(logits + attention_bias) + output = tf.matmul(attention, v_new) + output = tf.reshape(output, [batch_size, num_heads, -1, depth_v]) + # Remove the padding if introduced + output = tf.slice(output, [0, 0, 0, 0], [-1, -1, original_length, -1]) + # [batch, heads, h, w, depth_v] + return tf.reshape(output, v_shape) + + def compute_qkv(query_antecedent, memory_antecedent, total_key_depth, total_value_depth, q_filter_width=1, kv_filter_width=1, q_padding="VALID", kv_padding="VALID"): @@ -795,7 +909,7 @@ def compute_qkv(query_antecedent, memory_antecedent, total_key_depth, def compute_qkv_2d(query_antecedent, memory_antecedent, total_key_depth, total_value_depth): - """Computes query, key and value of a 4D tensor. + """Computes query, key and value. Args: query_antecedent: a Tensor with shape [batch, h, w, depth_k] @@ -806,7 +920,7 @@ def compute_qkv_2d(query_antecedent, memory_antecedent, total_key_depth, Returns: q, k, v : [batch, h, w, depth_k] tensors """ - # self attention with single position q, k, and v. + # self attention with single position q, k, and v if memory_antecedent is None: combined = tf.layers.conv2d( query_antecedent, @@ -817,7 +931,7 @@ def compute_qkv_2d(query_antecedent, memory_antecedent, total_key_depth, axis=-1) return q, k, v - # Encoder decoder attention. + # Encoder decoder attention q = common_layers.conv1d( query_antecedent, total_key_depth, 1, name="q_transform") combined = common_layers.conv1d( @@ -918,8 +1032,8 @@ def multihead_attention_2d(query_antecedent, output_depth, num_heads, attention_type="local_attention_2d", - block_length=128, - block_width=128, + query_shape=(8, 16), + memory_flange=(8, 16), name=None): """2d Multihead scaled-dot-product attention with inp/output transformations. @@ -931,8 +1045,8 @@ def multihead_attention_2d(query_antecedent, output_depth: an integer num_heads: an integer dividing total_key_depth and total_value_depth attention_type: String, type of attention function to use. - block_length: an integer - relevant for "local_attention_2d" - block_width: an integer - relevant for "local_attention_2d" + query_shape: an tuple indicating the height and width of each query block. + memory_flange: an integer indicating how much to look in height and width name: an optional string Returns: @@ -954,7 +1068,7 @@ def multihead_attention_2d(query_antecedent, values=[query_antecedent, memory_antecedent]): q, k, v = compute_qkv_2d(query_antecedent, memory_antecedent, total_key_depth, total_value_depth) - + # after splitting, shape is [batch, heads, h, w, depth] q = split_heads_2d(q, num_heads) k = split_heads_2d(k, num_heads) v = split_heads_2d(v, num_heads) @@ -962,9 +1076,16 @@ def multihead_attention_2d(query_antecedent, q *= key_depth_per_head**-0.5 if attention_type == "local_attention_2d": x = local_attention_2d( - q, k, v, block_length=block_length, filter_flange=block_width) - x = tf.squeeze(combine_heads_2d(x), axis=-2) - x = common_layers.conv1d(x, output_depth, 1, name="output_transform") + q, k, v, query_shape=query_shape, memory_flange=memory_flange) + else: + x = masked_local_attention_2d(q, k, v, query_shape=query_shape, + memory_flange=memory_flange) + x = combine_heads_2d(x) + x = tf.layers.conv2d( + x, + output_depth, + (1, 1), + name="output_transform") return x diff --git a/tensor2tensor/layers/common_attention_test.py b/tensor2tensor/layers/common_attention_test.py index 6664bcc2d..d8f6f2b39 100644 --- a/tensor2tensor/layers/common_attention_test.py +++ b/tensor2tensor/layers/common_attention_test.py @@ -98,8 +98,8 @@ def testLocalUnmaskedAttention2D(self): tf.constant(x, dtype=tf.float32), tf.constant(y, dtype=tf.float32), tf.constant(y, dtype=tf.float32), - block_length=4, - filter_flange=3) + query_shape=(4, 4), + memory_flange=(3, 3)) session.run(tf.global_variables_initializer()) res = session.run(a) self.assertEqual(res.shape, (5, 4, 25, 25, 16)) @@ -112,8 +112,8 @@ def testLocalUnmaskedAttention2DMatchingBlockLength(self): tf.constant(x, dtype=tf.float32), tf.constant(y, dtype=tf.float32), tf.constant(y, dtype=tf.float32), - block_length=5, - filter_flange=3) + query_shape=(5, 5), + memory_flange=(3, 3)) session.run(tf.global_variables_initializer()) res = session.run(a) self.assertEqual(res.shape, (5, 4, 25, 25, 16)) From 2ebead2f451d30107c43f6f061998496978f5279 Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Wed, 6 Sep 2017 18:14:18 -0700 Subject: [PATCH 19/32] Move to core Estimator and improve decoding PiperOrigin-RevId: 167802133 --- tensor2tensor/data_generators/image.py | 4 +- tensor2tensor/layers/modalities.py | 2 +- tensor2tensor/layers/modalities_test.py | 4 +- tensor2tensor/models/attention_lm_moe.py | 6 +- tensor2tensor/models/bluenet_test.py | 2 +- tensor2tensor/models/bytenet_test.py | 2 +- tensor2tensor/models/gene_expression_test.py | 2 +- tensor2tensor/models/lstm.py | 4 +- tensor2tensor/models/lstm_test.py | 4 +- tensor2tensor/models/multimodel.py | 4 +- tensor2tensor/models/multimodel_test.py | 2 +- tensor2tensor/models/neural_gpu_test.py | 2 +- tensor2tensor/models/shake_shake.py | 2 +- tensor2tensor/models/slicenet_test.py | 2 +- tensor2tensor/models/transformer_moe.py | 4 +- tensor2tensor/models/transformer_revnet.py | 4 +- .../models/transformer_revnet_test.py | 2 +- tensor2tensor/models/transformer_test.py | 2 +- tensor2tensor/models/transformer_vae.py | 4 +- tensor2tensor/models/xception_test.py | 2 +- tensor2tensor/utils/data_reader.py | 10 +- tensor2tensor/utils/data_reader_test.py | 6 +- tensor2tensor/utils/decoding.py | 326 ++++++++++++------ tensor2tensor/utils/input_fn_builder.py | 19 +- tensor2tensor/utils/metrics.py | 13 +- tensor2tensor/utils/model_builder.py | 286 ++++++--------- tensor2tensor/utils/t2t_model.py | 4 +- tensor2tensor/utils/trainer_utils.py | 40 +-- tensor2tensor/utils/trainer_utils_test.py | 9 +- .../TransformerVisualization.ipynb | 16 +- 30 files changed, 427 insertions(+), 362 deletions(-) diff --git a/tensor2tensor/data_generators/image.py b/tensor2tensor/data_generators/image.py index 8d142d239..06942ed3f 100644 --- a/tensor2tensor/data_generators/image.py +++ b/tensor2tensor/data_generators/image.py @@ -313,7 +313,7 @@ def resize(img): return tf.to_int64(tf.image.resize_images(img, [299, 299])) inputs = tf.cast(examples["inputs"], tf.int64) - if mode == tf.contrib.learn.ModeKeys.TRAIN: + if mode == tf.estimator.ModeKeys.TRAIN: examples["inputs"] = tf.cond( # Preprocess 90% of the time. tf.less(tf.random_uniform([]), 0.9), lambda img=inputs: preprocess(img), @@ -589,7 +589,7 @@ def class_labels(self): ] def preprocess_examples(self, examples, mode, unused_hparams): - if mode == tf.contrib.learn.ModeKeys.TRAIN: + if mode == tf.estimator.ModeKeys.TRAIN: examples["inputs"] = common_layers.cifar_image_augmentation( examples["inputs"]) return examples diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py index c93a05433..1d606ec1d 100644 --- a/tensor2tensor/layers/modalities.py +++ b/tensor2tensor/layers/modalities.py @@ -113,7 +113,7 @@ def top(self, body_output, _): with tf.variable_scope(scope_name, reuse=reuse): var = self._get_weights() if (self._model_hparams.factored_logits and - self._model_hparams.mode == tf.contrib.learn.ModeKeys.TRAIN): + self._model_hparams.mode == tf.estimator.ModeKeys.TRAIN): # insert channels dimension body_output = tf.expand_dims(body_output, 3) logits = common_layers.FactoredTensor(body_output, var) diff --git a/tensor2tensor/layers/modalities_test.py b/tensor2tensor/layers/modalities_test.py index 5813422ab..93dda6d09 100644 --- a/tensor2tensor/layers/modalities_test.py +++ b/tensor2tensor/layers/modalities_test.py @@ -67,7 +67,7 @@ def testSymbolModalityTargets(self): label_smoothing=0.2, shared_embedding_and_softmax_weights=0, factored_logits=0, - mode=tf.contrib.learn.ModeKeys.TRAIN) + mode=tf.estimator.ModeKeys.TRAIN) body_output = -1 + np.random.random_integers( 100, size=(batch_size, length, height, hidden_size)) targets = -1 + np.random.random_integers( @@ -101,7 +101,7 @@ def testSymbolModalityTargetsFactored(self): label_smoothing=0.2, shared_embedding_and_softmax_weights=0, factored_logits=1, - mode=tf.contrib.learn.ModeKeys.TRAIN) + mode=tf.estimator.ModeKeys.TRAIN) body_output = -1 + np.random.random_integers( 100, size=(batch_size, length, height, hidden_size)) targets = -1 + np.random.random_integers( diff --git a/tensor2tensor/models/attention_lm_moe.py b/tensor2tensor/models/attention_lm_moe.py index eccf349c9..596d5b01d 100644 --- a/tensor2tensor/models/attention_lm_moe.py +++ b/tensor2tensor/models/attention_lm_moe.py @@ -42,7 +42,7 @@ import tensorflow as tf -ModeKeys = tf.contrib.learn.ModeKeys # pylint: disable=invalid-name +ModeKeys = tf.estimator.ModeKeys # pylint: disable=invalid-name class AttentionType(object): @@ -279,7 +279,7 @@ def remove_pad(x, pad_remover, mode): x = expert_utils.flatten_all_but_last(x) # Remove padding for training and eval - if mode != ModeKeys.INFER: + if mode != ModeKeys.PREDICT: # This is a hack to allows inference when the token # is detected as padding and removed. This works for now because there is # no padding at inference. @@ -291,7 +291,7 @@ def remove_pad(x, pad_remover, mode): def restore_pad(x, ref_x, pad_remover, mode): x = tf.squeeze(x, axis=0) - if mode != ModeKeys.INFER: + if mode != ModeKeys.PREDICT: x = pad_remover.restore(x) x = expert_utils.reshape_like(x, ref_x) return x diff --git a/tensor2tensor/models/bluenet_test.py b/tensor2tensor/models/bluenet_test.py index 70b8defe9..d559fd953 100644 --- a/tensor2tensor/models/bluenet_test.py +++ b/tensor2tensor/models/bluenet_test.py @@ -45,7 +45,7 @@ def testBlueNet(self): "targets": tf.constant(y, dtype=tf.int32), } model = bluenet.BlueNet( - hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams) + hparams, tf.estimator.ModeKeys.TRAIN, p_hparams) sharded_logits, _ = model.model_fn(features) logits = tf.concat(sharded_logits, 0) session.run(tf.global_variables_initializer()) diff --git a/tensor2tensor/models/bytenet_test.py b/tensor2tensor/models/bytenet_test.py index 536d348e7..56f421153 100644 --- a/tensor2tensor/models/bytenet_test.py +++ b/tensor2tensor/models/bytenet_test.py @@ -44,7 +44,7 @@ def testByteNet(self): "targets": tf.constant(y, dtype=tf.int32), } model = bytenet.ByteNet( - hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams) + hparams, tf.estimator.ModeKeys.TRAIN, p_hparams) sharded_logits, _ = model.model_fn(features) logits = tf.concat(sharded_logits, 0) session.run(tf.global_variables_initializer()) diff --git a/tensor2tensor/models/gene_expression_test.py b/tensor2tensor/models/gene_expression_test.py index e46e81859..ea02572d0 100644 --- a/tensor2tensor/models/gene_expression_test.py +++ b/tensor2tensor/models/gene_expression_test.py @@ -55,7 +55,7 @@ def _testModel(self, hparams, model_cls): "targets": tf.constant(targets, dtype=tf.float32), } p_hparams, = hparams.problems - sharded_logits, _ = model_cls(hparams, tf.contrib.learn.ModeKeys.TRAIN, + sharded_logits, _ = model_cls(hparams, tf.estimator.ModeKeys.TRAIN, p_hparams).model_fn(features) logits = tf.concat(sharded_logits, 0) diff --git a/tensor2tensor/models/lstm.py b/tensor2tensor/models/lstm.py index d79b04494..9f909433e 100644 --- a/tensor2tensor/models/lstm.py +++ b/tensor2tensor/models/lstm.py @@ -251,7 +251,7 @@ def lstm_seq2seq_internal_attention(inputs, targets, hparams, train): class LSTMSeq2seq(t2t_model.T2TModel): def model_fn_body(self, features): - train = self._hparams.mode == tf.contrib.learn.ModeKeys.TRAIN + train = self._hparams.mode == tf.estimator.ModeKeys.TRAIN return lstm_seq2seq_internal(features["inputs"], features["targets"], self._hparams, train) @@ -260,7 +260,7 @@ def model_fn_body(self, features): class LSTMSeq2seqAttention(t2t_model.T2TModel): def model_fn_body(self, features): - train = self._hparams.mode == tf.contrib.learn.ModeKeys.TRAIN + train = self._hparams.mode == tf.estimator.ModeKeys.TRAIN return lstm_seq2seq_internal_attention( features["inputs"], features["targets"], self._hparams, train) diff --git a/tensor2tensor/models/lstm_test.py b/tensor2tensor/models/lstm_test.py index 7da3d2380..c1190d016 100644 --- a/tensor2tensor/models/lstm_test.py +++ b/tensor2tensor/models/lstm_test.py @@ -44,7 +44,7 @@ def testLSTMSeq2Seq(self): "inputs": tf.constant(x, dtype=tf.int32), "targets": tf.constant(y, dtype=tf.int32), } - model = lstm.LSTMSeq2seq(hparams, tf.contrib.learn.ModeKeys.TRAIN, + model = lstm.LSTMSeq2seq(hparams, tf.estimator.ModeKeys.TRAIN, p_hparams) sharded_logits, _ = model.model_fn(features) logits = tf.concat(sharded_logits, 0) @@ -69,7 +69,7 @@ def testLSTMSeq2SeqAttention(self): "targets": tf.constant(y, dtype=tf.int32), } model = lstm.LSTMSeq2seqAttention( - hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams) + hparams, tf.estimator.ModeKeys.TRAIN, p_hparams) sharded_logits, _ = model.model_fn(features) logits = tf.concat(sharded_logits, 0) session.run(tf.global_variables_initializer()) diff --git a/tensor2tensor/models/multimodel.py b/tensor2tensor/models/multimodel.py index c8d515c8d..5df8fcd3c 100644 --- a/tensor2tensor/models/multimodel.py +++ b/tensor2tensor/models/multimodel.py @@ -74,7 +74,7 @@ def residual_fn3(x, y, z, hparams): def conv_experts(xs, hparams, dp, ps, padding, mask, layer_id): """Convolutions + Mixture-of-Experts layer.""" del layer_id # Unused. - train = hparams.mode == tf.contrib.learn.ModeKeys.TRAIN, + train = hparams.mode == tf.estimator.ModeKeys.TRAIN, conv_out = dp(conv_res_step, xs, hparams, padding, mask) loss = 0.0 moe_hidden_sizes = [hparams.filter_size] @@ -109,7 +109,7 @@ def prepare_decoder(targets, target_space_emb): class MultiModel(t2t_model.T2TModel): def model_fn_body_sharded(self, sharded_features): - train = self._hparams.mode == tf.contrib.learn.ModeKeys.TRAIN + train = self._hparams.mode == tf.estimator.ModeKeys.TRAIN dp = self._data_parallelism hparams = self._hparams diff --git a/tensor2tensor/models/multimodel_test.py b/tensor2tensor/models/multimodel_test.py index ab60bae97..3aff41029 100644 --- a/tensor2tensor/models/multimodel_test.py +++ b/tensor2tensor/models/multimodel_test.py @@ -47,7 +47,7 @@ def testMultiModel(self): "target_space_id": tf.constant(1, dtype=tf.int32), } model = multimodel.MultiModel( - hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams) + hparams, tf.estimator.ModeKeys.TRAIN, p_hparams) sharded_logits, _ = model.model_fn(features) logits = tf.concat(sharded_logits, 0) session.run(tf.global_variables_initializer()) diff --git a/tensor2tensor/models/neural_gpu_test.py b/tensor2tensor/models/neural_gpu_test.py index b7a1e98f7..164623699 100644 --- a/tensor2tensor/models/neural_gpu_test.py +++ b/tensor2tensor/models/neural_gpu_test.py @@ -50,7 +50,7 @@ def testNeuralGPU(self): "inputs": tf.constant(inputs, dtype=tf.int32), "targets": tf.constant(targets, dtype=tf.int32) } - model = neural_gpu.NeuralGPU(hparams, tf.contrib.learn.ModeKeys.TRAIN, + model = neural_gpu.NeuralGPU(hparams, tf.estimator.ModeKeys.TRAIN, p_hparams) shadred_logits, _ = model.model_fn(features) logits = tf.concat(shadred_logits, 0) diff --git a/tensor2tensor/models/shake_shake.py b/tensor2tensor/models/shake_shake.py index a7b379e11..a4dd2385a 100644 --- a/tensor2tensor/models/shake_shake.py +++ b/tensor2tensor/models/shake_shake.py @@ -64,7 +64,7 @@ def shake_shake_block(x, conv_filters, stride, hparams): skip = downsampling_residual_branch(x, conv_filters) # TODO(rshin): Use different alpha for each image in batch. - if hparams.mode == tf.contrib.learn.ModeKeys.TRAIN: + if hparams.mode == tf.estimator.ModeKeys.TRAIN: if hparams.shakeshake_type == "batch": shaken = common_layers.shakeshake2(branch1, branch2) elif hparams.shakeshake_type == "image": diff --git a/tensor2tensor/models/slicenet_test.py b/tensor2tensor/models/slicenet_test.py index c3a064a85..faf028737 100644 --- a/tensor2tensor/models/slicenet_test.py +++ b/tensor2tensor/models/slicenet_test.py @@ -47,7 +47,7 @@ def testSliceNet(self): "targets": tf.constant(y, dtype=tf.int32), "target_space_id": tf.constant(1, dtype=tf.int32), } - model = slicenet.SliceNet(hparams, tf.contrib.learn.ModeKeys.TRAIN, + model = slicenet.SliceNet(hparams, tf.estimator.ModeKeys.TRAIN, p_hparams) sharded_logits, _ = model.model_fn(features) logits = tf.concat(sharded_logits, 0) diff --git a/tensor2tensor/models/transformer_moe.py b/tensor2tensor/models/transformer_moe.py index 669b1842b..c8a32a667 100644 --- a/tensor2tensor/models/transformer_moe.py +++ b/tensor2tensor/models/transformer_moe.py @@ -91,7 +91,7 @@ def postprocess(x, y): dp, self._ps_devices, preprocess(x), - hparams.mode == tf.contrib.learn.ModeKeys.TRAIN, + hparams.mode == tf.estimator.ModeKeys.TRAIN, input_size=hparams.hidden_size, expert_fn=expert_fn, num_experts=hparams.moe_num_experts, @@ -140,7 +140,7 @@ def postprocess(x, y): dp, self._ps_devices, preprocess(x), - hparams.mode == tf.contrib.learn.ModeKeys.TRAIN, + hparams.mode == tf.estimator.ModeKeys.TRAIN, input_size=hparams.hidden_size, expert_fn=expert_fn, num_experts=hparams.moe_num_experts, diff --git a/tensor2tensor/models/transformer_revnet.py b/tensor2tensor/models/transformer_revnet.py index 942a00660..7275c370a 100644 --- a/tensor2tensor/models/transformer_revnet.py +++ b/tensor2tensor/models/transformer_revnet.py @@ -131,7 +131,7 @@ def g(x): g, num_layers=hparams.num_hidden_layers, f_side_input=[encoder_self_attention_bias], - is_training=hparams.mode == tf.contrib.learn.ModeKeys.TRAIN) + is_training=hparams.mode == tf.estimator.ModeKeys.TRAIN) y = tf.concat([y1, y2], axis=-1) return common_layers.layer_preprocess(y, hparams) @@ -212,7 +212,7 @@ def g(x): decoder_self_attention_bias, encoder_decoder_attention_bias, encoder_output ], - is_training=hparams.mode == tf.contrib.learn.ModeKeys.TRAIN) + is_training=hparams.mode == tf.estimator.ModeKeys.TRAIN) y = tf.concat([y1, y2], axis=-1) return common_layers.layer_preprocess(y, hparams) diff --git a/tensor2tensor/models/transformer_revnet_test.py b/tensor2tensor/models/transformer_revnet_test.py index 66b493b0b..f9bc8cfb2 100644 --- a/tensor2tensor/models/transformer_revnet_test.py +++ b/tensor2tensor/models/transformer_revnet_test.py @@ -59,7 +59,7 @@ def testTransformer(self): "target_space_id": tf.constant(1, dtype=tf.int32), } model = transformer_revnet.TransformerRevnet( - hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams) + hparams, tf.estimator.ModeKeys.TRAIN, p_hparams) sharded_logits, _ = model.model_fn(features) logits = tf.concat(sharded_logits, 0) grads = tf.gradients( diff --git a/tensor2tensor/models/transformer_test.py b/tensor2tensor/models/transformer_test.py index 6c0eee203..9e450a670 100644 --- a/tensor2tensor/models/transformer_test.py +++ b/tensor2tensor/models/transformer_test.py @@ -53,7 +53,7 @@ def getModel(self): } return transformer.Transformer( - hparams, tf.contrib.learn.ModeKeys.INFER, p_hparams), features + hparams, tf.estimator.ModeKeys.PREDICT, p_hparams), features def testTransformer(self): model, features = self.getModel() diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py index 025f8d631..e3279495a 100644 --- a/tensor2tensor/models/transformer_vae.py +++ b/tensor2tensor/models/transformer_vae.py @@ -244,7 +244,7 @@ def ae_decompress(z, ae, x, is_2d, hparams, name, reuse=None): # Leak at the beginning to help train. z = mix(z, ae, hparams.startup_steps) prob_z = common_layers.inverse_exp_decay(hparams.startup_steps) * 0.8 - prob_z = prob_z if hparams.mode == tf.contrib.learn.ModeKeys.TRAIN else 1.0 + prob_z = prob_z if hparams.mode == tf.estimator.ModeKeys.TRAIN else 1.0 z = tf.cond(tf.less(tf.random_uniform([]), prob_z), lambda: z, lambda: ae) @@ -305,7 +305,7 @@ def ae_transformer_internal(inputs, targets, target_space, hparams): reconstruct_loss = tf.nn.softmax_cross_entropy_with_logits( labels=hot, logits=c_z) # If not training, use the predicted z instead of the autoregressive one. - if hparams.mode == tf.contrib.learn.ModeKeys.INFER: + if hparams.mode == tf.estimator.ModeKeys.PREDICT: hot = tf.one_hot(tf.argmax(c_z, axis=-1), hparams.v_size) # Decompress, pass for ae loss. diff --git a/tensor2tensor/models/xception_test.py b/tensor2tensor/models/xception_test.py index 776d1306a..eb4c6db20 100644 --- a/tensor2tensor/models/xception_test.py +++ b/tensor2tensor/models/xception_test.py @@ -44,7 +44,7 @@ def testXception(self): "targets": tf.constant(y, dtype=tf.int32), } model = xception.Xception( - hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams) + hparams, tf.estimator.ModeKeys.TRAIN, p_hparams) sharded_logits, _ = model.model_fn(features) logits = tf.concat(sharded_logits, 0) session.run(tf.global_variables_initializer()) diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py index 681f3598b..f48665078 100644 --- a/tensor2tensor/utils/data_reader.py +++ b/tensor2tensor/utils/data_reader.py @@ -215,7 +215,7 @@ def default_example_reading_spec(data_file_pattern): def read_examples(problem, data_file_pattern, capacity, - mode=tf.contrib.learn.ModeKeys.TRAIN): + mode=tf.estimator.ModeKeys.TRAIN): """Create Dataset of Example for problem and data_file_pattern.""" if problem is None: data_fields, data_items_to_decoders = default_example_reading_spec( @@ -227,7 +227,7 @@ def read_examples(problem, # Create placeholders for input, rather than reading data from disk. return feature_placeholders(data_fields) - is_training = mode == tf.contrib.learn.ModeKeys.TRAIN + is_training = mode == tf.estimator.ModeKeys.TRAIN dataset = examples_reader( [data_file_pattern], data_fields, @@ -245,7 +245,7 @@ def input_pipeline(problem, data_file_pattern, capacity, mode, hparams, problem: Problem instance for which to build the input pipeline. data_file_pattern: file pattern for input files. capacity: int, data pipeline buffer capacity. - mode: tf.contrib.learn.ModeKeys entry. + mode: tf.estimator.ModeKeys entry. hparams: an HParams object. batching_scheme: a dictionary containing "boundaries": a list of integers for the boundaries that will be @@ -256,7 +256,7 @@ def input_pipeline(problem, data_file_pattern, capacity, mode, hparams, Returns: dict """ - is_training = mode == tf.contrib.learn.ModeKeys.TRAIN + is_training = mode == tf.estimator.ModeKeys.TRAIN num_threads = 4 if is_training else 1 with tf.name_scope("input_pipeline"): @@ -505,7 +505,7 @@ def get_data_filepatterns(problems, data_dir, mode): except ValueError: problem, _, _ = problem_hparams.parse_problem_name(problem) path = os.path.join(data_dir, problem) - if mode == tf.contrib.learn.ModeKeys.TRAIN: + if mode == tf.estimator.ModeKeys.TRAIN: datasets.append("%s-train*" % path) else: datasets.append("%s-dev*" % path) diff --git a/tensor2tensor/utils/data_reader_test.py b/tensor2tensor/utils/data_reader_test.py index 991669a99..aed2598c7 100644 --- a/tensor2tensor/utils/data_reader_test.py +++ b/tensor2tensor/utils/data_reader_test.py @@ -70,7 +70,7 @@ def preprocess_examples(self, examples, unused_mode, unused_hparams): def generate_test_data(problem, tmp_dir): problem.generate_data(tmp_dir, tmp_dir) filepatterns = data_reader.get_data_filepatterns( - problem.name, tmp_dir, tf.contrib.learn.ModeKeys.TRAIN) + problem.name, tmp_dir, tf.estimator.ModeKeys.TRAIN) assert tf.gfile.Glob(filepatterns[0]) return filepatterns @@ -115,7 +115,7 @@ def testTrainEvalBehavior(self): self.problem, self.filepatterns[0], 16, - mode=tf.contrib.learn.ModeKeys.EVAL) + mode=tf.estimator.ModeKeys.EVAL) eval_examples = eval_dataset.make_one_shot_iterator().get_next() eval_idxs = [] @@ -243,7 +243,7 @@ def example_len(ex): self.problem, self.filepatterns[0], 32, - mode=tf.contrib.learn.ModeKeys.EVAL) + mode=tf.estimator.ModeKeys.EVAL) dataset = data_reader.bucket_by_sequence_length( dataset, example_len, boundaries, batch_sizes, window_size) diff --git a/tensor2tensor/utils/decoding.py b/tensor2tensor/utils/decoding.py index 3f00c25a9..ea1a5fa01 100644 --- a/tensor2tensor/utils/decoding.py +++ b/tensor2tensor/utils/decoding.py @@ -36,17 +36,20 @@ FLAGS = tf.flags.FLAGS - -def _decode_from_dataset_log_results(inputs, - targets, - outputs, - problem_name, - prediction_idx, - inputs_vocab, - targets_vocab, - save_images=False, - model_dir=None, - identity_output=False): +# Number of samples to draw for an image input (in such cases as captioning) +IMAGE_DECODE_LENGTH = 100 + + +def log_decode_results(inputs, + outputs, + problem_name, + prediction_idx, + inputs_vocab, + targets_vocab, + targets=None, + save_images=False, + model_dir=None, + identity_output=False): """Log inference results.""" if "image" in problem_name and save_images: save_path = os.path.join(model_dir, "%s_prediction_%d.jpg" % @@ -56,17 +59,21 @@ def _decode_from_dataset_log_results(inputs, decoded_inputs = inputs_vocab.decode(_save_until_eos(inputs.flatten())) tf.logging.info("Inference results INPUT: %s" % decoded_inputs) + decoded_targets = None if identity_output: decoded_outputs = "".join(map(str, outputs.flatten())) - decoded_targets = "".join(map(str, targets.flatten())) + if targets is not None: + decoded_targets = "".join(map(str, targets.flatten())) else: decoded_outputs = "".join( map(str, targets_vocab.decode(_save_until_eos(outputs.flatten())))) - decoded_targets = "".join( - map(str, targets_vocab.decode(_save_until_eos(targets.flatten())))) + if targets is not None: + decoded_targets = "".join( + map(str, targets_vocab.decode(_save_until_eos(targets.flatten())))) tf.logging.info("Inference results OUTPUT: %s" % decoded_outputs) - tf.logging.info("Inference results TARGET: %s" % decoded_targets) + if targets is not None: + tf.logging.info("Inference results TARGET: %s" % decoded_targets) return decoded_outputs, decoded_targets @@ -80,22 +87,22 @@ def decode_from_dataset(estimator, identity_output=False): tf.logging.info("Performing local inference from dataset for %s.", str(problem_names)) - hparams = estimator.hparams + hparams = estimator.params for problem_idx, problem_name in enumerate(problem_names): # Build the inference input function infer_problems_data = data_reader.get_data_filepatterns( - problem_name, hparams.data_dir, tf.contrib.learn.ModeKeys.INFER) + problem_name, hparams.data_dir, tf.estimator.ModeKeys.PREDICT) infer_input_fn = input_fn_builder.build_input_fn( - mode=tf.contrib.learn.ModeKeys.INFER, + mode=tf.estimator.ModeKeys.PREDICT, hparams=hparams, data_file_patterns=infer_problems_data, num_datashards=devices.data_parallelism().n, fixed_problem=problem_idx) # Get the predictions as an iterable - predictions = estimator.predict(input_fn=infer_input_fn, as_iterable=True) + predictions = estimator.predict(infer_input_fn) # Prepare output file writers if decode_to_file passed if decode_to_file: @@ -119,16 +126,30 @@ def decode_from_dataset(estimator, output_beams = np.split(outputs, beam_size, axis=0) for i, beam in enumerate(output_beams): tf.logging.info("BEAM %d:" % i) - decoded = _decode_from_dataset_log_results( - inputs, targets, beam, problem_name, num_predictions, - inputs_vocab, targets_vocab, save_images, estimator.model_dir, - identity_output) + decoded = log_decode_results( + inputs, + beam, + problem_name, + num_predictions, + inputs_vocab, + targets_vocab, + save_images=save_images, + model_dir=estimator.model_dir, + identity_output=identity_output, + targets=targets) decoded_outputs.append(decoded) else: - decoded = _decode_from_dataset_log_results( - inputs, targets, outputs, problem_name, num_predictions, - inputs_vocab, targets_vocab, save_images, estimator.model_dir, - identity_output) + decoded = log_decode_results( + inputs, + outputs, + problem_name, + num_predictions, + inputs_vocab, + targets_vocab, + save_images=save_images, + model_dir=estimator.model_dir, + identity_output=identity_output, + targets=targets) decoded_outputs.append(decoded) # Write out predictions if decode_to_file passed @@ -149,43 +170,40 @@ def decode_from_dataset(estimator, def decode_from_file(estimator, filename): """Compute predictions on entries in filename and write them out.""" - hparams = estimator.hparams + hparams = estimator.params problem_id = FLAGS.decode_problem_id inputs_vocab = hparams.problems[problem_id].vocabulary["inputs"] targets_vocab = hparams.problems[problem_id].vocabulary["targets"] + problem_name = FLAGS.problems.split("-")[problem_id] tf.logging.info("Performing decoding from a file.") sorted_inputs, sorted_keys = _get_sorted_inputs(filename) num_decode_batches = (len(sorted_inputs) - 1) // FLAGS.decode_batch_size + 1 - input_fn = _decode_batch_input_fn(problem_id, num_decode_batches, - sorted_inputs, inputs_vocab) - decodes = [] - for _ in range(num_decode_batches): - result_iter = estimator.predict( - input_fn=input_fn.next if six.PY2 else input_fn.__next__, - as_iterable=True) - for result in result_iter: - - def log_fn(inputs, outputs): - decoded_inputs = inputs_vocab.decode(_save_until_eos(inputs.flatten())) - tf.logging.info("Inference results INPUT: %s" % decoded_inputs) - - decoded_outputs = targets_vocab.decode( - _save_until_eos(outputs.flatten())) - tf.logging.info("Inference results OUTPUT: %s" % decoded_outputs) - return decoded_outputs - - if FLAGS.decode_return_beams: - beam_decodes = [] - output_beams = np.split( - result["outputs"], FLAGS.decode_beam_size, axis=0) - for k, beam in enumerate(output_beams): - tf.logging.info("BEAM %d:" % k) - beam_decodes.append(log_fn(result["inputs"], beam)) - decodes.append("\t".join(beam_decodes)) + def input_fn(): + input_gen = _decode_batch_input_fn(problem_id, num_decode_batches, + sorted_inputs, inputs_vocab) + gen_fn = make_input_fn_from_generator(input_gen) + example = gen_fn() + return _decode_input_tensor_to_features_dict(example, hparams) - else: - decodes.append(log_fn(result["inputs"], result["outputs"])) + decodes = [] + result_iter = estimator.predict(input_fn) + for result in result_iter: + if FLAGS.decode_return_beams: + beam_decodes = [] + output_beams = np.split(result["outputs"], FLAGS.decode_beam_size, axis=0) + for k, beam in enumerate(output_beams): + tf.logging.info("BEAM %d:" % k) + decoded_outputs, _ = log_decode_results(result["inputs"], beam, + problem_name, None, + inputs_vocab, targets_vocab) + beam_decodes.append(decoded_outputs) + decodes.append("\t".join(beam_decodes)) + else: + decoded_outputs, _ = log_decode_results(result["inputs"], + result["outputs"], problem_name, + None, inputs_vocab, targets_vocab) + decodes.append(decoded_outputs) # Reversing the decoded inputs and outputs because they were reversed in # _decode_batch_input_fn @@ -210,33 +228,63 @@ def log_fn(inputs, outputs): outfile.write("%s\n" % (decodes[sorted_keys[index]])) -def decode_interactively(estimator): - hparams = estimator.hparams +def make_input_fn_from_generator(gen): + """Use py_func to yield elements from the given generator.""" + first_ex = six.next(gen) + flattened = tf.contrib.framework.nest.flatten(first_ex) + types = [t.dtype for t in flattened] + shapes = [[None] * len(t.shape) for t in flattened] + first_ex_list = [first_ex] + + def py_func(): + if first_ex_list: + example = first_ex_list.pop() + else: + example = six.next(gen) + return tf.contrib.framework.nest.flatten(example) - infer_input_fn = _interactive_input_fn(hparams) - for problem_idx, example in infer_input_fn: + def input_fn(): + flat_example = tf.py_func(py_func, [], types) + _ = [t.set_shape(shape) for t, shape in zip(flat_example, shapes)] + example = tf.contrib.framework.nest.pack_sequence_as(first_ex, flat_example) + return example + + return input_fn + + +def decode_interactively(estimator): + """Interactive decoding.""" + hparams = estimator.params + + def input_fn(): + gen_fn = make_input_fn_from_generator(_interactive_input_fn(hparams)) + example = gen_fn() + example = _interactive_input_tensor_to_features_dict(example, hparams) + return example + + result_iter = estimator.predict(input_fn) + for result in result_iter: + problem_idx = result["problem_choice"] targets_vocab = hparams.problems[problem_idx].vocabulary["targets"] - result_iter = estimator.predict(input_fn=lambda e=example: e) - for result in result_iter: - if FLAGS.decode_return_beams: - beams = np.split(result["outputs"], FLAGS.decode_beam_size, axis=0) - scores = None - if "scores" in result: - scores = np.split(result["scores"], FLAGS.decode_beam_size, axis=0) - for k, beam in enumerate(beams): - tf.logging.info("BEAM %d:" % k) - beam_string = targets_vocab.decode(_save_until_eos(beam.flatten())) - if scores is not None: - tf.logging.info("%s\tScore:%f" % (beam_string, scores[k])) - else: - tf.logging.info(beam_string) - else: - if FLAGS.identity_output: - tf.logging.info(" ".join(map(str, result["outputs"].flatten()))) + + if FLAGS.decode_return_beams: + beams = np.split(result["outputs"], FLAGS.decode_beam_size, axis=0) + scores = None + if "scores" in result: + scores = np.split(result["scores"], FLAGS.decode_beam_size, axis=0) + for k, beam in enumerate(beams): + tf.logging.info("BEAM %d:" % k) + beam_string = targets_vocab.decode(_save_until_eos(beam.flatten())) + if scores is not None: + tf.logging.info("%s\tScore:%f" % (beam_string, scores[k])) else: - tf.logging.info( - targets_vocab.decode( - _save_until_eos(result["outputs"].flatten()))) + tf.logging.info(beam_string) + else: + if FLAGS.identity_output: + tf.logging.info(" ".join(map(str, result["outputs"].flatten()))) + else: + tf.logging.info( + targets_vocab.decode(_save_until_eos(result["outputs"].flatten()))) def _decode_batch_input_fn(problem_id, num_decode_batches, sorted_inputs, @@ -264,9 +312,10 @@ def _decode_batch_input_fn(problem_id, num_decode_batches, sorted_inputs, assert len(input_ids) <= batch_length x = input_ids + [0] * (batch_length - len(input_ids)) final_batch_inputs.append(x) + yield { - "inputs": np.array(final_batch_inputs), - "problem_choice": np.array(problem_id) + "inputs": np.array(final_batch_inputs).astype(np.int32), + "problem_choice": np.array(problem_id).astype(np.int32), } @@ -277,7 +326,7 @@ def _interactive_input_fn(hparams): whole graph, then we are stuck encoding all of the input as one fixed-size numpy array. - We yield int64 arrays with shape [const_array_size]. The format is: + We yield int32 arrays with shape [const_array_size]. The format is: [num_samples, decode_length, len(input ids), , ] Args: @@ -288,7 +337,7 @@ def _interactive_input_fn(hparams): Raises: Exception: when `input_type` is invalid. """ - num_samples = 3 + num_samples = 1 decode_length = 100 input_type = "text" problem_id = 0 @@ -304,12 +353,13 @@ def _interactive_input_fn(hparams): pass while True: prompt = ("INTERACTIVE MODE num_samples=%d decode_length=%d \n" - " it= ('text' or 'image' or 'label')\n" - " pr= (set the problem number)\n" + " it= ('text' or 'image' or 'label', default: " + "text)\n" + " pr= (set the problem number, default: 0)\n" " in= (set the input problem number)\n" " ou= (set the output problem number)\n" - " ns= (changes number of samples)\n" - " dl= (changes decode length)\n" + " ns= (changes number of samples, default: 1)\n" + " dl= (changes decode length, default: 100)\n" " <%s> (decode)\n" " q (quit)\n" ">" % (num_samples, decode_length, "source_string" @@ -344,23 +394,23 @@ def _interactive_input_fn(hparams): x = [num_samples, decode_length, len(input_ids)] + input_ids assert len(x) < const_array_size x += [0] * (const_array_size - len(x)) - yield problem_id, { - "inputs": np.array(x), - "problem_choice": np.array(problem_id) + yield { + "inputs": np.array(x).astype(np.int32), + "problem_choice": np.array(problem_id).astype(np.int32) } elif input_type == "image": input_path = input_string img = read_image(input_path) - yield problem_id, { - "inputs": img, - "problem_choice": np.array(problem_id) + yield { + "inputs": img.astype(np.int32), + "problem_choice": np.array(problem_id).astype(np.int32) } elif input_type == "label": input_ids = [int(input_string)] x = [num_samples, decode_length, len(input_ids)] + input_ids - yield problem_id, { - "inputs": np.array(x), - "problem_choice": np.array(problem_id) + yield { + "inputs": np.array(x).astype(np.int32), + "problem_choice": np.array(problem_id).astype(np.int32) } else: raise Exception("Unsupported input type.") @@ -423,3 +473,85 @@ def _save_until_eos(hyp): except ValueError: # No EOS_ID: return the array as-is. return hyp + + +def _interactive_input_tensor_to_features_dict(feature_map, hparams): + """Convert the interactive input format (see above) to a dictionary. + + Args: + feature_map: a dictionary with keys `problem_choice` and `input` containing + Tensors. + hparams: model hyperparameters + + Returns: + a features dictionary, as expected by the decoder. + """ + inputs = tf.convert_to_tensor(feature_map["inputs"]) + input_is_image = False if len(inputs.get_shape()) < 3 else True + + def input_fn(problem_choice, x=inputs): # pylint: disable=missing-docstring + if input_is_image: + x = tf.image.resize_images(x, [299, 299]) + x = tf.reshape(x, [1, 299, 299, -1]) + x = tf.to_int32(x) + else: + # Remove the batch dimension. + num_samples = x[0] + length = x[2] + x = tf.slice(x, [3], tf.to_int32([length])) + x = tf.reshape(x, [1, -1, 1, 1]) + # Transform into a batch of size num_samples to get that many random + # decodes. + x = tf.tile(x, tf.to_int32([num_samples, 1, 1, 1])) + + p_hparams = hparams.problems[problem_choice] + return (tf.constant(p_hparams.input_space_id), + tf.constant(p_hparams.target_space_id), x) + + input_space_id, target_space_id, x = input_fn_builder.cond_on_index( + input_fn, feature_map["problem_choice"], 0, len(hparams.problems) - 1) + + features = {} + features["problem_choice"] = tf.convert_to_tensor( + feature_map["problem_choice"]) + features["input_space_id"] = input_space_id + features["target_space_id"] = target_space_id + features["decode_length"] = (IMAGE_DECODE_LENGTH + if input_is_image else inputs[1]) + features["inputs"] = x + return features + + +def _decode_input_tensor_to_features_dict(feature_map, hparams): + """Convert the interactive input format (see above) to a dictionary. + + Args: + feature_map: a dictionary with keys `problem_choice` and `input` containing + Tensors. + hparams: model hyperparameters + + Returns: + a features dictionary, as expected by the decoder. + """ + inputs = tf.convert_to_tensor(feature_map["inputs"]) + input_is_image = False + + def input_fn(problem_choice, x=inputs): # pylint: disable=missing-docstring + p_hparams = hparams.problems[problem_choice] + # Add a third empty dimension dimension + x = tf.expand_dims(x, axis=[2]) + x = tf.to_int32(x) + return (tf.constant(p_hparams.input_space_id), + tf.constant(p_hparams.target_space_id), x) + + input_space_id, target_space_id, x = input_fn_builder.cond_on_index( + input_fn, feature_map["problem_choice"], 0, len(hparams.problems) - 1) + + features = {} + features["problem_choice"] = feature_map["problem_choice"] + features["input_space_id"] = input_space_id + features["target_space_id"] = target_space_id + features["decode_length"] = (IMAGE_DECODE_LENGTH + if input_is_image else tf.shape(x)[1] + 50) + features["inputs"] = x + return features diff --git a/tensor2tensor/utils/input_fn_builder.py b/tensor2tensor/utils/input_fn_builder.py index bef95d58f..abec8d4ad 100644 --- a/tensor2tensor/utils/input_fn_builder.py +++ b/tensor2tensor/utils/input_fn_builder.py @@ -47,7 +47,7 @@ def build_input_fn(mode, evaluation, and testing prediction. Args: - mode: The execution mode, as defined in tf.contrib.learn.ModeKeys. + mode: The execution mode, as defined in tf.estimator.ModeKeys. hparams: HParams object. data_file_patterns: The list of file patterns to use to read in data. Set to `None` if you want to create a placeholder for the input data. The @@ -98,7 +98,7 @@ def input_fn(): data_reader.hparams_to_batching_scheme( hparams, shard_multiplier=num_datashards, - drop_long_sequences=(mode == tf.contrib.learn.ModeKeys.TRAIN + drop_long_sequences=(mode == tf.estimator.ModeKeys.TRAIN or hparams.eval_drop_long_sequences), length_multiplier=(p_hparams.batch_size_multiplier))) @@ -137,7 +137,7 @@ def input_fn(): trainable=False)) if fixed_problem is None: if (hparams.problem_choice == "uniform" or - mode != tf.contrib.learn.ModeKeys.TRAIN): + mode != tf.estimator.ModeKeys.TRAIN): problem_choice = tf.random_uniform( [], maxval=problem_count, dtype=tf.int32) elif hparams.problem_choice == "adaptive": @@ -169,7 +169,7 @@ def input_fn(): inp_id.set_shape([]) tgt_id.set_shape([]) # Forced shape obfuscation is necessary for inference. - if mode == tf.contrib.learn.ModeKeys.INFER: + if mode == tf.estimator.ModeKeys.PREDICT: rand_inputs._shape = tf.TensorShape([None, None, None, None]) # pylint: disable=protected-access rand_target._shape = tf.TensorShape([None, None, None, None]) # pylint: disable=protected-access @@ -180,15 +180,14 @@ def input_fn(): "input_space_id": inp_id, "target_space_id": tgt_id } - if mode == tf.contrib.learn.ModeKeys.INFER: + if mode == tf.estimator.ModeKeys.PREDICT: rand_feature_map["infer_targets"] = rand_target rand_target = None - # This is because of a bug in the tf.contrib.learn Estimator that - # short-circuits prediction if it doesn't see a QueueRunner. - # DummyQueueRunner implements the minimal expected interface but does - # nothing. - # TODO(rsepassi): Remove once we move to core Estimator. + # This is because of a bug in the Estimator that short-circuits prediction + # if it doesn't see a QueueRunner. DummyQueueRunner implements the + # minimal expected interface but does nothing. tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS, DummyQueueRunner()) + return rand_feature_map, rand_target return input_fn diff --git a/tensor2tensor/utils/metrics.py b/tensor2tensor/utils/metrics.py index baff66669..5bfad5338 100644 --- a/tensor2tensor/utils/metrics.py +++ b/tensor2tensor/utils/metrics.py @@ -20,8 +20,6 @@ # Dependency imports -import six - from tensor2tensor.layers import common_layers from tensor2tensor.utils import bleu_hook from tensor2tensor.utils import rouge @@ -197,6 +195,8 @@ def create_evaluation_metrics(problems, model_hparams): model_hparams: a set of hparams. Returns: + Dict . The metric functions have signature + (predictions, labels, problem_choice) -> (metric Tensor, update op). A dictionary with keys that are strings naming the evaluation metrics and values that are functions taking arguments of (predictions, targets), returning a tuple of a tensor of the @@ -210,8 +210,7 @@ def create_evaluation_metrics(problems, model_hparams): def make_problem_specific_metric_fn(metric_fn, problem_idx, weights_fn): """Create a metric fn conditioned on problem_idx.""" - def problem_metric_fn(predictions, labels, weights): - problem_choice = weights + def problem_metric_fn(predictions, labels, problem_choice): (scores, weights) = tf.cond( tf.equal(problem_idx, problem_choice), lambda: metric_fn(predictions, labels, weights_fn=weights_fn), @@ -258,11 +257,7 @@ def problem_metric_fn(predictions, labels, weights): metric_fn, problem_idx, weights_fn) eval_metrics["metrics-%s/%s" % (problem_name, metric)] = problem_metric_fn - return { - k: tf.contrib.learn.MetricSpec( - v, prediction_key="predictions", weight_key="problem_choice") - for (k, v) in six.iteritems(eval_metrics) - } + return eval_metrics # Metrics are functions that take predictions and labels and return diff --git a/tensor2tensor/utils/model_builder.py b/tensor2tensor/utils/model_builder.py index 34af6c827..21ef96b28 100644 --- a/tensor2tensor/utils/model_builder.py +++ b/tensor2tensor/utils/model_builder.py @@ -33,6 +33,7 @@ from tensor2tensor.models import models # pylint: disable=unused-import from tensor2tensor.utils import devices from tensor2tensor.utils import input_fn_builder +from tensor2tensor.utils import metrics from tensor2tensor.utils import registry from tensor2tensor.utils import yellowfin @@ -42,9 +43,6 @@ # TODO(rsepassi): Rm dep on FLAGS here FLAGS = tf.flags.FLAGS -# Number of samples to draw for an image input (in such cases as captioning) -IMAGE_DECODE_LENGTH = 100 - def log_variable_sizes(var_list, tag): """Log the sizes and shapes of variables, and the total size. @@ -64,90 +62,30 @@ def log_variable_sizes(var_list, tag): tf.logging.info("%s Total size: %d", tag, total_size) -def build_model_fn(model, hparams): +def build_model_fn(model): """Returns a function to build the model. Args: model: The name of the model to use. - hparams: The hyperparameters. Returns: A function to build the model's graph. This function is called by the Estimator object to construct the graph. """ - def initializer(): - if hparams.initializer == "orthogonal": - return tf.orthogonal_initializer(gain=hparams.initializer_gain) - elif hparams.initializer == "uniform": - max_val = 0.1 * hparams.initializer_gain - return tf.random_uniform_initializer(-max_val, max_val) - elif hparams.initializer == "normal_unit_scaling": - return init_ops.variance_scaling_initializer( - hparams.initializer_gain, mode="fan_avg", distribution="normal") - elif hparams.initializer == "uniform_unit_scaling": - return init_ops.variance_scaling_initializer( - hparams.initializer_gain, mode="fan_avg", distribution="uniform") - else: - raise ValueError("Unrecognized initializer: %s" % hparams.initializer) - - def learning_rate_decay(): - """Inverse-decay learning rate until warmup_steps, then decay.""" - warmup_steps = tf.to_float( - hparams.learning_rate_warmup_steps * FLAGS.worker_replicas) - step = tf.to_float(tf.contrib.framework.get_global_step()) - if hparams.learning_rate_decay_scheme == "noam": - return 5000.0 * hparams.hidden_size**-0.5 * tf.minimum( - (step + 1) * warmup_steps**-1.5, (step + 1)**-0.5) - elif hparams.learning_rate_decay_scheme == "exp100k": - return 0.94**(step // 100000) - elif hparams.learning_rate_decay_scheme == "cosine": - cycle_steps = hparams.learning_rate_cosine_cycle_steps - return 0.5 * (1 + tf.cos(np.pi * (step % cycle_steps) / cycle_steps)) - elif hparams.learning_rate_decay_scheme == "cyclelinear10x": - # Cycle the rate linearly by 10x every warmup_steps, up and down. - cycle_steps = hparams.learning_rate_warmup_steps - cycle_position = step % (2 * cycle_steps) - cycle_position = tf.to_float( # Normalize to the interval [-1, 1]. - cycle_position - cycle_steps) / float(cycle_steps) - cycle_position = 1.0 - tf.abs(cycle_position) # 0 to 1 and back to 0. - return (cycle_position + 0.1) * 3.0 # 10x difference each cycle (0.3-3). - - inv_base = tf.exp(tf.log(0.01) / warmup_steps) - inv_decay = inv_base**(warmup_steps - step) - if hparams.learning_rate_decay_scheme == "sqrt": - decay = _sqrt_decay(step - warmup_steps) - elif hparams.learning_rate_decay_scheme == "exp10k": - decay = _exp_decay_after(step - warmup_steps, 0.9995, - FLAGS.train_steps - warmup_steps - 10000) - elif hparams.learning_rate_decay_scheme == "exp50k": - decay = _exp_decay_after(step - warmup_steps, 0.99995, - FLAGS.train_steps - warmup_steps - 50000) - elif hparams.learning_rate_decay_scheme == "exp500k": - decay = _exp_decay_after(step - warmup_steps, 0.9999955, - FLAGS.train_steps - warmup_steps - 500000) - elif hparams.learning_rate_decay_scheme == "none": - decay = tf.constant(1.0) - else: - raise ValueError("Unrecognized learning rate decay scheme: %s" % - hparams.learning_rate_decay_scheme) - return tf.cond( - step < warmup_steps, - lambda: inv_decay, - lambda: decay, - name="learning_rate_decay_warump_cond") - - def model_fn(features, targets, mode): + def model_fn(features, labels, mode, params): """Creates the prediction, loss, and train ops. Args: features: A dictionary of tensors keyed by the feature name. - targets: A tensor representing the labels (targets). - mode: The execution mode, as defined in tf.contrib.learn.ModeKeys. + labels: A tensor representing the labels. + mode: The execution mode, as defined in tf.estimator.ModeKeys. + params: model HParams. Returns: - A tuple consisting of the prediction, loss, and train_op. + An EstimatorSpec. """ + hparams = params # Deep-copy the model hparams between modes to eliminate # side-effects caused by abuse of the linked problem_hparams # objects which are used to share modality objects between @@ -159,19 +97,76 @@ def model_fn(features, targets, mode): # could be created once per mode and passed to the constructor of # t2t_model. my_hp = copy.deepcopy(hparams) - if mode == tf.contrib.learn.ModeKeys.INFER: - if FLAGS.decode_interactive: - features = _interactive_input_tensor_to_features_dict(features, my_hp) - elif FLAGS.decode_from_file: - features = _decode_input_tensor_to_features_dict(features, my_hp) - if targets is not None: - features["targets"] = targets + def initializer(): + if hparams.initializer == "orthogonal": + return tf.orthogonal_initializer(gain=hparams.initializer_gain) + elif hparams.initializer == "uniform": + max_val = 0.1 * hparams.initializer_gain + return tf.random_uniform_initializer(-max_val, max_val) + elif hparams.initializer == "normal_unit_scaling": + return init_ops.variance_scaling_initializer( + hparams.initializer_gain, mode="fan_avg", distribution="normal") + elif hparams.initializer == "uniform_unit_scaling": + return init_ops.variance_scaling_initializer( + hparams.initializer_gain, mode="fan_avg", distribution="uniform") + else: + raise ValueError("Unrecognized initializer: %s" % hparams.initializer) + + def learning_rate_decay(): + """Inverse-decay learning rate until warmup_steps, then decay.""" + warmup_steps = tf.to_float( + hparams.learning_rate_warmup_steps * FLAGS.worker_replicas) + step = tf.to_float(tf.contrib.framework.get_global_step()) + if hparams.learning_rate_decay_scheme == "noam": + return 5000.0 * hparams.hidden_size**-0.5 * tf.minimum( + (step + 1) * warmup_steps**-1.5, (step + 1)**-0.5) + elif hparams.learning_rate_decay_scheme == "exp100k": + return 0.94**(step // 100000) + elif hparams.learning_rate_decay_scheme == "cosine": + cycle_steps = hparams.learning_rate_cosine_cycle_steps + return 0.5 * (1 + tf.cos(np.pi * (step % cycle_steps) / cycle_steps)) + elif hparams.learning_rate_decay_scheme == "cyclelinear10x": + # Cycle the rate linearly by 10x every warmup_steps, up and down. + cycle_steps = hparams.learning_rate_warmup_steps + cycle_position = step % (2 * cycle_steps) + cycle_position = tf.to_float( # Normalize to the interval [-1, 1]. + cycle_position - cycle_steps) / float(cycle_steps) + cycle_position = 1.0 - tf.abs(cycle_position) # 0 to 1 and back to 0. + return ( + cycle_position + 0.1) * 3.0 # 10x difference each cycle (0.3-3). + + inv_base = tf.exp(tf.log(0.01) / warmup_steps) + inv_decay = inv_base**(warmup_steps - step) + if hparams.learning_rate_decay_scheme == "sqrt": + decay = _sqrt_decay(step - warmup_steps) + elif hparams.learning_rate_decay_scheme == "exp10k": + decay = _exp_decay_after(step - warmup_steps, 0.9995, + FLAGS.train_steps - warmup_steps - 10000) + elif hparams.learning_rate_decay_scheme == "exp50k": + decay = _exp_decay_after(step - warmup_steps, 0.99995, + FLAGS.train_steps - warmup_steps - 50000) + elif hparams.learning_rate_decay_scheme == "exp500k": + decay = _exp_decay_after(step - warmup_steps, 0.9999955, + FLAGS.train_steps - warmup_steps - 500000) + elif hparams.learning_rate_decay_scheme == "none": + decay = tf.constant(1.0) + else: + raise ValueError("Unrecognized learning rate decay scheme: %s" % + hparams.learning_rate_decay_scheme) + return tf.cond( + step < warmup_steps, + lambda: inv_decay, + lambda: decay, + name="learning_rate_decay_warump_cond") + + if labels is not None: + features["targets"] = labels dp = devices.data_parallelism() tf.get_variable_scope().set_initializer(initializer()) - is_training = mode == tf.contrib.learn.ModeKeys.TRAIN + is_training = mode == tf.estimator.ModeKeys.TRAIN # Add input statistics for incoming features. with tf.name_scope("input_stats"): @@ -218,7 +213,7 @@ def nth_model(n): n, dp, devices.ps_devices(all_workers=True)) - if mode == tf.contrib.learn.ModeKeys.INFER: + if mode == tf.estimator.ModeKeys.PREDICT: return model_class.infer( features, beam_size=FLAGS.decode_beam_size, @@ -235,7 +230,7 @@ def nth_model(n): # TODO(lukaszkaiser): why is this hack needed for variables init? Repair. skip_this_one = skip_this_one and (FLAGS.worker_id != 0 or n > 1) if (FLAGS.eval_run_autoregressive and - mode == tf.contrib.learn.ModeKeys.EVAL): + mode == tf.estimator.ModeKeys.EVAL): sharded_logits, losses_dict = model_class.eval_autoregressive(features) else: sharded_logits, losses_dict = model_class.model_fn( @@ -272,36 +267,50 @@ def nth_model(n): features["problem_choice"], 0, len(my_hp.problems) - 1) - if mode == tf.contrib.learn.ModeKeys.INFER: + if mode == tf.estimator.ModeKeys.PREDICT: # Beam search in sequence model returns both decodes withe key "outputs" # and scores with they key "scores". If return list is a dict, we expect # that it will have keys "outputs", a tensor of int32 and scores, a # tensor of floats. This is useful if we want to return scores from # estimator.predict if not isinstance(result_list, dict): - ret = {"outputs": result_list}, None, None + predictions = {"outputs": result_list} else: - ret = { + predictions = { "outputs": result_list["outputs"], "scores": result_list["scores"] - }, None, None + } + if "inputs" in features: - ret[0]["inputs"] = features["inputs"] + predictions["inputs"] = features["inputs"] if "infer_targets" in features: - ret[0]["targets"] = features["infer_targets"] - return ret + predictions["targets"] = features["infer_targets"] + predictions["problem_choice"] = (features["problem_choice"] * tf.ones( + (tf.shape(features["inputs"])[0],), dtype=tf.int32)) + + return tf.estimator.EstimatorSpec(mode, predictions=predictions) sharded_logits, total_loss = result_list[1:], result_list[0] - if mode == tf.contrib.learn.ModeKeys.EVAL: + if mode == tf.estimator.ModeKeys.EVAL: # For evaluation, return the logits layer as our predictions. logits = tf.concat(sharded_logits, 0) - ret = { - "predictions": logits, - "problem_choice": features["problem_choice"], - } - return ret, total_loss, None - assert mode == tf.contrib.learn.ModeKeys.TRAIN + eval_metrics_fns = metrics.create_evaluation_metrics( + zip(FLAGS.problems.split("-"), hparams.problem_instances), hparams) + _check_autotune_metrics(eval_metrics_fns) + + eval_metrics = {} + for metric_name, metric_fn in six.iteritems(eval_metrics_fns): + eval_metrics[metric_name] = metric_fn(logits, labels, + features["problem_choice"]) + + return tf.estimator.EstimatorSpec( + mode, + predictions={"predictions": logits}, + eval_metric_ops=eval_metrics, + loss=total_loss) + + assert mode == tf.estimator.ModeKeys.TRAIN # Some training statistics. with tf.name_scope("training_stats"): @@ -381,7 +390,11 @@ def nth_model(n): del summaries[i] tf.logging.info("Global model_fn finished.") - return {"problem_choice": features["problem_choice"]}, total_loss, train_op + return tf.estimator.EstimatorSpec( + mode, + predictions={"problem_choice": features["problem_choice"]}, + loss=total_loss, + train_op=train_op) return model_fn @@ -431,81 +444,8 @@ def _exp_decay_after(step, rate, from_which_step): name="exponential_decay_step_cond") -def _interactive_input_tensor_to_features_dict(feature_map, hparams): - """Convert the interactive input format (see above) to a dictionary. - - Args: - feature_map: a dictionary with keys `problem_choice` and `input` containing - Tensors. - hparams: model hyperparameters - - Returns: - a features dictionary, as expected by the decoder. - """ - inputs = tf.constant(feature_map["inputs"]) - input_is_image = False if len(inputs.shape) < 3 else True - - def input_fn(problem_choice, x=inputs): # pylint: disable=missing-docstring - p_hparams = hparams.problems[problem_choice] - if not input_is_image: - # Remove the batch dimension. - num_samples = x[0] - length = x[2] - x = tf.slice(x, [3], tf.to_int32([length])) - x = tf.reshape(x, [1, -1, 1, 1]) - # Transform into a batch of size num_samples to get that many random - # decodes. - x = tf.tile(x, tf.to_int32([num_samples, 1, 1, 1])) - else: - x = tf.image.resize_images(x, [299, 299]) - x = tf.reshape(x, [1, 299, 299, -1]) - x = tf.to_int32(x) - return (tf.constant(p_hparams.input_space_id), - tf.constant(p_hparams.target_space_id), x) - - input_space_id, target_space_id, x = input_fn_builder.cond_on_index( - input_fn, feature_map["problem_choice"], 0, len(hparams.problems) - 1) - - features = {} - features["problem_choice"] = tf.constant(feature_map["problem_choice"]) - features["input_space_id"] = input_space_id - features["target_space_id"] = target_space_id - features["decode_length"] = (IMAGE_DECODE_LENGTH - if input_is_image else inputs[1]) - features["inputs"] = x - return features - - -def _decode_input_tensor_to_features_dict(feature_map, hparams): - """Convert the interactive input format (see above) to a dictionary. - - Args: - feature_map: a dictionary with keys `problem_choice` and `input` containing - Tensors. - hparams: model hyperparameters - - Returns: - a features dictionary, as expected by the decoder. - """ - inputs = tf.constant(feature_map["inputs"]) - input_is_image = False - - def input_fn(problem_choice, x=inputs): # pylint: disable=missing-docstring - p_hparams = hparams.problems[problem_choice] - # Add a third empty dimension dimension - x = tf.expand_dims(x, axis=[2]) - x = tf.to_int32(x) - return (tf.constant(p_hparams.input_space_id), - tf.constant(p_hparams.target_space_id), x) - - input_space_id, target_space_id, x = input_fn_builder.cond_on_index( - input_fn, feature_map["problem_choice"], 0, len(hparams.problems) - 1) - - features = {} - features["problem_choice"] = feature_map["problem_choice"] - features["input_space_id"] = input_space_id - features["target_space_id"] = target_space_id - features["decode_length"] = (IMAGE_DECODE_LENGTH - if input_is_image else tf.shape(x)[1] + 50) - features["inputs"] = x - return features +def _check_autotune_metrics(metrics_dict): + if (hasattr(FLAGS, "autotune") and FLAGS.autotune and + FLAGS.objective not in metrics_dict): + raise ValueError("Tuning objective %s not among evaluation metrics %s" % + (FLAGS.objective, metrics_dict.keys())) diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py index d3fc6dac1..32627f7e3 100644 --- a/tensor2tensor/utils/t2t_model.py +++ b/tensor2tensor/utils/t2t_model.py @@ -69,7 +69,7 @@ def __init__(self, Args: hparams: a hyperparameters object. - mode: The execution mode, as defined in tf.contrib.learn.ModeKeys. + mode: The execution mode, as defined in tf.estimator.ModeKeys. problem_hparams: a hyperparameters object. problem_idx: an integer. data_parallelism: a expert_utils.parallelism @@ -86,7 +86,7 @@ def __init__(self, hparams = copy.copy(hparams) hparams.add_hparam("mode", mode) # When not in training mode, set all forms of dropout to zero. - if mode != tf.contrib.learn.ModeKeys.TRAIN: + if mode != tf.estimator.ModeKeys.TRAIN: for key in hparams.values(): if key[-len("dropout"):] == "dropout": setattr(hparams, key, 0.0) diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py index 3248d9ca9..83db7c007 100644 --- a/tensor2tensor/utils/trainer_utils.py +++ b/tensor2tensor/utils/trainer_utils.py @@ -30,7 +30,6 @@ from tensor2tensor.utils import data_reader from tensor2tensor.utils import devices from tensor2tensor.utils import input_fn_builder -from tensor2tensor.utils import metrics from tensor2tensor.utils import model_builder from tensor2tensor.utils import registry @@ -155,12 +154,6 @@ def create_experiment(output_dir, data_dir, model_name, train_steps, output_dir=output_dir, data_dir=data_dir, model_name=model_name) - eval_metrics = metrics.create_evaluation_metrics( - zip(FLAGS.problems.split("-"), hparams.problem_instances), hparams) - if (hasattr(FLAGS, "autotune") and FLAGS.autotune and - FLAGS.objective not in eval_metrics): - raise ValueError("Tuning objective %s not among evaluation metrics %s" % - (FLAGS.objective, eval_metrics.keys())) train_monitors = [] eval_hooks = [] if FLAGS.tfdbg: @@ -169,9 +162,8 @@ def create_experiment(output_dir, data_dir, model_name, train_steps, eval_hooks.append(hook) return tf.contrib.learn.Experiment( estimator=estimator, - train_input_fn=input_fns[tf.contrib.learn.ModeKeys.TRAIN], - eval_input_fn=input_fns[tf.contrib.learn.ModeKeys.EVAL], - eval_metrics=eval_metrics, + train_input_fn=input_fns[tf.estimator.ModeKeys.TRAIN], + eval_input_fn=input_fns[tf.estimator.ModeKeys.EVAL], train_steps=train_steps, eval_steps=eval_steps, min_eval_frequency=FLAGS.local_eval_frequency, @@ -185,39 +177,37 @@ def create_experiment_components(hparams, output_dir, data_dir, model_name): num_datashards = devices.data_parallelism().n train_input_fn = input_fn_builder.build_input_fn( - mode=tf.contrib.learn.ModeKeys.TRAIN, + mode=tf.estimator.ModeKeys.TRAIN, hparams=hparams, data_file_patterns=get_data_filepatterns(data_dir, - tf.contrib.learn.ModeKeys.TRAIN), + tf.estimator.ModeKeys.TRAIN), num_datashards=num_datashards, worker_replicas=FLAGS.worker_replicas, worker_id=FLAGS.worker_id) eval_input_fn = input_fn_builder.build_input_fn( - mode=tf.contrib.learn.ModeKeys.EVAL, + mode=tf.estimator.ModeKeys.EVAL, hparams=hparams, data_file_patterns=get_data_filepatterns(data_dir, - tf.contrib.learn.ModeKeys.EVAL), + tf.estimator.ModeKeys.EVAL), num_datashards=num_datashards, worker_replicas=FLAGS.worker_replicas, worker_id=FLAGS.worker_id) - estimator = tf.contrib.learn.Estimator( - model_fn=model_builder.build_model_fn(model_name, hparams), + estimator = tf.estimator.Estimator( + model_fn=model_builder.build_model_fn(model_name), model_dir=output_dir, + params=hparams, config=tf.contrib.learn.RunConfig( master=FLAGS.master, - model_dir=output_dir, gpu_memory_fraction=FLAGS.worker_gpu_memory_fraction, session_config=session_config(), keep_checkpoint_max=FLAGS.keep_checkpoint_max, keep_checkpoint_every_n_hours=FLAGS.keep_checkpoint_every_n_hours, save_checkpoints_secs=FLAGS.save_checkpoints_secs)) - # Store the hparams in the estimator as well - estimator.hparams = hparams return estimator, { - tf.contrib.learn.ModeKeys.TRAIN: train_input_fn, - tf.contrib.learn.ModeKeys.EVAL: eval_input_fn + tf.estimator.ModeKeys.TRAIN: train_input_fn, + tf.estimator.ModeKeys.EVAL: eval_input_fn } @@ -330,9 +320,15 @@ def run(data_dir, model, output_dir, train_steps, eval_steps, schedule): if schedule == "local_run": # Run the local demo. exp = exp_fn(output_dir) - if exp.train_steps > 0 or exp.eval_steps > 0: + if exp.train_steps > 0 and exp.eval_steps > 0: tf.logging.info("Performing local training and evaluation.") exp.train_and_evaluate() + elif exp.train_steps > 0: + tf.logging.info("Performing local training.") + exp.train() + elif exp.eval_steps > 0: + tf.logging.info("Performing local evaluation.") + exp.evaluate(delay_secs=0) else: # Perform distributed training/evaluation. learn_runner.run( diff --git a/tensor2tensor/utils/trainer_utils_test.py b/tensor2tensor/utils/trainer_utils_test.py index 6cc654d26..e71fc16c2 100644 --- a/tensor2tensor/utils/trainer_utils_test.py +++ b/tensor2tensor/utils/trainer_utils_test.py @@ -106,7 +106,7 @@ def testSingleEvalStepRawSession(self): encoders = registry.problem(FLAGS.problems).feature_encoders(data_dir) hparams = trainer_utils.create_hparams( FLAGS.hparams_set, FLAGS.problems, data_dir) - model_fn = model_builder.build_model_fn(model_name, hparams) + model_fn = model_builder.build_model_fn(model_name) inputs_ph = tf.placeholder(dtype=tf.int32) # Just length dimension. batch_inputs = tf.reshape(inputs_ph, [1, -1, 1, 1]) # Make it 4D. targets_ph = tf.placeholder(dtype=tf.int32) # Just length dimension. @@ -117,9 +117,10 @@ def testSingleEvalStepRawSession(self): "target_space_id": hparams.problems[0].target_space_id} # Now set a mode and create the graph by invoking model_fn. - mode = tf.contrib.learn.ModeKeys.EVAL - predictions_dict, _, _ = model_fn( # In INFER mode targets can be None. - features, batch_targets, mode) + mode = tf.estimator.ModeKeys.EVAL + estimator_spec = model_fn( # In INFER mode targets can be None. + features, batch_targets, mode, hparams) + predictions_dict = estimator_spec.predictions predictions = tf.squeeze( # These are not images, axis=2,3 are not needed. predictions_dict["predictions"], axis=[2, 3]) diff --git a/tensor2tensor/visualization/TransformerVisualization.ipynb b/tensor2tensor/visualization/TransformerVisualization.ipynb index e3fb8f958..166e0c9c5 100644 --- a/tensor2tensor/visualization/TransformerVisualization.ipynb +++ b/tensor2tensor/visualization/TransformerVisualization.ipynb @@ -127,9 +127,9 @@ "num_datashards = utils.devices.data_parallelism().n\n", "\n", "problems_data = utils.get_data_filepatterns(\n", - " DATA_DIR, tf.contrib.learn.ModeKeys.EVAL)\n", + " DATA_DIR, tf.estimator.ModeKeys.EVAL)\n", "input_fn = utils.input_fn_builder.build_input_fn(\n", - " mode=tf.contrib.learn.ModeKeys.EVAL,\n", + " mode=tf.estimator.ModeKeys.EVAL,\n", " hparams=hparams,\n", " data_file_patterns=problems_data,\n", " num_datashards=num_datashards)\n", @@ -192,8 +192,9 @@ } ], "source": [ - "model_fn=utils.model_builder.build_model_fn(MODEL, hparams=hparams)\n", - "sharded_logits, training_loss, extra_loss = model_fn(features, target, tf.contrib.learn.ModeKeys.EVAL)" + "model_fn=utils.model_builder.build_model_fn(MODEL)\n", + "spec = model_fn(features, target, tf.estimator.ModeKeys.EVAL, hparams)\n", + "predictions_dict = spec.predictions", ] }, { @@ -215,7 +216,8 @@ ], "source": [ "with tf.variable_scope(tf.get_variable_scope(), reuse=True):\n", - " beam_out = model_fn(features, target, tf.contrib.learn.ModeKeys.INFER)" + " spec = model_fn(features, target, tf.estimator.ModeKeys.PREDICT, hparams)\n", + " beam_out = spec.predictions['outputs']", ] }, { @@ -324,7 +326,7 @@ } ], "source": [ - "inp, out, logits = sess.run([inputs['inputs'], target, sharded_logits['predictions']])\n", + "inp, out, logits = sess.run([inputs['inputs'], target, predictions_dict['predictions']])\n", "\n", "print(\"Input: \", decode(inp[0]))\n", "print(\"Gold: \", decode(out[0]))\n", @@ -366,7 +368,7 @@ ], "source": [ "inp_ids = encode(eng)\n", - "beam_decode = sess.run(beam_out[0]['outputs'], {\n", + "beam_decode = sess.run(beam_out, {\n", " inputs['inputs']: np.expand_dims(np.expand_dims(inp_ids, axis=2), axis=3),\n", "})\n", "trans = decode(beam_decode[0])\n", From cb181de23926052a042ee5e6fa9bda0d21dc8f23 Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Wed, 6 Sep 2017 18:55:59 -0700 Subject: [PATCH 20/32] Enable passing padded_shapes in padded_batch; log_device_placement FLAG PiperOrigin-RevId: 167805495 --- tensor2tensor/models/transformer.py | 17 ++++++---- tensor2tensor/utils/data_reader.py | 44 ++++++++++++++++++------- tensor2tensor/utils/data_reader_test.py | 2 +- tensor2tensor/utils/trainer_utils.py | 5 ++- 4 files changed, 48 insertions(+), 20 deletions(-) diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py index 38766ec19..d3a406a29 100644 --- a/tensor2tensor/models/transformer.py +++ b/tensor2tensor/models/transformer.py @@ -191,10 +191,13 @@ def transformer_encoder(encoder_input, """ x = encoder_input with tf.variable_scope(name): - pad_remover = expert_utils.PadRemover( - common_attention.attention_bias_to_padding(encoder_self_attention_bias)) - for layer in xrange( - hparams.num_encoder_layers or hparams.num_hidden_layers): + pad_remover = None + if hparams.use_pad_remover: + pad_remover = expert_utils.PadRemover( + common_attention.attention_bias_to_padding( + encoder_self_attention_bias)) + for layer in xrange(hparams.num_encoder_layers or + hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = common_attention.multihead_attention( @@ -237,8 +240,8 @@ def transformer_decoder(decoder_input, """ x = decoder_input with tf.variable_scope(name): - for layer in xrange( - hparams.num_decoder_layers or hparams.num_hidden_layers): + for layer in xrange(hparams.num_decoder_layers or + hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = common_attention.multihead_attention( @@ -362,6 +365,8 @@ def transformer_base(): hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("nbr_decoder_problems", 1) hparams.add_hparam("proximity_bias", int(False)) + hparams.add_hparam("use_pad_remover", int(True)) + return hparams diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py index f48665078..e89b9b808 100644 --- a/tensor2tensor/utils/data_reader.py +++ b/tensor2tensor/utils/data_reader.py @@ -267,11 +267,23 @@ def input_pipeline(problem, data_file_pattern, capacity, mode, hparams, lambda ex: _preprocess(ex, problem, data_file_pattern, hparams, mode), num_threads=num_threads) dataset = dataset.filter( - lambda ex: _example_too_big(ex, batching_scheme["max_length"])) + lambda ex: example_valid_size(ex, batching_scheme["max_length"])) + + bucket_id_fn = _example_length + if len(batching_scheme["boundaries"]) == 1: + bucket_id_fn = lambda _: tf.constant(0) + + if "padded_shapes" not in batching_scheme: + batching_scheme["padded_shapes"] = None dataset = bucket_by_sequence_length( - dataset, _example_length, batching_scheme["boundaries"], - batching_scheme["batch_sizes"], batching_scheme["window_size"]) + dataset, + bucket_id_fn, + batching_scheme["boundaries"], + batching_scheme["batch_sizes"], + batching_scheme["window_size"], + padded_shapes=batching_scheme["padded_shapes"]) + # We reshuffle the batches to prevent many long-sequence batches at once. # TODO(rsepassi): Rm hasattr call once new dynamic window size functionality # is in a stable TF release. @@ -307,12 +319,16 @@ def _example_length(example): return length -def _example_too_big(example, max_length): +def example_valid_size(example, max_length): return tf.less_equal(_example_length(example), max_length) -def bucket_by_sequence_length(dataset, example_length_fn, bucket_boundaries, - bucket_batch_sizes, window_size): +def bucket_by_sequence_length(dataset, + example_length_fn, + bucket_boundaries, + bucket_batch_sizes, + window_size, + padded_shapes=None): """Bucket entries in dataset by length. Args: @@ -322,6 +338,8 @@ def bucket_by_sequence_length(dataset, example_length_fn, bucket_boundaries, bucket_boundaries: list, boundaries of the buckets. bucket_batch_sizes: list, batch size per bucket. window_size: an integer divisible by all elements of bucket_batch_sizes + padded_shapes: dict>, optional, shapes of the + features with None where feature should be padded to max in that dim. Returns: Dataset of padded and batched examples. @@ -351,12 +369,7 @@ def window_size_fn(bucket_id): def batching_fn(bucket_id, grouped_dataset): batch_sizes = tf.constant(bucket_batch_sizes, dtype=tf.int64) batch_size = batch_sizes[bucket_id] - - # Pad each dimension of each feature so that they match. - padded_shapes = dict( - [(name, [None] * len(shape)) - for name, shape in grouped_dataset.output_shapes.items()]) - return grouped_dataset.padded_batch(batch_size, padded_shapes) + return padded_batch(grouped_dataset, batch_size, padded_shapes) # TODO(rsepassi): Rm branch once the new group_by_window functionality is in # a stable TF release. @@ -371,6 +384,13 @@ def batching_fn(bucket_id, grouped_dataset): return dataset +def padded_batch(dataset, batch_size, padded_shapes=None): + padded_shapes = padded_shapes or dict( + [(name, [None] * len(shape)) + for name, shape in dataset.output_shapes.items()]) + return dataset.padded_batch(batch_size, padded_shapes) + + def _bucket_boundaries(max_length, min_length=8, length_bucket_step=1.1): """A default set of length-bucket boundaries.""" assert min_length <= max_length diff --git a/tensor2tensor/utils/data_reader_test.py b/tensor2tensor/utils/data_reader_test.py index aed2598c7..f03ce6da2 100644 --- a/tensor2tensor/utils/data_reader_test.py +++ b/tensor2tensor/utils/data_reader_test.py @@ -158,7 +158,7 @@ def testLengthFilter(self): max_len = 15 dataset = data_reader.read_examples(self.problem, self.filepatterns[0], 32) dataset = dataset.filter( - lambda ex: data_reader._example_too_big(ex, max_len)) + lambda ex: data_reader.example_valid_size(ex, max_len)) examples = dataset.make_one_shot_iterator().get_next() with tf.train.MonitoredSession() as sess: ex_lens = [] diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py index 83db7c007..08359ea5c 100644 --- a/tensor2tensor/utils/trainer_utils.py +++ b/tensor2tensor/utils/trainer_utils.py @@ -75,6 +75,8 @@ "Save checkpoints every this many seconds. " "Default=0 means let tensorflow.contrib.learn.python.learn" " decide, which is currently set to 600 = 10 minutes.") +flags.DEFINE_bool("log_device_placement", False, + "Whether to log device placement.") # Distributed training flags flags.DEFINE_string("master", "", "Address of TensorFlow master.") @@ -369,7 +371,8 @@ def session_config(): config = tf.ConfigProto( allow_soft_placement=True, graph_options=graph_options, - gpu_options=gpu_options) + gpu_options=gpu_options, + log_device_placement=FLAGS.log_device_placement) return config From ad57b3b2a4bd401464010778a39784644a055c9f Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Wed, 6 Sep 2017 22:18:07 -0700 Subject: [PATCH 21/32] correct transformer ranged hparams PiperOrigin-RevId: 167817267 --- tensor2tensor/models/transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py index d3a406a29..a2e76dd13 100644 --- a/tensor2tensor/models/transformer.py +++ b/tensor2tensor/models/transformer.py @@ -664,6 +664,6 @@ def transformer_base_range(rhp): rhp.set_discrete("learning_rate_warmup_steps", [1000, 2000, 4000, 8000, 16000]) rhp.set_float("initializer_gain", 0.5, 2.0) - rhp.set_float("optimizer_adam_beta2", 0.85, 0.95) + rhp.set_float("optimizer_adam_beta1", 0.85, 0.95) rhp.set_float("optimizer_adam_beta2", 0.97, 0.99) rhp.set_float("weight_decay", 0.0, 2.0) From 772337a811579a32078228d43e9572ccad4a669a Mon Sep 17 00:00:00 2001 From: T2T Team Date: Thu, 7 Sep 2017 07:05:14 -0700 Subject: [PATCH 22/32] bug fix to link function for log_poisson loss PiperOrigin-RevId: 167855204 --- tensor2tensor/layers/modalities.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py index 1d606ec1d..8e76c8051 100644 --- a/tensor2tensor/layers/modalities.py +++ b/tensor2tensor/layers/modalities.py @@ -475,19 +475,16 @@ def loss(self, top_out, targets, weights_fn=common_layers.weights_all): @registry.register_real_modality("log_poisson_loss") class RealLogPoissonLossModality(RealL2LossModality): """Modality for real (i.e. float) vectors with log Poisson regression loss. - - * Top is a linear projection to vocab size followed by a softplus - transform (log(exp(features) + 1)). """ - def top(self, body_output, _): - with tf.variable_scope("real"): - return tf.nn.softplus(tf.layers.dense(body_output, self._vocab_size)) + def bottom(self, x): + return x def loss(self, top_out, targets, weights_fn=common_layers.weights_all): predictions = top_out with tf.name_scope("log_possion"): weights = weights_fn(targets) + lp_loss = tf.nn.log_poisson_loss(targets, predictions) return tf.reduce_sum(lp_loss * weights), tf.reduce_sum(weights) From 73af26b1968efee44b41c0efaafa66bc393d2a29 Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Thu, 7 Sep 2017 11:40:57 -0700 Subject: [PATCH 23/32] change default initializer in lstm to uniform_unit_scaling PiperOrigin-RevId: 167888817 --- tensor2tensor/models/lstm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tensor2tensor/models/lstm.py b/tensor2tensor/models/lstm.py index 9f909433e..d1c3101b4 100644 --- a/tensor2tensor/models/lstm.py +++ b/tensor2tensor/models/lstm.py @@ -272,6 +272,7 @@ def lstm_attention(): hparams.batch_size = 1024 hparams.hidden_size = 128 hparams.num_hidden_layers = 2 + hparams.initializer = "uniform_unit_scaling" # Attention hparams.add_hparam("attn_vec_size", hparams.hidden_size) From 5fcc9bc8d813607bc2e93d680ea6a08cacf83a2c Mon Sep 17 00:00:00 2001 From: Etienne Pot Date: Thu, 7 Sep 2017 15:33:38 -0700 Subject: [PATCH 24/32] Expert now process each sequence individually to reduce the attention matrix size. PiperOrigin-RevId: 167921622 --- tensor2tensor/layers/common_attention.py | 82 ++++++++++++++------- tensor2tensor/models/attention_lm_moe.py | 7 ++ tensor2tensor/utils/expert_utils.py | 91 ++++++++++++++++++++++++ 3 files changed, 154 insertions(+), 26 deletions(-) diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py index 84f8d2d9a..3f3885b10 100644 --- a/tensor2tensor/layers/common_attention.py +++ b/tensor2tensor/layers/common_attention.py @@ -441,7 +441,9 @@ def dot_product_attention(q, weights = tf.nn.softmax(logits, name="attention_weights") # dropping out the attention links for each of the heads weights = tf.nn.dropout(weights, 1.0 - dropout_rate) - if not tf.get_variable_scope().reuse: + if (not tf.get_variable_scope().reuse and + # Summaries don't work well within tf.while_loop() + "/while/" not in tf.contrib.framework.get_name_scope()): attention_image_summary(weights, image_shapes) return tf.matmul(weights, v) @@ -1242,6 +1244,7 @@ def self_attention_expert( x, batch_coordinate, mask_right=True, + split_batch=False, attention_kq_size=None, attention_v_size=None, ): @@ -1255,6 +1258,9 @@ def self_attention_expert( positions from different sequences don't attend to each other. mask_right: A bool. If true, we will not attend to positions on the right, just as decoder self attention. + split_batch (bool): If True, each sequence of the batch is processed + individually on a loop. If False, the sequences are processed all at + once and a mask is applied to isolate the sequences from each others attention_kq_size (int): dimension used for the attention key, and query attention_v_size (int): dimension used for the attention value @@ -1289,32 +1295,58 @@ def self_attention_expert( def length_not_null(x, batch_coordinate): """Branch of the graph only evaluated when length isn't null.""" + + # Mask between the sequences (not used if map_ids is used) with tf.name_scope("expert_mask"): - batch_coordinate = tf.squeeze(batch_coordinate, 1) + batch_coord_float = tf.squeeze(batch_coordinate, 1) # Convert to float first because of b/25387198 - batch_coordinate = tf.to_float(batch_coordinate) - bc_v = tf.expand_dims(batch_coordinate, 1) - bc_h = tf.expand_dims(batch_coordinate, 0) - bias = bc_v - bc_h # Broadcast to create [length, length] mask - bias = tf.minimum(1.0, tf.abs(bias)) # Theshold non zeros to 1.0 - bias *= -1e9 # Set non zeros to -infinity - - if mask_right: - bias += tf.reshape( + batch_coord_float = tf.to_float(batch_coord_float) + bc_v = tf.expand_dims(batch_coord_float, 1) + bc_h = tf.expand_dims(batch_coord_float, 0) + bias_batch = bc_v - bc_h # Broadcast to create [length, length] mask + # Theshold non zeros to 1.0 + bias_batch = tf.minimum(1.0, tf.abs(bias_batch)) + bias_batch *= -1e9 # Set non zeros to -infinity + + def add_or_set_if(prev_bias, new_bias, condition): + """Add the bias together while concidering the None case.""" + if not condition: + return prev_bias + elif prev_bias is None: + return new_bias + else: + return prev_bias + new_bias + + def mask_and_call_attention(x): + """Function applied once for each sequence of the batch.""" + + # Mask to prevent sequences of attenting to the future + length = tf.shape(x)[1] # x has shape [1, length,...] + bias_past = tf.reshape( attention_bias_lower_triangle(length), [length, length]) - # bias has shape [length, length] - bias = tf.reshape(bias, [1, 1, length, length]) - x = tf.reshape(x, [1, length, depth]) - out = multihead_attention(x, - None, - bias, - total_key_depth=attention_kq_size, - total_value_depth=attention_v_size, - output_depth=depth, - num_heads=1, - dropout_rate=0.0) - out = tf.squeeze(out, 0) - + # bias has shape [length, length] + bias_past = tf.reshape(bias_past, [1, 1, length, length]) + + bias = None + bias = add_or_set_if(bias, bias_past, mask_right) + bias = add_or_set_if(bias, bias_batch, not split_batch) + + return multihead_attention( + x, + None, + bias, + total_key_depth=attention_kq_size, + total_value_depth=attention_v_size, + output_depth=depth, + num_heads=1, + dropout_rate=0.0) + + if split_batch: + out = expert_utils.map_ids(x, batch_coordinate, mask_and_call_attention) + else: + x = tf.reshape(x, [1, length, depth]) + out = mask_and_call_attention(x) + out = tf.squeeze(out, 0) return out # If the length is empty, just forward an empty tensor (avoid having to @@ -1326,8 +1358,6 @@ def length_not_null(x, batch_coordinate): ) return out -# functools.partial(self_attention_expert, mask_right=, depth=) - def local_expert_attention( x, diff --git a/tensor2tensor/models/attention_lm_moe.py b/tensor2tensor/models/attention_lm_moe.py index 596d5b01d..87d456b7d 100644 --- a/tensor2tensor/models/attention_lm_moe.py +++ b/tensor2tensor/models/attention_lm_moe.py @@ -127,6 +127,8 @@ def print_shape(x, suffix, debug=False): x = dp_remove_pad(x) x = dp(print_shape, x, "in_flat") + assert hparams.batch_size >= hparams.max_length + for layer in xrange(hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope( @@ -161,6 +163,7 @@ def print_shape(x, suffix, debug=False): train=hparams.mode == ModeKeys.TRAIN, batch_coordinate=batch_coordinate, mask_right=True, + split_batch=bool(hparams.attention_split_batch), attention_kq_size=hparams.attention_kq_size, attention_v_size=hparams.attention_v_size) # TODO(avaswani, epot, noam): Do we need to divide by num shards ? @@ -344,6 +347,7 @@ def attention_lm_moe_base(): hparams.add_hparam("attention_type", AttentionType.MULTIHEAD) hparams.add_hparam("attention_moe_k", 2) hparams.add_hparam("attention_num_experts", 16) + hparams.add_hparam("attention_split_batch", int(False)) # Key, query and value dimensions for the attention hparams.add_hparam("attention_kq_size", 128) hparams.add_hparam("attention_v_size", 256) @@ -366,6 +370,9 @@ def attention_lm_moe_base_ae(): hparams.min_length_bucket = 256 # Avoid cyclic problems for big batches hparams.learning_rate = 0.05 hparams.learning_rate_warmup_steps = 10000 + # According to noam, ("n", "da") seems better for harder-to-learn models + # hparams.layer_preprocess_sequence = "n" + # hparams.layer_postprocess_sequence = "da" return hparams diff --git a/tensor2tensor/utils/expert_utils.py b/tensor2tensor/utils/expert_utils.py index 16820ff37..8865b9271 100644 --- a/tensor2tensor/utils/expert_utils.py +++ b/tensor2tensor/utils/expert_utils.py @@ -23,6 +23,7 @@ from __future__ import division from __future__ import print_function +import functools import math # Dependency imports @@ -60,6 +61,27 @@ def convert_gradient_to_tensor(x): return x +def add_name_scope(scope): + """Return a decorator which add a TF name scope to a function. + + Args: + scope (str): name of the name scope + + Returns: + fct: the add_scope decorator + """ + def decorator(f): + + @functools.wraps(f) + def decorated(*args, **kwargs): + with tf.name_scope(scope): + return f(*args, **kwargs) + + return decorated + + return decorator + + class Parallelism(object): """Helper class for creating sets of parallel function calls. @@ -517,6 +539,75 @@ def restore(self, x): return x +@add_name_scope("map_ids") +def map_ids(x, indices, map_fn): + """Apply a function to each coordinate ids of a multidimentional tensor. + + This allows to process each sequence of a batch independently. This is + similar to tf.map_fn but with tensor where the batch dim has been flatten. + + Warning: The indices ids have to be contigous and orderd in memory as the + output vector for each of the ids are simply concatenated after being + processed. + Ex: if your indices are [0,2,2,1,2,0], the output will contains the processed + rows in the following order: [0,0,1,2,2,2] + + Args: + x (Tensor): The tensor to be dispatched of shape [length,...] + indices (Tensor): A int32 tensor of size [length, 1] containing the batch + coordinate of x + map_fn (fct): Function called for every ids of the original tensor. Take + as input a tensor of same rank than x and from shape [length_id,...] with + length_id <= length. Isn't called if length_id == 0 + + Returns: + a tensor of same shape as x, where each elements has been processed + """ + indices = tf.reshape(indices, [-1]) + + t_i = tf.constant(0) + # batch_coordinates start at 0 + t_batch_size = tf.reduce_max(indices) + 1 + + # ta_stack_out will store the intermediate results for each individual id + # As alternative to tf.TensorArray, scatter_update could potentially be used + # but that would require an additional mutable tensor. + ta_stack_out = tf.TensorArray( + x.dtype, + size=t_batch_size, + ) + + # Then we iterate over each sequence individually and compute the + # transformation for each id + while_condition = lambda t_i, *args: tf.less(t_i, t_batch_size) + def body(t_i, ta_stack_out): + """Loop body.""" + # Gather the ids + current_ids = tf.to_int32(tf.where(tf.equal(indices, t_i))) + t_row = tf.gather_nd(x, indices=current_ids) + + # TODO(epot): Should not call map_fn if t_row size is 0 + + # Apply transformation to each id + # Restore batch_dim=1 as most function expect [batch_dim, length, ...] as + # input + t_row = tf.expand_dims(t_row, axis=0) + t_row = map_fn(t_row) + t_row = tf.squeeze(t_row, axis=0) # Squeeze for concatenation + ta_stack_out = ta_stack_out.write(t_i, t_row) + + return [tf.add(t_i, 1), ta_stack_out] # ++i + + # Run the loop, equivalent to: + # stack_out = [] + # while i < batch_size: + # stack_out.expand(map_fn(x[indices==i])) + _, ta_stack_out = tf.while_loop(while_condition, body, [t_i, ta_stack_out]) + + # Merge all results + return ta_stack_out.concat() + + class SparseDispatcher(object): """Helper for implementing a mixture of experts. From 327c8d23999048596c1e9a7a59abc369ffd1ee4e Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Thu, 7 Sep 2017 17:08:25 -0700 Subject: [PATCH 25/32] Merge PRs #274 #282 PiperOrigin-RevId: 167933701 --- README.md | 4 +-- docs/new_problem.md | 38 +++++++++++++++++++----- tensor2tensor/visualization/attention.py | 2 +- 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 4e56d7855..bec411f1e 100644 --- a/README.md +++ b/README.md @@ -214,8 +214,8 @@ on the task (e.g. fed through a final linear transform to produce logits for a softmax over classes). All models are imported in [`models.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/models/models.py), inherit from `T2TModel` - defined in -[`t2t_model.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/utils/t2t_model.py) -- and are registered with +[`t2t_model.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/utils/t2t_model.py) - +and are registered with [`@registry.register_model`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/utils/registry.py). ### Hyperparameter Sets diff --git a/docs/new_problem.md b/docs/new_problem.md index d581a3a1b..ab5dd5e26 100644 --- a/docs/new_problem.md +++ b/docs/new_problem.md @@ -15,9 +15,17 @@ Let's add a new dataset together and train the transformer model. We'll be learn For each problem we want to tackle we create a new problem class and register it. Let's call our problem `Word2def`. -Since many text2text problems share similar methods, there's already a class called `Text2TextProblem` that extends the base problem class, `Problem` (both found in `problem.py`). - -For our problem, we can go ahead and create the file `word2def.py` in the `data_generators` folder and add our new problem, `Word2def`, which extends `Text2TextProblem`. Let's also register it while we're at it so we can specify the problem through flags. +Since many text2text problems share similar methods, there's already a class +called `Text2TextProblem` that extends the base problem class, `Problem` +(both found in +[`problem.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/problem.py)). + +For our problem, we can go ahead and create the file `word2def.py` in the +[`data_generators`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/) +folder and add our new problem, `Word2def`, which extends +[`Text2TextProblem`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/problem.py). +Let's also register it while we're at it so we can specify the problem through +flags. ```python @registry.register_problem @@ -28,7 +36,9 @@ class Word2def(problem.Text2TextProblem): ... ``` -We need to implement the following methods from `Text2TextProblem` in our new class: +We need to implement the following methods from +[`Text2TextProblem`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/problem.py). +in our new class: * is_character_level * targeted_vocab_size * generator @@ -42,7 +52,12 @@ Let's tackle them one by one: **input_space_id, target_space_id, is_character_level, targeted_vocab_size, use_subword_tokenizer**: -SpaceIDs tell Tensor2Tensor what sort of space the input and target tensors are in. These are things like, EN_CHR (English character), EN_TOK (English token), AUDIO_WAV (audio waveform), IMAGE, DNA (genetic bases). The complete list can be found at `data_generators/problem.py` in the class `SpaceID`. +SpaceIDs tell Tensor2Tensor what sort of space the input and target tensors are +in. These are things like, EN_CHR (English character), EN_TOK (English token), +AUDIO_WAV (audio waveform), IMAGE, DNA (genetic bases). The complete list can be +found at +[`data_generators/problem.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/problem.py). +in the class `SpaceID`. Since we're generating definitions and feeding in words at the character level, we set `is_character_level` to true, and use the same SpaceID, EN_CHR, for both input and target. Additionally, since we aren't using tokens, we don't need to give a `targeted_vocab_size` or define `use_subword_tokenizer`. @@ -86,7 +101,15 @@ class Word2def(problem.Text2TextProblem): **generator**: -We're almost done. `generator` generates the training and evaluation data and stores them in files like "word2def_train.lang1" in your DATA_DIR. Thankfully several commonly used methods like `character_generator`, and `token_generator` are already written in the file `wmt.py`. We will import `character_generator` and write: +We're almost done. `generator` generates the training and evaluation data and +stores them in files like "word2def_train.lang1" in your DATA_DIR. Thankfully +several commonly used methods like `character_generator`, and `token_generator` +are already written in the file +[`wmt.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/wmt.py). +We will import `character_generator` and +[`text_encoder`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/text_encoder.py) +to write: + ```python def generator(self, data_dir, tmp_dir, train): character_vocab = text_encoder.ByteTextEncoder() @@ -151,7 +174,8 @@ _WORD2DEF_TEST_DATASETS = [ ## Putting it all together -Now our `word2def.py` file looks like: (with the correct imports) +Now our `word2def.py` file looks like: + ```python """ Problem definition for word to dictionary definition. """ diff --git a/tensor2tensor/visualization/attention.py b/tensor2tensor/visualization/attention.py index bc4238081..6109f9cc6 100644 --- a/tensor2tensor/visualization/attention.py +++ b/tensor2tensor/visualization/attention.py @@ -15,7 +15,7 @@ """Module for postprocessing and displaying tranformer attentions. -This module is deigned to be called from an ipython notebook. +This module is designed to be called from an ipython notebook. """ import json From 0c0016a81424088e96df9fc6d712ce9b6ad90226 Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Thu, 7 Sep 2017 17:17:12 -0700 Subject: [PATCH 26/32] Fix error message when problem is misspecified PiperOrigin-RevId: 167934726 --- .../data_generators/problem_hparams.py | 5 +--- tensor2tensor/utils/registry.py | 30 +++++++++---------- tensor2tensor/utils/registry_test.py | 12 ++++---- tensor2tensor/utils/trainer_utils.py | 18 +++++++++-- 4 files changed, 37 insertions(+), 28 deletions(-) diff --git a/tensor2tensor/data_generators/problem_hparams.py b/tensor2tensor/data_generators/problem_hparams.py index e002329bc..f4880e4d9 100644 --- a/tensor2tensor/data_generators/problem_hparams.py +++ b/tensor2tensor/data_generators/problem_hparams.py @@ -40,9 +40,6 @@ def problem_hparams(problem_name, model_hparams): Returns: a tf.contrib.training.HParams - - Raises: - ValueError: if problem_name is unknown. """ base_name, was_reversed, was_copy = parse_problem_name(problem_name) p = _lookup_problem_hparams_fn(base_name)(model_hparams) @@ -78,7 +75,7 @@ def _lookup_problem_hparams_fn(name): if name not in PROBLEM_HPARAMS_MAP: map_str = "* " + "\n* ".join(sorted(PROBLEM_HPARAMS_MAP.keys())) error_msg = "%s not in the supported set of problems:\n%s" % (name, map_str) - raise ValueError(error_msg) + raise LookupError(error_msg) return PROBLEM_HPARAMS_MAP.get(name) diff --git a/tensor2tensor/utils/registry.py b/tensor2tensor/utils/registry.py index f1db2f36c..2b708b4ce 100644 --- a/tensor2tensor/utils/registry.py +++ b/tensor2tensor/utils/registry.py @@ -123,7 +123,7 @@ def decorator(model_cls, registration_name=None): """Registers & returns model_cls with registration_name or default name.""" model_name = registration_name or _default_name(model_cls) if model_name in _MODELS: - raise ValueError("Model %s already registered." % model_name) + raise LookupError("Model %s already registered." % model_name) _MODELS[model_name] = model_cls return model_cls @@ -137,7 +137,7 @@ def decorator(model_cls, registration_name=None): def model(name): if name not in _MODELS: - raise ValueError("Model %s never registered." % name) + raise LookupError("Model %s never registered." % name) return _MODELS[name] @@ -152,7 +152,7 @@ def decorator(hp_fn, registration_name=None): """Registers & returns hp_fn with registration_name or default name.""" hp_name = registration_name or _default_name(hp_fn) if hp_name in _HPARAMS: - raise ValueError("HParams set %s already registered." % hp_name) + raise LookupError("HParams set %s already registered." % hp_name) _HPARAMS[hp_name] = hp_fn return hp_fn @@ -166,7 +166,7 @@ def decorator(hp_fn, registration_name=None): def hparams(name): if name not in _HPARAMS: - raise ValueError("HParams set %s never registered." % name) + raise LookupError("HParams set %s never registered." % name) return _HPARAMS[name] @@ -181,7 +181,7 @@ def decorator(rhp_fn, registration_name=None): """Registers & returns hp_fn with registration_name or default name.""" rhp_name = registration_name or _default_name(rhp_fn) if rhp_name in _RANGED_HPARAMS: - raise ValueError("RangedHParams set %s already registered." % rhp_name) + raise LookupError("RangedHParams set %s already registered." % rhp_name) # Check that the fn takes a single argument args, varargs, keywords, _ = inspect.getargspec(rhp_fn) if len(args) != 1 or varargs is not None or keywords is not None: @@ -201,7 +201,7 @@ def decorator(rhp_fn, registration_name=None): def ranged_hparams(name): if name not in _RANGED_HPARAMS: - raise ValueError("RangedHParams set %s never registered." % name) + raise LookupError("RangedHParams set %s never registered." % name) return _RANGED_HPARAMS[name] @@ -216,7 +216,7 @@ def decorator(p_cls, registration_name=None): """Registers & returns p_cls with registration_name or default name.""" p_name = registration_name or _default_name(p_cls) if p_name in _PROBLEMS: - raise ValueError("Problem %s already registered." % p_name) + raise LookupError("Problem %s already registered." % p_name) _PROBLEMS[p_name] = p_cls p_cls.name = p_name @@ -258,7 +258,7 @@ def parse_problem_name(problem_name): base_name, was_reversed, was_copy = parse_problem_name(name) if base_name not in _PROBLEMS: - raise ValueError("Problem %s never registered." % name) + raise LookupError("Problem %s never registered." % name) return _PROBLEMS[base_name](was_reversed, was_copy) @@ -270,8 +270,8 @@ def _internal_get_modality(name, mod_collection, collection_str): if name is None: name = "default" if name not in mod_collection: - raise ValueError("%s modality %s never registered." % (collection_str, - name)) + raise LookupError("%s modality %s never registered." % (collection_str, + name)) return mod_collection[name] @@ -312,8 +312,8 @@ def decorator(mod_cls, registration_name=None): """Registers & returns mod_cls with registration_name or default name.""" mod_name = registration_name or _default_name(mod_cls) if mod_name in mod_collection: - raise ValueError("%s modality %s already registered." % (collection_str, - mod_name)) + raise LookupError("%s modality %s already registered." % (collection_str, + mod_name)) mod_collection[mod_name] = mod_cls return mod_cls @@ -391,7 +391,7 @@ def create_modality(modality_spec, model_hparams): Modality instance. Raises: - ValueError: if modality_type is not recognized. See Modalities class for + LookupError: if modality_type is not recognized. See Modalities class for accepted types. """ retrieval_fns = { @@ -406,8 +406,8 @@ def create_modality(modality_spec, model_hparams): modality_full_name, vocab_size = modality_spec modality_type, modality_name = parse_modality_name(modality_full_name) if modality_type not in retrieval_fns: - raise ValueError("Modality type %s not recognized. Options are: %s" % - (modality_type, list(_MODALITIES))) + raise LookupError("Modality type %s not recognized. Options are: %s" % + (modality_type, list(_MODALITIES))) return retrieval_fns[modality_type](modality_name)(model_hparams, vocab_size) diff --git a/tensor2tensor/utils/registry_test.py b/tensor2tensor/utils/registry_test.py index 62c24b054..d97dc6bdc 100644 --- a/tensor2tensor/utils/registry_test.py +++ b/tensor2tensor/utils/registry_test.py @@ -63,7 +63,7 @@ def model_fn(): self.assertTrue(model is model_fn) def testUnknownModel(self): - with self.assertRaisesRegexp(ValueError, "never registered"): + with self.assertRaisesRegexp(LookupError, "never registered"): registry.model("not_registered") def testDuplicateRegistration(self): @@ -72,7 +72,7 @@ def testDuplicateRegistration(self): def m1(): pass - with self.assertRaisesRegexp(ValueError, "already registered"): + with self.assertRaisesRegexp(LookupError, "already registered"): @registry.register_model("m1") def m2(): @@ -137,9 +137,9 @@ def my_hparams_range(_): self.assertTrue(registry.ranged_hparams("a") is my_hparams_range) def testUnknownHparams(self): - with self.assertRaisesRegexp(ValueError, "never registered"): + with self.assertRaisesRegexp(LookupError, "never registered"): registry.hparams("not_registered") - with self.assertRaisesRegexp(ValueError, "never registered"): + with self.assertRaisesRegexp(LookupError, "never registered"): registry.ranged_hparams("not_registered") def testDuplicateRegistration(self): @@ -148,7 +148,7 @@ def testDuplicateRegistration(self): def hp1(): pass - with self.assertRaisesRegexp(ValueError, "already registered"): + with self.assertRaisesRegexp(LookupError, "already registered"): @registry.register_hparams("hp1") def hp2(): @@ -158,7 +158,7 @@ def hp2(): def rhp1(_): pass - with self.assertRaisesRegexp(ValueError, "already registered"): + with self.assertRaisesRegexp(LookupError, "already registered"): @registry.register_ranged_hparams("rhp1") def rhp2(_): diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py index 08359ea5c..be5e5530f 100644 --- a/tensor2tensor/utils/trainer_utils.py +++ b/tensor2tensor/utils/trainer_utils.py @@ -149,7 +149,7 @@ def create_experiment(output_dir, data_dir, model_name, train_steps, """Create Experiment.""" hparams = create_hparams( FLAGS.hparams_set, FLAGS.problems, data_dir, passed_hparams=FLAGS.hparams) - if FLAGS.worker_id == 0: + if FLAGS.worker_id == 0 and FLAGS.schedule in ["local_run", "train"]: save_metadata(output_dir, hparams) estimator, input_fns = create_experiment_components( hparams=hparams, @@ -226,11 +226,23 @@ def add_problem_hparams(hparams, problems): for problem_name in problems.split("-"): try: problem = registry.problem(problem_name) - except ValueError: + except LookupError: problem = None if problem is None: - p_hparams = problem_hparams.problem_hparams(problem_name, hparams) + try: + p_hparams = problem_hparams.problem_hparams(problem_name, hparams) + except LookupError: + # The problem is not in the set of registered Problems nor in the old + # set of problem_hparams. + all_problem_names = sorted( + list(problem_hparams.PROBLEM_HPARAMS_MAP) + + registry.list_problems()) + error_lines = [ + "%s not in the set of supported problems:" % problem_name + ] + all_problem_names + error_msg = "\n * ".join(error_lines) + raise LookupError(error_msg) else: p_hparams = problem.get_hparams(hparams) From 6d004bdc853e2fc7fe6aa341dfefbb89d6b17963 Mon Sep 17 00:00:00 2001 From: Niki Parmar Date: Thu, 7 Sep 2017 22:56:03 -0700 Subject: [PATCH 27/32] Edit 2d scope name PiperOrigin-RevId: 167958304 --- tensor2tensor/layers/common_attention.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py index 3f3885b10..c5a0c60cb 100644 --- a/tensor2tensor/layers/common_attention.py +++ b/tensor2tensor/layers/common_attention.py @@ -1066,7 +1066,7 @@ def multihead_attention_2d(query_antecedent, "attention heads (%d)." % (total_value_depth, num_heads)) with tf.variable_scope( name, - default_name="multihead_attention", + default_name="multihead_attention_2d", values=[query_antecedent, memory_antecedent]): q, k, v = compute_qkv_2d(query_antecedent, memory_antecedent, total_key_depth, total_value_depth) From c99d5b5d350feb33ecb99f1bbbc74a2660e8a46b Mon Sep 17 00:00:00 2001 From: Noam Shazeer Date: Fri, 8 Sep 2017 09:33:02 -0700 Subject: [PATCH 28/32] log diet variables properly PiperOrigin-RevId: 168006293 --- tensor2tensor/utils/model_builder.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tensor2tensor/utils/model_builder.py b/tensor2tensor/utils/model_builder.py index 21ef96b28..1540c0f88 100644 --- a/tensor2tensor/utils/model_builder.py +++ b/tensor2tensor/utils/model_builder.py @@ -38,6 +38,7 @@ from tensor2tensor.utils import yellowfin import tensorflow as tf +from tensorflow.python.framework import dtypes from tensorflow.python.ops import init_ops # TODO(rsepassi): Rm dep on FLAGS here @@ -363,7 +364,9 @@ def nth_model(n): total_loss *= small_batch_multiplier total_loss = tf.identity(total_loss, name="total_loss") log_variable_sizes(tf.trainable_variables(), "Trainable Variables") - diet_vars = [v for v in tf.global_variables() if hasattr(v, "optimizer")] + diet_vars = [ + v for v in tf.global_variables() if v.dtype == dtypes.float16_ref + ] log_variable_sizes(diet_vars, "Diet Varaibles") # Define the train_op for the TRAIN mode. opt = _ConditionalOptimizer(my_hp.optimizer, learning_rate, my_hp) From 8f5fcc2d0ef416bbf06b2e2e777100da071292a1 Mon Sep 17 00:00:00 2001 From: Noam Shazeer Date: Fri, 8 Sep 2017 13:27:18 -0700 Subject: [PATCH 29/32] add wiki-scramble dataset. PiperOrigin-RevId: 168037859 --- tensor2tensor/data_generators/wiki.py | 117 +++++++++++++++++++++++ tensor2tensor/models/attention_lm_moe.py | 35 +++++-- 2 files changed, 145 insertions(+), 7 deletions(-) diff --git a/tensor2tensor/data_generators/wiki.py b/tensor2tensor/data_generators/wiki.py index 9610cb1d8..6f6c97686 100644 --- a/tensor2tensor/data_generators/wiki.py +++ b/tensor2tensor/data_generators/wiki.py @@ -25,6 +25,8 @@ import bz2file +import numpy as np + import six from tensor2tensor.data_generators import generator_utils from tensor2tensor.data_generators import problem @@ -130,3 +132,118 @@ def generator(self, data_dir, tmp_dir, _): encoded = encoder.encode(page) + [EOS] encoded_title = encoder.encode(title) + [EOS] yield {"inputs": encoded_title, "targets": encoded} + + +class LanguagemodelWikiScramble(problem.Text2TextProblem): + """Language modeling on English wikipedia. + + "targets" is a sequence of sequence_length tokens - a fragment of an article. + "inputs" is a copy of "targets", but with a random scramble_fraction of the + tokens randomly permuted. + + This dataset is intended to test parallel (non-autoregressive) prediction + of the target sequence given the input sequence. + """ + + @property + def sequence_length(self): + raise NotImplementedError() + + @property + def scramble_fraction(self): + raise NotImplementedError() + + @property + def is_character_level(self): + return False + + @property + def has_inputs(self): + return True + + @property + def input_space_id(self): + return problem.SpaceID.EN_TOK + + @property + def target_space_id(self): + return problem.SpaceID.EN_TOK + + @property + def num_shards(self): + return 1000 + + @property + def vocab_name(self): + return "vocab.wiki" + + @property + def use_subword_tokenizer(self): + return True + + @property + def targeted_vocab_size(self): + return 2**13 # 8192 + + @property + def use_train_shards_for_dev(self): + return True + + @property + def max_cases(self): + return (2 ** 30) / self.sequence_length + + def scramble(self, seq): + seq = np.array(seq) + num_permute = int(self.sequence_length * self.scramble_fraction) + full_permutation = np.random.permutation(self.sequence_length) + inverse_full_permutation = np.argsort(full_permutation) + partial_permutation = np.random.permutation(num_permute) + seq = seq[full_permutation] + seq = np.concatenate( + (seq[:num_permute][partial_permutation], seq[num_permute:])) + seq = seq[inverse_full_permutation] + seq = list(seq) + return seq + + def generator(self, data_dir, tmp_dir, _): + encoder = generator_utils.get_or_generate_vocab_inner( + data_dir, self.vocab_file, self.targeted_vocab_size, + lambda: page_generator(tmp_dir, max_docs=1000)) + case_num = 0 + for page in page_generator(tmp_dir): + encoded = encoder.encode(page) + for i in xrange(len(encoded) // self.sequence_length): + case_num += 1 + if self.max_cases and case_num > self.max_cases: + return + targets = encoded[ + i * self.sequence_length:(i + 1) * self.sequence_length] + inputs = self.scramble(targets) + yield {"inputs": inputs, "targets": targets} + + +@registry.register_problem +class LanguagemodelWikiScramble1k50(LanguagemodelWikiScramble): + """Sequence length 1024, 50% scrambed.""" + + @property + def sequence_length(self): + return 1024 + + @property + def scramble_fraction(self): + return 0.5 + + +@registry.register_problem +class LanguagemodelWikiScramble8k50(LanguagemodelWikiScramble): + """Sequence length 8192, 50% scrambed.""" + + @property + def sequence_length(self): + return 8192 + + @property + def scramble_fraction(self): + return 0.5 diff --git a/tensor2tensor/models/attention_lm_moe.py b/tensor2tensor/models/attention_lm_moe.py index 87d456b7d..cd54ce64e 100644 --- a/tensor2tensor/models/attention_lm_moe.py +++ b/tensor2tensor/models/attention_lm_moe.py @@ -68,8 +68,14 @@ def model_fn_body_sharded(self, sharded_features): # Remove dropout if not training hparams = self._hparams dp = self._data_parallelism - targets = sharded_features["targets"] - targets = dp(tf.squeeze, targets, 2) + if hparams.use_inputs: + decoder_input = dp(tf.squeeze, sharded_features["inputs"], 2) + decoder_self_attention_bias = None + else: + targets = sharded_features["targets"] + targets = dp(tf.squeeze, targets, 2) + (decoder_input, decoder_self_attention_bias, pad_remover) = dp( + attention_lm_moe_prepare_decoder, targets, hparams) def preprocess(x): return dp(common_layers.layer_preprocess, x, hparams) @@ -77,9 +83,6 @@ def preprocess(x): def postprocess(x, y): return dp(common_layers.layer_postprocess, x, y, hparams) - (decoder_input, decoder_self_attention_bias, pad_remover) = dp( - attention_lm_moe_prepare_decoder, targets, hparams) - x = dp(tf.nn.dropout, decoder_input, 1.0 - hparams.layer_prepostprocess_dropout) extra_loss = 0.0 @@ -95,7 +98,8 @@ def _diet_expert(x): expert_fn = expert_utils.ffn_expert_fn( hparams.hidden_size, moe_hidden_sizes, hparams.hidden_size) - if hparams.attention_type == AttentionType.LOCAL_EXPERTS: + if (hparams.attention_type == AttentionType.LOCAL_EXPERTS + and not hparams.use_inputs): # As preprocess and postprocess are called with batch of size one (all # batches concatenated), we just make sure that batch_norm is not use ( # should not either way) @@ -162,7 +166,7 @@ def print_shape(x, suffix, debug=False): attention_num_experts=hparams.attention_num_experts, train=hparams.mode == ModeKeys.TRAIN, batch_coordinate=batch_coordinate, - mask_right=True, + mask_right=not hparams.use_inputs, split_batch=bool(hparams.attention_split_batch), attention_kq_size=hparams.attention_kq_size, attention_v_size=hparams.attention_v_size) @@ -356,6 +360,9 @@ def attention_lm_moe_base(): hparams.add_hparam("use_sepconv", int(False)) hparams.add_hparam("diet_experts", int(False)) hparams.add_hparam("memory_efficient_ffn", int(False)) + # if True, we learn a non-autoregressive model from "inputs" to "targets". + # if False, we learn an autoregressive model to generate "targets" + hparams.add_hparam("use_inputs", int(False)) return hparams @@ -526,3 +533,17 @@ def attention_lm_moe_translation(): hparams.moe_layers = "0,1,2,3,4,5" hparams.shared_embedding_and_softmax_weights = int(True) return hparams + + +@registry.register_hparams +def attention_lm_moe_unscramble_base(): + """Version to use with languagemodel_wiki_scramble1k50.""" + hparams = attention_lm_no_moe_small() + hparams.use_inputs = True + hparams.min_length_bucket = 1024 + hparams.max_length = 1024 + hparams.batch_size = 5000 + hparams.layer_prepostprocess_dropout = 0.0 + hparams.layer_preprocess_sequence = "n" + hparams.layer_postprocess_sequence = "da" + return hparams From 1991f7b8addb657abe41bb633e1d909edade56ce Mon Sep 17 00:00:00 2001 From: Etienne Pot Date: Fri, 8 Sep 2017 13:51:17 -0700 Subject: [PATCH 30/32] Add option for local attention in attention_lm_moe. PiperOrigin-RevId: 168041046 --- tensor2tensor/models/attention_lm_moe.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tensor2tensor/models/attention_lm_moe.py b/tensor2tensor/models/attention_lm_moe.py index cd54ce64e..adbb871b5 100644 --- a/tensor2tensor/models/attention_lm_moe.py +++ b/tensor2tensor/models/attention_lm_moe.py @@ -148,6 +148,8 @@ def print_shape(x, suffix, debug=False): hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, + attention_type=("local_mask_right" if hparams.attention_local + else "dot_product"), name="decoder_self_attention") elif hparams.attention_type == AttentionType.MEMORY_EFFICIENT: assert hparams.layer_preprocess_sequence == "n" @@ -349,6 +351,7 @@ def attention_lm_moe_base(): hparams.add_hparam("moe_layers", "2") # comma separated list of layer numbers # moe params. local attention moe. hparams.add_hparam("attention_type", AttentionType.MULTIHEAD) + hparams.add_hparam("attention_local", int(False)) hparams.add_hparam("attention_moe_k", 2) hparams.add_hparam("attention_num_experts", 16) hparams.add_hparam("attention_split_batch", int(False)) @@ -383,6 +386,18 @@ def attention_lm_moe_base_ae(): return hparams +@registry.register_hparams +def attention_lm_moe_base_local(): + """Base model with attention expert.""" + hparams = attention_lm_moe_base() + hparams.attention_local = int(True) + hparams.use_sepconv = int(True) + hparams.max_length = 0 # max_length == batch_size + hparams.eval_drop_long_sequences = int(True) + hparams.min_length_bucket = 256 # Avoid cyclic problems for big batches + return hparams + + @registry.register_hparams def attention_lm_moe_small(): """Cheap model for single-gpu training. From 1d769553d3e9e4942229a705a526080626c6d16d Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Fri, 8 Sep 2017 14:45:11 -0700 Subject: [PATCH 31/32] v1.2.2 PiperOrigin-RevId: 168048958 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b51070c77..119eeea7e 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='tensor2tensor', - version='1.2.1', + version='1.2.2', description='Tensor2Tensor', author='Google Inc.', author_email='no-reply@google.com', From b8e59e746919a80f0ccd30dbf87426928c856218 Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Fri, 8 Sep 2017 14:46:56 -0700 Subject: [PATCH 32/32] open source fixes PiperOrigin-RevId: 168049257 --- tensor2tensor/utils/trainer_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py index be5e5530f..5ab3db70c 100644 --- a/tensor2tensor/utils/trainer_utils.py +++ b/tensor2tensor/utils/trainer_utils.py @@ -265,7 +265,7 @@ def save_metadata(output_dir, hparams): else: flags_dict = FLAGS.__dict__["__flags"] flags_str = "\n".join( - ["--%s=%s" % (name, str(f.value)) for (name, f) in flags_dict.items()]) + ["--%s=%s" % (name, str(f)) for (name, f) in flags_dict.items()]) t2t_flags_str = None flags_txt = os.path.join(output_dir, "flags.txt")