From 594da6340fc814743a2b7b8dd545a9965e036241 Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Wed, 30 Aug 2017 07:21:14 -0700
Subject: [PATCH 01/32] internal.

PiperOrigin-RevId: 166990178
---
 README.md                                |  3 +-
 docs/new_problem.md                      | 58 ++++++++++++------------
 tensor2tensor/utils/data_reader.py       |  1 -
 tensor2tensor/visualization/attention.py |  2 +-
 4 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/README.md b/README.md
index 58a58aa17..4e56d7855 100644
--- a/README.md
+++ b/README.md
@@ -214,7 +214,8 @@ on the task (e.g. fed through a final linear transform to produce logits for a
 softmax over classes). All models are imported in
 [`models.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/models/models.py),
 inherit from `T2TModel` - defined in
-[`t2t_model.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/utils/t2t_model.py) - and are registered with
+[`t2t_model.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/utils/t2t_model.py)
+- and are registered with
 [`@registry.register_model`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/utils/registry.py).
 
 ### Hyperparameter Sets
diff --git a/docs/new_problem.md b/docs/new_problem.md
index e69a7dfdb..c859c6eba 100644
--- a/docs/new_problem.md
+++ b/docs/new_problem.md
@@ -15,20 +15,18 @@ Let's add a new dataset together and train the transformer model. We'll be learn
 
 For each problem we want to tackle we create a new problem class and register it. Let's call our problem `Word2def`.
 
-Since many text2text problems share similar methods, there's already a class called [`Text2TextProblem`](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/data_generators/problem.py#L354) that extends the base problem class, `Problem` (both found in [`problem.py`](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/data_generators/problem.py)).
+Since many text2text problems share similar methods, there's already a class called `Text2TextProblem` that extends the base problem class, `Problem` (both found in `problem.py`).
 
-For our problem, we can go ahead and create the file `word2def.py` in the [`data_generators`](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/data_generators/) folder and add our new problem, `Word2def`, which extends [`Text2TextProblem`](https://github.com/tensorflow/tensor2tensor/blob/24071ba07d5a14c170044c5e60a24bda8179fb7a/tensor2tensor/data_generators/problem.py#L354). Let's also register it while we're at it so we can specify the problem through flags.
+For our problem, we can go ahead and create the file `word2def.py` in the `data_generators` folder and add our new problem, `Word2def`, which extends `TranslateProblem`. Let's also register it while we're at it so we can specify the problem through flags.
 
 ```python
-@registry.register_problem
+@registry.register_problem()
 class Word2def(problem.Text2TextProblem):
   """Problem spec for English word to dictionary definition."""
-  @property
-  def is_character_level(self):
-    ...
+  return NotImplementedError()
 ```
 
-We need to implement the following methods from [`Text2TextProblem`](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/data_generators/problem.py#L354) in our new class:
+We need to implement the following methods from `Text2TextProblem` in our new class:
 * is_character_level
 * targeted_vocab_size
 * generator
@@ -42,7 +40,7 @@ Let's tackle them one by one:
 
 **input_space_id, target_space_id, is_character_level, targeted_vocab_size, use_subword_tokenizer**:
 
-SpaceIDs tell Tensor2Tensor what sort of space the input and target tensors are in. These are things like, EN_CHR (English character), EN_TOK (English token), AUDIO_WAV (audio waveform), IMAGE, DNA (genetic bases). The complete list can be found at [`data_generators/problem.py`](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/data_generators/problem.py) in the class `SpaceID`.
+SpaceIDs tell Tensor2Tensor what sort of space the input and target tensors are in. These are things like, EN_CHR (English character), EN_TOK (English token), AUDIO_WAV (audio waveform), IMAGE, DNA (genetic bases). The complete list can be found at `data_generators/problem.py` in the class `SpaceID`.
 
 Since we're generating definitions and feeding in words at the character level, we set `is_character_level` to true, and use the same SpaceID, EN_CHR, for both input and target. Additionally, since we aren't using tokens, we don't need to give a `targeted_vocab_size` or define `use_subword_tokenizer`.
 
@@ -58,8 +56,6 @@ The number of shards to break data files into.
 @registry.register_problem()
 class Word2def(problem.Text2TextProblem):
   """Problem spec for English word to dictionary definition."""
-  
-  @property
   def is_character_level(self):
     return True
 
@@ -86,11 +82,12 @@ class Word2def(problem.Text2TextProblem):
 
 **generator**:
 
-We're almost done. `generator` generates the training and evaluation data and stores them in files like  "word2def_train.lang1" in your DATA_DIR. Thankfully several commonly used methods like `character_generator`, and `token_generator` are already written in the file [`wmt.py`](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/data_generators/wmt.py). We will import `character_generator` and [`text_encoder`](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/data_generators/text_encoder.py) to write:
+We're almost done. `generator` generates the training and evaluation data and stores them in files like  "word2def_train.lang1" in your DATA_DIR. Thankfully several commonly used methods like `character_generator`, and `token_generator` are already written in the file `wmt.py`. We will import `character_generator` and write:
 ```python
   def generator(self, data_dir, tmp_dir, train):
     character_vocab = text_encoder.ByteTextEncoder()
     datasets = _WORD2DEF_TRAIN_DATASETS if train else _WORD2DEF_TEST_DATASETS
+    tag = "train" if train else "dev"
     return character_generator(datasets[0], datasets[1], character_vocab, EOS)
 ```
 
@@ -111,6 +108,7 @@ class Word2def(problem.Text2TextProblem):
   def generator(self, data_dir, tmp_dir, train):
     character_vocab = text_encoder.ByteTextEncoder()
     datasets = _WORD2DEF_TRAIN_DATASETS if train else _WORD2DEF_TEST_DATASETS
+    tag = "train" if train else "dev"
     return character_generator(datasets[0], datasets[1], character_vocab, EOS)
 
   @property
@@ -139,31 +137,42 @@ I've gone ahead and split all words into a train and test set and saved them in
 ```python
 # English Word2def datasets
 _WORD2DEF_TRAIN_DATASETS = [
-    LOCATION_OF_DATA + 'words_train.txt',
-    LOCATION_OF_DATA + 'definitions_train.txt'
+    [
+        "LOCATION_OF_DATA/", ("words_train.txt", "definitions_train.txt")
+    ]
 ]
-
 _WORD2DEF_TEST_DATASETS = [
-    LOCATION_OF_DATA + 'words_test.txt',
-    LOCATION_OF_DATA + 'definitions_test.txt'
+    [
+        "LOCATION_OF_DATA", ("words_test.txt", "definitions_test.txt")
+    ]
 ]
 ```
 
 ## Putting it all together
 
-Now our `word2def.py` file looks like:
+Now our `word2def.py` file looks like: (with the correct imports)
 ```python
 """ Problem definition for word to dictionary definition.
 """
 
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
 import os
+import tarfile # do we need this import
 
+from tensor2tensor.data_generators import generator_utils
 from tensor2tensor.data_generators import problem
 from tensor2tensor.data_generators import text_encoder
 from tensor2tensor.data_generators.wmt import character_generator
 
 from tensor2tensor.utils import registry
 
+import tensorflow as tf
+
+FLAGS = tf.flags.FLAGS
+
 # English Word2def datasets
 _WORD2DEF_TRAIN_DATASETS = [
     LOCATION_OF_DATA+'words_train.txt',
@@ -189,6 +198,7 @@ class Word2def(problem.Text2TextProblem):
   def generator(self, data_dir, tmp_dir, train):
     character_vocab = text_encoder.ByteTextEncoder()
     datasets = _WORD2DEF_TRAIN_DATASETS if train else _WORD2DEF_TEST_DATASETS
+    tag = "train" if train else "dev"
     return character_generator(datasets[0], datasets[1], character_vocab, EOS)
 
   @property
@@ -210,17 +220,7 @@ class Word2def(problem.Text2TextProblem):
 ```
 
 # Hyperparameters
-All hyperparamters inherit from `_default_hparams()` in `problem.py.` If you would like to customize your hyperparameters, register a new hyperparameter set in `word2def.py` like the example provided in the walkthrough.  For example:
-
-```python
-from tensor2tensor.models import transformer
-
-@registry.register_hparams
-def word2def_hparams(self):
-    hparams = transformer.transformer_base_single_gpu()  # Or whatever you'd like to build off.
-    hparams.batch_size = 1024
-    return hparams
-```
+All hyperparamters inherit from `_default_hparams()` in `problem.py.` If you would like to customize your hyperparameters, add another method to the file `problem_hparams.py`.
 
 # Run the problem
 Now that we've gotten our problem set up, let's train a model and generate definitions. 
@@ -229,7 +229,7 @@ We specify our problem name, the model, and hparams.
 ```bash
 PROBLEM=word2def
 MODEL=transformer
-HPARAMS=word2def_hparams
+HPARAMS=transofmer_base_single_gpu
 ```
 
 The rest of the steps are as given in the [walkthrough](walkthrough.md).
diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py
index cde91cc7b..d55911f19 100644
--- a/tensor2tensor/utils/data_reader.py
+++ b/tensor2tensor/utils/data_reader.py
@@ -27,7 +27,6 @@
 
 import six
 from six.moves import zip  # pylint: disable=redefined-builtin
-from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensor2tensor.data_generators import problem_hparams
 from tensor2tensor.data_generators.problem import preprocess_examples_common
diff --git a/tensor2tensor/visualization/attention.py b/tensor2tensor/visualization/attention.py
index 6109f9cc6..bc4238081 100644
--- a/tensor2tensor/visualization/attention.py
+++ b/tensor2tensor/visualization/attention.py
@@ -15,7 +15,7 @@
 
 """Module for postprocessing and displaying tranformer attentions.
 
-This module is designed to be called from an ipython notebook.
+This module is deigned to be called from an ipython notebook.
 """
 
 import json

From 98f55734aa8f49aa00aec5cb27a90887e96b5682 Mon Sep 17 00:00:00 2001
From: Etienne Pot <epot@google.com>
Date: Wed, 30 Aug 2017 19:48:53 -0700
Subject: [PATCH 02/32] Add some logging/debug messages. Remove padding for all
 layers when local experts (both attention and fc)

PiperOrigin-RevId: 167086679
---
 tensor2tensor/layers/common_attention.py | 34 +++++++--
 tensor2tensor/models/attention_lm_moe.py | 94 ++++++++++++++++++++++--
 tensor2tensor/utils/expert_utils.py      | 15 ----
 3 files changed, 117 insertions(+), 26 deletions(-)

diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py
index 253e9bee5..975ed94ae 100644
--- a/tensor2tensor/layers/common_attention.py
+++ b/tensor2tensor/layers/common_attention.py
@@ -33,6 +33,9 @@
 from tensorflow.python.framework import function
 
 
+_expert_count = 0
+
+
 def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
   """Adds a bunch of sinusoids of different frequencies to a Tensor.
 
@@ -1007,9 +1010,22 @@ def self_attention_expert(
      expert_fn=functools.partial(self_attention_expert, mask_right=)
      )
   """
+
   depth = x.get_shape().as_list()[-1]
   length = tf.shape(batch_coordinate)[0]
 
+  # Print a warning message if one of the expert isn't used (useful at
+  # inference where summaries aren't used and the gating function don't add
+  # noise)
+  global _expert_count  # Hack to make each expert have a unique id
+  _expert_count += 1
+  length = tf.cond(
+      tf.equal(length, 0),
+      lambda: tf.Print(  # pylint: disable=g-long-lambda
+          length, [length], "Expert {} empty: ".format(_expert_count)),
+      lambda: length,
+  )
+
   tf.summary.scalar("batch_size", length, family="experts_stats_batch_size")
 
   attention_kq_size = attention_kq_size or depth
@@ -1063,7 +1079,7 @@ def local_expert_attention(
     loss_coef,
     attention_num_experts,
     train=True,
-    pad_remover=None,
+    batch_coordinate=None,
     **kwargs
 ):
   """Attention using a mixture of experts.
@@ -1072,23 +1088,30 @@ def local_expert_attention(
     The mixture of experts is "local" in that it is replicated on each
     datashard.
 
+    local_moe flatten all batches so to avoid problems with padding (ex: all
+    padding going to the same expert, self attention attending to non null
+    padding tokens,...), the padding should be removed before.
+
   Args:
-    x: a Tensor with shape [batch, length, depth]
+    x: a Tensor with shape [batch, length, depth] or [1, batch*length, depth]
     k: The number of experts to dispatch each example to
     loss_coef: a scalar. A multiplier for the expert loss
     attention_num_experts: The number of experts to use
     train: a boolean for the current mode
-    pad_remover (PadRemover): A util object containing the padding position
+    batch_coordinate (tf.Tensor): int32 tensor of shape [1, batch*length, 1]
+      containing the batch ids. If None, deduced from first dim of x.
     **kwargs: Arguments to forward to self_attention_expert
 
   Returns:
     y: a Tensor with shape [batch, length, depth]
     loss: a Scalar
   """
+  if batch_coordinate is None:
+    batch_coordinate = tf.expand_dims(
+        coordinate_tensor(tf.shape(x)[:-1], axis=0), axis=-1)
   with tf.variable_scope("local_expert_attention"):
     additional_dispatch_params = {
-        "batch_coordinate": tf.expand_dims(
-            coordinate_tensor(tf.shape(x)[:-1], axis=0), axis=-1)
+        "batch_coordinate": batch_coordinate
     }
     return expert_utils.local_moe(
         x,
@@ -1100,7 +1123,6 @@ def local_expert_attention(
         pass_x=True,
         pass_gates=False,
         additional_dispatch_params=additional_dispatch_params,
-        pad_remover=pad_remover
     )
 
 
diff --git a/tensor2tensor/models/attention_lm_moe.py b/tensor2tensor/models/attention_lm_moe.py
index 3b72ea9c2..191d4aa04 100644
--- a/tensor2tensor/models/attention_lm_moe.py
+++ b/tensor2tensor/models/attention_lm_moe.py
@@ -25,6 +25,8 @@
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 # Dependency imports
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
@@ -40,6 +42,9 @@
 import tensorflow as tf
 
 
+ModeKeys = tf.contrib.learn.ModeKeys  # pylint: disable=invalid-name
+
+
 class AttentionType(object):
   MULTIHEAD = "multihead"
   LOCAL_EXPERTS = "local_experts"
@@ -90,6 +95,37 @@ def _diet_expert(x):
       expert_fn = expert_utils.ffn_expert_fn(
           hparams.hidden_size, moe_hidden_sizes, hparams.hidden_size)
 
+    if hparams.attention_type == AttentionType.LOCAL_EXPERTS:
+      # As preprocess and postprocess are called with batch of size one (all
+      # batches concatenated), we just make sure that batch_norm is not use (
+      # should not either way)
+      assert hparams.norm_type != "batch"
+
+      dp_remove_pad = functools.partial(
+          dp, remove_pad, pad_remover=pad_remover, mode=hparams.mode)
+      dp_restore_pad = functools.partial(
+          dp, restore_pad, ref_x=x, pad_remover=pad_remover, mode=hparams.mode)
+    elif (hparams.attention_type == AttentionType.MULTIHEAD or
+          hparams.attention_type == AttentionType.MEMORY_EFFICIENT):
+      # Using identity function: No effect
+      dp_remove_pad = lambda x: (x, None)
+      dp_restore_pad = lambda x: x
+    else:
+      raise ValueError("Only {} supported for now.".format(
+          AttentionType.get_choices()))
+
+    def print_shape(x, suffix):
+      # To help debugging, print the input/output shapes at inference and eval
+      # Inference for long sequences can take a long time, so that's help to
+      # see the progession of the generation
+      if hparams.mode == ModeKeys.TRAIN:
+        return x
+      return tf.Print(x, [tf.shape(x)], "shape_x_{}".format(suffix))
+
+    x = dp(print_shape, x, "in")
+    x, batch_coordinate = dp_remove_pad(x)
+    x = dp(print_shape, x, "in_flat")
+
     for layer in xrange(hparams.num_hidden_layers):
       with tf.variable_scope("layer_%d" % layer):
         with tf.variable_scope(
@@ -118,11 +154,11 @@ def _diet_expert(x):
             y, loss = dp(
                 common_attention.local_expert_attention,
                 preprocess(x),
-                k=2,
+                k=hparams.attention_moe_k,
                 loss_coef=hparams.attention_load_balance,
                 attention_num_experts=hparams.attention_num_experts,
-                train=hparams.mode == tf.contrib.learn.ModeKeys.TRAIN,
-                pad_remover=pad_remover,
+                train=hparams.mode == ModeKeys.TRAIN,
+                batch_coordinate=batch_coordinate,
                 mask_right=True,
                 attention_kq_size=hparams.attention_kq_size,
                 attention_v_size=hparams.attention_v_size)
@@ -138,7 +174,7 @@ def _diet_expert(x):
                 dp,
                 self._ps_devices,
                 preprocess(x),
-                hparams.mode == tf.contrib.learn.ModeKeys.TRAIN,
+                hparams.mode == ModeKeys.TRAIN,
                 input_size=hparams.hidden_size,
                 expert_fn=expert_fn,
                 num_experts=hparams.moe_num_experts,
@@ -160,6 +196,9 @@ def _diet_expert(x):
                 dropout=hparams.relu_dropout)
           x = postprocess(x, y)
     x = preprocess(x)
+
+    x = dp_restore_pad(x)
+
     decoder_output = dp(tf.expand_dims, x, 2)
     return decoder_output, extra_loss
 
@@ -187,12 +226,56 @@ def attention_lm_moe_prepare_decoder(targets, hparams):
   else:
     decoder_self_attention_bias = (
         common_attention.attention_bias_lower_triangle(tf.shape(targets)[1]))
+  # TODO(epot): The padding remover should take into account that the input is
+  # shifted.
   decoder_input = common_layers.shift_left_3d(targets)
   if hparams.pos == "timing":
     decoder_input = common_attention.add_timing_signal_1d(decoder_input)
   return (decoder_input, decoder_self_attention_bias, pad_remover)
 
 
+def remove_pad(x, pad_remover, mode):
+  """Remove padding by concatenating all dimension into one.
+
+  Args:
+    x (tf.Tensor): input of shape [batch_size, length, depth]
+    pad_remover (obj): a PadRemover object
+    mode (ModeKeys): infer, train or eval. If inference, the padding remover is
+      not applied
+
+  Returns:
+    tf.Tensor of shape [1,length_nonpad,depth] where
+      length_nonpad <= batch_size*length
+  """
+  # Compute the batch coordinate before flattening all batches
+  batch_coordinate = tf.expand_dims(
+      common_attention.coordinate_tensor(tf.shape(x)[:-1], axis=0), axis=-1)
+  batch_coordinate = expert_utils.flatten_all_but_last(batch_coordinate)
+
+  # Concatenate all tokens (without padding)
+  x = expert_utils.flatten_all_but_last(x)
+
+  # Remove padding for training and eval
+  if mode != ModeKeys.INFER:
+    # This is a hack to allows inference when the <go> token
+    # is detected as padding and removed. This works for now because there is
+    # no padding at inference.
+    batch_coordinate = pad_remover.remove(batch_coordinate)
+    x = pad_remover.remove(x)
+
+  batch_coordinate = tf.expand_dims(batch_coordinate, axis=0)
+  x = tf.expand_dims(x, axis=0)  # Now batch_size=1
+  return x, batch_coordinate
+
+
+def restore_pad(x, ref_x, pad_remover, mode):
+  x = tf.squeeze(x, axis=0)
+  if mode != ModeKeys.INFER:
+    x = pad_remover.restore(x)
+  x = expert_utils.reshape_like(x, ref_x)
+  return x
+
+
 @registry.register_hparams
 def attention_lm_moe_base():
   """Set of hyperparameters.
@@ -238,6 +321,7 @@ def attention_lm_moe_base():
   hparams.add_hparam("moe_layers", "2")  # comma separated list of layer numbers
   # moe params. local attention moe.
   hparams.add_hparam("attention_type", AttentionType.MULTIHEAD)
+  hparams.add_hparam("attention_moe_k", 2)
   hparams.add_hparam("attention_num_experts", 16)
   # Key, query and value dimensions for the attention
   hparams.add_hparam("attention_kq_size", 128)
@@ -256,7 +340,7 @@ def attention_lm_moe_base_ae():
   hparams.attention_type = AttentionType.LOCAL_EXPERTS
   hparams.max_length = hparams.batch_size
   hparams.eval_drop_long_sequences = int(True)
-  hparams.batching_mantissa_bits = 2  # More buckets
+  hparams.min_length_bucket = 256  # Avoid cyclic problems for big batches
   hparams.learning_rate = 0.05
   hparams.learning_rate_warmup_steps = 10000
   return hparams
diff --git a/tensor2tensor/utils/expert_utils.py b/tensor2tensor/utils/expert_utils.py
index fb1d1fac0..16820ff37 100644
--- a/tensor2tensor/utils/expert_utils.py
+++ b/tensor2tensor/utils/expert_utils.py
@@ -847,7 +847,6 @@ def local_moe(x,
               pass_x=True,
               pass_gates=False,
               additional_dispatch_params=None,
-              pad_remover=None,
               name=None):
   """Call a local mixture of experts.
 
@@ -864,8 +863,6 @@ def local_moe(x,
     additional_dispatch_params: The extra tensors that need to be sent to each
       expert. Examples include batch batch coordinates (see
       common_attention.local_expert_attention)
-    pad_remover (PadRemover): If given, the padding is removed/restored before
-      sending to the experts
     name: a string
 
   Returns:
@@ -879,14 +876,6 @@ def local_moe(x,
   with tf.variable_scope(name, default_name="local_moe"):
     x_flat = flatten_all_but_last(x)
 
-    # Remove the padding tokens
-    if pad_remover:
-      x_flat = pad_remover.remove(x_flat)
-      tf.summary.scalar(  # Should match the targets_nonpadding_tokens
-          "nonpadding_tokens",
-          tf.shape(x_flat)[0],
-          family="experts_stats")
-
     # The gates indicate which batch elements go to which tensors.
     # load is a measure of approximately how many examples go to each expert
     gates, load = noisy_top_k_gating(
@@ -908,16 +897,12 @@ def local_moe(x,
       expert_kwargs["gates"] = dispatcher.expert_to_gates()
     for k, v in six.iteritems(additional_dispatch_params or {}):
       v = flatten_all_but_last(v)
-      if pad_remover:
-        v = pad_remover.remove(v)
       expert_kwargs[k] = dispatcher.dispatch(v)
 
     ep = Parallelism([DEFAULT_DEV_STRING] * num_experts)
     expert_outputs = ep(expert_fn, **expert_kwargs)
 
     y_flat = dispatcher.combine(expert_outputs)
-    if pad_remover:
-      y_flat = pad_remover.restore(y_flat)
     y = reshape_like(y_flat, x)
 
     importance = tf.reduce_sum(gates, 0)

From 473089b113e7644aa9b3a7f8794f237d3f41e24f Mon Sep 17 00:00:00 2001
From: Lukasz Kaiser <lukaszkaiser@google.com>
Date: Wed, 30 Aug 2017 20:20:07 -0700
Subject: [PATCH 03/32] Correct CNN+DailyMail generator, make TransforerAE work
 with 2d input.

PiperOrigin-RevId: 167088556
---
 .../data_generators/cnn_dailymail.py          |  4 +-
 tensor2tensor/layers/modalities.py            | 45 ++++---------
 tensor2tensor/models/cycle_gan.py             | 66 +------------------
 tensor2tensor/models/transformer_vae.py       | 46 +++++++++----
 4 files changed, 48 insertions(+), 113 deletions(-)

diff --git a/tensor2tensor/data_generators/cnn_dailymail.py b/tensor2tensor/data_generators/cnn_dailymail.py
index db4deae4e..93e846a0b 100644
--- a/tensor2tensor/data_generators/cnn_dailymail.py
+++ b/tensor2tensor/data_generators/cnn_dailymail.py
@@ -53,8 +53,8 @@ def _maybe_download_corpora(tmp_dir):
     filepath of the downloaded corpus file.
   """
   cnn_filename = "cnn_stories.tgz"
-  dailymail_filename = "dailymail_stories.tgz"
   cnn_finalpath = os.path.join(tmp_dir, "cnn/stories/")
+  dailymail_filename = "dailymail_stories.tgz"
   dailymail_finalpath = os.path.join(tmp_dir, "dailymail/stories/")
   if not tf.gfile.Exists(cnn_finalpath):
     cnn_file = generator_utils.maybe_download_from_drive(
@@ -63,7 +63,7 @@ def _maybe_download_corpora(tmp_dir):
       cnn_tar.extractall(tmp_dir)
   if not tf.gfile.Exists(dailymail_finalpath):
     dailymail_file = generator_utils.maybe_download_from_drive(
-        tmp_dir, dailymail_filename, _CNN_STORIES_DRIVE_URL)
+        tmp_dir, dailymail_filename, _DAILYMAIL_STORIES_DRIVE_URL)
     with tarfile.open(dailymail_file, "r:gz") as dailymail_tar:
       dailymail_tar.extractall(tmp_dir)
   return [cnn_finalpath, dailymail_finalpath]
diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py
index e03e6835e..c93a05433 100644
--- a/tensor2tensor/layers/modalities.py
+++ b/tensor2tensor/layers/modalities.py
@@ -141,17 +141,11 @@ def top_dimensionality(self):
   def bottom(self, inputs):
     with tf.variable_scope(self.name):
       inputs = common_layers.standardize_images(inputs)
-      # TODO(lukaszkaiser): summaries here don't work in multi-problem case yet.
-      # tf.summary.image("inputs", inputs, max_outputs=2)
-      if self._model_hparams.compress_steps > 0:
-        strides = (2, 2)
-      else:
-        strides = (1, 1)
+      tf.summary.image("inputs", inputs, max_outputs=2)
       return common_layers.conv_block(
           inputs,
           self._body_input_depth, [((1, 1), (3, 3))],
           first_relu=False,
-          strides=strides,
           padding="SAME",
           force2d=True,
           name="small_image_conv")
@@ -159,43 +153,26 @@ def bottom(self, inputs):
   def targets_bottom(self, inputs):
     with tf.variable_scope(self.name):
       # Reshape inputs to 2-d tensor and embed the RGB pixel values.
+      shape = tf.shape(inputs)
       inputs = common_layers.flatten4d3d(inputs)
       ret = common_layers.embedding(
-          inputs,
+          tf.to_int32(inputs),
           self.top_dimensionality,
           self._body_input_depth,
           name="input_rgb_embedding")
       if self._model_hparams.multiply_embedding_mode == "sqrt_depth":
         ret *= self._body_input_depth**0.5
-      return ret
+      ret = tf.reshape(ret, [shape[0], shape[1], shape[2],
+                             self._body_input_depth * 3])
+      return tf.layers.dense(ret, self._body_input_depth)
 
   def top(self, body_output, _):
     with tf.variable_scope("rgb_softmax"):
-      # separate embedding for each channel
-      # assuming the body output returns a tensor of shape
-      # [batch_size, rows, cols, channels, self._body_input_depth]
-      body_output_split = tf.split(body_output, self._channels, axis=3)
-      output_rgb_embedding_var = tf.get_variable(
-          "output_rgb_embedding",
-          [self._channels, self.top_dimensionality, self._body_input_depth],
-          initializer=tf.random_normal_initializer(0.0, self._body_input_depth
-                                                   **-0.5))
-      # compute logits separately for each channel
-      rgb_channel_logits = []
-      for i in self._channels:
-        shape = tf.shape(body_output_split[i])[:-1]
-        body_output = tf.reshape(body_output_split[i],
-                                 [-1, self._body_input_depth])
-        channel_logits = tf.matmul(
-            body_output, output_rgb_embedding_var[i], transpose_b=True)
-        rgb_channel_logits.append(
-            tf.reshape(channel_logits,
-                       tf.concat([shape, [self.top_dimensionality]], 0)))
-
-      logits = tf.concat(rgb_channel_logits, axis=3)
-      # Reshape logits to conform to CIFAR image shapes (32 by 32 by 3)
-
-      return logits
+      shape = tf.shape(body_output)
+      dim = body_output.get_shape().as_list()[-1] // 3
+      out = tf.reshape(body_output, [shape[0], shape[1], shape[2],
+                                     self._channels, dim])
+      return tf.layers.dense(out, self.top_dimensionality)
 
   def loss(self, top_out, targets, weights_fn=common_layers.weights_all):
     # Call the default implementation, but weight 1.0 on 0s by default.
diff --git a/tensor2tensor/models/cycle_gan.py b/tensor2tensor/models/cycle_gan.py
index c17becbbe..4cf1a5871 100644
--- a/tensor2tensor/models/cycle_gan.py
+++ b/tensor2tensor/models/cycle_gan.py
@@ -124,74 +124,10 @@ def model_fn_body(self, features):
         self._hparams)
 
 
-def cycle_vae_gan_internal(inputs, targets, _, hparams):
-  """Cycle GAN, main step used for training."""
-  with tf.variable_scope("cycle_vae_gan"):
-    # Embed inputs and targets.
-    inputs_orig, targets_orig = tf.to_int32(inputs), tf.to_int32(targets)
-    k = 2**hparams.num_compress_steps
-    inputs_orig, targets_orig = common_layers.pad_to_same_length(
-        inputs_orig, targets_orig, final_length_divisible_by=k)
-    inputs = common_layers.embedding(
-        inputs_orig, hparams.vocab_size, hparams.hidden_size, "embed")
-    targets = common_layers.embedding(
-        targets_orig, hparams.vocab_size, hparams.hidden_size,
-        "embed", reuse=True)
-
-    # Split the batch into input-input and target-target parts.
-    inputs1, _ = split_on_batch(inputs)
-    _, targets2 = split_on_batch(targets)
-
-    # Input-input part.
-    inp1_back, kl_loss1, inp1_mu, inp1_log_sigma = transformer_vae.vae_compress(
-        inputs1, None, hparams, "inp2hyp", "hyp2inp")
-    inp1_hyp = tf.concat([inp1_mu, inp1_log_sigma], axis=3)
-
-    # Target-target part.
-    tgt2_back, kl_loss2, tgt2_mu, tgt2_log_sigma = transformer_vae.vae_compress(
-        targets2, None, hparams, "tgt2hyp", "hyp2tgt")
-    tgt2_hyp = tf.concat([tgt2_mu, tgt2_log_sigma], axis=3)
-
-    # Reconstruction losses.
-    inp1_orig, _ = split_on_batch(inputs_orig)
-    _, tgt2_orig = split_on_batch(targets_orig)
-    inp1_loss = reconstruct_loss(
-        inp1_back, tf.squeeze(inp1_orig, axis=3), hparams)
-    tgt2_loss = reconstruct_loss(
-        tgt2_back, tf.squeeze(tgt2_orig, axis=3), hparams, reuse=True)
-
-    # Discriminator loss.
-    dloss = discriminate_loss(inp1_hyp, tgt2_hyp, False, hparams, "dloss")
-
-    # Reconstruct targets from inputs.
-    tgt, _, _, _ = transformer_vae.vae_compress(
-        inputs, None, hparams, "inp2hyp", "hyp2tgt", reuse=True)
-    tgt = tf.layers.dense(tgt, hparams.vocab_size, name="softmax", reuse=True)
-    # We use the reconstruction only for tracking progress, no gradients here!
-    tgt = tf.stop_gradient(tf.expand_dims(tgt, axis=2))
-
-    kl_rev_decay = common_layers.inverse_exp_decay(hparams.kl_warmup_steps)
-    losses = {"input_input": hparams.cycle_loss_multiplier * inp1_loss,
-              "target_target": hparams.cycle_loss_multiplier * tgt2_loss,
-              "input_kl": kl_loss1 * kl_rev_decay * 15.0,
-              "target_kl": kl_loss2 * kl_rev_decay * 15.0,
-              "discriminator": dloss}
-    return tgt, losses
-
-
-@registry.register_model
-class CycleVaeGAN(t2t_model.T2TModel):
-
-  def model_fn_body(self, features):
-    return cycle_vae_gan_internal(
-        features["inputs"], features["targets"], features["target_space_id"],
-        self._hparams)
-
-
 @registry.register_hparams
 def cycle_gan_small():
   """Set of hyperparameters."""
-  hparams = transformer_vae.transformer_vae_small()
+  hparams = transformer_vae.transformer_ae_small()
   hparams.batch_size = 2048
   hparams.input_modalities = "inputs:symbol:identity"
   hparams.target_modality = "symbol:identity"
diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py
index 1c566e996..025f8d631 100644
--- a/tensor2tensor/models/transformer_vae.py
+++ b/tensor2tensor/models/transformer_vae.py
@@ -253,18 +253,25 @@ def ae_decompress(z, ae, x, is_2d, hparams, name, reuse=None):
 
     # Decompress.
     d = z
+    k = (3, 3) if is_2d else (3, 1)
     for i in xrange(hparams.num_compress_steps):
       j = hparams.num_compress_steps - i - 1
-      d = residual_conv(d, 1, (3, 1), hparams, "decompress_rc_%d" % j)
+      d = residual_conv(d, 1, k, hparams, "decompress_rc_%d" % j)
       d = decompress_step(d, None, hparams, i > 0, is_2d, "decompress_%d" % j)
 
-    k = 2**hparams.num_compress_steps
-    z_batch = tf.reshape(z, [-1, 1, 1, hparams.hidden_size])
-    x_batch = tf.reshape(x, [-1, k, 1, hparams.hidden_size])
-    d_batch = tf.reshape(d, [-1, k, 1, hparams.hidden_size])
-    dec_batch = decode(z_batch, d_batch, x_batch, None, None, hparams)
-    z = tf.reshape(dec_batch, [-1, tf.shape(x)[1], 1, hparams.hidden_size])
-
+    # Autoregressive part.
+    if not is_2d:  # Currently we don't do it autoregressively for 2d problems.
+      k = 2**(hparams.num_compress_steps * (2 if is_2d else 1))
+      z_batch = tf.reshape(z, [-1, 1, 1, hparams.hidden_size])
+      x_batch = tf.reshape(x, [-1, k, 1, hparams.hidden_size])
+      d_batch = tf.reshape(d, [-1, k, 1, hparams.hidden_size])
+      dec_batch = decode(z_batch, d_batch, x_batch, None, None, hparams)
+    else:  # For non-autoregressive.
+      dec_batch = d
+    z = tf.reshape(dec_batch, [-1, tf.shape(x)[1], tf.shape(x)[2],
+                               hparams.hidden_size])
+    if is_2d:
+      z = tf.layers.dense(z, hparams.hidden_size * 3)
   return z
 
 
@@ -286,11 +293,14 @@ def ae_transformer_internal(inputs, targets, target_space, hparams):
     inputs, ed = encode(inputs, target_space, hparams, "input_enc")
 
     # Compress and ae.
-    ae, hot, kl = ae_compress(targets, False, hparams, "ae")
+    ae, hot, kl = ae_compress(targets, hparams.is_2d, hparams, "ae")
+    tf.summary.histogram("hot", tf.reshape(tf.argmax(hot, axis=-1), [-1]))
     emb = ae_embed(hot, hparams, "ae", reuse=True)
 
     # Compress context and run autoregressive decoder on emb-hot.
-    dec_c = decode(None, None, emb, inputs, ed, hparams)
+    emb_flat = tf.expand_dims(common_layers.flatten4d3d(emb), axis=2)
+    dec_c = decode(None, None, emb_flat, inputs, ed, hparams)
+    dec_c = tf.reshape(dec_c, tf.shape(emb))
     c_z = tf.layers.dense(dec_c, hparams.v_size, name="mask_context")
     reconstruct_loss = tf.nn.softmax_cross_entropy_with_logits(
         labels=hot, logits=c_z)
@@ -299,8 +309,8 @@ def ae_transformer_internal(inputs, targets, target_space, hparams):
       hot = tf.one_hot(tf.argmax(c_z, axis=-1), hparams.v_size)
 
     # Decompress, pass for ae loss.
-    z = ae_decompress(emb, ae, targets, False, hparams, "ae")
-    kl *= common_layers.inverse_exp_decay(int(hparams.startup_steps * 0.5))
+    z = ae_decompress(emb, ae, targets, hparams.is_2d, hparams, "ae")
+    kl *= common_layers.inverse_exp_decay(int(hparams.startup_steps * 0.8))
     reconstruct_loss *= common_layers.inverse_exp_decay(hparams.startup_steps)
     losses = {"kl": kl, "reconstruction": reconstruct_loss}
     return z, losses
@@ -365,6 +375,18 @@ def transformer_ae_small():
   hparams.add_hparam("startup_steps", 30000)
   hparams.add_hparam("kmeans_lr_factor", 0.002)
   hparams.add_hparam("z_dropout", 0.1)
+  hparams.add_hparam("is_2d", 0)
+  return hparams
+
+
+@registry.register_hparams
+def transformer_ae_cifar():
+  hparams = transformer_ae_small()
+  hparams.batch_size = 1024 * 16
+  hparams.num_compress_steps = 2
+  hparams.v_size = 1024 * 16
+  hparams.startup_steps = 120000
+  hparams.is_2d = 1
   return hparams
 
 

From c7636a372e5575c040cfcb8a574bd0b0387da53e Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Thu, 31 Aug 2017 14:46:43 -0700
Subject: [PATCH 04/32] Transformer hparams fall back on `num_hidden_layers`

PiperOrigin-RevId: 167194460
---
 tensor2tensor/models/transformer.py      | 17 +++++++++-------
 tensor2tensor/models/transformer_test.py | 25 ------------------------
 2 files changed, 10 insertions(+), 32 deletions(-)

diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index 105d9eb32..41bfa5b7f 100644
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -190,7 +190,8 @@ def transformer_encoder(encoder_input,
   """
   x = encoder_input
   with tf.variable_scope(name):
-    for layer in xrange(hparams.num_encoder_layers):
+    for layer in xrange(
+        hparams.num_encoder_layers or hparams.num_hidden_layers):
       with tf.variable_scope("layer_%d" % layer):
         with tf.variable_scope("self_attention"):
           y = common_attention.multihead_attention(
@@ -233,7 +234,8 @@ def transformer_decoder(decoder_input,
   """
   x = decoder_input
   with tf.variable_scope(name):
-    for layer in xrange(hparams.num_decoder_layers):
+    for layer in xrange(
+        hparams.num_decoder_layers or hparams.num_hidden_layers):
       with tf.variable_scope("layer_%d" % layer):
         with tf.variable_scope("self_attention"):
           y = common_attention.multihead_attention(
@@ -323,11 +325,12 @@ def transformer_base():
   hparams.label_smoothing = 0.1
   hparams.shared_embedding_and_softmax_weights = int(True)
 
-  hparams.add_hparam("filter_size", 2048)  # Add new ones like this.
-  # layer-related flags
-  hparams.add_hparam("num_encoder_layers", hparams.num_hidden_layers)
-  hparams.add_hparam("num_decoder_layers", hparams.num_hidden_layers)
-  # attention-related flags
+  # Add new ones like this.
+  hparams.add_hparam("filter_size", 2048)
+  # Layer-related flags. If zero, these fall back on hparams.num_hidden_layers.
+  hparams.add_hparam("num_encoder_layers", 0)
+  hparams.add_hparam("num_decoder_layers", 0)
+  # Attention-related flags.
   hparams.add_hparam("num_heads", 8)
   hparams.add_hparam("attention_key_channels", 0)
   hparams.add_hparam("attention_value_channels", 0)
diff --git a/tensor2tensor/models/transformer_test.py b/tensor2tensor/models/transformer_test.py
index 391824524..6c0eee203 100644
--- a/tensor2tensor/models/transformer_test.py
+++ b/tensor2tensor/models/transformer_test.py
@@ -64,31 +64,6 @@ def testTransformer(self):
       res = session.run(logits)
     self.assertEqual(res.shape, (BATCH_SIZE, TARGET_LENGTH, 1, 1, VOCAB_SIZE))
 
-  def testBeamDecodeVsGreedy(self):
-    model, features = self.getModel()
-
-    decode_length = 20
-
-    greedy_result, _, _ = model._greedy_infer(
-        features, decode_length, last_position_only=True)
-    greedy_result = tf.squeeze(greedy_result, axis=[2, 3])
-
-    with tf.variable_scope(tf.get_variable_scope(), reuse=True):
-      beam_res = model._beam_decode(
-          features,
-          decode_length,
-          beam_size=1,
-          top_beams=1,
-          last_position_only=True,
-          alpha=1.0)
-
-    with self.test_session() as session:
-      session.run(tf.global_variables_initializer())
-      greedy_res, beam_res = session.run([greedy_result, beam_res])
-
-    self.assertEqual(beam_res.shape, (BATCH_SIZE, INPUT_LENGTH + decode_length))
-    self.assertAllClose(greedy_res, beam_res)
-
 
 if __name__ == "__main__":
   tf.test.main()

From a317801dd7594b8b60a846e974d31ed426a8eeba Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Thu, 31 Aug 2017 15:13:22 -0700
Subject: [PATCH 05/32] Speed up Transformer using PadRemover

PiperOrigin-RevId: 167198565
---
 tensor2tensor/layers/common_attention.py | 16 +++++++++++++
 tensor2tensor/models/transformer.py      | 29 ++++++++++++++++++++----
 2 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py
index 975ed94ae..7ed7799d0 100644
--- a/tensor2tensor/layers/common_attention.py
+++ b/tensor2tensor/layers/common_attention.py
@@ -215,6 +215,22 @@ def attention_bias_ignore_padding(memory_padding):
   return tf.expand_dims(tf.expand_dims(ret, axis=1), axis=1)
 
 
+def attention_bias_to_padding(attention_bias):
+  """Inverse of attention_bias_ignore_padding().
+
+  Args:
+    attention_bias: a `Tensor` with shape [batch, 1, 1, memory_length], as
+      returned by attention_bias_ignore_padding().
+
+  Returns:
+    a Tensor with shape [batch, memory_length] with 1.0 in padding positions
+    and 0.0 in non-padding positions.
+  """
+  # `attention_bias` is a large negative number in padding positions and 0.0
+  # elsewhere.
+  return tf.squeeze(tf.to_float(tf.less(attention_bias, -1)), axis=[1, 2])
+
+
 def attention_bias_prepend_inputs_full_attention(padding):
   """Create a bias tensor for prepend_mode="prepend_inputs_full_attention".
 
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index 41bfa5b7f..86b920dc5 100644
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -30,6 +30,7 @@
 from tensor2tensor.layers import common_attention
 from tensor2tensor.layers import common_hparams
 from tensor2tensor.layers import common_layers
+from tensor2tensor.utils import expert_utils
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import t2t_model
 
@@ -50,8 +51,8 @@ def model_fn_body(self, features):
     targets = common_layers.flatten4d3d(targets)
 
     (encoder_input, encoder_self_attention_bias,
-     encoder_decoder_attention_bias) = (transformer_prepare_encoder(
-         inputs, target_space, hparams))
+     encoder_decoder_attention_bias) = transformer_prepare_encoder(
+         inputs, target_space, hparams)
     (decoder_input, decoder_self_attention_bias) = transformer_prepare_decoder(
         targets, hparams)
 
@@ -202,8 +203,11 @@ def transformer_encoder(encoder_input,
               hparams.hidden_size, hparams.num_heads, hparams.attention_dropout)
           x = common_layers.layer_postprocess(x, y, hparams)
         with tf.variable_scope("ffn"):
+          pad_remover = expert_utils.PadRemover(
+              common_attention.attention_bias_to_padding(
+                  encoder_self_attention_bias))
           y = transformer_ffn_layer(
-              common_layers.layer_preprocess(x, hparams), hparams)
+              common_layers.layer_preprocess(x, hparams), hparams, pad_remover)
           x = common_layers.layer_postprocess(x, y, hparams)
   # if normalization is done in layer_preprocess, then it shuold also be done
   # on the output, since the output can grow very large, being the sum of
@@ -265,22 +269,37 @@ def transformer_decoder(decoder_input,
   return common_layers.layer_preprocess(x, hparams)
 
 
-def transformer_ffn_layer(x, hparams):
+def transformer_ffn_layer(x, hparams, pad_remover=None):
   """Feed-forward layer in the transformer.
 
   Args:
     x: a Tensor of shape [batch_size, length, hparams.hidden_size]
     hparams: hyperparmeters for model
+    pad_remover: an expert_utils.PadRemover object tracking the padding
+      positions. If provided, when using convolutional settings, the padding
+      is removed before applying the convolution, and restored afterward. This
+      can give a significant speedup.
 
   Returns:
     a Tensor of shape [batch_size, length, hparams.hidden_size]
   """
   if hparams.ffn_layer == "conv_hidden_relu":
-    return common_layers.conv_hidden_relu(
+    # In simple convolution mode, use `pad_remover` to speed up processing.
+    if pad_remover:
+      original_shape = tf.shape(x)
+      # Collapse `x` across examples, and remove padding positions.
+      x = tf.reshape(x, tf.concat([[-1], tf.shape(x)[2:]], axis=0))
+      x = tf.expand_dims(pad_remover.remove(x), axis=0)
+    conv_output = common_layers.conv_hidden_relu(
         x,
         hparams.filter_size,
         hparams.hidden_size,
         dropout=hparams.relu_dropout)
+    if pad_remover:
+      # Restore `conv_output` to the original shape of `x`, including padding.
+      conv_output = tf.reshape(
+          pad_remover.restore(tf.squeeze(conv_output, axis=0)), original_shape)
+    return conv_output
   elif hparams.ffn_layer == "parameter_attention":
     return common_attention.parameter_attention(
         x, hparams.parameter_attention_key_channels or hparams.hidden_size,

From 1a9bdacf2fc4f87faa4da74908487a626e06c2db Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Fri, 1 Sep 2017 12:55:56 -0700
Subject: [PATCH 06/32] Bug fix and better documentation for normalizer_fn.

PiperOrigin-RevId: 167312851
---
 tensor2tensor/layers/common_layers.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/tensor2tensor/layers/common_layers.py b/tensor2tensor/layers/common_layers.py
index 4b09e70cb..264c11cf6 100644
--- a/tensor2tensor/layers/common_layers.py
+++ b/tensor2tensor/layers/common_layers.py
@@ -628,11 +628,22 @@ def conv_block_internal(conv_fn,
   Returns:
      a Tensor.
   """
+
   name = kwargs.pop("name") if "name" in kwargs else None
   mask = kwargs.pop("mask") if "mask" in kwargs else None
-  norm = kwargs.pop("normalizer_fn") if "normalizer_fn" in kwargs else None
-  if norm is None and "normalizer_fn" not in kwargs:
+
+  # Usage for normalize_fn kwarg:
+  # if not specified, use layer norm
+  # if given normalize_fn=None, don't use any normalization
+  # if given normalize_fn=norm, use the specified norm function
+
+  use_layer_norm = "normalizer_fn" not in kwargs
+  norm = kwargs.pop("normalizer_fn", None)
+  use_normalizer_fn = use_layer_norm or norm
+
+  if use_layer_norm:
     norm = lambda x, name: layer_norm(x, filters, name=name)
+
   with tf.variable_scope(name, "conv_block", [inputs]):
     cur, counter = inputs, -1
     for dilation_rate, kernel_size in dilation_rates_and_kernel_sizes:
@@ -660,7 +671,7 @@ def conv_block_internal(conv_fn,
             name="conv_block_%d" % counter,
             use_bias=norm is None,
             **kwargs)
-      if norm is not None:
+      if use_normalizer_fn:
         cur = norm(cur, name="conv_block_norm_%d" % counter)
     return cur
 

From 956e767af673be6292e9b2d06e5ce15688ba76d9 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Fri, 1 Sep 2017 12:59:51 -0700
Subject: [PATCH 07/32] Use new dynamic window size group_by_window
 functionality in an OSS-compatible way

PiperOrigin-RevId: 167313309
---
 tensor2tensor/utils/data_reader.py | 41 +++++++++++++++++++++---------
 1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py
index d55911f19..09ef159a4 100644
--- a/tensor2tensor/utils/data_reader.py
+++ b/tensor2tensor/utils/data_reader.py
@@ -267,12 +267,14 @@ def input_pipeline(problem, data_file_pattern, capacity, mode, hparams,
     dataset = dataset.filter(
         lambda ex: _example_too_big(ex, batching_scheme["max_length"]))
 
-    dataset = bucket_by_sequence_length(dataset, _example_length,
-                                        batching_scheme["boundaries"],
-                                        batching_scheme["batch_sizes"],
-                                        batching_scheme["window_size"])
+    dataset = bucket_by_sequence_length(
+        dataset, _example_length, batching_scheme["boundaries"],
+        batching_scheme["batch_sizes"], batching_scheme["window_size"])
     # We reshuffle the batches to prevent many long-sequence batches at once.
-    if batching_scheme["shuffle_queue_size"] is not None:
+    # TODO(rsepassi): Rm hasattr call once new dynamic window size functionality
+    # is in a stable TF release.
+    if (batching_scheme["shuffle_queue_size"] is not None and
+        not hasattr(dataset, "apply")):
       dataset = dataset.shuffle(batching_scheme["shuffle_queue_size"])
     batched_examples = dataset.make_one_shot_iterator().get_next()
     return batched_examples
@@ -338,6 +340,12 @@ def example_to_bucket_id(example):
 
       return bucket_id
 
+    def window_size_fn(bucket_id):
+      # window size = batch size
+      batch_sizes = tf.constant(bucket_batch_sizes, dtype=tf.int64)
+      window_size = batch_sizes[bucket_id]
+      return window_size
+
     def batching_fn(bucket_id, grouped_dataset):
       batch_sizes = tf.constant(bucket_batch_sizes, dtype=tf.int64)
       batch_size = batch_sizes[bucket_id]
@@ -348,8 +356,16 @@ def batching_fn(bucket_id, grouped_dataset):
            for name, shape in grouped_dataset.output_shapes.items()])
       return grouped_dataset.padded_batch(batch_size, padded_shapes)
 
-    dataset = dataset.group_by_window(example_to_bucket_id, batching_fn,
-                                      window_size)
+    # TODO(rsepassi): Rm branch once the new group_by_window functionality is in
+    # a stable TF release.
+    if hasattr(dataset, "apply"):
+      # If the Dataset supports dynamic window size, use it.
+      dataset = dataset.apply(
+          tf.contrib.data.group_by_window,
+          args=(example_to_bucket_id, batching_fn, None, window_size_fn))
+    else:
+      dataset = dataset.group_by_window(example_to_bucket_id, batching_fn,
+                                        window_size)
     return dataset
 
 
@@ -398,8 +414,8 @@ def _batching_scheme(batch_size,
        * max_length: int, maximum length of an example
   """
   max_length = max_length or batch_size
-  boundaries = _bucket_boundaries(
-      max_length, min_length_bucket, length_bucket_step)
+  boundaries = _bucket_boundaries(max_length, min_length_bucket,
+                                  length_bucket_step)
   boundaries = [boundary * length_multiplier for boundary in boundaries]
   max_length *= length_multiplier
   batch_sizes = [
@@ -417,9 +433,10 @@ def _batching_scheme(batch_size,
       83160, 110880, 166320, 221760, 277200, 332640, 498960, 554400, 665280,
       720720, 1081080, 1441440, 2162160, 2882880, 3603600, 4324320, 6486480,
       7207200, 8648640, 10810800, 14414400, 17297280, 21621600, 32432400,
-      36756720, 43243200, 61261200, 73513440, 110270160]
-  window_size = max([
-      i for i in highly_composite_numbers if i <= 3 * max_batch_size])
+      36756720, 43243200, 61261200, 73513440, 110270160
+  ]
+  window_size = max(
+      [i for i in highly_composite_numbers if i <= 3 * max_batch_size])
   divisors = [i for i in xrange(1, window_size + 1) if window_size % i == 0]
   batch_sizes = [max([d for d in divisors if d <= bs]) for bs in batch_sizes]
   window_size *= shard_multiplier

From f76ea08833639613287b2c46fa079bf5ef88207e Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Fri, 1 Sep 2017 13:12:17 -0700
Subject: [PATCH 08/32] Fixed typo.

PiperOrigin-RevId: 167314859
---
 tensor2tensor/utils/yellowfin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensor2tensor/utils/yellowfin.py b/tensor2tensor/utils/yellowfin.py
index 450875fa5..a3f6a18a1 100644
--- a/tensor2tensor/utils/yellowfin.py
+++ b/tensor2tensor/utils/yellowfin.py
@@ -602,7 +602,7 @@ def minimize(self,
     Raises:
       ValueError: if no gradients are provided for any variable.
     """
-    grads_and_vars = self._optimizer.compute_gradients(
+    grads_and_vars = self._momentum_optimizer.compute_gradients(
         loss,
         var_list=var_list,
         gate_gradients=gate_gradients,

From 0f3d76cc266c6a96f8093cd2ddca6bfc6f3cd721 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Mon, 4 Sep 2017 04:38:00 -0700
Subject: [PATCH 09/32] Enable easy dataset construction from Problems with
 Problem.dataset

PiperOrigin-RevId: 167485065
---
 .../data_generators/gene_expression.py        |   5 +-
 tensor2tensor/data_generators/ice_parsing.py  |   2 +-
 tensor2tensor/data_generators/image.py        |  14 +-
 tensor2tensor/data_generators/imdb.py         |   2 +-
 tensor2tensor/data_generators/problem.py      | 176 +++++++++++++++++-
 tensor2tensor/models/gene_expression_test.py  |   2 +-
 tensor2tensor/models/multimodel_test.py       |   2 +-
 tensor2tensor/models/slicenet_test.py         |   2 +-
 tensor2tensor/problems.py                     |  36 ++++
 tensor2tensor/problems_test.py                |  60 ++++++
 tensor2tensor/utils/data_reader.py            |   2 +
 tensor2tensor/utils/trainer_utils.py          |   2 +-
 12 files changed, 284 insertions(+), 21 deletions(-)
 create mode 100644 tensor2tensor/problems.py
 create mode 100644 tensor2tensor/problems_test.py

diff --git a/tensor2tensor/data_generators/gene_expression.py b/tensor2tensor/data_generators/gene_expression.py
index 0607aad15..43d5a6702 100644
--- a/tensor2tensor/data_generators/gene_expression.py
+++ b/tensor2tensor/data_generators/gene_expression.py
@@ -142,7 +142,7 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1):
     # Shuffle
     generator_utils.shuffle_dataset(all_filepaths)
 
-  def hparams(self, defaults, model_hparams):
+  def hparams(self, defaults, unused_model_hparams):
     p = defaults
     vocab_size = self._encoders["inputs"].vocab_size
     p.input_modality = {"inputs": (registry.Modalities.SYMBOL, vocab_size)}
@@ -159,9 +159,8 @@ def example_reading_spec(self):
     data_items_to_decoders = None
     return (data_fields, data_items_to_decoders)
 
-  def preprocess_examples(self, examples, mode, hparams):
+  def preprocess_examples(self, examples, mode, unused_hparams):
     del mode
-    del hparams
 
     # Reshape targets to contain num_output_predictions per output timestep
     examples["targets"] = tf.reshape(examples["targets"],
diff --git a/tensor2tensor/data_generators/ice_parsing.py b/tensor2tensor/data_generators/ice_parsing.py
index 4fb0424bb..2aa261cd4 100644
--- a/tensor2tensor/data_generators/ice_parsing.py
+++ b/tensor2tensor/data_generators/ice_parsing.py
@@ -109,7 +109,7 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1):
                                        self.targeted_vocab_size),
         self.dev_filepaths(data_dir, 1, shuffled=False))
 
-  def hparams(self, defaults, model_hparams):
+  def hparams(self, defaults, unused_model_hparams):
     p = defaults
     source_vocab_size = self._encoders["inputs"].vocab_size
     p.input_modality = {"inputs": (registry.Modalities.SYMBOL,
diff --git a/tensor2tensor/data_generators/image.py b/tensor2tensor/data_generators/image.py
index fbe91d70e..03cea1d02 100644
--- a/tensor2tensor/data_generators/image.py
+++ b/tensor2tensor/data_generators/image.py
@@ -105,7 +105,7 @@ def resize(img, size):
     examples["targets"] = resize(inputs, 32)
     return examples
 
-  def hparams(self, defaults, model_hparams):
+  def hparams(self, defaults, unused_model_hparams):
     p = defaults
     p.input_modality = {"inputs": ("image:identity_no_pad", None)}
     p.target_modality = ("image:identity_no_pad", None)
@@ -229,7 +229,7 @@ def feature_encoders(self, data_dir):
         "targets": text_encoder.SubwordTextEncoder(vocab_filename)
     }
 
-  def hparams(self, defaults, model_hparams):
+  def hparams(self, defaults, unused_model_hparams):
     p = defaults
     p.input_modality = {"inputs": (registry.Modalities.IMAGE, None)}
     vocab_size = self._encoders["targets"].vocab_size
@@ -267,7 +267,7 @@ def dev_shards(self):
   def generator(self, data_dir, tmp_dir, is_training):
     raise NotImplementedError()
 
-  def hparams(self, defaults, model_hparams):
+  def hparams(self, defaults, unused_model_hparams):
     p = defaults
     small_modality = "%s:small_image_modality" % registry.Modalities.IMAGE
     modality = small_modality if self.is_small else registry.Modalities.IMAGE
@@ -349,7 +349,7 @@ def is_small(self):
   def num_classes(self):
     return 1000
 
-  def preprocess_examples(self, examples, mode, hparams):
+  def preprocess_examples(self, examples, mode, unused_hparams):
     # Just resize with area.
     if self._was_reversed:
       examples["inputs"] = tf.to_int64(
@@ -565,7 +565,7 @@ def cifar10_generator(tmp_dir, training, how_many, start_from=0):
 @registry.register_problem
 class ImageCifar10Tune(ImageMnistTune):
 
-  def preprocess_examples(self, examples, mode, hparams):
+  def preprocess_examples(self, examples, mode, unused_hparams):
     if mode == tf.contrib.learn.ModeKeys.TRAIN:
       examples["inputs"] = common_layers.cifar_image_augmentation(
           examples["inputs"])
@@ -591,7 +591,7 @@ def generator(self, data_dir, tmp_dir, is_training):
 @registry.register_problem
 class ImageCifar10Plain(ImageCifar10):
 
-  def preprocess_examples(self, examples, mode, hparams):
+  def preprocess_examples(self, examples, mode, unused_hparams):
     return examples
 
 
@@ -730,7 +730,7 @@ def feature_encoders(self, data_dir):
       encoder = text_encoder.SubwordTextEncoder(vocab_filename)
     return {"targets": encoder}
 
-  def hparams(self, defaults, model_hparams):
+  def hparams(self, defaults, unused_model_hparams):
     p = defaults
     p.input_modality = {"inputs": (registry.Modalities.IMAGE, None)}
     encoder = self._encoders["targets"]
diff --git a/tensor2tensor/data_generators/imdb.py b/tensor2tensor/data_generators/imdb.py
index 281a03bee..4216747c4 100644
--- a/tensor2tensor/data_generators/imdb.py
+++ b/tensor2tensor/data_generators/imdb.py
@@ -97,7 +97,7 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1):
         self.generator(data_dir, tmp_dir, True), train_paths,
         self.generator(data_dir, tmp_dir, False), dev_paths)
 
-  def hparams(self, defaults, model_hparams):
+  def hparams(self, defaults, unused_model_hparams):
     p = defaults
     source_vocab_size = self._encoders["inputs"].vocab_size
     p.input_modality = {
diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py
index e4424e73e..d0ed6ad2c 100644
--- a/tensor2tensor/data_generators/problem.py
+++ b/tensor2tensor/data_generators/problem.py
@@ -18,10 +18,14 @@
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import os
+import random
 
 # Dependency imports
 
+import six
+
 from tensor2tensor.data_generators import generator_utils
 from tensor2tensor.data_generators import text_encoder
 from tensor2tensor.utils import metrics
@@ -30,6 +34,7 @@
 import tensorflow as tf
 
 
+
 class SpaceID(object):
   """Input and target space ids. Add more as needed."""
   # Generic / unknown output space (default)
@@ -92,6 +97,14 @@ class SpaceID(object):
   CPP_TOK = 28
 
 
+def default_model_hparams():
+  return tf.contrib.training.HParams(
+      max_input_seq_length=0,
+      max_target_seq_length=0,
+      prepend_mode="none",
+      data_dir=None)
+
+
 def preprocess_examples_common(examples, hparams):
   """Preprocessing steps common to all models."""
   if hparams.max_input_seq_length > 0:
@@ -232,14 +245,23 @@ def __init__(self, was_reversed=False, was_copy=False):
     self._was_reversed = was_reversed
     self._was_copy = was_copy
     self._encoders = None
+    self._hparams = None
+    self._feature_info = None
 
-  def internal_build_encoders(self, data_dir):
-    self._encoders = self.feature_encoders(data_dir)
+  def get_feature_encoders(self, data_dir=None):
+    if self._encoders is None:
+      self._encoders = self.feature_encoders(data_dir)
+    return self._encoders
 
-  def internal_hparams(self, model_hparams):
+  def get_hparams(self, model_hparams=None):
     """Returns problem_hparams."""
+    if self._hparams is not None:
+      return self._hparams
+
+    assert model_hparams is not None
+
     if self._encoders is None:
-      self.internal_build_encoders(model_hparams.data_dir)
+      self.get_feature_encoders(model_hparams.data_dir)
 
     hp = _default_hparams()
     ret = self.hparams(hp, model_hparams)
@@ -255,7 +277,9 @@ def internal_hparams(self, model_hparams):
       _reverse_problem_hparams(hp)
     if self._was_copy:
       _copy_problem_hparams(hp)
-    return hp
+
+    self._hparams = hp
+    return self._hparams
 
   def maybe_reverse_features(self, feature_map):
     if not self._was_reversed:
@@ -268,6 +292,148 @@ def maybe_copy_features(self, feature_map):
       return
     feature_map["targets"] = feature_map["inputs"]
 
+  def dataset(self,
+              mode,
+              data_dir=None,
+              num_threads=None,
+              output_buffer_size=None,
+              shuffle_files=None,
+              hparams=None):
+    """Build a Dataset for this problem.
+
+    Args:
+      mode: tf.estimator.ModeKeys; determines which files to read from.
+      data_dir: directory that contains data files.
+      num_threads: int, number of threads to use for decode and preprocess
+        Dataset.map calls.
+      output_buffer_size: int, how many elements to prefetch in Dataset.map
+        calls.
+      shuffle_files: whether to shuffle input files. Default behavior (i.e. when
+        shuffle_files=None) is to shuffle if mode == TRAIN.
+      hparams: tf.contrib.training.HParams; hparams to be passed to
+        Problem.preprocess_examples and Problem.hparams. If None, will use a
+        default set that is a no-op.
+
+    Returns:
+      Dataset containing dict<feature name, Tensor>.
+    """
+    assert data_dir
+
+    if hparams is None:
+      hparams = default_model_hparams()
+
+    if not hasattr(hparams, "data_dir"):
+      hparams.add_hparam("data_dir", data_dir)
+    if not hparams.data_dir:
+      hparams.data_dir = data_dir
+    # Construct the Problem's hparams so that items within it are accessible
+    _ = self.get_hparams(hparams)
+
+    base_filename = self.dataset_filename()
+    path = os.path.join(data_dir, base_filename)
+
+    # TODO(rsepassi): handle ModeKeys.PREDICT with placeholders
+    is_training = mode == tf.estimator.ModeKeys.TRAIN
+    if is_training:
+      suffix = "train"
+    elif mode == tf.estimator.ModeKeys.EVAL:
+      suffix = "dev"
+    else:
+      assert mode == "test"
+      suffix = "test"
+
+    filepattern = "%s-%s*" % (path, suffix)
+    data_fields, data_items_to_decoders = self.example_reading_spec()
+    if data_items_to_decoders is None:
+      data_items_to_decoders = {
+          field: tf.contrib.slim.tfexample_decoder.Tensor(field)
+          for field in data_fields
+      }
+
+    data_files = tf.contrib.slim.parallel_reader.get_data_files(filepattern)
+    if shuffle_files or shuffle_files is None and is_training:
+      random.shuffle(data_files)
+    dataset = tf.contrib.data.TFRecordDataset(data_files)
+
+    def decode_record(record):
+      """Serialized Example to dict of <feature name, Tensor>."""
+      decoder = tf.contrib.slim.tfexample_decoder.TFExampleDecoder(
+          data_fields, data_items_to_decoders)
+
+      decode_items = list(data_items_to_decoders)
+      decoded = decoder.decode(record, items=decode_items)
+      return dict(zip(decode_items, decoded))
+
+    def preprocess(example):
+      example = self.preprocess_examples(example, mode, hparams)
+      self.maybe_reverse_features(example)
+      self.maybe_copy_features(example)
+      return example
+
+    dataset = dataset.map(decode_record, num_threads=num_threads)
+    dataset = dataset.map(
+        preprocess,
+        num_threads=num_threads,
+        output_buffer_size=output_buffer_size)
+
+    return dataset
+
+  @property
+  def feature_info(self):
+    """Retrieve dict<feature name, FeatureInfo>.
+
+    Must first call Problem.get_hparams or Problem.dataset to have the problem's
+    internal hparams already constructed.
+
+    Returns:
+      dict<feature name, FeatureInfo>
+    """
+    if self._feature_info is not None:
+      return self._feature_info
+
+    assert self._hparams is not None
+
+    hp = self.get_hparams()
+    input_mods = hp.input_modality
+    target_mod = hp.target_modality
+    vocabs = hp.vocabulary
+    in_id = hp.input_space_id
+    out_id = hp.target_space_id
+
+    features = collections.defaultdict(FeatureInfo)
+
+    for name, mod_spec in six.iteritems(input_mods):
+      mod, vocab_size = mod_spec
+      finfo = features[name]
+      finfo.modality = mod
+      finfo.vocab_size = vocab_size
+
+    mod, vocab_size = target_mod
+    features["targets"].modality = mod
+    features["targets"].vocab_size = vocab_size
+
+    for name, encoder in six.iteritems(vocabs):
+      features[name].encoder = encoder
+
+    features["inputs"].space_id = in_id
+    features["targets"].space_id = out_id
+
+    self._feature_info = features
+    return features
+
+
+class FeatureInfo(object):
+
+  def __init__(self,
+               encoder=None,
+               modality=None,
+               vocab_size=None,
+               space_id=None):
+    self.encoder = encoder
+    self.modality = modality
+    self.vocab_size = vocab_size
+    self.space_id = space_id
+
 
 def _copy_problem_hparams(p_hparams):
   """Use input modality, vocab, and space id for target."""
diff --git a/tensor2tensor/models/gene_expression_test.py b/tensor2tensor/models/gene_expression_test.py
index cc4cd1200..e46e81859 100644
--- a/tensor2tensor/models/gene_expression_test.py
+++ b/tensor2tensor/models/gene_expression_test.py
@@ -70,7 +70,7 @@ def testGeneExpressionModels(self):
                        gene_expression_conv_test())]
     for model_cls, hparams in models_hparams:
       hparams.add_hparam("data_dir", None)
-      p_hparams = gene_data.GenomicsExpressionCage10().internal_hparams(hparams)
+      p_hparams = gene_data.GenomicsExpressionCage10().get_hparams(hparams)
       hparams.problems = [p_hparams]
       self._testModel(hparams, model_cls)
 
diff --git a/tensor2tensor/models/multimodel_test.py b/tensor2tensor/models/multimodel_test.py
index 73a8436cc..ab60bae97 100644
--- a/tensor2tensor/models/multimodel_test.py
+++ b/tensor2tensor/models/multimodel_test.py
@@ -38,7 +38,7 @@ def testMultiModel(self):
     hparams = multimodel.multimodel_tiny()
     hparams.add_hparam("data_dir", "")
     problem = registry.problem("image_cifar10")
-    p_hparams = problem.internal_hparams(hparams)
+    p_hparams = problem.get_hparams(hparams)
     hparams.problems = [p_hparams]
     with self.test_session() as session:
       features = {
diff --git a/tensor2tensor/models/slicenet_test.py b/tensor2tensor/models/slicenet_test.py
index 388acde1b..c3a064a85 100644
--- a/tensor2tensor/models/slicenet_test.py
+++ b/tensor2tensor/models/slicenet_test.py
@@ -39,7 +39,7 @@ def testSliceNet(self):
     hparams = slicenet.slicenet_params1_tiny()
     hparams.add_hparam("data_dir", "")
     problem = registry.problem("image_cifar10")
-    p_hparams = problem.internal_hparams(hparams)
+    p_hparams = problem.get_hparams(hparams)
     hparams.problems = [p_hparams]
     with self.test_session() as session:
       features = {
diff --git a/tensor2tensor/problems.py b/tensor2tensor/problems.py
new file mode 100644
index 000000000..1e94c7bad
--- /dev/null
+++ b/tensor2tensor/problems.py
@@ -0,0 +1,36 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Access T2T Problems.
+
+See problems_test.py for basic usage.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from tensor2tensor.data_generators import all_problems  # pylint: disable=unused-import
+from tensor2tensor.utils import registry
+
+
+def problem(name):
+  return registry.problem(name)
+
+
+def available():
+  return sorted(registry.list_problems())
diff --git a/tensor2tensor/problems_test.py b/tensor2tensor/problems_test.py
new file mode 100644
index 000000000..de101e6e7
--- /dev/null
+++ b/tensor2tensor/problems_test.py
@@ -0,0 +1,60 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""tensor2tensor.problems test."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from tensor2tensor import problems
+
+import tensorflow as tf
+
+MODES = tf.estimator.ModeKeys
+
+
+class ProblemsTest(tf.test.TestCase):
+
+  def testBuildDataset(self):
+    # See all the available problems
+    self.assertTrue(len(problems.available()) > 10)
+
+    # Retrieve a problem by name
+    problem = problems.problem("translate_ende_wmt8k")
+
+    # Access train and dev datasets through Problem
+    train_dataset = problem.dataset(MODES.TRAIN)
+    dev_dataset = problem.dataset(MODES.EVAL)
+
+    # Access vocab size and other info (e.g. the data encoders used to
+    # encode/decode data for the feature, used below) through feature_info.
+    feature_info = problem.feature_info
+    self.assertTrue(feature_info["inputs"].vocab_size > 0)
+    self.assertTrue(feature_info["targets"].vocab_size > 0)
+
+    train_example = train_dataset.make_one_shot_iterator().get_next()
+    dev_example = dev_dataset.make_one_shot_iterator().get_next()
+
+    with tf.Session() as sess:
+      train_ex_val, _ = sess.run([train_example, dev_example])
+      _ = feature_info["inputs"].encoder.decode(train_ex_val["inputs"])
+      _ = feature_info["targets"].encoder.decode(train_ex_val["targets"])
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py
index 09ef159a4..681f3598b 100644
--- a/tensor2tensor/utils/data_reader.py
+++ b/tensor2tensor/utils/data_reader.py
@@ -260,6 +260,8 @@ def input_pipeline(problem, data_file_pattern, capacity, mode, hparams,
   num_threads = 4 if is_training else 1
 
   with tf.name_scope("input_pipeline"):
+    # TODO(rsepassi): Once all problems use the Problem class, rm example
+    # reading, parsing, and preprocessing. Use Problem.dataset instead.
     dataset = read_examples(problem, data_file_pattern, capacity, mode=mode)
     dataset = dataset.map(
         lambda ex: _preprocess(ex, problem, data_file_pattern, hparams, mode),
diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py
index a747b9a09..8539f4eb1 100644
--- a/tensor2tensor/utils/trainer_utils.py
+++ b/tensor2tensor/utils/trainer_utils.py
@@ -237,7 +237,7 @@ def add_problem_hparams(hparams, problems):
     if problem is None:
       p_hparams = problem_hparams.problem_hparams(problem_name, hparams)
     else:
-      p_hparams = problem.internal_hparams(hparams)
+      p_hparams = problem.get_hparams(hparams)
 
     hparams.problem_instances.append(problem)
     hparams.problems.append(p_hparams)

From 636d2e1fd089290f28eaa45f2476cc00ce67d7a4 Mon Sep 17 00:00:00 2001
From: Katherine Lee <katherinelee@google.com>
Date: Mon, 4 Sep 2017 14:44:19 -0700
Subject: [PATCH 10/32] Add strokes SpaceID.

PiperOrigin-RevId: 167518694
---
 tensor2tensor/data_generators/all_problems.py | 1 -
 tensor2tensor/data_generators/problem.py      | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensor2tensor/data_generators/all_problems.py b/tensor2tensor/data_generators/all_problems.py
index f9afa895b..52354704d 100644
--- a/tensor2tensor/data_generators/all_problems.py
+++ b/tensor2tensor/data_generators/all_problems.py
@@ -45,4 +45,3 @@
   pass
 # pylint: enable=g-import-not-at-top
 # pylint: enable=unused-import
-
diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py
index d0ed6ad2c..302c51fa7 100644
--- a/tensor2tensor/data_generators/problem.py
+++ b/tensor2tensor/data_generators/problem.py
@@ -95,6 +95,8 @@ class SpaceID(object):
   PY_TOK = 27
   # C++
   CPP_TOK = 28
+  # Strokes
+  STROKES = 29
 
 
 def default_model_hparams():

From c25325be184bd555a1b0df0af021699996435f79 Mon Sep 17 00:00:00 2001
From: Katherine Lee <katherinelee@google.com>
Date: Mon, 4 Sep 2017 15:36:23 -0700
Subject: [PATCH 11/32] Merge from GitHub

PiperOrigin-RevId: 167520632
---
 docs/new_problem.md | 48 ++++++++++++++++++++++-----------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/docs/new_problem.md b/docs/new_problem.md
index c859c6eba..d581a3a1b 100644
--- a/docs/new_problem.md
+++ b/docs/new_problem.md
@@ -17,13 +17,15 @@ For each problem we want to tackle we create a new problem class and register it
 
 Since many text2text problems share similar methods, there's already a class called `Text2TextProblem` that extends the base problem class, `Problem` (both found in `problem.py`).
 
-For our problem, we can go ahead and create the file `word2def.py` in the `data_generators` folder and add our new problem, `Word2def`, which extends `TranslateProblem`. Let's also register it while we're at it so we can specify the problem through flags.
+For our problem, we can go ahead and create the file `word2def.py` in the `data_generators` folder and add our new problem, `Word2def`, which extends `Text2TextProblem`. Let's also register it while we're at it so we can specify the problem through flags.
 
 ```python
-@registry.register_problem()
+@registry.register_problem
 class Word2def(problem.Text2TextProblem):
   """Problem spec for English word to dictionary definition."""
-  return NotImplementedError()
+  @property
+  def is_character_level(self):
+    ...
 ```
 
 We need to implement the following methods from `Text2TextProblem` in our new class:
@@ -56,6 +58,8 @@ The number of shards to break data files into.
 @registry.register_problem()
 class Word2def(problem.Text2TextProblem):
   """Problem spec for English word to dictionary definition."""
+
+  @property
   def is_character_level(self):
     return True
 
@@ -87,7 +91,6 @@ We're almost done. `generator` generates the training and evaluation data and st
   def generator(self, data_dir, tmp_dir, train):
     character_vocab = text_encoder.ByteTextEncoder()
     datasets = _WORD2DEF_TRAIN_DATASETS if train else _WORD2DEF_TEST_DATASETS
-    tag = "train" if train else "dev"
     return character_generator(datasets[0], datasets[1], character_vocab, EOS)
 ```
 
@@ -108,7 +111,6 @@ class Word2def(problem.Text2TextProblem):
   def generator(self, data_dir, tmp_dir, train):
     character_vocab = text_encoder.ByteTextEncoder()
     datasets = _WORD2DEF_TRAIN_DATASETS if train else _WORD2DEF_TEST_DATASETS
-    tag = "train" if train else "dev"
     return character_generator(datasets[0], datasets[1], character_vocab, EOS)
 
   @property
@@ -137,14 +139,13 @@ I've gone ahead and split all words into a train and test set and saved them in
 ```python
 # English Word2def datasets
 _WORD2DEF_TRAIN_DATASETS = [
-    [
-        "LOCATION_OF_DATA/", ("words_train.txt", "definitions_train.txt")
-    ]
+    LOCATION_OF_DATA + 'words_train.txt',
+    LOCATION_OF_DATA + 'definitions_train.txt'
 ]
+
 _WORD2DEF_TEST_DATASETS = [
-    [
-        "LOCATION_OF_DATA", ("words_test.txt", "definitions_test.txt")
-    ]
+    LOCATION_OF_DATA + 'words_test.txt',
+    LOCATION_OF_DATA + 'definitions_test.txt'
 ]
 ```
 
@@ -155,24 +156,14 @@ Now our `word2def.py` file looks like: (with the correct imports)
 """ Problem definition for word to dictionary definition.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
 import os
-import tarfile # do we need this import
 
-from tensor2tensor.data_generators import generator_utils
 from tensor2tensor.data_generators import problem
 from tensor2tensor.data_generators import text_encoder
 from tensor2tensor.data_generators.wmt import character_generator
 
 from tensor2tensor.utils import registry
 
-import tensorflow as tf
-
-FLAGS = tf.flags.FLAGS
-
 # English Word2def datasets
 _WORD2DEF_TRAIN_DATASETS = [
     LOCATION_OF_DATA+'words_train.txt',
@@ -198,7 +189,6 @@ class Word2def(problem.Text2TextProblem):
   def generator(self, data_dir, tmp_dir, train):
     character_vocab = text_encoder.ByteTextEncoder()
     datasets = _WORD2DEF_TRAIN_DATASETS if train else _WORD2DEF_TEST_DATASETS
-    tag = "train" if train else "dev"
     return character_generator(datasets[0], datasets[1], character_vocab, EOS)
 
   @property
@@ -220,7 +210,17 @@ class Word2def(problem.Text2TextProblem):
 ```
 
 # Hyperparameters
-All hyperparamters inherit from `_default_hparams()` in `problem.py.` If you would like to customize your hyperparameters, add another method to the file `problem_hparams.py`.
+All hyperparamters inherit from `_default_hparams()` in `problem.py.` If you would like to customize your hyperparameters, register a new hyperparameter set in `word2def.py` like the example provided in the walkthrough. For example:
+
+```python
+from tensor2tensor.models import transformer
+
+@registry.register_hparams
+def word2def_hparams(self):
+    hparams = transformer.transformer_base_single_gpu()  # Or whatever you'd like to build off.
+    hparams.batch_size = 1024
+    return hparams
+```
 
 # Run the problem
 Now that we've gotten our problem set up, let's train a model and generate definitions. 
@@ -229,7 +229,7 @@ We specify our problem name, the model, and hparams.
 ```bash
 PROBLEM=word2def
 MODEL=transformer
-HPARAMS=transofmer_base_single_gpu
+HPARAMS=word2def_hparams
 ```
 
 The rest of the steps are as given in the [walkthrough](walkthrough.md).

From 0de6f8c53204ebbce4cdabaaa32182d69571ad6c Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Tue, 5 Sep 2017 14:05:23 -0700
Subject: [PATCH 12/32] Save metadata (flags, hparams) on train

PiperOrigin-RevId: 167628142
---
 tensor2tensor/utils/trainer_utils.py | 34 ++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py
index 8539f4eb1..ee3445e26 100644
--- a/tensor2tensor/utils/trainer_utils.py
+++ b/tensor2tensor/utils/trainer_utils.py
@@ -19,6 +19,7 @@
 from __future__ import division
 from __future__ import print_function
 
+import os
 import sys
 
 # Dependency imports
@@ -147,6 +148,8 @@ def create_experiment(output_dir, data_dir, model_name, train_steps,
   """Create Experiment."""
   hparams = create_hparams(
       FLAGS.hparams_set, FLAGS.problems, data_dir, passed_hparams=FLAGS.hparams)
+  if FLAGS.worker_id == 0:
+    save_metadata(output_dir, hparams)
   estimator, input_fns = create_experiment_components(
       hparams=hparams,
       output_dir=output_dir,
@@ -245,6 +248,37 @@ def add_problem_hparams(hparams, problems):
   return hparams
 
 
+def save_metadata(output_dir, hparams):
+  """Saves FLAGS and hparams to output_dir."""
+  # Save FLAGS in txt file
+  if hasattr(FLAGS, "flags_into_string"):
+    flags_str = FLAGS.flags_into_string()
+    t2t_flags_str = "\n".join([
+        "--%s=%s" % (f.name, f.value)
+        for f in FLAGS.flags_by_module_dict()[
+            "tensor2tensor.utils.trainer_utils"]
+    ])
+  else:
+    flags_dict = FLAGS.__dict__["__flags"]
+    flags_str = "\n".join(
+        ["--%s=%s" % (name, str(f.value)) for (name, f) in flags_dict.items()])
+    t2t_flags_str = None
+
+  flags_txt = os.path.join(output_dir, "flags.txt")
+  with tf.gfile.Open(flags_txt, "w") as f:
+    f.write(flags_str)
+
+  if t2t_flags_str:
+    t2t_flags_txt = os.path.join(output_dir, "flags_t2t.txt")
+    with tf.gfile.Open(t2t_flags_txt, "w") as f:
+      f.write(t2t_flags_str)
+
+  # Save hparams as hparams.json
+  hparams_fname = os.path.join(output_dir, "hparams.json")
+  with tf.gfile.Open(hparams_fname, "w") as f:
+    f.write(hparams.to_json())
+
+
 def create_hparams(params_id, problems, data_dir, passed_hparams=None):
   """Returns hyperparameters, including any flag value overrides.
 

From c46684f79620ae695e4c79708e3064ab2aea8b7d Mon Sep 17 00:00:00 2001
From: Etienne Pot <epot@google.com>
Date: Wed, 6 Sep 2017 08:55:04 -0700
Subject: [PATCH 13/32] Attention experts uses local info for the FC. Fix long
 max_length size when batch_size is set through command line. Minor cleanup

PiperOrigin-RevId: 167726943
---
 tensor2tensor/models/attention_lm_moe.py | 63 ++++++++++++++++--------
 1 file changed, 43 insertions(+), 20 deletions(-)

diff --git a/tensor2tensor/models/attention_lm_moe.py b/tensor2tensor/models/attention_lm_moe.py
index 191d4aa04..eccf349c9 100644
--- a/tensor2tensor/models/attention_lm_moe.py
+++ b/tensor2tensor/models/attention_lm_moe.py
@@ -101,29 +101,30 @@ def _diet_expert(x):
       # should not either way)
       assert hparams.norm_type != "batch"
 
+      tf.logging.info("Applying Padding Remover for the attention experts")
+
       dp_remove_pad = functools.partial(
           dp, remove_pad, pad_remover=pad_remover, mode=hparams.mode)
       dp_restore_pad = functools.partial(
           dp, restore_pad, ref_x=x, pad_remover=pad_remover, mode=hparams.mode)
-    elif (hparams.attention_type == AttentionType.MULTIHEAD or
-          hparams.attention_type == AttentionType.MEMORY_EFFICIENT):
+    else:
       # Using identity function: No effect
-      dp_remove_pad = lambda x: (x, None)
+      dp_remove_pad = lambda x: x
       dp_restore_pad = lambda x: x
-    else:
-      raise ValueError("Only {} supported for now.".format(
-          AttentionType.get_choices()))
 
-    def print_shape(x, suffix):
+    def print_shape(x, suffix, debug=False):
       # To help debugging, print the input/output shapes at inference and eval
       # Inference for long sequences can take a long time, so that's help to
       # see the progession of the generation
-      if hparams.mode == ModeKeys.TRAIN:
+      if not debug and hparams.mode == ModeKeys.TRAIN:
         return x
       return tf.Print(x, [tf.shape(x)], "shape_x_{}".format(suffix))
 
+    batch_coordinate = dp(get_batch_coordinate, x)
+    batch_coordinate = dp_remove_pad(batch_coordinate)
+
     x = dp(print_shape, x, "in")
-    x, batch_coordinate = dp_remove_pad(x)
+    x = dp_remove_pad(x)
     x = dp(print_shape, x, "in_flat")
 
     for layer in xrange(hparams.num_hidden_layers):
@@ -188,12 +189,31 @@ def print_shape(x, suffix):
                 x,
                 hparams.filter_size)
           else:
+            x_in = preprocess(x)
+            additional_conv_params = dict()
+            if hparams.use_sepconv:
+              # Restore padding so sequences don't attend to each others
+              # restore_pad will apply a reshape like x_ref, to restore the
+              # original shape. Here this works because the last dimension is
+              # constant between the output of attention and the original input
+              # but it shouldn't necessarily be the case.
+              x_in = dp_restore_pad(x_in)
+              additional_conv_params = dict(
+                  padding="LEFT",
+                  # Parameters copied from the transformer model
+                  kernel_size=(3, 1),
+                  second_kernel_size=(31, 1),
+              )
             y = dp(
                 common_layers.conv_hidden_relu,
-                preprocess(x),
+                x_in,
                 hparams.filter_size,
                 hparams.hidden_size,
-                dropout=hparams.relu_dropout)
+                dropout=hparams.relu_dropout,
+                **additional_conv_params
+            )
+            if hparams.use_sepconv:
+              y = dp_remove_pad(y)
           x = postprocess(x, y)
     x = preprocess(x)
 
@@ -234,6 +254,14 @@ def attention_lm_moe_prepare_decoder(targets, hparams):
   return (decoder_input, decoder_self_attention_bias, pad_remover)
 
 
+def get_batch_coordinate(x):
+  """Return a flat int32 tensor of shape [1, batch_size*length, 1]."""
+  # Compute the batch coordinate before flattening all batches
+  batch_coordinate = tf.expand_dims(
+      common_attention.coordinate_tensor(tf.shape(x)[:-1], axis=0), axis=-1)
+  return batch_coordinate
+
+
 def remove_pad(x, pad_remover, mode):
   """Remove padding by concatenating all dimension into one.
 
@@ -247,11 +275,6 @@ def remove_pad(x, pad_remover, mode):
     tf.Tensor of shape [1,length_nonpad,depth] where
       length_nonpad <= batch_size*length
   """
-  # Compute the batch coordinate before flattening all batches
-  batch_coordinate = tf.expand_dims(
-      common_attention.coordinate_tensor(tf.shape(x)[:-1], axis=0), axis=-1)
-  batch_coordinate = expert_utils.flatten_all_but_last(batch_coordinate)
-
   # Concatenate all tokens (without padding)
   x = expert_utils.flatten_all_but_last(x)
 
@@ -260,12 +283,10 @@ def remove_pad(x, pad_remover, mode):
     # This is a hack to allows inference when the <go> token
     # is detected as padding and removed. This works for now because there is
     # no padding at inference.
-    batch_coordinate = pad_remover.remove(batch_coordinate)
     x = pad_remover.remove(x)
 
-  batch_coordinate = tf.expand_dims(batch_coordinate, axis=0)
   x = tf.expand_dims(x, axis=0)  # Now batch_size=1
-  return x, batch_coordinate
+  return x
 
 
 def restore_pad(x, ref_x, pad_remover, mode):
@@ -328,6 +349,7 @@ def attention_lm_moe_base():
   hparams.add_hparam("attention_v_size", 256)
   # Loss coef for load balancing
   hparams.add_hparam("attention_load_balance", 2e-2)
+  hparams.add_hparam("use_sepconv", int(False))
   hparams.add_hparam("diet_experts", int(False))
   hparams.add_hparam("memory_efficient_ffn", int(False))
   return hparams
@@ -338,7 +360,8 @@ def attention_lm_moe_base_ae():
   """Base model with attention expert."""
   hparams = attention_lm_moe_base()
   hparams.attention_type = AttentionType.LOCAL_EXPERTS
-  hparams.max_length = hparams.batch_size
+  hparams.use_sepconv = int(True)
+  hparams.max_length = 0  # max_length == batch_size
   hparams.eval_drop_long_sequences = int(True)
   hparams.min_length_bucket = 256  # Avoid cyclic problems for big batches
   hparams.learning_rate = 0.05

From 5767beceb71c56222f73cb41e70641c380636cb9 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Wed, 6 Sep 2017 10:11:01 -0700
Subject: [PATCH 14/32] ClassLabelEncoder to map class ids to names

PiperOrigin-RevId: 167736101
---
 tensor2tensor/data_generators/image.py        | 23 +++++++++++++++
 tensor2tensor/data_generators/imdb.py         |  2 +-
 tensor2tensor/data_generators/problem.py      | 17 +++++++----
 tensor2tensor/data_generators/text_encoder.py | 29 +++++++++++++++++++
 4 files changed, 64 insertions(+), 7 deletions(-)

diff --git a/tensor2tensor/data_generators/image.py b/tensor2tensor/data_generators/image.py
index 03cea1d02..8d142d239 100644
--- a/tensor2tensor/data_generators/image.py
+++ b/tensor2tensor/data_generators/image.py
@@ -264,6 +264,17 @@ def train_shards(self):
   def dev_shards(self):
     return 1
 
+  @property
+  def class_labels(self):
+    return ["ID_%d" % i for i in range(self.num_classes)]
+
+  def feature_encoders(self, data_dir):
+    del data_dir
+    return {
+        "inputs": text_encoder.TextEncoder(),
+        "targets": text_encoder.ClassLabelEncoder(self.class_labels)
+    }
+
   def generator(self, data_dir, tmp_dir, is_training):
     raise NotImplementedError()
 
@@ -491,6 +502,10 @@ def is_small(self):
   def num_classes(self):
     return 10
 
+  @property
+  def class_labels(self):
+    return [str(c) for c in range(self.num_classes)]
+
   @property
   def train_shards(self):
     return 10
@@ -564,6 +579,14 @@ def cifar10_generator(tmp_dir, training, how_many, start_from=0):
 
 @registry.register_problem
 class ImageCifar10Tune(ImageMnistTune):
+  """Cifar-10 Tune."""
+
+  @property
+  def class_labels(self):
+    return [
+        "airplane", "automobile", "bird", "cat", "deer", "dog", "frog", "horse",
+        "ship", "truck"
+    ]
 
   def preprocess_examples(self, examples, mode, unused_hparams):
     if mode == tf.contrib.learn.ModeKeys.TRAIN:
diff --git a/tensor2tensor/data_generators/imdb.py b/tensor2tensor/data_generators/imdb.py
index 4216747c4..d7eadcd1d 100644
--- a/tensor2tensor/data_generators/imdb.py
+++ b/tensor2tensor/data_generators/imdb.py
@@ -112,7 +112,7 @@ def feature_encoders(self, data_dir):
     encoder = text_encoder.SubwordTextEncoder(vocab_filename)
     return {
         "inputs": encoder,
-        "targets": text_encoder.TextEncoder(),
+        "targets": text_encoder.ClassLabelEncoder(["neg", "pos"]),
     }
 
   def example_reading_spec(self):
diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py
index 302c51fa7..4aa4862ef 100644
--- a/tensor2tensor/data_generators/problem.py
+++ b/tensor2tensor/data_generators/problem.py
@@ -300,7 +300,8 @@ def dataset(self,
               num_threads=None,
               output_buffer_size=None,
               shuffle_files=None,
-              hparams=None):
+              hparams=None,
+              preprocess=True):
     """Build a Dataset for this problem.
 
     Args:
@@ -315,6 +316,8 @@ def dataset(self,
       hparams: tf.contrib.training.HParams; hparams to be passed to
         Problem.preprocess_examples and Problem.hparams. If None, will use a
         default set that is a no-op.
+      preprocess: bool, whether to map the Dataset through
+        Problem.preprocess_examples.
 
     Returns:
       Dataset containing dict<feature name, Tensor>.
@@ -366,17 +369,19 @@ def decode_record(record):
       decoded = decoder.decode(record, items=decode_items)
       return dict(zip(decode_items, decoded))
 
-    def preprocess(example):
+    def _preprocess(example):
       example = self.preprocess_examples(example, mode, hparams)
       self.maybe_reverse_features(example)
       self.maybe_copy_features(example)
       return example
 
     dataset = dataset.map(decode_record, num_threads=num_threads)
-    dataset = dataset.map(
-        preprocess,
-        num_threads=num_threads,
-        output_buffer_size=output_buffer_size)
+
+    if preprocess:
+      dataset = dataset.map(
+          _preprocess,
+          num_threads=num_threads,
+          output_buffer_size=output_buffer_size)
 
     return dataset
 
diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py
index ac9260cfa..97ab88402 100644
--- a/tensor2tensor/data_generators/text_encoder.py
+++ b/tensor2tensor/data_generators/text_encoder.py
@@ -154,6 +154,35 @@ def vocab_size(self):
     return 2**8 + self._num_reserved_ids
 
 
+class ClassLabelEncoder(TextEncoder):
+  """Encoder for class labels."""
+
+  def __init__(self, class_labels=None, class_labels_fname=None):
+    super(ClassLabelEncoder, self).__init__(num_reserved_ids=0)
+
+    assert class_labels or class_labels_fname
+    assert not (class_labels and class_labels_fname)
+
+    if class_labels_fname:
+      with tf.gfile.Open(class_labels_fname) as f:
+        class_labels = [label.strip() for label in f.readlines()]
+
+    self._class_labels = class_labels
+
+  def encode(self, label_str):
+    return self._class_labels.index(label_str)
+
+  def decode(self, label_id):
+    if isinstance(label_id, list):
+      assert len(label_id) == 1
+      label_id, = label_id
+    return self._class_labels[label_id]
+
+  @property
+  def vocab_size(self):
+    return len(self._class_labels)
+
+
 class TokenTextEncoder(TextEncoder):
   """Encoder based on a user-supplied vocabulary (file or list)."""
 

From 78d8ddb349870400c89cd08c1c3e70bcc92f1f5f Mon Sep 17 00:00:00 2001
From: Niki Parmar <nikip@google.com>
Date: Wed, 6 Sep 2017 14:56:10 -0700
Subject: [PATCH 15/32] Add attention 2D functions over local_attention_2d

PiperOrigin-RevId: 167777554
---
 tensor2tensor/layers/common_attention.py | 119 +++++++++++++++++++++++
 1 file changed, 119 insertions(+)

diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py
index 7ed7799d0..1053a69af 100644
--- a/tensor2tensor/layers/common_attention.py
+++ b/tensor2tensor/layers/common_attention.py
@@ -328,6 +328,19 @@ def split_heads(x, num_heads):
   return tf.transpose(split_last_dimension(x, num_heads), [0, 2, 1, 3])
 
 
+def split_heads_2d(x, num_heads):
+  """Split channels (dimension 4) into multiple heads (becomes dimension 1).
+
+  Args:
+    x: a Tensor with shape [batch, height, width, channels]
+    num_heads: an integer
+
+  Returns:
+    a Tensor with shape [batch, num_heads, height, width, channels / num_heads]
+  """
+  return tf.transpose(split_last_dimension(x, num_heads), [0, 3, 1, 2, 4])
+
+
 def combine_heads(x):
   """Inverse of split_heads.
 
@@ -340,6 +353,18 @@ def combine_heads(x):
   return combine_last_two_dimensions(tf.transpose(x, [0, 2, 1, 3]))
 
 
+def combine_heads_2d(x):
+  """Inverse of split_heads_2d function.
+
+  Args:
+    x: a Tensor with shape [batch, num_heads, height, width, channels/num_heads]
+
+  Returns:
+    a Tensor with shape [batch, height, width, channels]
+  """
+  return combine_last_two_dimensions(tf.transpose(x, [0, 2, 3, 1, 4]))
+
+
 def attention_image_summary(attn, image_shapes=None):
   """Compute color image summary.
 
@@ -768,6 +793,43 @@ def compute_qkv(query_antecedent, memory_antecedent, total_key_depth,
   return q, k, v
 
 
+def compute_qkv_2d(query_antecedent, memory_antecedent, total_key_depth,
+                   total_value_depth):
+  """Computes query, key and value of a 4D tensor.
+
+  Args:
+    query_antecedent: a Tensor with shape [batch, h, w, depth_k]
+    memory_antecedent: a Tensor with shape [batch, h, w, depth_k]
+    total_key_depth: an integer
+    total_value_depth: and integer
+
+  Returns:
+    q, k, v : [batch, h, w, depth_k] tensors
+  """
+  # self attention with single position q, k, and v.
+  if memory_antecedent is None:
+    combined = tf.layers.conv2d(
+        query_antecedent,
+        total_key_depth * 2 + total_value_depth, (1, 1),
+        name="qkv_transform")
+    q, k, v = tf.split(
+        combined, [total_key_depth, total_key_depth, total_value_depth],
+        axis=-1)
+    return q, k, v
+
+  # Encoder decoder attention.
+  q = common_layers.conv1d(
+      query_antecedent, total_key_depth, 1, name="q_transform")
+  combined = common_layers.conv1d(
+      memory_antecedent,
+      total_key_depth + total_value_depth,
+      1,
+      name="kv_transform")
+  k, v = tf.split(combined, [total_key_depth, total_value_depth], axis=2)
+
+  return q, k, v
+
+
 def multihead_attention(query_antecedent,
                         memory_antecedent,
                         bias,
@@ -849,6 +911,63 @@ def multihead_attention(query_antecedent,
     return x
 
 
+def multihead_attention_2d(query_antecedent,
+                           memory_antecedent,
+                           total_key_depth,
+                           total_value_depth,
+                           output_depth,
+                           num_heads,
+                           attention_type="local_attention_2d",
+                           block_length=128,
+                           block_width=128,
+                           name=None):
+  """2d Multihead scaled-dot-product attention with inp/output transformations.
+
+  Args:
+    query_antecedent: a Tensor with shape [batch, h, w, depth_k]
+    memory_antecedent: a Tensor with shape [batch, h, w, depth_k]
+    total_key_depth: an integer
+    total_value_depth: an integer
+    output_depth: an integer
+    num_heads: an integer dividing total_key_depth and total_value_depth
+    attention_type: String, type of attention function to use.
+    block_length: an integer - relevant for "local_attention_2d"
+    block_width: an integer - relevant for "local_attention_2d"
+    name: an optional string
+
+  Returns:
+    A Tensor of shape [batch, h, w, depth_k]
+
+  Raises:
+    ValueError: if the key depth or value depth are not divisible by the
+      number of attention heads.
+  """
+  if total_key_depth % num_heads != 0:
+    raise ValueError("Key depth (%d) must be divisible by the number of "
+                     "attention heads (%d)." % (total_key_depth, num_heads))
+  if total_value_depth % num_heads != 0:
+    raise ValueError("Value depth (%d) must be divisible by the number of "
+                     "attention heads (%d)." % (total_value_depth, num_heads))
+  with tf.variable_scope(
+      name,
+      default_name="multihead_attention",
+      values=[query_antecedent, memory_antecedent]):
+    q, k, v = compute_qkv_2d(query_antecedent, memory_antecedent,
+                             total_key_depth, total_value_depth)
+
+    q = split_heads_2d(q, num_heads)
+    k = split_heads_2d(k, num_heads)
+    v = split_heads_2d(v, num_heads)
+    key_depth_per_head = total_key_depth // num_heads
+    q *= key_depth_per_head**-0.5
+    if attention_type == "local_attention_2d":
+      x = local_attention_2d(
+          q, k, v, block_length=block_length, filter_flange=block_width)
+    x = tf.squeeze(combine_heads_2d(x), axis=-2)
+    x = common_layers.conv1d(x, output_depth, 1, name="output_transform")
+    return x
+
+
 def ffn_self_attention_layer(x,
                              filter_depth,
                              output_depth,

From 4794c20af3e0d104e38985a37cfa7244185cd13e Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Wed, 6 Sep 2017 16:13:02 -0700
Subject: [PATCH 16/32] GPU mem fraction default 0.95 to rm allocation error
 msg

PiperOrigin-RevId: 167788682
---
 tensor2tensor/utils/trainer_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py
index ee3445e26..3248d9ca9 100644
--- a/tensor2tensor/utils/trainer_utils.py
+++ b/tensor2tensor/utils/trainer_utils.py
@@ -93,7 +93,7 @@
 flags.DEFINE_integer("worker_gpu", 1, "How many GPUs to use.")
 flags.DEFINE_integer("worker_replicas", 1, "How many workers to use.")
 flags.DEFINE_integer("worker_id", 0, "Which worker task are we.")
-flags.DEFINE_float("worker_gpu_memory_fraction", 1.,
+flags.DEFINE_float("worker_gpu_memory_fraction", 0.95,
                    "Fraction of GPU memory to allocate.")
 flags.DEFINE_integer("ps_gpu", 0, "How many GPUs to use per ps.")
 flags.DEFINE_string("gpu_order", "", "Optional order for daisy-chaining gpus."

From 74044ea9768fd41e90166305d041d67457955bfd Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Wed, 6 Sep 2017 16:32:29 -0700
Subject: [PATCH 17/32] Share one PadRemover across all Transformer encoder
 layers

PiperOrigin-RevId: 167791186
---
 tensor2tensor/models/transformer.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index 86b920dc5..38766ec19 100644
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -191,6 +191,8 @@ def transformer_encoder(encoder_input,
   """
   x = encoder_input
   with tf.variable_scope(name):
+    pad_remover = expert_utils.PadRemover(
+        common_attention.attention_bias_to_padding(encoder_self_attention_bias))
     for layer in xrange(
         hparams.num_encoder_layers or hparams.num_hidden_layers):
       with tf.variable_scope("layer_%d" % layer):
@@ -203,9 +205,6 @@ def transformer_encoder(encoder_input,
               hparams.hidden_size, hparams.num_heads, hparams.attention_dropout)
           x = common_layers.layer_postprocess(x, y, hparams)
         with tf.variable_scope("ffn"):
-          pad_remover = expert_utils.PadRemover(
-              common_attention.attention_bias_to_padding(
-                  encoder_self_attention_bias))
           y = transformer_ffn_layer(
               common_layers.layer_preprocess(x, hparams), hparams, pad_remover)
           x = common_layers.layer_postprocess(x, y, hparams)

From 665dbe8b92f827d68a7671fa15cbb6f0231de1ad Mon Sep 17 00:00:00 2001
From: Ashish Vaswani <avaswani@google.com>
Date: Wed, 6 Sep 2017 16:43:15 -0700
Subject: [PATCH 18/32] 2d masked local attention. Each memory block can attend
 to a memory region top-left, top, and top-right. The mask ensures that we
 don't peek into the future. Refactored some functions out of
 local_attention_2d so that they could be shared.

PiperOrigin-RevId: 167792489
---
 tensor2tensor/layers/common_attention.py      | 263 +++++++++++++-----
 tensor2tensor/layers/common_attention_test.py |   8 +-
 2 files changed, 196 insertions(+), 75 deletions(-)

diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py
index 1053a69af..84f8d2d9a 100644
--- a/tensor2tensor/layers/common_attention.py
+++ b/tensor2tensor/layers/common_attention.py
@@ -22,6 +22,7 @@
 import math
 
 # Dependency imports
+import numpy as np
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
@@ -354,10 +355,11 @@ def combine_heads(x):
 
 
 def combine_heads_2d(x):
-  """Inverse of split_heads_2d function.
+  """Inverse of split_heads_2d.
 
   Args:
-    x: a Tensor with shape [batch, num_heads, height, width, channels/num_heads]
+    x: a Tensor with shape
+      [batch, num_heads, height, width, channels / num_heads]
 
   Returns:
     a Tensor with shape [batch, height, width, channels]
@@ -627,8 +629,8 @@ def pad_l_and_r(x, pad_length):
 def local_attention_2d(q,
                        k,
                        v,
-                       block_length=128,
-                       filter_flange=100,
+                       query_shape=(8, 16),
+                       memory_flange=(8, 16),
                        name=None):
   """strided block local self-attention.
 
@@ -636,8 +638,9 @@ def local_attention_2d(q,
     q: a Tensor with shape [batch, heads, h, w, depth_k]
     k: a Tensor with shape [batch, heads, h, w, depth_k]
     v: a Tensor with shape [batch, heads, h, w, depth_v]
-    block_length: an integer indicating the side length of each square block.
-    filter_flange: an integer indicating how much to look around each block.
+    query_shape: an tuple indicating the height and width of each query block.
+    memory_flange: an integer indicating how much to look in height and width
+      from each query block.
     name: an optional string
 
   Returns:
@@ -651,68 +654,26 @@ def local_attention_2d(q,
     num_heads = tf.shape(q)[1]
     original_length = tf.shape(q)[2] * tf.shape(q)[3]
 
-    def reshape_range(tensor, i, j, shape):
-      """Reshapes a tensor between dimensions i and j."""
-      target_shape = tf.concat(
-          [tf.shape(tensor)[:i], shape, tf.shape(tensor)[j:]],
-          axis=0)
-      return tf.reshape(tensor, target_shape)
-
-    def pad_to_multiple(x, d):
-      """Making sure x is a multiple of d."""
-      height_padding = -tf.shape(x)[1] % d
-      width_padding = -tf.shape(x)[2] % d
-      paddings = [[0, 0], [0, 0], [0, height_padding],
-                  [0, width_padding], [0, 0]]
-      return tf.pad(x, paddings)
-
-    def gather_indices(x, block_length, stride):
-      """Getting gather indices."""
-      # making an identity matrix kernel
-      kernel = tf.eye(block_length ** 2)
-      kernel = reshape_range(kernel, 0, 1, [block_length, block_length, 1])
-      # making indices [1, h, w, 1] to appy convs
-      indices = tf.range(0, tf.shape(x)[2] * tf.shape(x)[3], delta=1)
-      indices = tf.reshape(indices, [1, tf.shape(x)[2], tf.shape(x)[3], 1])
-      indices = tf.nn.conv2d(
-          tf.cast(indices, tf.float32),
-          kernel,
-          strides=[1, stride, stride, 1],
-          padding="VALID")
-      # making indices [num_blocks, dim] to gather
-      num_blocks = tf.reduce_prod(tf.shape(indices)[:2])
-      indices = tf.reshape(indices, [num_blocks, -1])
-      return tf.cast(indices, tf.int32)
-
-    def gather_blocks(x, indices):
-      """Gathers flattened blocks from x."""
-      x_shape = tf.shape(x)
-      x = reshape_range(x, 2, 4, [tf.reduce_prod(x_shape[2:4])])
-      # [length, batch, heads, dim]
-      x_t = tf.transpose(x, [2, 0, 1, 3])
-      x_new = tf.gather(x_t, indices)
-      # returns [batch, heads, num_blocks, block_length ** 2, dim]
-      return tf.transpose(x_new, [2, 3, 0, 1, 4])
-
-    q = pad_to_multiple(q, block_length)
-    k = pad_to_multiple(k, block_length)
-    v = pad_to_multiple(v, block_length)
+    q = pad_to_multiple_2d(q, query_shape)
+    k = pad_to_multiple_2d(k, query_shape)
+    v = pad_to_multiple_2d(v, query_shape)
 
     # Setting up k and v values
-    paddings = [[0, 0], [0, 0], [filter_flange, filter_flange],
-                [filter_flange, filter_flange], [0, 0]]
+    paddings = [[0, 0], [0, 0], [memory_flange[0], memory_flange[1]],
+                [memory_flange[0], memory_flange[1]], [0, 0]]
     k = tf.pad(k, paddings)
     v = tf.pad(v, paddings)
 
     # Setting up q blocks
-    q_indices = gather_indices(q, block_length, block_length)
-    q_new = gather_blocks(q, q_indices)
+    q_indices = gather_indices_2d(q, query_shape, query_shape)
+    q_new = gather_blocks_2d(q, q_indices)
 
     # Setting up k and v blocks
-    full_filter_width = block_length + 2 * filter_flange
-    k_and_v_indices = gather_indices(k, full_filter_width, block_length)
-    k_new = gather_blocks(k, k_and_v_indices)
-    v_new = gather_blocks(v, k_and_v_indices)
+    memory_shape = (query_shape[0]+2*memory_flange[0],
+                    query_shape[1]+2*memory_flange[1])
+    k_and_v_indices = gather_indices_2d(k, memory_shape, query_shape)
+    k_new = gather_blocks_2d(k, k_and_v_indices)
+    v_new = gather_blocks_2d(v, k_and_v_indices)
 
     attention_bias = tf.expand_dims(
         tf.to_float(embedding_to_padding(k_new)) * -1e9, axis=-2)
@@ -729,6 +690,159 @@ def gather_blocks(x, indices):
     return tf.reshape(output, v_shape)
 
 
+def pad_to_multiple_2d(x, block_shape):
+  """Making sure x is a multiple of shape."""
+  old_shape = x.get_shape().dims
+  last = old_shape[-1]
+  height_padding = -tf.shape(x)[1] % block_shape[0]
+  width_padding = -tf.shape(x)[2] % block_shape[1]
+  paddings = [[0, 0], [0, 0], [0, height_padding],
+              [0, width_padding], [0, 0]]
+  padded_x = tf.pad(x, paddings)
+  padded_shape = padded_x.get_shape().as_list()
+  padded_shape = padded_shape[:-1]+[last]
+  padded_x.set_shape(padded_shape)
+  return padded_x
+
+
+def reshape_range(tensor, i, j, shape):
+  """Reshapes a tensor between dimensions i and j."""
+  target_shape = tf.concat(
+      [tf.shape(tensor)[:i], shape, tf.shape(tensor)[j:]],
+      axis=0)
+  return tf.reshape(tensor, target_shape)
+
+
+def gather_blocks_2d(x, indices):
+  """Gathers flattened blocks from x."""
+  x_shape = tf.shape(x)
+  x = reshape_range(x, 2, 4, [tf.reduce_prod(x_shape[2:4])])
+  # [length, batch, heads, dim]
+  x_t = tf.transpose(x, [2, 0, 1, 3])
+  x_new = tf.gather(x_t, indices)
+  # returns [batch, heads, num_blocks, block_length ** 2, dim]
+  return tf.transpose(x_new, [2, 3, 0, 1, 4])
+
+
+def gather_indices_2d(x, block_shape, block_stride):
+  """Getting gather indices."""
+  # making an identity matrix kernel
+  kernel = tf.eye(block_shape[0]*block_shape[1])
+  kernel = reshape_range(kernel, 0, 1, [block_shape[0], block_shape[1], 1])
+  # making indices [1, h, w, 1] to appy convs
+  indices = tf.range(0, tf.shape(x)[2] * tf.shape(x)[3], delta=1)
+  indices = tf.reshape(indices, [1, tf.shape(x)[2], tf.shape(x)[3], 1])
+  indices = tf.nn.conv2d(
+      tf.cast(indices, tf.float32),
+      kernel,
+      strides=[1, block_stride[0], block_stride[1], 1],
+      padding="VALID")
+  # making indices [num_blocks, dim] to gather
+  num_blocks = tf.reduce_prod(tf.shape(indices)[:3])
+  indices = tf.reshape(indices, [num_blocks, -1])
+  return tf.cast(indices, tf.int32)
+
+
+def masked_local_attention_2d(q,
+                              k,
+                              v,
+                              query_shape=(8, 16),
+                              memory_flange=(8, 16),
+                              name=None):
+  """strided block local self-attention.
+
+  Args:
+    q: a Tensor with shape [batch, heads, h, w, depth_k]
+    k: a Tensor with shape [batch, heads, h, w, depth_k]
+    v: a Tensor with shape [batch, heads, h, w, depth_v]
+    query_shape: an tuple indicating the height and width of each query block.
+      query_shape = block_shape
+    memory_flange: an integer indicating how much to look in height and width
+      from each query block.
+      memory shape = query_shape + (block_flange[0], 2*block_flange[1])
+    name: an optional string
+
+  Returns:
+    a Tensor of shape [batch, heads, h, w, depth_v]
+  """
+  with tf.variable_scope(
+      name, default_name="local_masked_self_attention_2d", values=[q, k, v]):
+    v_shape = tf.shape(v)
+    depth_v = tf.shape(v)[4]
+    batch_size = tf.shape(q)[0]
+    num_heads = tf.shape(q)[1]
+    original_length = tf.shape(q)[2] * tf.shape(q)[3]
+    def make_mask(query_shape, memory_flange):
+      """creates a mask.
+
+      The query mask can look to the left, top left, top, and top right, but
+      not the right. Inside the query, we have the standard raster scan
+      masking.
+      Args:
+        query_shape: A tuple of ints (query_height, query_width)
+        memory_flange: A tuple of ints
+          (memory_flange_height, memory_flange_width)
+
+      Returns:
+        A tensor of shape query_size, memory_size
+      """
+
+      query_triangle = tf.matrix_band_part(
+          tf.ones([np.prod(query_shape), np.prod(query_shape)]), -1, 0)
+      split_query_masks = tf.split(query_triangle, query_shape[0], axis=1)
+      mask_pieces = [
+          tf.concat(
+              [tf.ones([np.prod(query_shape), memory_flange[1]]),
+               split_query_masks[i],
+               tf.zeros([np.prod(query_shape), memory_flange[1]])
+              ], axis=1) for i in range(query_shape[0])]
+
+      final_mask = tf.concat(
+          [tf.ones(
+              [np.prod(query_shape),
+               (query_shape[1]+2*memory_flange[1])*memory_flange[0]]),
+           tf.concat(mask_pieces, axis=1)
+          ], axis=1)
+      # 0. is visible location, 1.0 is masked.
+      return 1. - final_mask
+    q = pad_to_multiple_2d(q, query_shape)
+    k = pad_to_multiple_2d(k, query_shape)
+    v = pad_to_multiple_2d(v, query_shape)
+    # Setting up k and v values. Padding top, left, and right
+    paddings = [[0, 0], [0, 0], [memory_flange[0], 0],
+                [memory_flange[1], memory_flange[1]], [0, 0]]
+    k = tf.pad(k, paddings)
+    v = tf.pad(v, paddings)
+    # Setting up q blocks
+    q_indices = gather_indices_2d(q, query_shape, query_shape)
+    q_new = gather_blocks_2d(q, q_indices)
+    # Setting up k and v blocks
+    memory_shape = (query_shape[0]+memory_flange[0],
+                    query_shape[1]+memory_flange[1]*2)
+    k_and_v_indices = gather_indices_2d(k, memory_shape, query_shape)
+    k_new = gather_blocks_2d(k, k_and_v_indices)
+    v_new = gather_blocks_2d(v, k_and_v_indices)
+    logits = tf.matmul(q_new, k_new, transpose_b=True)
+    # Combining the mask for padding and visible region
+    attention_mask_shape = [np.prod(query_shape),
+                            (query_shape[0]+memory_flange[0])*
+                            (query_shape[1]+2*memory_flange[1])]
+    attention_mask = tf.cast(make_mask(query_shape, memory_flange), tf.bool)
+    # reshaping attention mask to have same dims as logits
+    attention_mask = tf.reshape(attention_mask, [1, 1, 1]+attention_mask_shape)
+    padding_mask = tf.expand_dims(
+        tf.cast(embedding_to_padding(k_new), tf.bool), axis=-2)
+    attention_bias = (
+        tf.to_float(tf.logical_or(attention_mask, padding_mask)) *-1e9)
+    attention = tf.nn.softmax(logits + attention_bias)
+    output = tf.matmul(attention, v_new)
+    output = tf.reshape(output, [batch_size, num_heads, -1, depth_v])
+    # Remove the padding if introduced
+    output = tf.slice(output, [0, 0, 0, 0], [-1, -1, original_length, -1])
+    # [batch, heads, h, w, depth_v]
+    return tf.reshape(output, v_shape)
+
+
 def compute_qkv(query_antecedent, memory_antecedent, total_key_depth,
                 total_value_depth, q_filter_width=1, kv_filter_width=1,
                 q_padding="VALID", kv_padding="VALID"):
@@ -795,7 +909,7 @@ def compute_qkv(query_antecedent, memory_antecedent, total_key_depth,
 
 def compute_qkv_2d(query_antecedent, memory_antecedent, total_key_depth,
                    total_value_depth):
-  """Computes query, key and value of a 4D tensor.
+  """Computes query, key and value.
 
   Args:
     query_antecedent: a Tensor with shape [batch, h, w, depth_k]
@@ -806,7 +920,7 @@ def compute_qkv_2d(query_antecedent, memory_antecedent, total_key_depth,
   Returns:
     q, k, v : [batch, h, w, depth_k] tensors
   """
-  # self attention with single position q, k, and v.
+  # self attention with single position q, k, and v
   if memory_antecedent is None:
     combined = tf.layers.conv2d(
         query_antecedent,
@@ -817,7 +931,7 @@ def compute_qkv_2d(query_antecedent, memory_antecedent, total_key_depth,
         axis=-1)
     return q, k, v
 
-  # Encoder decoder attention.
+  # Encoder decoder attention
   q = common_layers.conv1d(
       query_antecedent, total_key_depth, 1, name="q_transform")
   combined = common_layers.conv1d(
@@ -918,8 +1032,8 @@ def multihead_attention_2d(query_antecedent,
                            output_depth,
                            num_heads,
                            attention_type="local_attention_2d",
-                           block_length=128,
-                           block_width=128,
+                           query_shape=(8, 16),
+                           memory_flange=(8, 16),
                            name=None):
   """2d Multihead scaled-dot-product attention with inp/output transformations.
 
@@ -931,8 +1045,8 @@ def multihead_attention_2d(query_antecedent,
     output_depth: an integer
     num_heads: an integer dividing total_key_depth and total_value_depth
     attention_type: String, type of attention function to use.
-    block_length: an integer - relevant for "local_attention_2d"
-    block_width: an integer - relevant for "local_attention_2d"
+    query_shape: an tuple indicating the height and width of each query block.
+    memory_flange: an integer indicating how much to look in height and width
     name: an optional string
 
   Returns:
@@ -954,7 +1068,7 @@ def multihead_attention_2d(query_antecedent,
       values=[query_antecedent, memory_antecedent]):
     q, k, v = compute_qkv_2d(query_antecedent, memory_antecedent,
                              total_key_depth, total_value_depth)
-
+    # after splitting, shape is [batch, heads, h, w, depth]
     q = split_heads_2d(q, num_heads)
     k = split_heads_2d(k, num_heads)
     v = split_heads_2d(v, num_heads)
@@ -962,9 +1076,16 @@ def multihead_attention_2d(query_antecedent,
     q *= key_depth_per_head**-0.5
     if attention_type == "local_attention_2d":
       x = local_attention_2d(
-          q, k, v, block_length=block_length, filter_flange=block_width)
-    x = tf.squeeze(combine_heads_2d(x), axis=-2)
-    x = common_layers.conv1d(x, output_depth, 1, name="output_transform")
+          q, k, v, query_shape=query_shape, memory_flange=memory_flange)
+    else:
+      x = masked_local_attention_2d(q, k, v, query_shape=query_shape,
+                                    memory_flange=memory_flange)
+    x = combine_heads_2d(x)
+    x = tf.layers.conv2d(
+        x,
+        output_depth,
+        (1, 1),
+        name="output_transform")
     return x
 
 
diff --git a/tensor2tensor/layers/common_attention_test.py b/tensor2tensor/layers/common_attention_test.py
index 6664bcc2d..d8f6f2b39 100644
--- a/tensor2tensor/layers/common_attention_test.py
+++ b/tensor2tensor/layers/common_attention_test.py
@@ -98,8 +98,8 @@ def testLocalUnmaskedAttention2D(self):
           tf.constant(x, dtype=tf.float32),
           tf.constant(y, dtype=tf.float32),
           tf.constant(y, dtype=tf.float32),
-          block_length=4,
-          filter_flange=3)
+          query_shape=(4, 4),
+          memory_flange=(3, 3))
       session.run(tf.global_variables_initializer())
       res = session.run(a)
     self.assertEqual(res.shape, (5, 4, 25, 25, 16))
@@ -112,8 +112,8 @@ def testLocalUnmaskedAttention2DMatchingBlockLength(self):
           tf.constant(x, dtype=tf.float32),
           tf.constant(y, dtype=tf.float32),
           tf.constant(y, dtype=tf.float32),
-          block_length=5,
-          filter_flange=3)
+          query_shape=(5, 5),
+          memory_flange=(3, 3))
       session.run(tf.global_variables_initializer())
       res = session.run(a)
     self.assertEqual(res.shape, (5, 4, 25, 25, 16))

From 2ebead2f451d30107c43f6f061998496978f5279 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Wed, 6 Sep 2017 18:14:18 -0700
Subject: [PATCH 19/32] Move to core Estimator and improve decoding

PiperOrigin-RevId: 167802133
---
 tensor2tensor/data_generators/image.py        |   4 +-
 tensor2tensor/layers/modalities.py            |   2 +-
 tensor2tensor/layers/modalities_test.py       |   4 +-
 tensor2tensor/models/attention_lm_moe.py      |   6 +-
 tensor2tensor/models/bluenet_test.py          |   2 +-
 tensor2tensor/models/bytenet_test.py          |   2 +-
 tensor2tensor/models/gene_expression_test.py  |   2 +-
 tensor2tensor/models/lstm.py                  |   4 +-
 tensor2tensor/models/lstm_test.py             |   4 +-
 tensor2tensor/models/multimodel.py            |   4 +-
 tensor2tensor/models/multimodel_test.py       |   2 +-
 tensor2tensor/models/neural_gpu_test.py       |   2 +-
 tensor2tensor/models/shake_shake.py           |   2 +-
 tensor2tensor/models/slicenet_test.py         |   2 +-
 tensor2tensor/models/transformer_moe.py       |   4 +-
 tensor2tensor/models/transformer_revnet.py    |   4 +-
 .../models/transformer_revnet_test.py         |   2 +-
 tensor2tensor/models/transformer_test.py      |   2 +-
 tensor2tensor/models/transformer_vae.py       |   4 +-
 tensor2tensor/models/xception_test.py         |   2 +-
 tensor2tensor/utils/data_reader.py            |  10 +-
 tensor2tensor/utils/data_reader_test.py       |   6 +-
 tensor2tensor/utils/decoding.py               | 326 ++++++++++++------
 tensor2tensor/utils/input_fn_builder.py       |  19 +-
 tensor2tensor/utils/metrics.py                |  13 +-
 tensor2tensor/utils/model_builder.py          | 286 ++++++---------
 tensor2tensor/utils/t2t_model.py              |   4 +-
 tensor2tensor/utils/trainer_utils.py          |  40 +--
 tensor2tensor/utils/trainer_utils_test.py     |   9 +-
 .../TransformerVisualization.ipynb            |  16 +-
 30 files changed, 427 insertions(+), 362 deletions(-)

diff --git a/tensor2tensor/data_generators/image.py b/tensor2tensor/data_generators/image.py
index 8d142d239..06942ed3f 100644
--- a/tensor2tensor/data_generators/image.py
+++ b/tensor2tensor/data_generators/image.py
@@ -313,7 +313,7 @@ def resize(img):
     return tf.to_int64(tf.image.resize_images(img, [299, 299]))
 
   inputs = tf.cast(examples["inputs"], tf.int64)
-  if mode == tf.contrib.learn.ModeKeys.TRAIN:
+  if mode == tf.estimator.ModeKeys.TRAIN:
     examples["inputs"] = tf.cond(  # Preprocess 90% of the time.
         tf.less(tf.random_uniform([]), 0.9),
         lambda img=inputs: preprocess(img),
@@ -589,7 +589,7 @@ def class_labels(self):
     ]
 
   def preprocess_examples(self, examples, mode, unused_hparams):
-    if mode == tf.contrib.learn.ModeKeys.TRAIN:
+    if mode == tf.estimator.ModeKeys.TRAIN:
       examples["inputs"] = common_layers.cifar_image_augmentation(
           examples["inputs"])
     return examples
diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py
index c93a05433..1d606ec1d 100644
--- a/tensor2tensor/layers/modalities.py
+++ b/tensor2tensor/layers/modalities.py
@@ -113,7 +113,7 @@ def top(self, body_output, _):
     with tf.variable_scope(scope_name, reuse=reuse):
       var = self._get_weights()
       if (self._model_hparams.factored_logits and
-          self._model_hparams.mode == tf.contrib.learn.ModeKeys.TRAIN):
+          self._model_hparams.mode == tf.estimator.ModeKeys.TRAIN):
         # insert channels dimension
         body_output = tf.expand_dims(body_output, 3)
         logits = common_layers.FactoredTensor(body_output, var)
diff --git a/tensor2tensor/layers/modalities_test.py b/tensor2tensor/layers/modalities_test.py
index 5813422ab..93dda6d09 100644
--- a/tensor2tensor/layers/modalities_test.py
+++ b/tensor2tensor/layers/modalities_test.py
@@ -67,7 +67,7 @@ def testSymbolModalityTargets(self):
         label_smoothing=0.2,
         shared_embedding_and_softmax_weights=0,
         factored_logits=0,
-        mode=tf.contrib.learn.ModeKeys.TRAIN)
+        mode=tf.estimator.ModeKeys.TRAIN)
     body_output = -1 + np.random.random_integers(
         100, size=(batch_size, length, height, hidden_size))
     targets = -1 + np.random.random_integers(
@@ -101,7 +101,7 @@ def testSymbolModalityTargetsFactored(self):
         label_smoothing=0.2,
         shared_embedding_and_softmax_weights=0,
         factored_logits=1,
-        mode=tf.contrib.learn.ModeKeys.TRAIN)
+        mode=tf.estimator.ModeKeys.TRAIN)
     body_output = -1 + np.random.random_integers(
         100, size=(batch_size, length, height, hidden_size))
     targets = -1 + np.random.random_integers(
diff --git a/tensor2tensor/models/attention_lm_moe.py b/tensor2tensor/models/attention_lm_moe.py
index eccf349c9..596d5b01d 100644
--- a/tensor2tensor/models/attention_lm_moe.py
+++ b/tensor2tensor/models/attention_lm_moe.py
@@ -42,7 +42,7 @@
 import tensorflow as tf
 
 
-ModeKeys = tf.contrib.learn.ModeKeys  # pylint: disable=invalid-name
+ModeKeys = tf.estimator.ModeKeys  # pylint: disable=invalid-name
 
 
 class AttentionType(object):
@@ -279,7 +279,7 @@ def remove_pad(x, pad_remover, mode):
   x = expert_utils.flatten_all_but_last(x)
 
   # Remove padding for training and eval
-  if mode != ModeKeys.INFER:
+  if mode != ModeKeys.PREDICT:
     # This is a hack to allows inference when the <go> token
     # is detected as padding and removed. This works for now because there is
     # no padding at inference.
@@ -291,7 +291,7 @@ def remove_pad(x, pad_remover, mode):
 
 def restore_pad(x, ref_x, pad_remover, mode):
   x = tf.squeeze(x, axis=0)
-  if mode != ModeKeys.INFER:
+  if mode != ModeKeys.PREDICT:
     x = pad_remover.restore(x)
   x = expert_utils.reshape_like(x, ref_x)
   return x
diff --git a/tensor2tensor/models/bluenet_test.py b/tensor2tensor/models/bluenet_test.py
index 70b8defe9..d559fd953 100644
--- a/tensor2tensor/models/bluenet_test.py
+++ b/tensor2tensor/models/bluenet_test.py
@@ -45,7 +45,7 @@ def testBlueNet(self):
           "targets": tf.constant(y, dtype=tf.int32),
       }
       model = bluenet.BlueNet(
-          hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams)
+          hparams, tf.estimator.ModeKeys.TRAIN, p_hparams)
       sharded_logits, _ = model.model_fn(features)
       logits = tf.concat(sharded_logits, 0)
       session.run(tf.global_variables_initializer())
diff --git a/tensor2tensor/models/bytenet_test.py b/tensor2tensor/models/bytenet_test.py
index 536d348e7..56f421153 100644
--- a/tensor2tensor/models/bytenet_test.py
+++ b/tensor2tensor/models/bytenet_test.py
@@ -44,7 +44,7 @@ def testByteNet(self):
           "targets": tf.constant(y, dtype=tf.int32),
       }
       model = bytenet.ByteNet(
-          hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams)
+          hparams, tf.estimator.ModeKeys.TRAIN, p_hparams)
       sharded_logits, _ = model.model_fn(features)
       logits = tf.concat(sharded_logits, 0)
       session.run(tf.global_variables_initializer())
diff --git a/tensor2tensor/models/gene_expression_test.py b/tensor2tensor/models/gene_expression_test.py
index e46e81859..ea02572d0 100644
--- a/tensor2tensor/models/gene_expression_test.py
+++ b/tensor2tensor/models/gene_expression_test.py
@@ -55,7 +55,7 @@ def _testModel(self, hparams, model_cls):
         "targets": tf.constant(targets, dtype=tf.float32),
     }
     p_hparams, = hparams.problems
-    sharded_logits, _ = model_cls(hparams, tf.contrib.learn.ModeKeys.TRAIN,
+    sharded_logits, _ = model_cls(hparams, tf.estimator.ModeKeys.TRAIN,
                                   p_hparams).model_fn(features)
     logits = tf.concat(sharded_logits, 0)
 
diff --git a/tensor2tensor/models/lstm.py b/tensor2tensor/models/lstm.py
index d79b04494..9f909433e 100644
--- a/tensor2tensor/models/lstm.py
+++ b/tensor2tensor/models/lstm.py
@@ -251,7 +251,7 @@ def lstm_seq2seq_internal_attention(inputs, targets, hparams, train):
 class LSTMSeq2seq(t2t_model.T2TModel):
 
   def model_fn_body(self, features):
-    train = self._hparams.mode == tf.contrib.learn.ModeKeys.TRAIN
+    train = self._hparams.mode == tf.estimator.ModeKeys.TRAIN
     return lstm_seq2seq_internal(features["inputs"], features["targets"],
                                  self._hparams, train)
 
@@ -260,7 +260,7 @@ def model_fn_body(self, features):
 class LSTMSeq2seqAttention(t2t_model.T2TModel):
 
   def model_fn_body(self, features):
-    train = self._hparams.mode == tf.contrib.learn.ModeKeys.TRAIN
+    train = self._hparams.mode == tf.estimator.ModeKeys.TRAIN
     return lstm_seq2seq_internal_attention(
         features["inputs"], features["targets"], self._hparams, train)
 
diff --git a/tensor2tensor/models/lstm_test.py b/tensor2tensor/models/lstm_test.py
index 7da3d2380..c1190d016 100644
--- a/tensor2tensor/models/lstm_test.py
+++ b/tensor2tensor/models/lstm_test.py
@@ -44,7 +44,7 @@ def testLSTMSeq2Seq(self):
           "inputs": tf.constant(x, dtype=tf.int32),
           "targets": tf.constant(y, dtype=tf.int32),
       }
-      model = lstm.LSTMSeq2seq(hparams, tf.contrib.learn.ModeKeys.TRAIN,
+      model = lstm.LSTMSeq2seq(hparams, tf.estimator.ModeKeys.TRAIN,
                                p_hparams)
       sharded_logits, _ = model.model_fn(features)
       logits = tf.concat(sharded_logits, 0)
@@ -69,7 +69,7 @@ def testLSTMSeq2SeqAttention(self):
           "targets": tf.constant(y, dtype=tf.int32),
       }
       model = lstm.LSTMSeq2seqAttention(
-          hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams)
+          hparams, tf.estimator.ModeKeys.TRAIN, p_hparams)
       sharded_logits, _ = model.model_fn(features)
       logits = tf.concat(sharded_logits, 0)
       session.run(tf.global_variables_initializer())
diff --git a/tensor2tensor/models/multimodel.py b/tensor2tensor/models/multimodel.py
index c8d515c8d..5df8fcd3c 100644
--- a/tensor2tensor/models/multimodel.py
+++ b/tensor2tensor/models/multimodel.py
@@ -74,7 +74,7 @@ def residual_fn3(x, y, z, hparams):
 def conv_experts(xs, hparams, dp, ps, padding, mask, layer_id):
   """Convolutions + Mixture-of-Experts layer."""
   del layer_id  # Unused.
-  train = hparams.mode == tf.contrib.learn.ModeKeys.TRAIN,
+  train = hparams.mode == tf.estimator.ModeKeys.TRAIN,
   conv_out = dp(conv_res_step, xs, hparams, padding, mask)
   loss = 0.0
   moe_hidden_sizes = [hparams.filter_size]
@@ -109,7 +109,7 @@ def prepare_decoder(targets, target_space_emb):
 class MultiModel(t2t_model.T2TModel):
 
   def model_fn_body_sharded(self, sharded_features):
-    train = self._hparams.mode == tf.contrib.learn.ModeKeys.TRAIN
+    train = self._hparams.mode == tf.estimator.ModeKeys.TRAIN
     dp = self._data_parallelism
     hparams = self._hparams
 
diff --git a/tensor2tensor/models/multimodel_test.py b/tensor2tensor/models/multimodel_test.py
index ab60bae97..3aff41029 100644
--- a/tensor2tensor/models/multimodel_test.py
+++ b/tensor2tensor/models/multimodel_test.py
@@ -47,7 +47,7 @@ def testMultiModel(self):
           "target_space_id": tf.constant(1, dtype=tf.int32),
       }
       model = multimodel.MultiModel(
-          hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams)
+          hparams, tf.estimator.ModeKeys.TRAIN, p_hparams)
       sharded_logits, _ = model.model_fn(features)
       logits = tf.concat(sharded_logits, 0)
       session.run(tf.global_variables_initializer())
diff --git a/tensor2tensor/models/neural_gpu_test.py b/tensor2tensor/models/neural_gpu_test.py
index b7a1e98f7..164623699 100644
--- a/tensor2tensor/models/neural_gpu_test.py
+++ b/tensor2tensor/models/neural_gpu_test.py
@@ -50,7 +50,7 @@ def testNeuralGPU(self):
           "inputs": tf.constant(inputs, dtype=tf.int32),
           "targets": tf.constant(targets, dtype=tf.int32)
       }
-      model = neural_gpu.NeuralGPU(hparams, tf.contrib.learn.ModeKeys.TRAIN,
+      model = neural_gpu.NeuralGPU(hparams, tf.estimator.ModeKeys.TRAIN,
                                    p_hparams)
       shadred_logits, _ = model.model_fn(features)
       logits = tf.concat(shadred_logits, 0)
diff --git a/tensor2tensor/models/shake_shake.py b/tensor2tensor/models/shake_shake.py
index a7b379e11..a4dd2385a 100644
--- a/tensor2tensor/models/shake_shake.py
+++ b/tensor2tensor/models/shake_shake.py
@@ -64,7 +64,7 @@ def shake_shake_block(x, conv_filters, stride, hparams):
     skip = downsampling_residual_branch(x, conv_filters)
 
   # TODO(rshin): Use different alpha for each image in batch.
-  if hparams.mode == tf.contrib.learn.ModeKeys.TRAIN:
+  if hparams.mode == tf.estimator.ModeKeys.TRAIN:
     if hparams.shakeshake_type == "batch":
       shaken = common_layers.shakeshake2(branch1, branch2)
     elif hparams.shakeshake_type == "image":
diff --git a/tensor2tensor/models/slicenet_test.py b/tensor2tensor/models/slicenet_test.py
index c3a064a85..faf028737 100644
--- a/tensor2tensor/models/slicenet_test.py
+++ b/tensor2tensor/models/slicenet_test.py
@@ -47,7 +47,7 @@ def testSliceNet(self):
           "targets": tf.constant(y, dtype=tf.int32),
           "target_space_id": tf.constant(1, dtype=tf.int32),
       }
-      model = slicenet.SliceNet(hparams, tf.contrib.learn.ModeKeys.TRAIN,
+      model = slicenet.SliceNet(hparams, tf.estimator.ModeKeys.TRAIN,
                                 p_hparams)
       sharded_logits, _ = model.model_fn(features)
       logits = tf.concat(sharded_logits, 0)
diff --git a/tensor2tensor/models/transformer_moe.py b/tensor2tensor/models/transformer_moe.py
index 669b1842b..c8a32a667 100644
--- a/tensor2tensor/models/transformer_moe.py
+++ b/tensor2tensor/models/transformer_moe.py
@@ -91,7 +91,7 @@ def postprocess(x, y):
                 dp,
                 self._ps_devices,
                 preprocess(x),
-                hparams.mode == tf.contrib.learn.ModeKeys.TRAIN,
+                hparams.mode == tf.estimator.ModeKeys.TRAIN,
                 input_size=hparams.hidden_size,
                 expert_fn=expert_fn,
                 num_experts=hparams.moe_num_experts,
@@ -140,7 +140,7 @@ def postprocess(x, y):
                 dp,
                 self._ps_devices,
                 preprocess(x),
-                hparams.mode == tf.contrib.learn.ModeKeys.TRAIN,
+                hparams.mode == tf.estimator.ModeKeys.TRAIN,
                 input_size=hparams.hidden_size,
                 expert_fn=expert_fn,
                 num_experts=hparams.moe_num_experts,
diff --git a/tensor2tensor/models/transformer_revnet.py b/tensor2tensor/models/transformer_revnet.py
index 942a00660..7275c370a 100644
--- a/tensor2tensor/models/transformer_revnet.py
+++ b/tensor2tensor/models/transformer_revnet.py
@@ -131,7 +131,7 @@ def g(x):
         g,
         num_layers=hparams.num_hidden_layers,
         f_side_input=[encoder_self_attention_bias],
-        is_training=hparams.mode == tf.contrib.learn.ModeKeys.TRAIN)
+        is_training=hparams.mode == tf.estimator.ModeKeys.TRAIN)
     y = tf.concat([y1, y2], axis=-1)
 
   return common_layers.layer_preprocess(y, hparams)
@@ -212,7 +212,7 @@ def g(x):
             decoder_self_attention_bias, encoder_decoder_attention_bias,
             encoder_output
         ],
-        is_training=hparams.mode == tf.contrib.learn.ModeKeys.TRAIN)
+        is_training=hparams.mode == tf.estimator.ModeKeys.TRAIN)
     y = tf.concat([y1, y2], axis=-1)
     return common_layers.layer_preprocess(y, hparams)
 
diff --git a/tensor2tensor/models/transformer_revnet_test.py b/tensor2tensor/models/transformer_revnet_test.py
index 66b493b0b..f9bc8cfb2 100644
--- a/tensor2tensor/models/transformer_revnet_test.py
+++ b/tensor2tensor/models/transformer_revnet_test.py
@@ -59,7 +59,7 @@ def testTransformer(self):
         "target_space_id": tf.constant(1, dtype=tf.int32),
     }
     model = transformer_revnet.TransformerRevnet(
-        hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams)
+        hparams, tf.estimator.ModeKeys.TRAIN, p_hparams)
     sharded_logits, _ = model.model_fn(features)
     logits = tf.concat(sharded_logits, 0)
     grads = tf.gradients(
diff --git a/tensor2tensor/models/transformer_test.py b/tensor2tensor/models/transformer_test.py
index 6c0eee203..9e450a670 100644
--- a/tensor2tensor/models/transformer_test.py
+++ b/tensor2tensor/models/transformer_test.py
@@ -53,7 +53,7 @@ def getModel(self):
     }
 
     return transformer.Transformer(
-        hparams, tf.contrib.learn.ModeKeys.INFER, p_hparams), features
+        hparams, tf.estimator.ModeKeys.PREDICT, p_hparams), features
 
   def testTransformer(self):
     model, features = self.getModel()
diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py
index 025f8d631..e3279495a 100644
--- a/tensor2tensor/models/transformer_vae.py
+++ b/tensor2tensor/models/transformer_vae.py
@@ -244,7 +244,7 @@ def ae_decompress(z, ae, x, is_2d, hparams, name, reuse=None):
     # Leak at the beginning to help train.
     z = mix(z, ae, hparams.startup_steps)
     prob_z = common_layers.inverse_exp_decay(hparams.startup_steps) * 0.8
-    prob_z = prob_z if hparams.mode == tf.contrib.learn.ModeKeys.TRAIN else 1.0
+    prob_z = prob_z if hparams.mode == tf.estimator.ModeKeys.TRAIN else 1.0
     z = tf.cond(tf.less(tf.random_uniform([]), prob_z),
                 lambda: z, lambda: ae)
 
@@ -305,7 +305,7 @@ def ae_transformer_internal(inputs, targets, target_space, hparams):
     reconstruct_loss = tf.nn.softmax_cross_entropy_with_logits(
         labels=hot, logits=c_z)
     # If not training, use the predicted z instead of the autoregressive one.
-    if hparams.mode == tf.contrib.learn.ModeKeys.INFER:
+    if hparams.mode == tf.estimator.ModeKeys.PREDICT:
       hot = tf.one_hot(tf.argmax(c_z, axis=-1), hparams.v_size)
 
     # Decompress, pass for ae loss.
diff --git a/tensor2tensor/models/xception_test.py b/tensor2tensor/models/xception_test.py
index 776d1306a..eb4c6db20 100644
--- a/tensor2tensor/models/xception_test.py
+++ b/tensor2tensor/models/xception_test.py
@@ -44,7 +44,7 @@ def testXception(self):
           "targets": tf.constant(y, dtype=tf.int32),
       }
       model = xception.Xception(
-          hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams)
+          hparams, tf.estimator.ModeKeys.TRAIN, p_hparams)
       sharded_logits, _ = model.model_fn(features)
       logits = tf.concat(sharded_logits, 0)
       session.run(tf.global_variables_initializer())
diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py
index 681f3598b..f48665078 100644
--- a/tensor2tensor/utils/data_reader.py
+++ b/tensor2tensor/utils/data_reader.py
@@ -215,7 +215,7 @@ def default_example_reading_spec(data_file_pattern):
 def read_examples(problem,
                   data_file_pattern,
                   capacity,
-                  mode=tf.contrib.learn.ModeKeys.TRAIN):
+                  mode=tf.estimator.ModeKeys.TRAIN):
   """Create Dataset of Example for problem and data_file_pattern."""
   if problem is None:
     data_fields, data_items_to_decoders = default_example_reading_spec(
@@ -227,7 +227,7 @@ def read_examples(problem,
     # Create placeholders for input, rather than reading data from disk.
     return feature_placeholders(data_fields)
 
-  is_training = mode == tf.contrib.learn.ModeKeys.TRAIN
+  is_training = mode == tf.estimator.ModeKeys.TRAIN
   dataset = examples_reader(
       [data_file_pattern],
       data_fields,
@@ -245,7 +245,7 @@ def input_pipeline(problem, data_file_pattern, capacity, mode, hparams,
     problem: Problem instance for which to build the input pipeline.
     data_file_pattern: file pattern for input files.
     capacity: int, data pipeline buffer capacity.
-    mode: tf.contrib.learn.ModeKeys entry.
+    mode: tf.estimator.ModeKeys entry.
     hparams: an HParams object.
     batching_scheme: a dictionary containing
       "boundaries": a list of integers for the boundaries that will be
@@ -256,7 +256,7 @@ def input_pipeline(problem, data_file_pattern, capacity, mode, hparams,
   Returns:
     dict <feature name, batched and padded Tensor>
   """
-  is_training = mode == tf.contrib.learn.ModeKeys.TRAIN
+  is_training = mode == tf.estimator.ModeKeys.TRAIN
   num_threads = 4 if is_training else 1
 
   with tf.name_scope("input_pipeline"):
@@ -505,7 +505,7 @@ def get_data_filepatterns(problems, data_dir, mode):
     except ValueError:
       problem, _, _ = problem_hparams.parse_problem_name(problem)
     path = os.path.join(data_dir, problem)
-    if mode == tf.contrib.learn.ModeKeys.TRAIN:
+    if mode == tf.estimator.ModeKeys.TRAIN:
       datasets.append("%s-train*" % path)
     else:
       datasets.append("%s-dev*" % path)
diff --git a/tensor2tensor/utils/data_reader_test.py b/tensor2tensor/utils/data_reader_test.py
index 991669a99..aed2598c7 100644
--- a/tensor2tensor/utils/data_reader_test.py
+++ b/tensor2tensor/utils/data_reader_test.py
@@ -70,7 +70,7 @@ def preprocess_examples(self, examples, unused_mode, unused_hparams):
 def generate_test_data(problem, tmp_dir):
   problem.generate_data(tmp_dir, tmp_dir)
   filepatterns = data_reader.get_data_filepatterns(
-      problem.name, tmp_dir, tf.contrib.learn.ModeKeys.TRAIN)
+      problem.name, tmp_dir, tf.estimator.ModeKeys.TRAIN)
   assert tf.gfile.Glob(filepatterns[0])
   return filepatterns
 
@@ -115,7 +115,7 @@ def testTrainEvalBehavior(self):
         self.problem,
         self.filepatterns[0],
         16,
-        mode=tf.contrib.learn.ModeKeys.EVAL)
+        mode=tf.estimator.ModeKeys.EVAL)
     eval_examples = eval_dataset.make_one_shot_iterator().get_next()
 
     eval_idxs = []
@@ -243,7 +243,7 @@ def example_len(ex):
         self.problem,
         self.filepatterns[0],
         32,
-        mode=tf.contrib.learn.ModeKeys.EVAL)
+        mode=tf.estimator.ModeKeys.EVAL)
     dataset = data_reader.bucket_by_sequence_length(
         dataset, example_len,
         boundaries, batch_sizes, window_size)
diff --git a/tensor2tensor/utils/decoding.py b/tensor2tensor/utils/decoding.py
index 3f00c25a9..ea1a5fa01 100644
--- a/tensor2tensor/utils/decoding.py
+++ b/tensor2tensor/utils/decoding.py
@@ -36,17 +36,20 @@
 
 FLAGS = tf.flags.FLAGS
 
-
-def _decode_from_dataset_log_results(inputs,
-                                     targets,
-                                     outputs,
-                                     problem_name,
-                                     prediction_idx,
-                                     inputs_vocab,
-                                     targets_vocab,
-                                     save_images=False,
-                                     model_dir=None,
-                                     identity_output=False):
+# Number of samples to draw for an image input (in such cases as captioning)
+IMAGE_DECODE_LENGTH = 100
+
+
+def log_decode_results(inputs,
+                       outputs,
+                       problem_name,
+                       prediction_idx,
+                       inputs_vocab,
+                       targets_vocab,
+                       targets=None,
+                       save_images=False,
+                       model_dir=None,
+                       identity_output=False):
   """Log inference results."""
   if "image" in problem_name and save_images:
     save_path = os.path.join(model_dir, "%s_prediction_%d.jpg" %
@@ -56,17 +59,21 @@ def _decode_from_dataset_log_results(inputs,
     decoded_inputs = inputs_vocab.decode(_save_until_eos(inputs.flatten()))
     tf.logging.info("Inference results INPUT: %s" % decoded_inputs)
 
+  decoded_targets = None
   if identity_output:
     decoded_outputs = "".join(map(str, outputs.flatten()))
-    decoded_targets = "".join(map(str, targets.flatten()))
+    if targets is not None:
+      decoded_targets = "".join(map(str, targets.flatten()))
   else:
     decoded_outputs = "".join(
         map(str, targets_vocab.decode(_save_until_eos(outputs.flatten()))))
-    decoded_targets = "".join(
-        map(str, targets_vocab.decode(_save_until_eos(targets.flatten()))))
+    if targets is not None:
+      decoded_targets = "".join(
+          map(str, targets_vocab.decode(_save_until_eos(targets.flatten()))))
 
   tf.logging.info("Inference results OUTPUT: %s" % decoded_outputs)
-  tf.logging.info("Inference results TARGET: %s" % decoded_targets)
+  if targets is not None:
+    tf.logging.info("Inference results TARGET: %s" % decoded_targets)
   return decoded_outputs, decoded_targets
 
 
@@ -80,22 +87,22 @@ def decode_from_dataset(estimator,
                         identity_output=False):
   tf.logging.info("Performing local inference from dataset for %s.",
                   str(problem_names))
-  hparams = estimator.hparams
+  hparams = estimator.params
 
   for problem_idx, problem_name in enumerate(problem_names):
     # Build the inference input function
     infer_problems_data = data_reader.get_data_filepatterns(
-        problem_name, hparams.data_dir, tf.contrib.learn.ModeKeys.INFER)
+        problem_name, hparams.data_dir, tf.estimator.ModeKeys.PREDICT)
 
     infer_input_fn = input_fn_builder.build_input_fn(
-        mode=tf.contrib.learn.ModeKeys.INFER,
+        mode=tf.estimator.ModeKeys.PREDICT,
         hparams=hparams,
         data_file_patterns=infer_problems_data,
         num_datashards=devices.data_parallelism().n,
         fixed_problem=problem_idx)
 
     # Get the predictions as an iterable
-    predictions = estimator.predict(input_fn=infer_input_fn, as_iterable=True)
+    predictions = estimator.predict(infer_input_fn)
 
     # Prepare output file writers if decode_to_file passed
     if decode_to_file:
@@ -119,16 +126,30 @@ def decode_from_dataset(estimator,
         output_beams = np.split(outputs, beam_size, axis=0)
         for i, beam in enumerate(output_beams):
           tf.logging.info("BEAM %d:" % i)
-          decoded = _decode_from_dataset_log_results(
-              inputs, targets, beam, problem_name, num_predictions,
-              inputs_vocab, targets_vocab, save_images, estimator.model_dir,
-              identity_output)
+          decoded = log_decode_results(
+              inputs,
+              beam,
+              problem_name,
+              num_predictions,
+              inputs_vocab,
+              targets_vocab,
+              save_images=save_images,
+              model_dir=estimator.model_dir,
+              identity_output=identity_output,
+              targets=targets)
           decoded_outputs.append(decoded)
       else:
-        decoded = _decode_from_dataset_log_results(
-            inputs, targets, outputs, problem_name, num_predictions,
-            inputs_vocab, targets_vocab, save_images, estimator.model_dir,
-            identity_output)
+        decoded = log_decode_results(
+            inputs,
+            outputs,
+            problem_name,
+            num_predictions,
+            inputs_vocab,
+            targets_vocab,
+            save_images=save_images,
+            model_dir=estimator.model_dir,
+            identity_output=identity_output,
+            targets=targets)
         decoded_outputs.append(decoded)
 
       # Write out predictions if decode_to_file passed
@@ -149,43 +170,40 @@ def decode_from_dataset(estimator,
 
 def decode_from_file(estimator, filename):
   """Compute predictions on entries in filename and write them out."""
-  hparams = estimator.hparams
+  hparams = estimator.params
   problem_id = FLAGS.decode_problem_id
   inputs_vocab = hparams.problems[problem_id].vocabulary["inputs"]
   targets_vocab = hparams.problems[problem_id].vocabulary["targets"]
+  problem_name = FLAGS.problems.split("-")[problem_id]
   tf.logging.info("Performing decoding from a file.")
   sorted_inputs, sorted_keys = _get_sorted_inputs(filename)
   num_decode_batches = (len(sorted_inputs) - 1) // FLAGS.decode_batch_size + 1
-  input_fn = _decode_batch_input_fn(problem_id, num_decode_batches,
-                                    sorted_inputs, inputs_vocab)
 
-  decodes = []
-  for _ in range(num_decode_batches):
-    result_iter = estimator.predict(
-        input_fn=input_fn.next if six.PY2 else input_fn.__next__,
-        as_iterable=True)
-    for result in result_iter:
-
-      def log_fn(inputs, outputs):
-        decoded_inputs = inputs_vocab.decode(_save_until_eos(inputs.flatten()))
-        tf.logging.info("Inference results INPUT: %s" % decoded_inputs)
-
-        decoded_outputs = targets_vocab.decode(
-            _save_until_eos(outputs.flatten()))
-        tf.logging.info("Inference results OUTPUT: %s" % decoded_outputs)
-        return decoded_outputs
-
-      if FLAGS.decode_return_beams:
-        beam_decodes = []
-        output_beams = np.split(
-            result["outputs"], FLAGS.decode_beam_size, axis=0)
-        for k, beam in enumerate(output_beams):
-          tf.logging.info("BEAM %d:" % k)
-          beam_decodes.append(log_fn(result["inputs"], beam))
-        decodes.append("\t".join(beam_decodes))
+  def input_fn():
+    input_gen = _decode_batch_input_fn(problem_id, num_decode_batches,
+                                       sorted_inputs, inputs_vocab)
+    gen_fn = make_input_fn_from_generator(input_gen)
+    example = gen_fn()
+    return _decode_input_tensor_to_features_dict(example, hparams)
 
-      else:
-        decodes.append(log_fn(result["inputs"], result["outputs"]))
+  decodes = []
+  result_iter = estimator.predict(input_fn)
+  for result in result_iter:
+    if FLAGS.decode_return_beams:
+      beam_decodes = []
+      output_beams = np.split(result["outputs"], FLAGS.decode_beam_size, axis=0)
+      for k, beam in enumerate(output_beams):
+        tf.logging.info("BEAM %d:" % k)
+        decoded_outputs, _ = log_decode_results(result["inputs"], beam,
+                                                problem_name, None,
+                                                inputs_vocab, targets_vocab)
+        beam_decodes.append(decoded_outputs)
+      decodes.append("\t".join(beam_decodes))
+    else:
+      decoded_outputs, _ = log_decode_results(result["inputs"],
+                                              result["outputs"], problem_name,
+                                              None, inputs_vocab, targets_vocab)
+      decodes.append(decoded_outputs)
 
   # Reversing the decoded inputs and outputs because they were reversed in
   # _decode_batch_input_fn
@@ -210,33 +228,63 @@ def log_fn(inputs, outputs):
     outfile.write("%s\n" % (decodes[sorted_keys[index]]))
 
 
-def decode_interactively(estimator):
-  hparams = estimator.hparams
+def make_input_fn_from_generator(gen):
+  """Use py_func to yield elements from the given generator."""
+  first_ex = six.next(gen)
+  flattened = tf.contrib.framework.nest.flatten(first_ex)
+  types = [t.dtype for t in flattened]
+  shapes = [[None] * len(t.shape) for t in flattened]
+  first_ex_list = [first_ex]
+
+  def py_func():
+    if first_ex_list:
+      example = first_ex_list.pop()
+    else:
+      example = six.next(gen)
+    return tf.contrib.framework.nest.flatten(example)
 
-  infer_input_fn = _interactive_input_fn(hparams)
-  for problem_idx, example in infer_input_fn:
+  def input_fn():
+    flat_example = tf.py_func(py_func, [], types)
+    _ = [t.set_shape(shape) for t, shape in zip(flat_example, shapes)]
+    example = tf.contrib.framework.nest.pack_sequence_as(first_ex, flat_example)
+    return example
+
+  return input_fn
+
+
+def decode_interactively(estimator):
+  """Interactive decoding."""
+  hparams = estimator.params
+
+  def input_fn():
+    gen_fn = make_input_fn_from_generator(_interactive_input_fn(hparams))
+    example = gen_fn()
+    example = _interactive_input_tensor_to_features_dict(example, hparams)
+    return example
+
+  result_iter = estimator.predict(input_fn)
+  for result in result_iter:
+    problem_idx = result["problem_choice"]
     targets_vocab = hparams.problems[problem_idx].vocabulary["targets"]
-    result_iter = estimator.predict(input_fn=lambda e=example: e)
-    for result in result_iter:
-      if FLAGS.decode_return_beams:
-        beams = np.split(result["outputs"], FLAGS.decode_beam_size, axis=0)
-        scores = None
-        if "scores" in result:
-          scores = np.split(result["scores"], FLAGS.decode_beam_size, axis=0)
-        for k, beam in enumerate(beams):
-          tf.logging.info("BEAM %d:" % k)
-          beam_string = targets_vocab.decode(_save_until_eos(beam.flatten()))
-          if scores is not None:
-            tf.logging.info("%s\tScore:%f" % (beam_string, scores[k]))
-          else:
-            tf.logging.info(beam_string)
-      else:
-        if FLAGS.identity_output:
-          tf.logging.info(" ".join(map(str, result["outputs"].flatten())))
+
+    if FLAGS.decode_return_beams:
+      beams = np.split(result["outputs"], FLAGS.decode_beam_size, axis=0)
+      scores = None
+      if "scores" in result:
+        scores = np.split(result["scores"], FLAGS.decode_beam_size, axis=0)
+      for k, beam in enumerate(beams):
+        tf.logging.info("BEAM %d:" % k)
+        beam_string = targets_vocab.decode(_save_until_eos(beam.flatten()))
+        if scores is not None:
+          tf.logging.info("%s\tScore:%f" % (beam_string, scores[k]))
         else:
-          tf.logging.info(
-              targets_vocab.decode(
-                  _save_until_eos(result["outputs"].flatten())))
+          tf.logging.info(beam_string)
+    else:
+      if FLAGS.identity_output:
+        tf.logging.info(" ".join(map(str, result["outputs"].flatten())))
+      else:
+        tf.logging.info(
+            targets_vocab.decode(_save_until_eos(result["outputs"].flatten())))
 
 
 def _decode_batch_input_fn(problem_id, num_decode_batches, sorted_inputs,
@@ -264,9 +312,10 @@ def _decode_batch_input_fn(problem_id, num_decode_batches, sorted_inputs,
       assert len(input_ids) <= batch_length
       x = input_ids + [0] * (batch_length - len(input_ids))
       final_batch_inputs.append(x)
+
     yield {
-        "inputs": np.array(final_batch_inputs),
-        "problem_choice": np.array(problem_id)
+        "inputs": np.array(final_batch_inputs).astype(np.int32),
+        "problem_choice": np.array(problem_id).astype(np.int32),
     }
 
 
@@ -277,7 +326,7 @@ def _interactive_input_fn(hparams):
   whole graph, then we are stuck encoding all of the input as one fixed-size
   numpy array.
 
-  We yield int64 arrays with shape [const_array_size].  The format is:
+  We yield int32 arrays with shape [const_array_size].  The format is:
   [num_samples, decode_length, len(input ids), <input ids>, <padding>]
 
   Args:
@@ -288,7 +337,7 @@ def _interactive_input_fn(hparams):
   Raises:
     Exception: when `input_type` is invalid.
   """
-  num_samples = 3
+  num_samples = 1
   decode_length = 100
   input_type = "text"
   problem_id = 0
@@ -304,12 +353,13 @@ def _interactive_input_fn(hparams):
     pass
   while True:
     prompt = ("INTERACTIVE MODE  num_samples=%d  decode_length=%d  \n"
-              "  it=<input_type>     ('text' or 'image' or 'label')\n"
-              "  pr=<problem_num>    (set the problem number)\n"
+              "  it=<input_type>     ('text' or 'image' or 'label', default: "
+              "text)\n"
+              "  pr=<problem_num>    (set the problem number, default: 0)\n"
               "  in=<input_problem>  (set the input problem number)\n"
               "  ou=<output_problem> (set the output problem number)\n"
-              "  ns=<num_samples>    (changes number of samples)\n"
-              "  dl=<decode_length>  (changes decode length)\n"
+              "  ns=<num_samples>    (changes number of samples, default: 1)\n"
+              "  dl=<decode_length>  (changes decode length, default: 100)\n"
               "  <%s>                (decode)\n"
               "  q                   (quit)\n"
               ">" % (num_samples, decode_length, "source_string"
@@ -344,23 +394,23 @@ def _interactive_input_fn(hparams):
         x = [num_samples, decode_length, len(input_ids)] + input_ids
         assert len(x) < const_array_size
         x += [0] * (const_array_size - len(x))
-        yield problem_id, {
-            "inputs": np.array(x),
-            "problem_choice": np.array(problem_id)
+        yield {
+            "inputs": np.array(x).astype(np.int32),
+            "problem_choice": np.array(problem_id).astype(np.int32)
         }
       elif input_type == "image":
         input_path = input_string
         img = read_image(input_path)
-        yield problem_id, {
-            "inputs": img,
-            "problem_choice": np.array(problem_id)
+        yield {
+            "inputs": img.astype(np.int32),
+            "problem_choice": np.array(problem_id).astype(np.int32)
         }
       elif input_type == "label":
         input_ids = [int(input_string)]
         x = [num_samples, decode_length, len(input_ids)] + input_ids
-        yield problem_id, {
-            "inputs": np.array(x),
-            "problem_choice": np.array(problem_id)
+        yield {
+            "inputs": np.array(x).astype(np.int32),
+            "problem_choice": np.array(problem_id).astype(np.int32)
         }
       else:
         raise Exception("Unsupported input type.")
@@ -423,3 +473,85 @@ def _save_until_eos(hyp):
   except ValueError:
     # No EOS_ID: return the array as-is.
     return hyp
+
+
+def _interactive_input_tensor_to_features_dict(feature_map, hparams):
+  """Convert the interactive input format (see above) to a dictionary.
+
+  Args:
+    feature_map: a dictionary with keys `problem_choice` and `input` containing
+      Tensors.
+    hparams: model hyperparameters
+
+  Returns:
+    a features dictionary, as expected by the decoder.
+  """
+  inputs = tf.convert_to_tensor(feature_map["inputs"])
+  input_is_image = False if len(inputs.get_shape()) < 3 else True
+
+  def input_fn(problem_choice, x=inputs):  # pylint: disable=missing-docstring
+    if input_is_image:
+      x = tf.image.resize_images(x, [299, 299])
+      x = tf.reshape(x, [1, 299, 299, -1])
+      x = tf.to_int32(x)
+    else:
+      # Remove the batch dimension.
+      num_samples = x[0]
+      length = x[2]
+      x = tf.slice(x, [3], tf.to_int32([length]))
+      x = tf.reshape(x, [1, -1, 1, 1])
+      # Transform into a batch of size num_samples to get that many random
+      # decodes.
+      x = tf.tile(x, tf.to_int32([num_samples, 1, 1, 1]))
+
+    p_hparams = hparams.problems[problem_choice]
+    return (tf.constant(p_hparams.input_space_id),
+            tf.constant(p_hparams.target_space_id), x)
+
+  input_space_id, target_space_id, x = input_fn_builder.cond_on_index(
+      input_fn, feature_map["problem_choice"], 0, len(hparams.problems) - 1)
+
+  features = {}
+  features["problem_choice"] = tf.convert_to_tensor(
+      feature_map["problem_choice"])
+  features["input_space_id"] = input_space_id
+  features["target_space_id"] = target_space_id
+  features["decode_length"] = (IMAGE_DECODE_LENGTH
+                               if input_is_image else inputs[1])
+  features["inputs"] = x
+  return features
+
+
+def _decode_input_tensor_to_features_dict(feature_map, hparams):
+  """Convert the interactive input format (see above) to a dictionary.
+
+  Args:
+    feature_map: a dictionary with keys `problem_choice` and `input` containing
+      Tensors.
+    hparams: model hyperparameters
+
+  Returns:
+    a features dictionary, as expected by the decoder.
+  """
+  inputs = tf.convert_to_tensor(feature_map["inputs"])
+  input_is_image = False
+
+  def input_fn(problem_choice, x=inputs):  # pylint: disable=missing-docstring
+    p_hparams = hparams.problems[problem_choice]
+    # Add a third empty dimension dimension
+    x = tf.expand_dims(x, axis=[2])
+    x = tf.to_int32(x)
+    return (tf.constant(p_hparams.input_space_id),
+            tf.constant(p_hparams.target_space_id), x)
+
+  input_space_id, target_space_id, x = input_fn_builder.cond_on_index(
+      input_fn, feature_map["problem_choice"], 0, len(hparams.problems) - 1)
+
+  features = {}
+  features["problem_choice"] = feature_map["problem_choice"]
+  features["input_space_id"] = input_space_id
+  features["target_space_id"] = target_space_id
+  features["decode_length"] = (IMAGE_DECODE_LENGTH
+                               if input_is_image else tf.shape(x)[1] + 50)
+  features["inputs"] = x
+  return features
diff --git a/tensor2tensor/utils/input_fn_builder.py b/tensor2tensor/utils/input_fn_builder.py
index bef95d58f..abec8d4ad 100644
--- a/tensor2tensor/utils/input_fn_builder.py
+++ b/tensor2tensor/utils/input_fn_builder.py
@@ -47,7 +47,7 @@ def build_input_fn(mode,
      evaluation, and testing prediction.
 
   Args:
-    mode: The execution mode, as defined in tf.contrib.learn.ModeKeys.
+    mode: The execution mode, as defined in tf.estimator.ModeKeys.
     hparams: HParams object.
     data_file_patterns: The list of file patterns to use to read in data. Set to
       `None` if you want to create a placeholder for the input data. The
@@ -98,7 +98,7 @@ def input_fn():
                 data_reader.hparams_to_batching_scheme(
                     hparams,
                     shard_multiplier=num_datashards,
-                    drop_long_sequences=(mode == tf.contrib.learn.ModeKeys.TRAIN
+                    drop_long_sequences=(mode == tf.estimator.ModeKeys.TRAIN
                                          or hparams.eval_drop_long_sequences),
                     length_multiplier=(p_hparams.batch_size_multiplier)))
 
@@ -137,7 +137,7 @@ def input_fn():
                 trainable=False))
     if fixed_problem is None:
       if (hparams.problem_choice == "uniform" or
-          mode != tf.contrib.learn.ModeKeys.TRAIN):
+          mode != tf.estimator.ModeKeys.TRAIN):
         problem_choice = tf.random_uniform(
             [], maxval=problem_count, dtype=tf.int32)
       elif hparams.problem_choice == "adaptive":
@@ -169,7 +169,7 @@ def input_fn():
     inp_id.set_shape([])
     tgt_id.set_shape([])
     #  Forced shape obfuscation is necessary for inference.
-    if mode == tf.contrib.learn.ModeKeys.INFER:
+    if mode == tf.estimator.ModeKeys.PREDICT:
       rand_inputs._shape = tf.TensorShape([None, None, None, None])  # pylint: disable=protected-access
       rand_target._shape = tf.TensorShape([None, None, None, None])  # pylint: disable=protected-access
 
@@ -180,15 +180,14 @@ def input_fn():
         "input_space_id": inp_id,
         "target_space_id": tgt_id
     }
-    if mode == tf.contrib.learn.ModeKeys.INFER:
+    if mode == tf.estimator.ModeKeys.PREDICT:
       rand_feature_map["infer_targets"] = rand_target
       rand_target = None
-      # This is because of a bug in the tf.contrib.learn Estimator that
-      # short-circuits prediction if it doesn't see a QueueRunner.
-      # DummyQueueRunner implements the minimal expected interface but does
-      # nothing.
-      # TODO(rsepassi): Remove once we move to core Estimator.
+      # This is because of a bug in the Estimator that short-circuits prediction
+      # if it doesn't see a QueueRunner.  DummyQueueRunner implements the
+      # minimal expected interface but does nothing.
       tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS, DummyQueueRunner())
+
     return rand_feature_map, rand_target
 
   return input_fn
diff --git a/tensor2tensor/utils/metrics.py b/tensor2tensor/utils/metrics.py
index baff66669..5bfad5338 100644
--- a/tensor2tensor/utils/metrics.py
+++ b/tensor2tensor/utils/metrics.py
@@ -20,8 +20,6 @@
 
 # Dependency imports
 
-import six
-
 from tensor2tensor.layers import common_layers
 from tensor2tensor.utils import bleu_hook
 from tensor2tensor.utils import rouge
@@ -197,6 +195,8 @@ def create_evaluation_metrics(problems, model_hparams):
     model_hparams: a set of hparams.
 
   Returns:
+    Dict <metric name, metric function>. The metric functions have signature
+    (predictions, labels, problem_choice) -> (metric Tensor, update op).
     A dictionary with keys that are strings naming the evaluation
     metrics and values that are functions taking arguments of
     (predictions, targets), returning a tuple of a tensor of the
@@ -210,8 +210,7 @@ def create_evaluation_metrics(problems, model_hparams):
   def make_problem_specific_metric_fn(metric_fn, problem_idx, weights_fn):
     """Create a metric fn conditioned on problem_idx."""
 
-    def problem_metric_fn(predictions, labels, weights):
-      problem_choice = weights
+    def problem_metric_fn(predictions, labels, problem_choice):
       (scores, weights) = tf.cond(
           tf.equal(problem_idx, problem_choice),
           lambda: metric_fn(predictions, labels, weights_fn=weights_fn),
@@ -258,11 +257,7 @@ def problem_metric_fn(predictions, labels, weights):
           metric_fn, problem_idx, weights_fn)
       eval_metrics["metrics-%s/%s" % (problem_name, metric)] = problem_metric_fn
 
-  return {
-      k: tf.contrib.learn.MetricSpec(
-          v, prediction_key="predictions", weight_key="problem_choice")
-      for (k, v) in six.iteritems(eval_metrics)
-  }
+  return eval_metrics
 
 
 # Metrics are functions that take predictions and labels and return
diff --git a/tensor2tensor/utils/model_builder.py b/tensor2tensor/utils/model_builder.py
index 34af6c827..21ef96b28 100644
--- a/tensor2tensor/utils/model_builder.py
+++ b/tensor2tensor/utils/model_builder.py
@@ -33,6 +33,7 @@
 from tensor2tensor.models import models  # pylint: disable=unused-import
 from tensor2tensor.utils import devices
 from tensor2tensor.utils import input_fn_builder
+from tensor2tensor.utils import metrics
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import yellowfin
 
@@ -42,9 +43,6 @@
 # TODO(rsepassi): Rm dep on FLAGS here
 FLAGS = tf.flags.FLAGS
 
-# Number of samples to draw for an image input (in such cases as captioning)
-IMAGE_DECODE_LENGTH = 100
-
 
 def log_variable_sizes(var_list, tag):
   """Log the sizes and shapes of variables, and the total size.
@@ -64,90 +62,30 @@ def log_variable_sizes(var_list, tag):
   tf.logging.info("%s Total size: %d", tag, total_size)
 
 
-def build_model_fn(model, hparams):
+def build_model_fn(model):
   """Returns a function to build the model.
 
   Args:
     model: The name of the model to use.
-    hparams: The hyperparameters.
 
   Returns:
     A function to build the model's graph. This function is called by
     the Estimator object to construct the graph.
   """
 
-  def initializer():
-    if hparams.initializer == "orthogonal":
-      return tf.orthogonal_initializer(gain=hparams.initializer_gain)
-    elif hparams.initializer == "uniform":
-      max_val = 0.1 * hparams.initializer_gain
-      return tf.random_uniform_initializer(-max_val, max_val)
-    elif hparams.initializer == "normal_unit_scaling":
-      return init_ops.variance_scaling_initializer(
-          hparams.initializer_gain, mode="fan_avg", distribution="normal")
-    elif hparams.initializer == "uniform_unit_scaling":
-      return init_ops.variance_scaling_initializer(
-          hparams.initializer_gain, mode="fan_avg", distribution="uniform")
-    else:
-      raise ValueError("Unrecognized initializer: %s" % hparams.initializer)
-
-  def learning_rate_decay():
-    """Inverse-decay learning rate until warmup_steps, then decay."""
-    warmup_steps = tf.to_float(
-        hparams.learning_rate_warmup_steps * FLAGS.worker_replicas)
-    step = tf.to_float(tf.contrib.framework.get_global_step())
-    if hparams.learning_rate_decay_scheme == "noam":
-      return 5000.0 * hparams.hidden_size**-0.5 * tf.minimum(
-          (step + 1) * warmup_steps**-1.5, (step + 1)**-0.5)
-    elif hparams.learning_rate_decay_scheme == "exp100k":
-      return 0.94**(step // 100000)
-    elif hparams.learning_rate_decay_scheme == "cosine":
-      cycle_steps = hparams.learning_rate_cosine_cycle_steps
-      return 0.5 * (1 + tf.cos(np.pi * (step % cycle_steps) / cycle_steps))
-    elif hparams.learning_rate_decay_scheme == "cyclelinear10x":
-      # Cycle the rate linearly by 10x every warmup_steps, up and down.
-      cycle_steps = hparams.learning_rate_warmup_steps
-      cycle_position = step % (2 * cycle_steps)
-      cycle_position = tf.to_float(  # Normalize to the interval [-1, 1].
-          cycle_position - cycle_steps) / float(cycle_steps)
-      cycle_position = 1.0 - tf.abs(cycle_position)  # 0 to 1 and back to 0.
-      return (cycle_position + 0.1) * 3.0  # 10x difference each cycle (0.3-3).
-
-    inv_base = tf.exp(tf.log(0.01) / warmup_steps)
-    inv_decay = inv_base**(warmup_steps - step)
-    if hparams.learning_rate_decay_scheme == "sqrt":
-      decay = _sqrt_decay(step - warmup_steps)
-    elif hparams.learning_rate_decay_scheme == "exp10k":
-      decay = _exp_decay_after(step - warmup_steps, 0.9995,
-                               FLAGS.train_steps - warmup_steps - 10000)
-    elif hparams.learning_rate_decay_scheme == "exp50k":
-      decay = _exp_decay_after(step - warmup_steps, 0.99995,
-                               FLAGS.train_steps - warmup_steps - 50000)
-    elif hparams.learning_rate_decay_scheme == "exp500k":
-      decay = _exp_decay_after(step - warmup_steps, 0.9999955,
-                               FLAGS.train_steps - warmup_steps - 500000)
-    elif hparams.learning_rate_decay_scheme == "none":
-      decay = tf.constant(1.0)
-    else:
-      raise ValueError("Unrecognized learning rate decay scheme: %s" %
-                       hparams.learning_rate_decay_scheme)
-    return tf.cond(
-        step < warmup_steps,
-        lambda: inv_decay,
-        lambda: decay,
-        name="learning_rate_decay_warump_cond")
-
-  def model_fn(features, targets, mode):
+  def model_fn(features, labels, mode, params):
     """Creates the prediction, loss, and train ops.
 
     Args:
       features: A dictionary of tensors keyed by the feature name.
-      targets: A tensor representing the labels (targets).
-      mode: The execution mode, as defined in tf.contrib.learn.ModeKeys.
+      labels: A tensor representing the labels.
+      mode: The execution mode, as defined in tf.estimator.ModeKeys.
+      params: model HParams.
 
     Returns:
-      A tuple consisting of the prediction, loss, and train_op.
+      An EstimatorSpec.
     """
+    hparams = params
     # Deep-copy the model hparams between modes to eliminate
     # side-effects caused by abuse of the linked problem_hparams
     # objects which are used to share modality objects between
@@ -159,19 +97,76 @@ def model_fn(features, targets, mode):
     # could be created once per mode and passed to the constructor of
     # t2t_model.
     my_hp = copy.deepcopy(hparams)
-    if mode == tf.contrib.learn.ModeKeys.INFER:
-      if FLAGS.decode_interactive:
-        features = _interactive_input_tensor_to_features_dict(features, my_hp)
-      elif FLAGS.decode_from_file:
-        features = _decode_input_tensor_to_features_dict(features, my_hp)
 
-    if targets is not None:
-      features["targets"] = targets
+    def initializer():
+      if hparams.initializer == "orthogonal":
+        return tf.orthogonal_initializer(gain=hparams.initializer_gain)
+      elif hparams.initializer == "uniform":
+        max_val = 0.1 * hparams.initializer_gain
+        return tf.random_uniform_initializer(-max_val, max_val)
+      elif hparams.initializer == "normal_unit_scaling":
+        return init_ops.variance_scaling_initializer(
+            hparams.initializer_gain, mode="fan_avg", distribution="normal")
+      elif hparams.initializer == "uniform_unit_scaling":
+        return init_ops.variance_scaling_initializer(
+            hparams.initializer_gain, mode="fan_avg", distribution="uniform")
+      else:
+        raise ValueError("Unrecognized initializer: %s" % hparams.initializer)
+
+    def learning_rate_decay():
+      """Inverse-decay learning rate until warmup_steps, then decay."""
+      warmup_steps = tf.to_float(
+          hparams.learning_rate_warmup_steps * FLAGS.worker_replicas)
+      step = tf.to_float(tf.contrib.framework.get_global_step())
+      if hparams.learning_rate_decay_scheme == "noam":
+        return 5000.0 * hparams.hidden_size**-0.5 * tf.minimum(
+            (step + 1) * warmup_steps**-1.5, (step + 1)**-0.5)
+      elif hparams.learning_rate_decay_scheme == "exp100k":
+        return 0.94**(step // 100000)
+      elif hparams.learning_rate_decay_scheme == "cosine":
+        cycle_steps = hparams.learning_rate_cosine_cycle_steps
+        return 0.5 * (1 + tf.cos(np.pi * (step % cycle_steps) / cycle_steps))
+      elif hparams.learning_rate_decay_scheme == "cyclelinear10x":
+        # Cycle the rate linearly by 10x every warmup_steps, up and down.
+        cycle_steps = hparams.learning_rate_warmup_steps
+        cycle_position = step % (2 * cycle_steps)
+        cycle_position = tf.to_float(  # Normalize to the interval [-1, 1].
+            cycle_position - cycle_steps) / float(cycle_steps)
+        cycle_position = 1.0 - tf.abs(cycle_position)  # 0 to 1 and back to 0.
+        return (
+            cycle_position + 0.1) * 3.0  # 10x difference each cycle (0.3-3).
+
+      inv_base = tf.exp(tf.log(0.01) / warmup_steps)
+      inv_decay = inv_base**(warmup_steps - step)
+      if hparams.learning_rate_decay_scheme == "sqrt":
+        decay = _sqrt_decay(step - warmup_steps)
+      elif hparams.learning_rate_decay_scheme == "exp10k":
+        decay = _exp_decay_after(step - warmup_steps, 0.9995,
+                                 FLAGS.train_steps - warmup_steps - 10000)
+      elif hparams.learning_rate_decay_scheme == "exp50k":
+        decay = _exp_decay_after(step - warmup_steps, 0.99995,
+                                 FLAGS.train_steps - warmup_steps - 50000)
+      elif hparams.learning_rate_decay_scheme == "exp500k":
+        decay = _exp_decay_after(step - warmup_steps, 0.9999955,
+                                 FLAGS.train_steps - warmup_steps - 500000)
+      elif hparams.learning_rate_decay_scheme == "none":
+        decay = tf.constant(1.0)
+      else:
+        raise ValueError("Unrecognized learning rate decay scheme: %s" %
+                         hparams.learning_rate_decay_scheme)
+      return tf.cond(
+          step < warmup_steps,
+          lambda: inv_decay,
+          lambda: decay,
+          name="learning_rate_decay_warump_cond")
+
+    if labels is not None:
+      features["targets"] = labels
 
     dp = devices.data_parallelism()
 
     tf.get_variable_scope().set_initializer(initializer())
-    is_training = mode == tf.contrib.learn.ModeKeys.TRAIN
+    is_training = mode == tf.estimator.ModeKeys.TRAIN
 
     # Add input statistics for incoming features.
     with tf.name_scope("input_stats"):
@@ -218,7 +213,7 @@ def nth_model(n):
           n,
           dp,
           devices.ps_devices(all_workers=True))
-      if mode == tf.contrib.learn.ModeKeys.INFER:
+      if mode == tf.estimator.ModeKeys.PREDICT:
         return model_class.infer(
             features,
             beam_size=FLAGS.decode_beam_size,
@@ -235,7 +230,7 @@ def nth_model(n):
       # TODO(lukaszkaiser): why is this hack needed for variables init? Repair.
       skip_this_one = skip_this_one and (FLAGS.worker_id != 0 or n > 1)
       if (FLAGS.eval_run_autoregressive and
-          mode == tf.contrib.learn.ModeKeys.EVAL):
+          mode == tf.estimator.ModeKeys.EVAL):
         sharded_logits, losses_dict = model_class.eval_autoregressive(features)
       else:
         sharded_logits, losses_dict = model_class.model_fn(
@@ -272,36 +267,50 @@ def nth_model(n):
                                                  features["problem_choice"], 0,
                                                  len(my_hp.problems) - 1)
 
-    if mode == tf.contrib.learn.ModeKeys.INFER:
+    if mode == tf.estimator.ModeKeys.PREDICT:
       # Beam search in sequence model returns both decodes withe key "outputs"
       # and scores with they key "scores". If return list is a dict, we expect
       # that it will have keys "outputs", a tensor of int32 and scores, a
       # tensor of floats. This is useful if we want to return scores from
       # estimator.predict
       if not isinstance(result_list, dict):
-        ret = {"outputs": result_list}, None, None
+        predictions = {"outputs": result_list}
       else:
-        ret = {
+        predictions = {
             "outputs": result_list["outputs"],
             "scores": result_list["scores"]
-        }, None, None
+        }
+
       if "inputs" in features:
-        ret[0]["inputs"] = features["inputs"]
+        predictions["inputs"] = features["inputs"]
       if "infer_targets" in features:
-        ret[0]["targets"] = features["infer_targets"]
-      return ret
+        predictions["targets"] = features["infer_targets"]
+      predictions["problem_choice"] = (features["problem_choice"] * tf.ones(
+          (tf.shape(features["inputs"])[0],), dtype=tf.int32))
+
+      return tf.estimator.EstimatorSpec(mode, predictions=predictions)
 
     sharded_logits, total_loss = result_list[1:], result_list[0]
-    if mode == tf.contrib.learn.ModeKeys.EVAL:
+    if mode == tf.estimator.ModeKeys.EVAL:
       # For evaluation, return the logits layer as our predictions.
       logits = tf.concat(sharded_logits, 0)
-      ret = {
-          "predictions": logits,
-          "problem_choice": features["problem_choice"],
-      }
-      return ret, total_loss, None
 
-    assert mode == tf.contrib.learn.ModeKeys.TRAIN
+      eval_metrics_fns = metrics.create_evaluation_metrics(
+          zip(FLAGS.problems.split("-"), hparams.problem_instances), hparams)
+      _check_autotune_metrics(eval_metrics_fns)
+
+      eval_metrics = {}
+      for metric_name, metric_fn in six.iteritems(eval_metrics_fns):
+        eval_metrics[metric_name] = metric_fn(logits, labels,
+                                              features["problem_choice"])
+
+      return tf.estimator.EstimatorSpec(
+          mode,
+          predictions={"predictions": logits},
+          eval_metric_ops=eval_metrics,
+          loss=total_loss)
+
+    assert mode == tf.estimator.ModeKeys.TRAIN
 
     # Some training statistics.
     with tf.name_scope("training_stats"):
@@ -381,7 +390,11 @@ def nth_model(n):
         del summaries[i]
 
     tf.logging.info("Global model_fn finished.")
-    return {"problem_choice": features["problem_choice"]}, total_loss, train_op
+    return tf.estimator.EstimatorSpec(
+        mode,
+        predictions={"problem_choice": features["problem_choice"]},
+        loss=total_loss,
+        train_op=train_op)
 
   return model_fn
 
@@ -431,81 +444,8 @@ def _exp_decay_after(step, rate, from_which_step):
       name="exponential_decay_step_cond")
 
 
-def _interactive_input_tensor_to_features_dict(feature_map, hparams):
-  """Convert the interactive input format (see above) to a dictionary.
-
-  Args:
-    feature_map: a dictionary with keys `problem_choice` and `input` containing
-      Tensors.
-    hparams: model hyperparameters
-
-  Returns:
-    a features dictionary, as expected by the decoder.
-  """
-  inputs = tf.constant(feature_map["inputs"])
-  input_is_image = False if len(inputs.shape) < 3 else True
-
-  def input_fn(problem_choice, x=inputs):  # pylint: disable=missing-docstring
-    p_hparams = hparams.problems[problem_choice]
-    if not input_is_image:
-      # Remove the batch dimension.
-      num_samples = x[0]
-      length = x[2]
-      x = tf.slice(x, [3], tf.to_int32([length]))
-      x = tf.reshape(x, [1, -1, 1, 1])
-      # Transform into a batch of size num_samples to get that many random
-      # decodes.
-      x = tf.tile(x, tf.to_int32([num_samples, 1, 1, 1]))
-    else:
-      x = tf.image.resize_images(x, [299, 299])
-      x = tf.reshape(x, [1, 299, 299, -1])
-      x = tf.to_int32(x)
-    return (tf.constant(p_hparams.input_space_id),
-            tf.constant(p_hparams.target_space_id), x)
-
-  input_space_id, target_space_id, x = input_fn_builder.cond_on_index(
-      input_fn, feature_map["problem_choice"], 0, len(hparams.problems) - 1)
-
-  features = {}
-  features["problem_choice"] = tf.constant(feature_map["problem_choice"])
-  features["input_space_id"] = input_space_id
-  features["target_space_id"] = target_space_id
-  features["decode_length"] = (IMAGE_DECODE_LENGTH
-                               if input_is_image else inputs[1])
-  features["inputs"] = x
-  return features
-
-
-def _decode_input_tensor_to_features_dict(feature_map, hparams):
-  """Convert the interactive input format (see above) to a dictionary.
-
-  Args:
-    feature_map: a dictionary with keys `problem_choice` and `input` containing
-      Tensors.
-    hparams: model hyperparameters
-
-  Returns:
-    a features dictionary, as expected by the decoder.
-  """
-  inputs = tf.constant(feature_map["inputs"])
-  input_is_image = False
-
-  def input_fn(problem_choice, x=inputs):  # pylint: disable=missing-docstring
-    p_hparams = hparams.problems[problem_choice]
-    # Add a third empty dimension dimension
-    x = tf.expand_dims(x, axis=[2])
-    x = tf.to_int32(x)
-    return (tf.constant(p_hparams.input_space_id),
-            tf.constant(p_hparams.target_space_id), x)
-
-  input_space_id, target_space_id, x = input_fn_builder.cond_on_index(
-      input_fn, feature_map["problem_choice"], 0, len(hparams.problems) - 1)
-
-  features = {}
-  features["problem_choice"] = feature_map["problem_choice"]
-  features["input_space_id"] = input_space_id
-  features["target_space_id"] = target_space_id
-  features["decode_length"] = (IMAGE_DECODE_LENGTH
-                               if input_is_image else tf.shape(x)[1] + 50)
-  features["inputs"] = x
-  return features
+def _check_autotune_metrics(metrics_dict):
+  if (hasattr(FLAGS, "autotune") and FLAGS.autotune and
+      FLAGS.objective not in metrics_dict):
+    raise ValueError("Tuning objective %s not among evaluation metrics %s" %
+                     (FLAGS.objective, metrics_dict.keys()))
diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py
index d3fc6dac1..32627f7e3 100644
--- a/tensor2tensor/utils/t2t_model.py
+++ b/tensor2tensor/utils/t2t_model.py
@@ -69,7 +69,7 @@ def __init__(self,
 
     Args:
       hparams: a hyperparameters object.
-      mode: The execution mode, as defined in tf.contrib.learn.ModeKeys.
+      mode: The execution mode, as defined in tf.estimator.ModeKeys.
       problem_hparams: a hyperparameters object.
       problem_idx: an integer.
       data_parallelism: a expert_utils.parallelism
@@ -86,7 +86,7 @@ def __init__(self,
     hparams = copy.copy(hparams)
     hparams.add_hparam("mode", mode)
     # When not in training mode, set all forms of dropout to zero.
-    if mode != tf.contrib.learn.ModeKeys.TRAIN:
+    if mode != tf.estimator.ModeKeys.TRAIN:
       for key in hparams.values():
         if key[-len("dropout"):] == "dropout":
           setattr(hparams, key, 0.0)
diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py
index 3248d9ca9..83db7c007 100644
--- a/tensor2tensor/utils/trainer_utils.py
+++ b/tensor2tensor/utils/trainer_utils.py
@@ -30,7 +30,6 @@
 from tensor2tensor.utils import data_reader
 from tensor2tensor.utils import devices
 from tensor2tensor.utils import input_fn_builder
-from tensor2tensor.utils import metrics
 from tensor2tensor.utils import model_builder
 from tensor2tensor.utils import registry
 
@@ -155,12 +154,6 @@ def create_experiment(output_dir, data_dir, model_name, train_steps,
       output_dir=output_dir,
       data_dir=data_dir,
       model_name=model_name)
-  eval_metrics = metrics.create_evaluation_metrics(
-      zip(FLAGS.problems.split("-"), hparams.problem_instances), hparams)
-  if (hasattr(FLAGS, "autotune") and FLAGS.autotune and
-      FLAGS.objective not in eval_metrics):
-    raise ValueError("Tuning objective %s not among evaluation metrics %s" %
-                     (FLAGS.objective, eval_metrics.keys()))
   train_monitors = []
   eval_hooks = []
   if FLAGS.tfdbg:
@@ -169,9 +162,8 @@ def create_experiment(output_dir, data_dir, model_name, train_steps,
     eval_hooks.append(hook)
   return tf.contrib.learn.Experiment(
       estimator=estimator,
-      train_input_fn=input_fns[tf.contrib.learn.ModeKeys.TRAIN],
-      eval_input_fn=input_fns[tf.contrib.learn.ModeKeys.EVAL],
-      eval_metrics=eval_metrics,
+      train_input_fn=input_fns[tf.estimator.ModeKeys.TRAIN],
+      eval_input_fn=input_fns[tf.estimator.ModeKeys.EVAL],
       train_steps=train_steps,
       eval_steps=eval_steps,
       min_eval_frequency=FLAGS.local_eval_frequency,
@@ -185,39 +177,37 @@ def create_experiment_components(hparams, output_dir, data_dir, model_name):
 
   num_datashards = devices.data_parallelism().n
   train_input_fn = input_fn_builder.build_input_fn(
-      mode=tf.contrib.learn.ModeKeys.TRAIN,
+      mode=tf.estimator.ModeKeys.TRAIN,
       hparams=hparams,
       data_file_patterns=get_data_filepatterns(data_dir,
-                                               tf.contrib.learn.ModeKeys.TRAIN),
+                                               tf.estimator.ModeKeys.TRAIN),
       num_datashards=num_datashards,
       worker_replicas=FLAGS.worker_replicas,
       worker_id=FLAGS.worker_id)
 
   eval_input_fn = input_fn_builder.build_input_fn(
-      mode=tf.contrib.learn.ModeKeys.EVAL,
+      mode=tf.estimator.ModeKeys.EVAL,
       hparams=hparams,
       data_file_patterns=get_data_filepatterns(data_dir,
-                                               tf.contrib.learn.ModeKeys.EVAL),
+                                               tf.estimator.ModeKeys.EVAL),
       num_datashards=num_datashards,
       worker_replicas=FLAGS.worker_replicas,
       worker_id=FLAGS.worker_id)
-  estimator = tf.contrib.learn.Estimator(
-      model_fn=model_builder.build_model_fn(model_name, hparams),
+  estimator = tf.estimator.Estimator(
+      model_fn=model_builder.build_model_fn(model_name),
       model_dir=output_dir,
+      params=hparams,
       config=tf.contrib.learn.RunConfig(
           master=FLAGS.master,
-          model_dir=output_dir,
           gpu_memory_fraction=FLAGS.worker_gpu_memory_fraction,
           session_config=session_config(),
           keep_checkpoint_max=FLAGS.keep_checkpoint_max,
           keep_checkpoint_every_n_hours=FLAGS.keep_checkpoint_every_n_hours,
           save_checkpoints_secs=FLAGS.save_checkpoints_secs))
 
-  # Store the hparams in the estimator as well
-  estimator.hparams = hparams
   return estimator, {
-      tf.contrib.learn.ModeKeys.TRAIN: train_input_fn,
-      tf.contrib.learn.ModeKeys.EVAL: eval_input_fn
+      tf.estimator.ModeKeys.TRAIN: train_input_fn,
+      tf.estimator.ModeKeys.EVAL: eval_input_fn
   }
 
 
@@ -330,9 +320,15 @@ def run(data_dir, model, output_dir, train_steps, eval_steps, schedule):
   if schedule == "local_run":
     # Run the local demo.
     exp = exp_fn(output_dir)
-    if exp.train_steps > 0 or exp.eval_steps > 0:
+    if exp.train_steps > 0 and exp.eval_steps > 0:
       tf.logging.info("Performing local training and evaluation.")
       exp.train_and_evaluate()
+    elif exp.train_steps > 0:
+      tf.logging.info("Performing local training.")
+      exp.train()
+    elif exp.eval_steps > 0:
+      tf.logging.info("Performing local evaluation.")
+      exp.evaluate(delay_secs=0)
   else:
     # Perform distributed training/evaluation.
     learn_runner.run(
diff --git a/tensor2tensor/utils/trainer_utils_test.py b/tensor2tensor/utils/trainer_utils_test.py
index 6cc654d26..e71fc16c2 100644
--- a/tensor2tensor/utils/trainer_utils_test.py
+++ b/tensor2tensor/utils/trainer_utils_test.py
@@ -106,7 +106,7 @@ def testSingleEvalStepRawSession(self):
     encoders = registry.problem(FLAGS.problems).feature_encoders(data_dir)
     hparams = trainer_utils.create_hparams(
         FLAGS.hparams_set, FLAGS.problems, data_dir)
-    model_fn = model_builder.build_model_fn(model_name, hparams)
+    model_fn = model_builder.build_model_fn(model_name)
     inputs_ph = tf.placeholder(dtype=tf.int32)  # Just length dimension.
     batch_inputs = tf.reshape(inputs_ph, [1, -1, 1, 1])  # Make it 4D.
     targets_ph = tf.placeholder(dtype=tf.int32)  # Just length dimension.
@@ -117,9 +117,10 @@ def testSingleEvalStepRawSession(self):
                 "target_space_id": hparams.problems[0].target_space_id}
 
     # Now set a mode and create the graph by invoking model_fn.
-    mode = tf.contrib.learn.ModeKeys.EVAL
-    predictions_dict, _, _ = model_fn(  # In INFER mode targets can be None.
-        features, batch_targets, mode)
+    mode = tf.estimator.ModeKeys.EVAL
+    estimator_spec = model_fn(  # In INFER mode targets can be None.
+        features, batch_targets, mode, hparams)
+    predictions_dict = estimator_spec.predictions
     predictions = tf.squeeze(  # These are not images, axis=2,3 are not needed.
         predictions_dict["predictions"], axis=[2, 3])
 
diff --git a/tensor2tensor/visualization/TransformerVisualization.ipynb b/tensor2tensor/visualization/TransformerVisualization.ipynb
index e3fb8f958..166e0c9c5 100644
--- a/tensor2tensor/visualization/TransformerVisualization.ipynb
+++ b/tensor2tensor/visualization/TransformerVisualization.ipynb
@@ -127,9 +127,9 @@
     "num_datashards = utils.devices.data_parallelism().n\n",
     "\n",
     "problems_data = utils.get_data_filepatterns(\n",
-    "    DATA_DIR, tf.contrib.learn.ModeKeys.EVAL)\n",
+    "    DATA_DIR, tf.estimator.ModeKeys.EVAL)\n",
     "input_fn = utils.input_fn_builder.build_input_fn(\n",
-    "    mode=tf.contrib.learn.ModeKeys.EVAL,\n",
+    "    mode=tf.estimator.ModeKeys.EVAL,\n",
     "    hparams=hparams,\n",
     "    data_file_patterns=problems_data,\n",
     "    num_datashards=num_datashards)\n",
@@ -192,8 +192,9 @@
     }
    ],
    "source": [
-    "model_fn=utils.model_builder.build_model_fn(MODEL, hparams=hparams)\n",
-    "sharded_logits, training_loss, extra_loss = model_fn(features, target, tf.contrib.learn.ModeKeys.EVAL)"
+    "model_fn=utils.model_builder.build_model_fn(MODEL)\n",
+    "spec = model_fn(features, target, tf.estimator.ModeKeys.EVAL, hparams)\n",
+    "predictions_dict = spec.predictions",
    ]
   },
   {
@@ -215,7 +216,8 @@
    ],
    "source": [
     "with tf.variable_scope(tf.get_variable_scope(), reuse=True):\n",
-    "    beam_out = model_fn(features, target, tf.contrib.learn.ModeKeys.INFER)"
+    "    spec = model_fn(features, target, tf.estimator.ModeKeys.PREDICT, hparams)\n",
+    "    beam_out = spec.predictions['outputs']",
    ]
   },
   {
@@ -324,7 +326,7 @@
     }
    ],
    "source": [
-    "inp, out, logits = sess.run([inputs['inputs'], target, sharded_logits['predictions']])\n",
+    "inp, out, logits = sess.run([inputs['inputs'], target, predictions_dict['predictions']])\n",
     "\n",
     "print(\"Input:    \", decode(inp[0]))\n",
     "print(\"Gold:     \", decode(out[0]))\n",
@@ -366,7 +368,7 @@
    ],
    "source": [
     "inp_ids = encode(eng)\n",
-    "beam_decode = sess.run(beam_out[0]['outputs'], {\n",
+    "beam_decode = sess.run(beam_out, {\n",
     "    inputs['inputs']: np.expand_dims(np.expand_dims(inp_ids, axis=2), axis=3),\n",
     "})\n",
     "trans = decode(beam_decode[0])\n",

From cb181de23926052a042ee5e6fa9bda0d21dc8f23 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Wed, 6 Sep 2017 18:55:59 -0700
Subject: [PATCH 20/32] Enable passing padded_shapes in padded_batch;
 log_device_placement FLAG

PiperOrigin-RevId: 167805495
---
 tensor2tensor/models/transformer.py     | 17 ++++++----
 tensor2tensor/utils/data_reader.py      | 44 ++++++++++++++++++-------
 tensor2tensor/utils/data_reader_test.py |  2 +-
 tensor2tensor/utils/trainer_utils.py    |  5 ++-
 4 files changed, 48 insertions(+), 20 deletions(-)

diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index 38766ec19..d3a406a29 100644
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -191,10 +191,13 @@ def transformer_encoder(encoder_input,
   """
   x = encoder_input
   with tf.variable_scope(name):
-    pad_remover = expert_utils.PadRemover(
-        common_attention.attention_bias_to_padding(encoder_self_attention_bias))
-    for layer in xrange(
-        hparams.num_encoder_layers or hparams.num_hidden_layers):
+    pad_remover = None
+    if hparams.use_pad_remover:
+      pad_remover = expert_utils.PadRemover(
+          common_attention.attention_bias_to_padding(
+              encoder_self_attention_bias))
+    for layer in xrange(hparams.num_encoder_layers or
+                        hparams.num_hidden_layers):
       with tf.variable_scope("layer_%d" % layer):
         with tf.variable_scope("self_attention"):
           y = common_attention.multihead_attention(
@@ -237,8 +240,8 @@ def transformer_decoder(decoder_input,
   """
   x = decoder_input
   with tf.variable_scope(name):
-    for layer in xrange(
-        hparams.num_decoder_layers or hparams.num_hidden_layers):
+    for layer in xrange(hparams.num_decoder_layers or
+                        hparams.num_hidden_layers):
       with tf.variable_scope("layer_%d" % layer):
         with tf.variable_scope("self_attention"):
           y = common_attention.multihead_attention(
@@ -362,6 +365,8 @@ def transformer_base():
   hparams.add_hparam("pos", "timing")  # timing, none
   hparams.add_hparam("nbr_decoder_problems", 1)
   hparams.add_hparam("proximity_bias", int(False))
+  hparams.add_hparam("use_pad_remover", int(True))
+
   return hparams
 
 
diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py
index f48665078..e89b9b808 100644
--- a/tensor2tensor/utils/data_reader.py
+++ b/tensor2tensor/utils/data_reader.py
@@ -267,11 +267,23 @@ def input_pipeline(problem, data_file_pattern, capacity, mode, hparams,
         lambda ex: _preprocess(ex, problem, data_file_pattern, hparams, mode),
         num_threads=num_threads)
     dataset = dataset.filter(
-        lambda ex: _example_too_big(ex, batching_scheme["max_length"]))
+        lambda ex: example_valid_size(ex, batching_scheme["max_length"]))
+
+    bucket_id_fn = _example_length
+    if len(batching_scheme["boundaries"]) == 1:
+      bucket_id_fn = lambda _: tf.constant(0)
+
+    if "padded_shapes" not in batching_scheme:
+      batching_scheme["padded_shapes"] = None
 
     dataset = bucket_by_sequence_length(
-        dataset, _example_length, batching_scheme["boundaries"],
-        batching_scheme["batch_sizes"], batching_scheme["window_size"])
+        dataset,
+        bucket_id_fn,
+        batching_scheme["boundaries"],
+        batching_scheme["batch_sizes"],
+        batching_scheme["window_size"],
+        padded_shapes=batching_scheme["padded_shapes"])
+
     # We reshuffle the batches to prevent many long-sequence batches at once.
     # TODO(rsepassi): Rm hasattr call once new dynamic window size functionality
     # is in a stable TF release.
@@ -307,12 +319,16 @@ def _example_length(example):
   return length
 
 
-def _example_too_big(example, max_length):
+def example_valid_size(example, max_length):
   return tf.less_equal(_example_length(example), max_length)
 
 
-def bucket_by_sequence_length(dataset, example_length_fn, bucket_boundaries,
-                              bucket_batch_sizes, window_size):
+def bucket_by_sequence_length(dataset,
+                              example_length_fn,
+                              bucket_boundaries,
+                              bucket_batch_sizes,
+                              window_size,
+                              padded_shapes=None):
   """Bucket entries in dataset by length.
 
   Args:
@@ -322,6 +338,8 @@ def bucket_by_sequence_length(dataset, example_length_fn, bucket_boundaries,
     bucket_boundaries: list<int>, boundaries of the buckets.
     bucket_batch_sizes: list<int>, batch size per bucket.
     window_size: an integer divisible by all elements of bucket_batch_sizes
+    padded_shapes: dict<feature name, list<int>>, optional, shapes of the
+      features with None where feature should be padded to max in that dim.
 
   Returns:
     Dataset of padded and batched examples.
@@ -351,12 +369,7 @@ def window_size_fn(bucket_id):
     def batching_fn(bucket_id, grouped_dataset):
       batch_sizes = tf.constant(bucket_batch_sizes, dtype=tf.int64)
       batch_size = batch_sizes[bucket_id]
-
-      # Pad each dimension of each feature so that they match.
-      padded_shapes = dict(
-          [(name, [None] * len(shape))
-           for name, shape in grouped_dataset.output_shapes.items()])
-      return grouped_dataset.padded_batch(batch_size, padded_shapes)
+      return padded_batch(grouped_dataset, batch_size, padded_shapes)
 
     # TODO(rsepassi): Rm branch once the new group_by_window functionality is in
     # a stable TF release.
@@ -371,6 +384,13 @@ def batching_fn(bucket_id, grouped_dataset):
     return dataset
 
 
+def padded_batch(dataset, batch_size, padded_shapes=None):
+  padded_shapes = padded_shapes or dict(
+      [(name, [None] * len(shape))
+       for name, shape in dataset.output_shapes.items()])
+  return dataset.padded_batch(batch_size, padded_shapes)
+
+
 def _bucket_boundaries(max_length, min_length=8, length_bucket_step=1.1):
   """A default set of length-bucket boundaries."""
   assert min_length <= max_length
diff --git a/tensor2tensor/utils/data_reader_test.py b/tensor2tensor/utils/data_reader_test.py
index aed2598c7..f03ce6da2 100644
--- a/tensor2tensor/utils/data_reader_test.py
+++ b/tensor2tensor/utils/data_reader_test.py
@@ -158,7 +158,7 @@ def testLengthFilter(self):
     max_len = 15
     dataset = data_reader.read_examples(self.problem, self.filepatterns[0], 32)
     dataset = dataset.filter(
-        lambda ex: data_reader._example_too_big(ex, max_len))
+        lambda ex: data_reader.example_valid_size(ex, max_len))
     examples = dataset.make_one_shot_iterator().get_next()
     with tf.train.MonitoredSession() as sess:
       ex_lens = []
diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py
index 83db7c007..08359ea5c 100644
--- a/tensor2tensor/utils/trainer_utils.py
+++ b/tensor2tensor/utils/trainer_utils.py
@@ -75,6 +75,8 @@
                      "Save checkpoints every this many seconds. "
                      "Default=0 means let tensorflow.contrib.learn.python.learn"
                      " decide, which is currently set to 600 = 10 minutes.")
+flags.DEFINE_bool("log_device_placement", False,
+                  "Whether to log device placement.")
 
 # Distributed training flags
 flags.DEFINE_string("master", "", "Address of TensorFlow master.")
@@ -369,7 +371,8 @@ def session_config():
   config = tf.ConfigProto(
       allow_soft_placement=True,
       graph_options=graph_options,
-      gpu_options=gpu_options)
+      gpu_options=gpu_options,
+      log_device_placement=FLAGS.log_device_placement)
   return config
 
 

From ad57b3b2a4bd401464010778a39784644a055c9f Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Wed, 6 Sep 2017 22:18:07 -0700
Subject: [PATCH 21/32] correct transformer ranged hparams

PiperOrigin-RevId: 167817267
---
 tensor2tensor/models/transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index d3a406a29..a2e76dd13 100644
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -664,6 +664,6 @@ def transformer_base_range(rhp):
   rhp.set_discrete("learning_rate_warmup_steps",
                    [1000, 2000, 4000, 8000, 16000])
   rhp.set_float("initializer_gain", 0.5, 2.0)
-  rhp.set_float("optimizer_adam_beta2", 0.85, 0.95)
+  rhp.set_float("optimizer_adam_beta1", 0.85, 0.95)
   rhp.set_float("optimizer_adam_beta2", 0.97, 0.99)
   rhp.set_float("weight_decay", 0.0, 2.0)

From 772337a811579a32078228d43e9572ccad4a669a Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Thu, 7 Sep 2017 07:05:14 -0700
Subject: [PATCH 22/32] bug fix to link function for log_poisson loss

PiperOrigin-RevId: 167855204
---
 tensor2tensor/layers/modalities.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py
index 1d606ec1d..8e76c8051 100644
--- a/tensor2tensor/layers/modalities.py
+++ b/tensor2tensor/layers/modalities.py
@@ -475,19 +475,16 @@ def loss(self, top_out, targets, weights_fn=common_layers.weights_all):
 @registry.register_real_modality("log_poisson_loss")
 class RealLogPoissonLossModality(RealL2LossModality):
   """Modality for real (i.e. float) vectors with log Poisson regression loss.
-
-  * Top is a linear projection to vocab size followed by a softplus
-    transform (log(exp(features) + 1)).
   """
 
-  def top(self, body_output, _):
-    with tf.variable_scope("real"):
-      return tf.nn.softplus(tf.layers.dense(body_output, self._vocab_size))
+  def bottom(self, x):
+    return x
 
   def loss(self, top_out, targets, weights_fn=common_layers.weights_all):
     predictions = top_out
     with tf.name_scope("log_possion"):
       weights = weights_fn(targets)
+
       lp_loss = tf.nn.log_poisson_loss(targets, predictions)
       return tf.reduce_sum(lp_loss * weights), tf.reduce_sum(weights)
 

From 73af26b1968efee44b41c0efaafa66bc393d2a29 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Thu, 7 Sep 2017 11:40:57 -0700
Subject: [PATCH 23/32] change default initializer in lstm to
 uniform_unit_scaling

PiperOrigin-RevId: 167888817
---
 tensor2tensor/models/lstm.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensor2tensor/models/lstm.py b/tensor2tensor/models/lstm.py
index 9f909433e..d1c3101b4 100644
--- a/tensor2tensor/models/lstm.py
+++ b/tensor2tensor/models/lstm.py
@@ -272,6 +272,7 @@ def lstm_attention():
   hparams.batch_size = 1024
   hparams.hidden_size = 128
   hparams.num_hidden_layers = 2
+  hparams.initializer = "uniform_unit_scaling"
 
   # Attention
   hparams.add_hparam("attn_vec_size", hparams.hidden_size)

From 5fcc9bc8d813607bc2e93d680ea6a08cacf83a2c Mon Sep 17 00:00:00 2001
From: Etienne Pot <epot@google.com>
Date: Thu, 7 Sep 2017 15:33:38 -0700
Subject: [PATCH 24/32] Expert now process each sequence individually to reduce
 the attention matrix size.

PiperOrigin-RevId: 167921622
---
 tensor2tensor/layers/common_attention.py | 82 ++++++++++++++-------
 tensor2tensor/models/attention_lm_moe.py |  7 ++
 tensor2tensor/utils/expert_utils.py      | 91 ++++++++++++++++++++++++
 3 files changed, 154 insertions(+), 26 deletions(-)

diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py
index 84f8d2d9a..3f3885b10 100644
--- a/tensor2tensor/layers/common_attention.py
+++ b/tensor2tensor/layers/common_attention.py
@@ -441,7 +441,9 @@ def dot_product_attention(q,
     weights = tf.nn.softmax(logits, name="attention_weights")
     # dropping out the attention links for each of the heads
     weights = tf.nn.dropout(weights, 1.0 - dropout_rate)
-    if not tf.get_variable_scope().reuse:
+    if (not tf.get_variable_scope().reuse and
+        # Summaries don't work well within tf.while_loop()
+        "/while/" not in tf.contrib.framework.get_name_scope()):
       attention_image_summary(weights, image_shapes)
     return tf.matmul(weights, v)
 
@@ -1242,6 +1244,7 @@ def self_attention_expert(
     x,
     batch_coordinate,
     mask_right=True,
+    split_batch=False,
     attention_kq_size=None,
     attention_v_size=None,
 ):
@@ -1255,6 +1258,9 @@ def self_attention_expert(
       positions from different sequences don't attend to each other.
     mask_right: A bool. If true, we will not attend to positions on the right,
       just as decoder self attention.
+    split_batch (bool): If True, each sequence of the batch is processed
+      individually on a loop. If False, the sequences are processed all at
+      once and a mask is applied to isolate the sequences from each others
     attention_kq_size (int): dimension used for the attention key, and query
     attention_v_size (int): dimension used for the attention value
 
@@ -1289,32 +1295,58 @@ def self_attention_expert(
 
   def length_not_null(x, batch_coordinate):
     """Branch of the graph only evaluated when length isn't null."""
+
+    # Mask between the sequences (not used if map_ids is used)
     with tf.name_scope("expert_mask"):
-      batch_coordinate = tf.squeeze(batch_coordinate, 1)
+      batch_coord_float = tf.squeeze(batch_coordinate, 1)
       # Convert to float first because of b/25387198
-      batch_coordinate = tf.to_float(batch_coordinate)
-      bc_v = tf.expand_dims(batch_coordinate, 1)
-      bc_h = tf.expand_dims(batch_coordinate, 0)
-      bias = bc_v - bc_h  # Broadcast to create [length, length] mask
-      bias = tf.minimum(1.0, tf.abs(bias))  # Theshold non zeros to 1.0
-      bias *= -1e9  # Set non zeros to -infinity
-
-    if mask_right:
-      bias += tf.reshape(
+      batch_coord_float = tf.to_float(batch_coord_float)
+      bc_v = tf.expand_dims(batch_coord_float, 1)
+      bc_h = tf.expand_dims(batch_coord_float, 0)
+      bias_batch = bc_v - bc_h  # Broadcast to create [length, length] mask
+      # Theshold non zeros to 1.0
+      bias_batch = tf.minimum(1.0, tf.abs(bias_batch))
+      bias_batch *= -1e9  # Set non zeros to -infinity
+
+    def add_or_set_if(prev_bias, new_bias, condition):
+      """Add the bias together while concidering the None case."""
+      if not condition:
+        return prev_bias
+      elif prev_bias is None:
+        return new_bias
+      else:
+        return prev_bias + new_bias
+
+    def mask_and_call_attention(x):
+      """Function applied once for each sequence of the batch."""
+
+      # Mask to prevent sequences of attenting to the future
+      length = tf.shape(x)[1]  # x has shape [1, length,...]
+      bias_past = tf.reshape(
           attention_bias_lower_triangle(length), [length, length])
-    # bias has shape [length, length]
-    bias = tf.reshape(bias, [1, 1, length, length])
-    x = tf.reshape(x, [1, length, depth])
-    out = multihead_attention(x,
-                              None,
-                              bias,
-                              total_key_depth=attention_kq_size,
-                              total_value_depth=attention_v_size,
-                              output_depth=depth,
-                              num_heads=1,
-                              dropout_rate=0.0)
-    out = tf.squeeze(out, 0)
-
+      # bias has shape [length, length]
+      bias_past = tf.reshape(bias_past, [1, 1, length, length])
+
+      bias = None
+      bias = add_or_set_if(bias, bias_past, mask_right)
+      bias = add_or_set_if(bias, bias_batch, not split_batch)
+
+      return multihead_attention(
+          x,
+          None,
+          bias,
+          total_key_depth=attention_kq_size,
+          total_value_depth=attention_v_size,
+          output_depth=depth,
+          num_heads=1,
+          dropout_rate=0.0)
+
+    if split_batch:
+      out = expert_utils.map_ids(x, batch_coordinate, mask_and_call_attention)
+    else:
+      x = tf.reshape(x, [1, length, depth])
+      out = mask_and_call_attention(x)
+      out = tf.squeeze(out, 0)
     return out
 
   # If the length is empty, just forward an empty tensor (avoid having to
@@ -1326,8 +1358,6 @@ def length_not_null(x, batch_coordinate):
   )
   return out
 
-#  functools.partial(self_attention_expert, mask_right=, depth=)
-
 
 def local_expert_attention(
     x,
diff --git a/tensor2tensor/models/attention_lm_moe.py b/tensor2tensor/models/attention_lm_moe.py
index 596d5b01d..87d456b7d 100644
--- a/tensor2tensor/models/attention_lm_moe.py
+++ b/tensor2tensor/models/attention_lm_moe.py
@@ -127,6 +127,8 @@ def print_shape(x, suffix, debug=False):
     x = dp_remove_pad(x)
     x = dp(print_shape, x, "in_flat")
 
+    assert hparams.batch_size >= hparams.max_length
+
     for layer in xrange(hparams.num_hidden_layers):
       with tf.variable_scope("layer_%d" % layer):
         with tf.variable_scope(
@@ -161,6 +163,7 @@ def print_shape(x, suffix, debug=False):
                 train=hparams.mode == ModeKeys.TRAIN,
                 batch_coordinate=batch_coordinate,
                 mask_right=True,
+                split_batch=bool(hparams.attention_split_batch),
                 attention_kq_size=hparams.attention_kq_size,
                 attention_v_size=hparams.attention_v_size)
             # TODO(avaswani, epot, noam): Do we need to divide by num shards ?
@@ -344,6 +347,7 @@ def attention_lm_moe_base():
   hparams.add_hparam("attention_type", AttentionType.MULTIHEAD)
   hparams.add_hparam("attention_moe_k", 2)
   hparams.add_hparam("attention_num_experts", 16)
+  hparams.add_hparam("attention_split_batch", int(False))
   # Key, query and value dimensions for the attention
   hparams.add_hparam("attention_kq_size", 128)
   hparams.add_hparam("attention_v_size", 256)
@@ -366,6 +370,9 @@ def attention_lm_moe_base_ae():
   hparams.min_length_bucket = 256  # Avoid cyclic problems for big batches
   hparams.learning_rate = 0.05
   hparams.learning_rate_warmup_steps = 10000
+  # According to noam, ("n", "da") seems better for harder-to-learn models
+  # hparams.layer_preprocess_sequence = "n"
+  # hparams.layer_postprocess_sequence = "da"
   return hparams
 
 
diff --git a/tensor2tensor/utils/expert_utils.py b/tensor2tensor/utils/expert_utils.py
index 16820ff37..8865b9271 100644
--- a/tensor2tensor/utils/expert_utils.py
+++ b/tensor2tensor/utils/expert_utils.py
@@ -23,6 +23,7 @@
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import math
 
 # Dependency imports
@@ -60,6 +61,27 @@ def convert_gradient_to_tensor(x):
   return x
 
 
+def add_name_scope(scope):
+  """Return a decorator which add a TF name scope to a function.
+
+  Args:
+    scope (str): name of the name scope
+
+  Returns:
+    fct: the add_scope decorator
+  """
+  def decorator(f):
+
+    @functools.wraps(f)
+    def decorated(*args, **kwargs):
+      with tf.name_scope(scope):
+        return f(*args, **kwargs)
+
+    return decorated
+
+  return decorator
+
+
 class Parallelism(object):
   """Helper class for creating sets of parallel function calls.
 
@@ -517,6 +539,75 @@ def restore(self, x):
     return x
 
 
+@add_name_scope("map_ids")
+def map_ids(x, indices, map_fn):
+  """Apply a function to each coordinate ids of a multidimentional tensor.
+
+  This allows to process each sequence of a batch independently. This is
+  similar to tf.map_fn but with tensor where the batch dim has been flatten.
+
+  Warning: The indices ids have to be contigous and orderd in memory as the
+  output vector for each of the ids are simply concatenated after being
+  processed.
+  Ex: if your indices are [0,2,2,1,2,0], the output will contains the processed
+  rows in the following order: [0,0,1,2,2,2]
+
+  Args:
+    x (Tensor): The tensor to be dispatched of shape [length,...]
+    indices (Tensor): A int32 tensor of size [length, 1] containing the batch
+      coordinate of x
+    map_fn (fct): Function called for every ids of the original tensor. Take
+      as input a tensor of same rank than x and from shape [length_id,...] with
+      length_id <= length. Isn't called if length_id == 0
+
+  Returns:
+    a tensor of same shape as x, where each elements has been processed
+  """
+  indices = tf.reshape(indices, [-1])
+
+  t_i = tf.constant(0)
+  # batch_coordinates start at 0
+  t_batch_size = tf.reduce_max(indices) + 1
+
+  # ta_stack_out will store the intermediate results for each individual id
+  # As alternative to tf.TensorArray, scatter_update could potentially be used
+  # but that would require an additional mutable tensor.
+  ta_stack_out = tf.TensorArray(
+      x.dtype,
+      size=t_batch_size,
+  )
+
+  # Then we iterate over each sequence individually and compute the
+  # transformation for each id
+  while_condition = lambda t_i, *args: tf.less(t_i, t_batch_size)
+  def body(t_i, ta_stack_out):
+    """Loop body."""
+    # Gather the ids
+    current_ids = tf.to_int32(tf.where(tf.equal(indices, t_i)))
+    t_row = tf.gather_nd(x, indices=current_ids)
+
+    # TODO(epot): Should not call map_fn if t_row size is 0
+
+    # Apply transformation to each id
+    # Restore batch_dim=1 as most function expect [batch_dim, length, ...] as
+    # input
+    t_row = tf.expand_dims(t_row, axis=0)
+    t_row = map_fn(t_row)
+    t_row = tf.squeeze(t_row, axis=0)  # Squeeze for concatenation
+    ta_stack_out = ta_stack_out.write(t_i, t_row)
+
+    return [tf.add(t_i, 1), ta_stack_out]  # ++i
+
+  # Run the loop, equivalent to:
+  # stack_out = []
+  # while i < batch_size:
+  #   stack_out.expand(map_fn(x[indices==i]))
+  _, ta_stack_out = tf.while_loop(while_condition, body, [t_i, ta_stack_out])
+
+  # Merge all results
+  return ta_stack_out.concat()
+
+
 class SparseDispatcher(object):
   """Helper for implementing a mixture of experts.
 

From 327c8d23999048596c1e9a7a59abc369ffd1ee4e Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Thu, 7 Sep 2017 17:08:25 -0700
Subject: [PATCH 25/32] Merge PRs #274 #282

PiperOrigin-RevId: 167933701
---
 README.md                                |  4 +--
 docs/new_problem.md                      | 38 +++++++++++++++++++-----
 tensor2tensor/visualization/attention.py |  2 +-
 3 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 4e56d7855..bec411f1e 100644
--- a/README.md
+++ b/README.md
@@ -214,8 +214,8 @@ on the task (e.g. fed through a final linear transform to produce logits for a
 softmax over classes). All models are imported in
 [`models.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/models/models.py),
 inherit from `T2TModel` - defined in
-[`t2t_model.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/utils/t2t_model.py)
-- and are registered with
+[`t2t_model.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/utils/t2t_model.py) -
+and are registered with
 [`@registry.register_model`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/utils/registry.py).
 
 ### Hyperparameter Sets
diff --git a/docs/new_problem.md b/docs/new_problem.md
index d581a3a1b..ab5dd5e26 100644
--- a/docs/new_problem.md
+++ b/docs/new_problem.md
@@ -15,9 +15,17 @@ Let's add a new dataset together and train the transformer model. We'll be learn
 
 For each problem we want to tackle we create a new problem class and register it. Let's call our problem `Word2def`.
 
-Since many text2text problems share similar methods, there's already a class called `Text2TextProblem` that extends the base problem class, `Problem` (both found in `problem.py`).
-
-For our problem, we can go ahead and create the file `word2def.py` in the `data_generators` folder and add our new problem, `Word2def`, which extends `Text2TextProblem`. Let's also register it while we're at it so we can specify the problem through flags.
+Since many text2text problems share similar methods, there's already a class
+called `Text2TextProblem` that extends the base problem class, `Problem`
+(both found in
+[`problem.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/problem.py)).
+
+For our problem, we can go ahead and create the file `word2def.py` in the
+[`data_generators`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/)
+folder and add our new problem, `Word2def`, which extends
+[`Text2TextProblem`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/problem.py).
+Let's also register it while we're at it so we can specify the problem through
+flags.
 
 ```python
 @registry.register_problem
@@ -28,7 +36,9 @@ class Word2def(problem.Text2TextProblem):
     ...
 ```
 
-We need to implement the following methods from `Text2TextProblem` in our new class:
+We need to implement the following methods from
+[`Text2TextProblem`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/problem.py).
+in our new class:
 * is_character_level
 * targeted_vocab_size
 * generator
@@ -42,7 +52,12 @@ Let's tackle them one by one:
 
 **input_space_id, target_space_id, is_character_level, targeted_vocab_size, use_subword_tokenizer**:
 
-SpaceIDs tell Tensor2Tensor what sort of space the input and target tensors are in. These are things like, EN_CHR (English character), EN_TOK (English token), AUDIO_WAV (audio waveform), IMAGE, DNA (genetic bases). The complete list can be found at `data_generators/problem.py` in the class `SpaceID`.
+SpaceIDs tell Tensor2Tensor what sort of space the input and target tensors are
+in. These are things like, EN_CHR (English character), EN_TOK (English token),
+AUDIO_WAV (audio waveform), IMAGE, DNA (genetic bases). The complete list can be
+found at
+[`data_generators/problem.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/problem.py).
+in the class `SpaceID`.
 
 Since we're generating definitions and feeding in words at the character level, we set `is_character_level` to true, and use the same SpaceID, EN_CHR, for both input and target. Additionally, since we aren't using tokens, we don't need to give a `targeted_vocab_size` or define `use_subword_tokenizer`.
 
@@ -86,7 +101,15 @@ class Word2def(problem.Text2TextProblem):
 
 **generator**:
 
-We're almost done. `generator` generates the training and evaluation data and stores them in files like  "word2def_train.lang1" in your DATA_DIR. Thankfully several commonly used methods like `character_generator`, and `token_generator` are already written in the file `wmt.py`. We will import `character_generator` and write:
+We're almost done. `generator` generates the training and evaluation data and
+stores them in files like "word2def_train.lang1" in your DATA_DIR. Thankfully
+several commonly used methods like `character_generator`, and `token_generator`
+are already written in the file
+[`wmt.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/wmt.py).
+We will import `character_generator` and
+[`text_encoder`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/text_encoder.py)
+to write:
+
 ```python
   def generator(self, data_dir, tmp_dir, train):
     character_vocab = text_encoder.ByteTextEncoder()
@@ -151,7 +174,8 @@ _WORD2DEF_TEST_DATASETS = [
 
 ## Putting it all together
 
-Now our `word2def.py` file looks like: (with the correct imports)
+Now our `word2def.py` file looks like:
+
 ```python
 """ Problem definition for word to dictionary definition.
 """
diff --git a/tensor2tensor/visualization/attention.py b/tensor2tensor/visualization/attention.py
index bc4238081..6109f9cc6 100644
--- a/tensor2tensor/visualization/attention.py
+++ b/tensor2tensor/visualization/attention.py
@@ -15,7 +15,7 @@
 
 """Module for postprocessing and displaying tranformer attentions.
 
-This module is deigned to be called from an ipython notebook.
+This module is designed to be called from an ipython notebook.
 """
 
 import json

From 0c0016a81424088e96df9fc6d712ce9b6ad90226 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Thu, 7 Sep 2017 17:17:12 -0700
Subject: [PATCH 26/32] Fix error message when problem is misspecified

PiperOrigin-RevId: 167934726
---
 .../data_generators/problem_hparams.py        |  5 +---
 tensor2tensor/utils/registry.py               | 30 +++++++++----------
 tensor2tensor/utils/registry_test.py          | 12 ++++----
 tensor2tensor/utils/trainer_utils.py          | 18 +++++++++--
 4 files changed, 37 insertions(+), 28 deletions(-)

diff --git a/tensor2tensor/data_generators/problem_hparams.py b/tensor2tensor/data_generators/problem_hparams.py
index e002329bc..f4880e4d9 100644
--- a/tensor2tensor/data_generators/problem_hparams.py
+++ b/tensor2tensor/data_generators/problem_hparams.py
@@ -40,9 +40,6 @@ def problem_hparams(problem_name, model_hparams):
 
   Returns:
     a tf.contrib.training.HParams
-
-  Raises:
-    ValueError: if problem_name is unknown.
   """
   base_name, was_reversed, was_copy = parse_problem_name(problem_name)
   p = _lookup_problem_hparams_fn(base_name)(model_hparams)
@@ -78,7 +75,7 @@ def _lookup_problem_hparams_fn(name):
   if name not in PROBLEM_HPARAMS_MAP:
     map_str = "* " + "\n* ".join(sorted(PROBLEM_HPARAMS_MAP.keys()))
     error_msg = "%s not in the supported set of problems:\n%s" % (name, map_str)
-    raise ValueError(error_msg)
+    raise LookupError(error_msg)
   return PROBLEM_HPARAMS_MAP.get(name)
 
 
diff --git a/tensor2tensor/utils/registry.py b/tensor2tensor/utils/registry.py
index f1db2f36c..2b708b4ce 100644
--- a/tensor2tensor/utils/registry.py
+++ b/tensor2tensor/utils/registry.py
@@ -123,7 +123,7 @@ def decorator(model_cls, registration_name=None):
     """Registers & returns model_cls with registration_name or default name."""
     model_name = registration_name or _default_name(model_cls)
     if model_name in _MODELS:
-      raise ValueError("Model %s already registered." % model_name)
+      raise LookupError("Model %s already registered." % model_name)
     _MODELS[model_name] = model_cls
     return model_cls
 
@@ -137,7 +137,7 @@ def decorator(model_cls, registration_name=None):
 
 def model(name):
   if name not in _MODELS:
-    raise ValueError("Model %s never registered." % name)
+    raise LookupError("Model %s never registered." % name)
   return _MODELS[name]
 
 
@@ -152,7 +152,7 @@ def decorator(hp_fn, registration_name=None):
     """Registers & returns hp_fn with registration_name or default name."""
     hp_name = registration_name or _default_name(hp_fn)
     if hp_name in _HPARAMS:
-      raise ValueError("HParams set %s already registered." % hp_name)
+      raise LookupError("HParams set %s already registered." % hp_name)
     _HPARAMS[hp_name] = hp_fn
     return hp_fn
 
@@ -166,7 +166,7 @@ def decorator(hp_fn, registration_name=None):
 
 def hparams(name):
   if name not in _HPARAMS:
-    raise ValueError("HParams set %s never registered." % name)
+    raise LookupError("HParams set %s never registered." % name)
   return _HPARAMS[name]
 
 
@@ -181,7 +181,7 @@ def decorator(rhp_fn, registration_name=None):
     """Registers & returns hp_fn with registration_name or default name."""
     rhp_name = registration_name or _default_name(rhp_fn)
     if rhp_name in _RANGED_HPARAMS:
-      raise ValueError("RangedHParams set %s already registered." % rhp_name)
+      raise LookupError("RangedHParams set %s already registered." % rhp_name)
     # Check that the fn takes a single argument
     args, varargs, keywords, _ = inspect.getargspec(rhp_fn)
     if len(args) != 1 or varargs is not None or keywords is not None:
@@ -201,7 +201,7 @@ def decorator(rhp_fn, registration_name=None):
 
 def ranged_hparams(name):
   if name not in _RANGED_HPARAMS:
-    raise ValueError("RangedHParams set %s never registered." % name)
+    raise LookupError("RangedHParams set %s never registered." % name)
   return _RANGED_HPARAMS[name]
 
 
@@ -216,7 +216,7 @@ def decorator(p_cls, registration_name=None):
     """Registers & returns p_cls with registration_name or default name."""
     p_name = registration_name or _default_name(p_cls)
     if p_name in _PROBLEMS:
-      raise ValueError("Problem %s already registered." % p_name)
+      raise LookupError("Problem %s already registered." % p_name)
 
     _PROBLEMS[p_name] = p_cls
     p_cls.name = p_name
@@ -258,7 +258,7 @@ def parse_problem_name(problem_name):
   base_name, was_reversed, was_copy = parse_problem_name(name)
 
   if base_name not in _PROBLEMS:
-    raise ValueError("Problem %s never registered." % name)
+    raise LookupError("Problem %s never registered." % name)
   return _PROBLEMS[base_name](was_reversed, was_copy)
 
 
@@ -270,8 +270,8 @@ def _internal_get_modality(name, mod_collection, collection_str):
   if name is None:
     name = "default"
   if name not in mod_collection:
-    raise ValueError("%s modality %s never registered." % (collection_str,
-                                                           name))
+    raise LookupError("%s modality %s never registered." % (collection_str,
+                                                            name))
   return mod_collection[name]
 
 
@@ -312,8 +312,8 @@ def decorator(mod_cls, registration_name=None):
     """Registers & returns mod_cls with registration_name or default name."""
     mod_name = registration_name or _default_name(mod_cls)
     if mod_name in mod_collection:
-      raise ValueError("%s modality %s already registered." % (collection_str,
-                                                               mod_name))
+      raise LookupError("%s modality %s already registered." % (collection_str,
+                                                                mod_name))
     mod_collection[mod_name] = mod_cls
     return mod_cls
 
@@ -391,7 +391,7 @@ def create_modality(modality_spec, model_hparams):
     Modality instance.
 
   Raises:
-    ValueError: if modality_type is not recognized. See Modalities class for
+    LookupError: if modality_type is not recognized. See Modalities class for
     accepted types.
   """
   retrieval_fns = {
@@ -406,8 +406,8 @@ def create_modality(modality_spec, model_hparams):
   modality_full_name, vocab_size = modality_spec
   modality_type, modality_name = parse_modality_name(modality_full_name)
   if modality_type not in retrieval_fns:
-    raise ValueError("Modality type %s not recognized. Options are: %s" %
-                     (modality_type, list(_MODALITIES)))
+    raise LookupError("Modality type %s not recognized. Options are: %s" %
+                      (modality_type, list(_MODALITIES)))
 
   return retrieval_fns[modality_type](modality_name)(model_hparams, vocab_size)
 
diff --git a/tensor2tensor/utils/registry_test.py b/tensor2tensor/utils/registry_test.py
index 62c24b054..d97dc6bdc 100644
--- a/tensor2tensor/utils/registry_test.py
+++ b/tensor2tensor/utils/registry_test.py
@@ -63,7 +63,7 @@ def model_fn():
     self.assertTrue(model is model_fn)
 
   def testUnknownModel(self):
-    with self.assertRaisesRegexp(ValueError, "never registered"):
+    with self.assertRaisesRegexp(LookupError, "never registered"):
       registry.model("not_registered")
 
   def testDuplicateRegistration(self):
@@ -72,7 +72,7 @@ def testDuplicateRegistration(self):
     def m1():
       pass
 
-    with self.assertRaisesRegexp(ValueError, "already registered"):
+    with self.assertRaisesRegexp(LookupError, "already registered"):
 
       @registry.register_model("m1")
       def m2():
@@ -137,9 +137,9 @@ def my_hparams_range(_):
     self.assertTrue(registry.ranged_hparams("a") is my_hparams_range)
 
   def testUnknownHparams(self):
-    with self.assertRaisesRegexp(ValueError, "never registered"):
+    with self.assertRaisesRegexp(LookupError, "never registered"):
       registry.hparams("not_registered")
-    with self.assertRaisesRegexp(ValueError, "never registered"):
+    with self.assertRaisesRegexp(LookupError, "never registered"):
       registry.ranged_hparams("not_registered")
 
   def testDuplicateRegistration(self):
@@ -148,7 +148,7 @@ def testDuplicateRegistration(self):
     def hp1():
       pass
 
-    with self.assertRaisesRegexp(ValueError, "already registered"):
+    with self.assertRaisesRegexp(LookupError, "already registered"):
 
       @registry.register_hparams("hp1")
       def hp2():
@@ -158,7 +158,7 @@ def hp2():
     def rhp1(_):
       pass
 
-    with self.assertRaisesRegexp(ValueError, "already registered"):
+    with self.assertRaisesRegexp(LookupError, "already registered"):
 
       @registry.register_ranged_hparams("rhp1")
       def rhp2(_):
diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py
index 08359ea5c..be5e5530f 100644
--- a/tensor2tensor/utils/trainer_utils.py
+++ b/tensor2tensor/utils/trainer_utils.py
@@ -149,7 +149,7 @@ def create_experiment(output_dir, data_dir, model_name, train_steps,
   """Create Experiment."""
   hparams = create_hparams(
       FLAGS.hparams_set, FLAGS.problems, data_dir, passed_hparams=FLAGS.hparams)
-  if FLAGS.worker_id == 0:
+  if FLAGS.worker_id == 0 and FLAGS.schedule in ["local_run", "train"]:
     save_metadata(output_dir, hparams)
   estimator, input_fns = create_experiment_components(
       hparams=hparams,
@@ -226,11 +226,23 @@ def add_problem_hparams(hparams, problems):
   for problem_name in problems.split("-"):
     try:
       problem = registry.problem(problem_name)
-    except ValueError:
+    except LookupError:
       problem = None
 
     if problem is None:
-      p_hparams = problem_hparams.problem_hparams(problem_name, hparams)
+      try:
+        p_hparams = problem_hparams.problem_hparams(problem_name, hparams)
+      except LookupError:
+        # The problem is not in the set of registered Problems nor in the old
+        # set of problem_hparams.
+        all_problem_names = sorted(
+            list(problem_hparams.PROBLEM_HPARAMS_MAP) +
+            registry.list_problems())
+        error_lines = [
+            "%s not in the set of supported problems:" % problem_name
+        ] + all_problem_names
+        error_msg = "\n  * ".join(error_lines)
+        raise LookupError(error_msg)
     else:
       p_hparams = problem.get_hparams(hparams)
 

From 6d004bdc853e2fc7fe6aa341dfefbb89d6b17963 Mon Sep 17 00:00:00 2001
From: Niki Parmar <nikip@google.com>
Date: Thu, 7 Sep 2017 22:56:03 -0700
Subject: [PATCH 27/32] Edit 2d scope name

PiperOrigin-RevId: 167958304
---
 tensor2tensor/layers/common_attention.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py
index 3f3885b10..c5a0c60cb 100644
--- a/tensor2tensor/layers/common_attention.py
+++ b/tensor2tensor/layers/common_attention.py
@@ -1066,7 +1066,7 @@ def multihead_attention_2d(query_antecedent,
                      "attention heads (%d)." % (total_value_depth, num_heads))
   with tf.variable_scope(
       name,
-      default_name="multihead_attention",
+      default_name="multihead_attention_2d",
       values=[query_antecedent, memory_antecedent]):
     q, k, v = compute_qkv_2d(query_antecedent, memory_antecedent,
                              total_key_depth, total_value_depth)

From c99d5b5d350feb33ecb99f1bbbc74a2660e8a46b Mon Sep 17 00:00:00 2001
From: Noam Shazeer <noam@google.com>
Date: Fri, 8 Sep 2017 09:33:02 -0700
Subject: [PATCH 28/32] log diet variables properly

PiperOrigin-RevId: 168006293
---
 tensor2tensor/utils/model_builder.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensor2tensor/utils/model_builder.py b/tensor2tensor/utils/model_builder.py
index 21ef96b28..1540c0f88 100644
--- a/tensor2tensor/utils/model_builder.py
+++ b/tensor2tensor/utils/model_builder.py
@@ -38,6 +38,7 @@
 from tensor2tensor.utils import yellowfin
 
 import tensorflow as tf
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import init_ops
 
 # TODO(rsepassi): Rm dep on FLAGS here
@@ -363,7 +364,9 @@ def nth_model(n):
       total_loss *= small_batch_multiplier
     total_loss = tf.identity(total_loss, name="total_loss")
     log_variable_sizes(tf.trainable_variables(), "Trainable Variables")
-    diet_vars = [v for v in tf.global_variables() if hasattr(v, "optimizer")]
+    diet_vars = [
+        v for v in tf.global_variables() if v.dtype == dtypes.float16_ref
+    ]
     log_variable_sizes(diet_vars, "Diet Varaibles")
     # Define the train_op for the TRAIN mode.
     opt = _ConditionalOptimizer(my_hp.optimizer, learning_rate, my_hp)

From 8f5fcc2d0ef416bbf06b2e2e777100da071292a1 Mon Sep 17 00:00:00 2001
From: Noam Shazeer <noam@google.com>
Date: Fri, 8 Sep 2017 13:27:18 -0700
Subject: [PATCH 29/32] add wiki-scramble dataset.

PiperOrigin-RevId: 168037859
---
 tensor2tensor/data_generators/wiki.py    | 117 +++++++++++++++++++++++
 tensor2tensor/models/attention_lm_moe.py |  35 +++++--
 2 files changed, 145 insertions(+), 7 deletions(-)

diff --git a/tensor2tensor/data_generators/wiki.py b/tensor2tensor/data_generators/wiki.py
index 9610cb1d8..6f6c97686 100644
--- a/tensor2tensor/data_generators/wiki.py
+++ b/tensor2tensor/data_generators/wiki.py
@@ -25,6 +25,8 @@
 
 import bz2file
 
+import numpy as np
+
 import six
 from tensor2tensor.data_generators import generator_utils
 from tensor2tensor.data_generators import problem
@@ -130,3 +132,118 @@ def generator(self, data_dir, tmp_dir, _):
       encoded = encoder.encode(page) + [EOS]
       encoded_title = encoder.encode(title) + [EOS]
       yield {"inputs": encoded_title, "targets": encoded}
+
+
+class LanguagemodelWikiScramble(problem.Text2TextProblem):
+  """Language modeling on English wikipedia.
+
+  "targets" is a sequence of sequence_length tokens - a fragment of an article.
+  "inputs" is a copy of "targets", but with a random scramble_fraction of the
+    tokens randomly permuted.
+
+  This dataset is intended to test parallel (non-autoregressive) prediction
+  of the target sequence given the input sequence.
+  """
+
+  @property
+  def sequence_length(self):
+    raise NotImplementedError()
+
+  @property
+  def scramble_fraction(self):
+    raise NotImplementedError()
+
+  @property
+  def is_character_level(self):
+    return False
+
+  @property
+  def has_inputs(self):
+    return True
+
+  @property
+  def input_space_id(self):
+    return problem.SpaceID.EN_TOK
+
+  @property
+  def target_space_id(self):
+    return problem.SpaceID.EN_TOK
+
+  @property
+  def num_shards(self):
+    return 1000
+
+  @property
+  def vocab_name(self):
+    return "vocab.wiki"
+
+  @property
+  def use_subword_tokenizer(self):
+    return True
+
+  @property
+  def targeted_vocab_size(self):
+    return 2**13  # 8192
+
+  @property
+  def use_train_shards_for_dev(self):
+    return True
+
+  @property
+  def max_cases(self):
+    return (2 ** 30) / self.sequence_length
+
+  def scramble(self, seq):
+    seq = np.array(seq)
+    num_permute = int(self.sequence_length * self.scramble_fraction)
+    full_permutation = np.random.permutation(self.sequence_length)
+    inverse_full_permutation = np.argsort(full_permutation)
+    partial_permutation = np.random.permutation(num_permute)
+    seq = seq[full_permutation]
+    seq = np.concatenate(
+        (seq[:num_permute][partial_permutation], seq[num_permute:]))
+    seq = seq[inverse_full_permutation]
+    seq = list(seq)
+    return seq
+
+  def generator(self, data_dir, tmp_dir, _):
+    encoder = generator_utils.get_or_generate_vocab_inner(
+        data_dir, self.vocab_file, self.targeted_vocab_size,
+        lambda: page_generator(tmp_dir, max_docs=1000))
+    case_num = 0
+    for page in page_generator(tmp_dir):
+      encoded = encoder.encode(page)
+      for i in xrange(len(encoded) // self.sequence_length):
+        case_num += 1
+        if self.max_cases and case_num > self.max_cases:
+          return
+        targets = encoded[
+            i * self.sequence_length:(i + 1) * self.sequence_length]
+        inputs = self.scramble(targets)
+        yield {"inputs": inputs, "targets": targets}
+
+
+@registry.register_problem
+class LanguagemodelWikiScramble1k50(LanguagemodelWikiScramble):
+  """Sequence length 1024, 50% scrambed."""
+
+  @property
+  def sequence_length(self):
+    return 1024
+
+  @property
+  def scramble_fraction(self):
+    return 0.5
+
+
+@registry.register_problem
+class LanguagemodelWikiScramble8k50(LanguagemodelWikiScramble):
+  """Sequence length 8192, 50% scrambed."""
+
+  @property
+  def sequence_length(self):
+    return 8192
+
+  @property
+  def scramble_fraction(self):
+    return 0.5
diff --git a/tensor2tensor/models/attention_lm_moe.py b/tensor2tensor/models/attention_lm_moe.py
index 87d456b7d..cd54ce64e 100644
--- a/tensor2tensor/models/attention_lm_moe.py
+++ b/tensor2tensor/models/attention_lm_moe.py
@@ -68,8 +68,14 @@ def model_fn_body_sharded(self, sharded_features):
     # Remove dropout if not training
     hparams = self._hparams
     dp = self._data_parallelism
-    targets = sharded_features["targets"]
-    targets = dp(tf.squeeze, targets, 2)
+    if hparams.use_inputs:
+      decoder_input = dp(tf.squeeze, sharded_features["inputs"], 2)
+      decoder_self_attention_bias = None
+    else:
+      targets = sharded_features["targets"]
+      targets = dp(tf.squeeze, targets, 2)
+      (decoder_input, decoder_self_attention_bias, pad_remover) = dp(
+          attention_lm_moe_prepare_decoder, targets, hparams)
 
     def preprocess(x):
       return dp(common_layers.layer_preprocess, x, hparams)
@@ -77,9 +83,6 @@ def preprocess(x):
     def postprocess(x, y):
       return dp(common_layers.layer_postprocess, x, y, hparams)
 
-    (decoder_input, decoder_self_attention_bias, pad_remover) = dp(
-        attention_lm_moe_prepare_decoder, targets, hparams)
-
     x = dp(tf.nn.dropout, decoder_input,
            1.0 - hparams.layer_prepostprocess_dropout)
     extra_loss = 0.0
@@ -95,7 +98,8 @@ def _diet_expert(x):
       expert_fn = expert_utils.ffn_expert_fn(
           hparams.hidden_size, moe_hidden_sizes, hparams.hidden_size)
 
-    if hparams.attention_type == AttentionType.LOCAL_EXPERTS:
+    if (hparams.attention_type == AttentionType.LOCAL_EXPERTS
+        and not hparams.use_inputs):
       # As preprocess and postprocess are called with batch of size one (all
       # batches concatenated), we just make sure that batch_norm is not use (
       # should not either way)
@@ -162,7 +166,7 @@ def print_shape(x, suffix, debug=False):
                 attention_num_experts=hparams.attention_num_experts,
                 train=hparams.mode == ModeKeys.TRAIN,
                 batch_coordinate=batch_coordinate,
-                mask_right=True,
+                mask_right=not hparams.use_inputs,
                 split_batch=bool(hparams.attention_split_batch),
                 attention_kq_size=hparams.attention_kq_size,
                 attention_v_size=hparams.attention_v_size)
@@ -356,6 +360,9 @@ def attention_lm_moe_base():
   hparams.add_hparam("use_sepconv", int(False))
   hparams.add_hparam("diet_experts", int(False))
   hparams.add_hparam("memory_efficient_ffn", int(False))
+  # if True, we learn a non-autoregressive model from "inputs" to "targets".
+  # if False, we learn an autoregressive model to generate "targets"
+  hparams.add_hparam("use_inputs", int(False))
   return hparams
 
 
@@ -526,3 +533,17 @@ def attention_lm_moe_translation():
   hparams.moe_layers = "0,1,2,3,4,5"
   hparams.shared_embedding_and_softmax_weights = int(True)
   return hparams
+
+
+@registry.register_hparams
+def attention_lm_moe_unscramble_base():
+  """Version to use with languagemodel_wiki_scramble1k50."""
+  hparams = attention_lm_no_moe_small()
+  hparams.use_inputs = True
+  hparams.min_length_bucket = 1024
+  hparams.max_length = 1024
+  hparams.batch_size = 5000
+  hparams.layer_prepostprocess_dropout = 0.0
+  hparams.layer_preprocess_sequence = "n"
+  hparams.layer_postprocess_sequence = "da"
+  return hparams

From 1991f7b8addb657abe41bb633e1d909edade56ce Mon Sep 17 00:00:00 2001
From: Etienne Pot <epot@google.com>
Date: Fri, 8 Sep 2017 13:51:17 -0700
Subject: [PATCH 30/32] Add option for local attention in attention_lm_moe.

PiperOrigin-RevId: 168041046
---
 tensor2tensor/models/attention_lm_moe.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tensor2tensor/models/attention_lm_moe.py b/tensor2tensor/models/attention_lm_moe.py
index cd54ce64e..adbb871b5 100644
--- a/tensor2tensor/models/attention_lm_moe.py
+++ b/tensor2tensor/models/attention_lm_moe.py
@@ -148,6 +148,8 @@ def print_shape(x, suffix, debug=False):
                 hparams.hidden_size,
                 hparams.num_heads,
                 hparams.attention_dropout,
+                attention_type=("local_mask_right" if hparams.attention_local
+                                else "dot_product"),
                 name="decoder_self_attention")
           elif hparams.attention_type == AttentionType.MEMORY_EFFICIENT:
             assert hparams.layer_preprocess_sequence == "n"
@@ -349,6 +351,7 @@ def attention_lm_moe_base():
   hparams.add_hparam("moe_layers", "2")  # comma separated list of layer numbers
   # moe params. local attention moe.
   hparams.add_hparam("attention_type", AttentionType.MULTIHEAD)
+  hparams.add_hparam("attention_local", int(False))
   hparams.add_hparam("attention_moe_k", 2)
   hparams.add_hparam("attention_num_experts", 16)
   hparams.add_hparam("attention_split_batch", int(False))
@@ -383,6 +386,18 @@ def attention_lm_moe_base_ae():
   return hparams
 
 
+@registry.register_hparams
+def attention_lm_moe_base_local():
+  """Base model with attention expert."""
+  hparams = attention_lm_moe_base()
+  hparams.attention_local = int(True)
+  hparams.use_sepconv = int(True)
+  hparams.max_length = 0  # max_length == batch_size
+  hparams.eval_drop_long_sequences = int(True)
+  hparams.min_length_bucket = 256  # Avoid cyclic problems for big batches
+  return hparams
+
+
 @registry.register_hparams
 def attention_lm_moe_small():
   """Cheap model for single-gpu training.

From 1d769553d3e9e4942229a705a526080626c6d16d Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Fri, 8 Sep 2017 14:45:11 -0700
Subject: [PATCH 31/32] v1.2.2

PiperOrigin-RevId: 168048958
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index b51070c77..119eeea7e 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='tensor2tensor',
-    version='1.2.1',
+    version='1.2.2',
     description='Tensor2Tensor',
     author='Google Inc.',
     author_email='no-reply@google.com',

From b8e59e746919a80f0ccd30dbf87426928c856218 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Fri, 8 Sep 2017 14:46:56 -0700
Subject: [PATCH 32/32] open source fixes

PiperOrigin-RevId: 168049257
---
 tensor2tensor/utils/trainer_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py
index be5e5530f..5ab3db70c 100644
--- a/tensor2tensor/utils/trainer_utils.py
+++ b/tensor2tensor/utils/trainer_utils.py
@@ -265,7 +265,7 @@ def save_metadata(output_dir, hparams):
   else:
     flags_dict = FLAGS.__dict__["__flags"]
     flags_str = "\n".join(
-        ["--%s=%s" % (name, str(f.value)) for (name, f) in flags_dict.items()])
+        ["--%s=%s" % (name, str(f)) for (name, f) in flags_dict.items()])
     t2t_flags_str = None
 
   flags_txt = os.path.join(output_dir, "flags.txt")