From 2e55ec24d728be1323ba3b20b08facb4abf8004e Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Thu, 27 Jul 2017 12:30:18 -0700
Subject: [PATCH 1/6] Modality.loss

PiperOrigin-RevId: 163376624
---
 tensor2tensor/models/modalities.py      | 68 ++++++-----------------
 tensor2tensor/models/modalities_test.py | 10 ++--
 tensor2tensor/utils/modality.py         | 73 +++++++++++++------------
 tensor2tensor/utils/t2t_model.py        | 13 +++--
 4 files changed, 69 insertions(+), 95 deletions(-)

diff --git a/tensor2tensor/models/modalities.py b/tensor2tensor/models/modalities.py
index 20464c0a2..c57a97905 100644
--- a/tensor2tensor/models/modalities.py
+++ b/tensor2tensor/models/modalities.py
@@ -96,12 +96,11 @@ def targets_bottom(self, x):
     else:
       return self.bottom_simple(x, "target_emb", reuse=None)
 
-  def top(self, body_output, targets):
+  def top(self, body_output, _):
     """Generate logits.
 
     Args:
       body_output: A Tensor with shape [batch, p0, p1, body_input_depth]
-      targets: A Tensor with shape [batch, p0, p1, 1]
     Returns:
       logits: A Tensor with shape  [batch, p0, p1, ?, vocab_size].
     """
@@ -192,18 +191,11 @@ def top(self, body_output, _):
 
       return logits
 
-  def top_sharded(self,
-                  sharded_body_output,
-                  sharded_targets,
-                  data_parallelism,
-                  weights_fn=common_layers.weights_all):
+  def loss(self, top_out, targets, weights_fn=common_layers.weights_all):
     # Call the default implementation, but weight 1.0 on 0s by default.
     # (Since we're processing images and so have no padding and some pixel 0s.)
-    return super(SmallImageModality, self).top_sharded(
-        sharded_body_output,
-        sharded_targets,
-        data_parallelism,
-        weights_fn=weights_fn)
+    return super(SmallImageModality, self).loss(
+        top_out, targets, weights_fn=weights_fn)
 
 
 @registry.register_image_modality("default")
@@ -425,18 +417,11 @@ def top(self, body_output, _):
       res = common_layers.conv(x, self._vocab_size, (1, 1))
       return tf.expand_dims(res, 3)
 
-  def top_sharded(self,
-                  sharded_body_output,
-                  sharded_targets,
-                  data_parallelism,
-                  weights_fn=common_layers.weights_all):
+  def loss(self, top_out, targets, weights_fn=common_layers.weights_all):
     # Call the default implementation, but weight 1.0 on 0s by default.
-    # (Since we're processing images and so have no padding and some labels 0.)
-    return super(ClassLabelModality, self).top_sharded(
-        sharded_body_output,
-        sharded_targets,
-        data_parallelism,
-        weights_fn=weights_fn)
+    # (Since we're processing images and so have no padding and some pixel 0s.)
+    return super(ClassLabelModality, self).loss(
+        top_out, targets, weights_fn=weights_fn)
 
 
 @registry.register_class_label_modality("class_label_2d")
@@ -479,24 +464,12 @@ def top(self, body_output, _):
     with tf.variable_scope("real"):
       return tf.layers.dense(body_output, self._vocab_size)
 
-  def top_sharded(self,
-                  sharded_body_output,
-                  sharded_targets,
-                  data_parallelism,
-                  weights_fn=common_layers.weights_nonzero):
-    sharded_predictions = data_parallelism(self.top, sharded_body_output,
-                                           sharded_targets)
-
-    def l2_loss(predictions, targets):
-      with tf.name_scope("l2"):
-        weights = weights_fn(targets)
-        l2 = tf.pow(predictions - targets, 2)
-        return tf.reduce_sum(l2 * weights), tf.reduce_sum(weights)
-
-    loss_num, loss_den = data_parallelism(l2_loss, sharded_predictions,
-                                          sharded_targets)
-    loss = tf.add_n(loss_num) / tf.maximum(1.0, tf.add_n(loss_den))
-    return sharded_predictions, loss
+  def loss(self, top_out, targets, weights_fn=common_layers.weights_nonzero):
+    predictions = top_out
+    with tf.name_scope("l2"):
+      weights = weights_fn(targets)
+      l2 = tf.pow(predictions - targets, 2)
+      return tf.reduce_sum(l2 * weights), tf.reduce_sum(weights)
 
 
 @registry.register_image_modality("identity_no_pad")
@@ -513,15 +486,8 @@ def bottom(self, x):
   def top(self, body_output, _):
     return body_output
 
-  def top_sharded(self,
-                  sharded_body_output,
-                  sharded_targets,
-                  data_parallelism,
-                  weights_fn=common_layers.weights_all):
+  def loss(self, top_out, targets, weights_fn=common_layers.weights_all):
     # Call the default implementation, but weight 1.0 on 0s by default.
     # (Since we're processing images and so have no padding and some pixel 0s.)
-    return super(IdentityModalityNoPad, self).top_sharded(
-        sharded_body_output,
-        sharded_targets,
-        data_parallelism,
-        weights_fn=weights_fn)
+    return super(IdentityModalityNoPad, self).loss(
+        top_out, targets, weights_fn=weights_fn)
diff --git a/tensor2tensor/models/modalities_test.py b/tensor2tensor/models/modalities_test.py
index 4254c6b04..9130613b9 100644
--- a/tensor2tensor/models/modalities_test.py
+++ b/tensor2tensor/models/modalities_test.py
@@ -41,8 +41,8 @@ def testSymbolModalityInputs(self):
         hidden_size=hidden_size,
         multiply_embedding_mode="sqrt_depth",
         shared_embedding_and_softmax_weights=0)
-    x = -1 + np.random.random_integers(vocab_size, size=(
-        batch_size, length, 1, 1))
+    x = -1 + np.random.random_integers(
+        vocab_size, size=(batch_size, length, 1, 1))
     m = modalities.SymbolModality(model_hparams, vocab_size)
     data_parallelism = expert_utils.Parallelism(
         ["/device:CPU:0"] * num_datashards, reuse=True)
@@ -76,8 +76,10 @@ def testSymbolModalityTargets(self):
     with self.test_session() as session:
       sharded_body_output = tf.split(tf.to_float(body_output), num_datashards)
       sharded_targets = tf.split(targets, num_datashards)
-      sharded_logits, train_loss = m.top_sharded(
-          sharded_body_output, sharded_targets, data_parallelism)
+      sharded_logits = m.top_sharded(sharded_body_output, sharded_targets,
+                                     data_parallelism)
+      train_loss = m.loss_sharded(sharded_logits, sharded_targets,
+                                  data_parallelism)
       logits = tf.concat(sharded_logits, 0)
       session.run(tf.global_variables_initializer())
       res1, res2 = session.run((logits, train_loss))
diff --git a/tensor2tensor/utils/modality.py b/tensor2tensor/utils/modality.py
index 72169be1f..5c596e10f 100644
--- a/tensor2tensor/utils/modality.py
+++ b/tensor2tensor/utils/modality.py
@@ -31,23 +31,26 @@ class Modality(object):
   """Abstract Modality class for data transformations.
 
   An abstract class representing modalities for transforming data to a space
-  interpretable by sequence models. It has 3 functions:
-  * bottom:  called on inputs entering the model.
+  interpretable by T2T models. It has 4 functions:
+  * bottom: called on inputs entering the model.
   * targets_bottom: called on targets entering the model (e.g., the decoder).
-  * top:   called on targets to generate predictions.
-
-  For example, think about a modality for images. The inputs_bottom function
-  represents the part of the model applied to an incoming image, e.g., an entry
-  flow of a convolutional network. The targets_top function represents the top
-  part of a model that is generating images, e.g., a PixelCNN network. The final
-  function targets_bottom represents the auto-regressive part of the network.
-  It is applied to the already-generated part of an image, which is given to
-  the decoder to generate the next part. In some cases, e.g., for text, it is
-  the same as the inputs_bottom function, and that is the default we use. But,
-  e.g., for images, a different function might be needed to regress properly.
-
-  All 3 functions have simple and sharded versions. A sub-class only needs
-  to implement the simple version, the default sharding will be used then.
+  * top: called on model outputs to generate predictions (e.g., logits).
+  * loss: called on predictions (outputs of top) and targets.
+
+  For example, think about a modality for images:
+  * `bottom` represents the part of the model applied to an incoming image,
+    e.g., an entry flow of a convolutional network.
+  * `top` represents the top part of a model that is generating images, e.g., a
+    PixelCNN network.
+  * `targets_bottom` represents the auto-regressive part of the network.  It is
+    applied to the already-generated part of an image, which is given to the
+    decoder to generate the next part. In some cases, e.g., for text, it is the
+    same as the `bottom` function, and that is the default we use. But, e.g.,
+    for images, a different function might be needed to regress properly.
+  * `loss` would compare the generated image to the target image and score it.
+
+  All the functions have simple and sharded versions. A sub-class only needs to
+  implement the simple version, the default sharding will be used then.
   """
 
   def __init__(self, model_hparams, vocab_size=None):
@@ -116,7 +119,7 @@ def targets_bottom_sharded(self, xs, data_parallelism):
     return data_parallelism(self.targets_bottom, xs)
 
   def top(self, body_output, targets):
-    """Transform one shard of output.
+    """Generate predictions/logits for one shard of output.
 
     Most classes will override this function.
 
@@ -129,12 +132,8 @@ def top(self, body_output, targets):
     """
     raise NotImplementedError("Abstract Method")
 
-  def top_sharded(self,
-                  sharded_body_output,
-                  sharded_targets,
-                  data_parallelism,
-                  weights_fn=common_layers.weights_nonzero):
-    """Transform all shards of targets.
+  def top_sharded(self, sharded_body_output, sharded_targets, data_parallelism):
+    """Generate predictions/logits for all shards.
 
     Classes with cross-shard interaction will override this function.
 
@@ -142,18 +141,24 @@ def top_sharded(self,
       sharded_body_output: A list of Tensors.
       sharded_targets: A list of Tensors.
       data_parallelism: a expert_utils.Parallelism object.
-      weights_fn: function from targets to target weights.
     Returns:
-      shaded_logits: A list of Tensors.
-      training_loss: a Scalar.
+      sharded_logits: A list of Tensors.
     """
-    sharded_logits = data_parallelism(self.top, sharded_body_output,
-                                      sharded_targets)
-    loss_num, loss_den = data_parallelism(
-        common_layers.padded_cross_entropy,
-        sharded_logits,
-        sharded_targets,
+    return data_parallelism(self.top, sharded_body_output, sharded_targets)
+
+  def loss(self, top_out, targets, weights_fn=common_layers.weights_nonzero):
+    """Compute loss numerator and denominator for one shard of output."""
+    logits = top_out
+    return common_layers.padded_cross_entropy(
+        logits,
+        targets,
         self._model_hparams.label_smoothing,
         weights_fn=weights_fn)
-    loss = tf.add_n(loss_num) / tf.maximum(1.0, tf.add_n(loss_den))
-    return sharded_logits, loss
+
+  def loss_sharded(self, sharded_top_out, sharded_targets, data_parallelism):
+    """Compute loss for all shards."""
+    sharded_loss_num, sharded_loss_den = data_parallelism(
+        self.loss, sharded_top_out, sharded_targets)
+    loss = tf.add_n(sharded_loss_num) / tf.maximum(1.0,
+                                                   tf.add_n(sharded_loss_den))
+    return loss
diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py
index 66e40d495..f67cc9540 100644
--- a/tensor2tensor/utils/t2t_model.py
+++ b/tensor2tensor/utils/t2t_model.py
@@ -424,8 +424,10 @@ def model_fn(self, features, skip=False, last_position_only=False):
 
     with tf.variable_scope(target_modality.name, reuse=target_reuse):
       if not last_position_only:
-        sharded_logits, training_loss = (target_modality.top_sharded(
-            body_outputs, sharded_features["targets"], self._data_parallelism))
+        sharded_logits = target_modality.top_sharded(
+            body_outputs, sharded_features["targets"], self._data_parallelism)
+        training_loss = target_modality.loss_sharded(
+            sharded_logits, sharded_features["targets"], self._data_parallelism)
 
         training_loss *= self._problem_hparams.loss_multiplier
       else:
@@ -439,10 +441,9 @@ def model_fn(self, features, skip=False, last_position_only=False):
             tf.expand_dims(target_shard[:, -1:, :, :], axis=[1])
             for target_shard in sharded_features["targets"]
         ]
-        sharded_logits, training_loss = (target_modality.top_sharded(
-            last_position_body_outputs, last_position_targets,
-            self._data_parallelism))
-
+        sharded_logits = target_modality.top_sharded(last_position_body_outputs,
+                                                     last_position_targets,
+                                                     self._data_parallelism)
         training_loss = None
 
     tf.logging.info("This model_fn took %.3f sec." % (time.time() - start_time))

From b82bdfd13a94a05a82dc6755126c1ad1bcc6c02c Mon Sep 17 00:00:00 2001
From: Lukasz Kaiser <lukaszkaiser@google.com>
Date: Thu, 27 Jul 2017 16:05:10 -0700
Subject: [PATCH 2/6] correct metrics and some generator and python3
 corrections.

PiperOrigin-RevId: 163402917
---
 tensor2tensor/data_generators/gene_expression.py | 12 ++++++------
 tensor2tensor/data_generators/generator_utils.py |  2 +-
 tensor2tensor/data_generators/text_encoder.py    |  4 ++--
 tensor2tensor/data_generators/wmt.py             |  7 ++++---
 tensor2tensor/utils/metrics.py                   |  3 +++
 5 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/tensor2tensor/data_generators/gene_expression.py b/tensor2tensor/data_generators/gene_expression.py
index 31d1cd150..1bb9d4ab3 100644
--- a/tensor2tensor/data_generators/gene_expression.py
+++ b/tensor2tensor/data_generators/gene_expression.py
@@ -110,10 +110,10 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1):
     # Collect created shard processes to start and join
     processes = []
 
-    datasets = [(self.training_filepaths, self.num_shards, "train",
-                 num_train_examples), (self.dev_filepaths, 1, "valid",
-                                       num_dev_examples),
-                (self.test_filepaths, 1, "test", num_test_examples)]
+    datasets = [
+        (self.training_filepaths, self.num_shards, "train", num_train_examples),
+        (self.dev_filepaths, 10, "valid", num_dev_examples),
+        (self.test_filepaths, 10, "test", num_test_examples)]
     for fname_fn, nshards, key_prefix, num_examples in datasets:
       outfiles = fname_fn(data_dir, nshards, shuffled=False)
       all_filepaths.extend(outfiles)
@@ -125,8 +125,8 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1):
                   start_idx, end_idx))
         processes.append(p)
 
-    # 1 per training shard + dev + test
-    assert len(processes) == self.num_shards + 2
+    # 1 per training shard + 10 for dev + 10 for test
+    assert len(processes) == self.num_shards + 20
 
     # Start and wait for processes in batches
     num_batches = int(
diff --git a/tensor2tensor/data_generators/generator_utils.py b/tensor2tensor/data_generators/generator_utils.py
index 866a0f3e7..5c7f9f2a1 100644
--- a/tensor2tensor/data_generators/generator_utils.py
+++ b/tensor2tensor/data_generators/generator_utils.py
@@ -305,7 +305,7 @@ def generate():
 
         # Use Tokenizer to count the word occurrences.
         with tf.gfile.GFile(filepath, mode="r") as source_file:
-          file_byte_budget = 3.5e5 if "en" in filepath else 7e5
+          file_byte_budget = 3.5e5 if filepath.endswith("en") else 7e5
           for line in source_file:
             if file_byte_budget <= 0:
               break
diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py
index 4bb1c875d..ff284bcc6 100644
--- a/tensor2tensor/data_generators/text_encoder.py
+++ b/tensor2tensor/data_generators/text_encoder.py
@@ -53,7 +53,7 @@
 # '\u' is converted to '_'
 # '\\' is converted to '\'
 # '\213;' is converted to unichr(213)
-_UNESCAPE_REGEX = re.compile(ur"\\u|\\\\|\\([0-9]+);")
+_UNESCAPE_REGEX = re.compile(r"\\u|\\\\|\\([0-9]+);")
 _ESCAPE_CHARS = set(u"\\_;0123456789")
 
 
@@ -219,7 +219,7 @@ def _escape_token(token, alphabet):
 
   token = token.replace(u"\\", u"\\\\").replace(u"_", u"\\u")
   ret = [
-      c if c in alphabet and c != u"\n" else ur"\%d;" % ord(c)
+      c if c in alphabet and c != u"\n" else r"\%d;" % ord(c)
       for c in token]
   return u"".join(ret) + "_"
 
diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py
index 97b191096..7fde9b3b4 100644
--- a/tensor2tensor/data_generators/wmt.py
+++ b/tensor2tensor/data_generators/wmt.py
@@ -404,14 +404,15 @@ def _compile_data(tmp_dir, datasets, filename):
           generator_utils.maybe_download(tmp_dir, compressed_filename, url)
         if not (os.path.exists(lang1_filepath) and
                 os.path.exists(lang2_filepath)):
-          mode = "r:gz" if "gz" in compressed_filepath else "r"
+          # For .tar.gz and .tgz files, we read compressed.
+          mode = "r:gz" if compressed_filepath.endswith("gz") else "r"
           with tarfile.open(compressed_filepath, mode) as corpus_tar:
             corpus_tar.extractall(tmp_dir)
-        if ".gz" in lang1_filepath:
+        if lang1_filepath.endswith(".gz"):
           new_filepath = lang1_filepath.strip(".gz")
           generator_utils.gunzip_file(lang1_filepath, new_filepath)
           lang1_filepath = new_filepath
-        if ".gz" in lang2_filepath:
+        if lang2_filepath.endswith(".gz"):
           new_filepath = lang2_filepath.strip(".gz")
           generator_utils.gunzip_file(lang2_filepath, new_filepath)
           lang2_filepath = new_filepath
diff --git a/tensor2tensor/utils/metrics.py b/tensor2tensor/utils/metrics.py
index ae9ce3882..4435707cd 100644
--- a/tensor2tensor/utils/metrics.py
+++ b/tensor2tensor/utils/metrics.py
@@ -59,6 +59,7 @@ def padded_accuracy_topk(predictions,
     effective_k = tf.minimum(k, tf.shape(padded_predictions)[-1])
     _, outputs = tf.nn.top_k(padded_predictions, k=effective_k)
     outputs = tf.to_int32(outputs)
+    padded_labels = tf.to_int32(padded_labels)
     padded_labels = tf.expand_dims(padded_labels, axis=-1)
     padded_labels += tf.zeros_like(outputs)  # Pad to same shape.
     same = tf.to_float(tf.equal(outputs, padded_labels))
@@ -82,6 +83,7 @@ def padded_sequence_accuracy(predictions,
         predictions, labels)
     weights = weights_fn(padded_labels)
     outputs = tf.to_int32(tf.argmax(padded_predictions, axis=-1))
+    padded_labels = tf.to_int32(padded_labels)
     not_correct = tf.to_float(tf.not_equal(outputs, padded_labels)) * weights
     axis = list(range(1, len(outputs.get_shape())))
     correct_seq = 1.0 - tf.minimum(1.0, tf.reduce_sum(not_correct, axis=axis))
@@ -106,6 +108,7 @@ def padded_accuracy(predictions,
         predictions, labels)
     weights = weights_fn(padded_labels)
     outputs = tf.to_int32(tf.argmax(padded_predictions, axis=-1))
+    padded_labels = tf.to_int32(padded_labels)
     return tf.to_float(tf.equal(outputs, padded_labels)), weights
 
 

From 8ad79b60d29bef80c7724f5d5d5dfa0ff2ff8cab Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Thu, 27 Jul 2017 16:39:45 -0700
Subject: [PATCH 3/6] Move examples reading to new Datasets API

PiperOrigin-RevId: 163407588
---
 .../data_generators/gene_expression.py        |  4 +-
 tensor2tensor/utils/data_reader.py            | 90 +++++++++----------
 tensor2tensor/utils/data_reader_test.py       |  8 +-
 tensor2tensor/utils/trainer_utils.py          | 35 ++++----
 4 files changed, 64 insertions(+), 73 deletions(-)

diff --git a/tensor2tensor/data_generators/gene_expression.py b/tensor2tensor/data_generators/gene_expression.py
index 1bb9d4ab3..60e38a90f 100644
--- a/tensor2tensor/data_generators/gene_expression.py
+++ b/tensor2tensor/data_generators/gene_expression.py
@@ -168,8 +168,8 @@ def preprocess_examples(self, examples, mode):
 
     # Reshape targets
     examples["targets"] = tf.reshape(examples["targets"],
-                                     [-1, 1, self.num_output_predictions])
-    examples["targets_mask"] = tf.reshape(examples["targets_mask"], [-1, 1, 1])
+                                     [-1, self.num_output_predictions])
+    examples["targets_mask"] = tf.reshape(examples["targets_mask"], [-1, 1])
 
     # Set masked targets to 0 (i.e. pad) so that loss and metrics ignore them.
     # Add epsilon because some unmasked labels are actually 0.
diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py
index 24dd31485..e78e22344 100644
--- a/tensor2tensor/utils/data_reader.py
+++ b/tensor2tensor/utils/data_reader.py
@@ -20,6 +20,7 @@
 
 import math
 import os
+import random
 
 # Dependency imports
 
@@ -33,19 +34,15 @@
 import tensorflow as tf
 
 
-def examples_queue(data_sources,
-                   data_fields_to_features,
-                   training,
-                   capacity=32,
-                   data_items_to_decoders=None,
-                   data_items_to_decode=None):
-  """Contruct a queue of training or evaluation examples.
+def examples_reader(data_sources,
+                    data_fields_to_features,
+                    training,
+                    capacity=32,
+                    data_items_to_decoders=None,
+                    data_items_to_decode=None):
+  """Reads Examples from data_sources and decodes to Tensors.
 
-  This function will create a reader from files given by data_sources,
-  then enqueue the tf.Examples from these files, shuffling if training
-  is true, and finally parse these tf.Examples to tensors.
-
-  The dictionary data_fields_to_features for an image dataset can be this:
+  The dictionary data_fields_to_features for an image dataset can be:
 
   data_fields_to_features = {
     'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''),
@@ -54,7 +51,7 @@ def examples_queue(data_sources,
         [1], tf.int64, default_value=tf.zeros([1], dtype=tf.int64)),
   }
 
-  and for a simple algorithmic dataset with variable-length data it is this:
+  and for a simple algorithmic dataset with variable-length data it is:
 
   data_fields_to_features = {
     'inputs': tf.VarLenFeature(tf.int64),
@@ -63,7 +60,7 @@ def examples_queue(data_sources,
 
   The data_items_to_decoders dictionary argument can be left as None if there
   is no decoding to be performed. But, e.g. for images, it should be set so that
-  the images are decoded from the features, e.g., like this for MNIST:
+  the images are decoded from the features, e.g., for MNIST:
 
   data_items_to_decoders = {
     'image': tfexample_decoder.Image(
@@ -83,7 +80,7 @@ def examples_queue(data_sources,
     data_fields_to_features: a dictionary from data fields in the data sources
       to features, such as tf.VarLenFeature(tf.int64), see above for examples.
     training: a Boolean, whether to read for training or evaluation.
-    capacity: integer, queue capacity; set to 2 * max_batch_size or more.
+    capacity: integer, buffer capacity; set to 2 * max_batch_size or more.
     data_items_to_decoders: a dictionary mapping data items (that will be
       in the returned result) to decoders that will decode them using features
       defined in data_fields_to_features; see above for examples. By default
@@ -93,43 +90,40 @@ def examples_queue(data_sources,
 
   Returns:
     A dictionary mapping each data_field to a corresponding 1D int64 tensor
-    read from the created queue.
-
-  Raises:
-    ValueError: if no files are found with the provided data_prefix or no data
-      fields were provided.
+    read from the created Dataset.
   """
-  with tf.name_scope("examples_queue"):
-    # Read serialized examples using slim parallel_reader.
-    num_epochs = None if training else 1
-    data_files = tf.contrib.slim.parallel_reader.get_data_files(data_sources)
-    num_readers = min(4 if training else 1, len(data_files))
-    _, example_serialized = tf.contrib.slim.parallel_reader.parallel_read(
-        data_sources,
-        tf.TFRecordReader,
-        num_epochs=num_epochs,
-        shuffle=training,
-        capacity=2 * capacity,
-        min_after_dequeue=capacity,
-        num_readers=num_readers)
-
-    if data_items_to_decoders is None:
-      data_items_to_decoders = {
+
+  def decode_record(record):
+    """Serialized Example to dict of <feature name, Tensor>."""
+    example_serialized = record
+    item_decoders = data_items_to_decoders
+    if item_decoders is None:
+      item_decoders = {
           field: tf.contrib.slim.tfexample_decoder.Tensor(field)
           for field in data_fields_to_features
       }
 
     decoder = tf.contrib.slim.tfexample_decoder.TFExampleDecoder(
-        data_fields_to_features, data_items_to_decoders)
+        data_fields_to_features, item_decoders)
 
-    if data_items_to_decode is None:
-      data_items_to_decode = list(data_items_to_decoders)
+    decode_items = data_items_to_decode
+    if decode_items is None:
+      decode_items = list(item_decoders)
 
-    decoded = decoder.decode(example_serialized, items=data_items_to_decode)
-    return {
-        field: tensor
-        for (field, tensor) in zip(data_items_to_decode, decoded)
-    }
+    decoded = decoder.decode(example_serialized, items=decode_items)
+    return dict(zip(decode_items, decoded))
+
+  with tf.name_scope("examples_in"):
+    data_files = tf.contrib.slim.parallel_reader.get_data_files(data_sources)
+    random.shuffle(data_files)
+    dataset = tf.contrib.data.TFRecordDataset(data_files)
+    num_readers = min(4 if training else 1, len(data_files))
+    dataset = dataset.map(decode_record, num_threads=num_readers)
+    if training:
+      dataset = dataset.shuffle(capacity)
+    dataset = dataset.repeat(None if training else 1)
+    it = dataset.make_one_shot_iterator()
+    return it.get_next()
 
 
 def preprocessing(examples, data_file_pattern, mode):
@@ -193,7 +187,7 @@ def problem_input_pipeline(problem, data_file_pattern, capacity, mode):
     return feature_placeholders(data_fields)
 
   # Now the non-trivial case construction.
-  examples = examples_queue(
+  examples = examples_reader(
       [data_file_pattern],
       data_fields,
       training=(mode == tf.contrib.learn.ModeKeys.TRAIN),
@@ -278,7 +272,7 @@ def input_pipeline(problem, data_file_pattern, capacity, mode):
     return feature_placeholders(data_fields)
 
   # Now the non-trivial case construction.
-  examples = examples_queue(
+  examples = examples_reader(
       [data_file_pattern],
       data_fields,
       training=(mode == tf.contrib.learn.ModeKeys.TRAIN),
@@ -296,7 +290,7 @@ def batch_examples(examples, batching_scheme):
   """Given a queue of examples, create batches of examples with similar lengths.
 
   We assume that examples is a dictionary with string keys and tensor values,
-  possibly coming from a queue, e.g., constructed by examples_queue above.
+  possibly coming from a queue, e.g., constructed by examples_reader above.
   Each tensor in examples is assumed to be 1D. We will put tensors of similar
   length into batches togeter. We return a dictionary with the same keys as
   examples, and with values being batches of size batch_size. If elements have
@@ -407,7 +401,7 @@ def constant_batching_scheme(constant_batch_size_in_sequences):
   }
 
 
-def get_datasets(problems, data_dir, mode):
+def get_data_filepatterns(problems, data_dir, mode):
   """Return the location of a dataset for a given mode."""
   datasets = []
   for problem in problems.split("-"):
diff --git a/tensor2tensor/utils/data_reader_test.py b/tensor2tensor/utils/data_reader_test.py
index f0c318e7b..ea98da06d 100644
--- a/tensor2tensor/utils/data_reader_test.py
+++ b/tensor2tensor/utils/data_reader_test.py
@@ -50,13 +50,13 @@ def test_generator():
     generator_utils.generate_files(test_generator(), filenames)
     self.assertTrue(tf.gfile.Exists(tmp_file_path + "-train-00000-of-00001"))
 
-    examples_train = data_reader.examples_queue(
+    examples_train = data_reader.examples_reader(
         [tmp_file_path + "*"], {
             "inputs": tf.VarLenFeature(tf.int64),
             "targets": tf.VarLenFeature(tf.int64)
         },
         training=True)
-    examples_eval = data_reader.examples_queue(
+    examples_eval = data_reader.examples_reader(
         [tmp_file_path + "*"], {
             "inputs": tf.VarLenFeature(tf.int64),
             "targets": tf.VarLenFeature(tf.int64),
@@ -103,12 +103,12 @@ def test_generator():
     generator_utils.generate_files(test_generator(), filenames)
     self.assertTrue(tf.gfile.Exists(tmp_file_path + "-train-00000-of-00001"))
 
-    examples_train = data_reader.examples_queue([tmp_file_path + "*"], {
+    examples_train = data_reader.examples_reader([tmp_file_path + "*"], {
         "inputs": tf.VarLenFeature(tf.int64),
         "targets": tf.VarLenFeature(tf.int64)
     }, True)
     batch_train = data_reader.batch_examples(examples_train, 4)
-    examples_eval = data_reader.examples_queue([tmp_file_path + "*"], {
+    examples_eval = data_reader.examples_reader([tmp_file_path + "*"], {
         "inputs": tf.VarLenFeature(tf.int64),
         "targets": tf.VarLenFeature(tf.int64)
     }, False)
diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py
index bf105c5ae..260ec6a00 100644
--- a/tensor2tensor/utils/trainer_utils.py
+++ b/tensor2tensor/utils/trainer_utils.py
@@ -197,14 +197,14 @@ def create_experiment_components(hparams, output_dir, data_dir, model_name):
   train_input_fn = get_input_fn(
       mode=tf.contrib.learn.ModeKeys.TRAIN,
       hparams=hparams,
-      data_file_patterns=get_datasets_for_mode(data_dir,
+      data_file_patterns=get_data_filepatterns(data_dir,
                                                tf.contrib.learn.ModeKeys.TRAIN),
       num_datashards=num_datashards)
 
   eval_input_fn = get_input_fn(
       mode=tf.contrib.learn.ModeKeys.EVAL,
       hparams=hparams,
-      data_file_patterns=get_datasets_for_mode(data_dir,
+      data_file_patterns=get_data_filepatterns(data_dir,
                                                tf.contrib.learn.ModeKeys.EVAL),
       num_datashards=num_datashards)
   estimator = tf.contrib.learn.Estimator(
@@ -626,7 +626,7 @@ def decode_from_dataset(estimator):
     inputs_vocab = hparams.problems[i].vocabulary.get("inputs", None)
     targets_vocab = hparams.problems[i].vocabulary["targets"]
     tf.logging.info("Performing local inference.")
-    infer_problems_data = get_datasets_for_mode(hparams.data_dir,
+    infer_problems_data = get_data_filepatterns(hparams.data_dir,
                                                 tf.contrib.learn.ModeKeys.INFER)
 
     infer_input_fn = get_input_fn(
@@ -801,8 +801,8 @@ def _decode_batch_input_fn(problem_id, num_decode_batches, sorted_inputs,
     }
 
 
-def get_datasets_for_mode(data_dir, mode):
-  return data_reader.get_datasets(FLAGS.problems, data_dir, mode)
+def get_data_filepatterns(data_dir, mode):
+  return data_reader.get_data_filepatterns(FLAGS.problems, data_dir, mode)
 
 
 def _cond_on_index(fn, index_tensor, cur_idx, max_idx):
@@ -1075,42 +1075,39 @@ def input_fn():
       ValueError: if one of the parameters has an unsupported value.
     """
     problem_count, batches = len(data_file_patterns), []
-    with tf.name_scope("input_queues"):
+    with tf.name_scope("input_reader"):
       for n in xrange(problem_count):
         if fixed_problem is not None and n != fixed_problem:
           continue
         problem_instance = hparams.problem_instances[n]
+        p_hparams = hparams.problems[n]
         with tf.name_scope("problem_%d" % n):
-          with tf.device("/cpu:0"):  # Input queues are on CPU.
-            capacity = hparams.problems[n].max_expected_batch_size_per_shard
+          with tf.device("/cpu:0"):  # Input reading on CPU
+            capacity = p_hparams.max_expected_batch_size_per_shard
             capacity *= num_datashards
             examples = data_reader.input_pipeline(
                 problem_instance, data_file_patterns[n], capacity, mode)
-            if mode == tf.contrib.learn.ModeKeys.TRAIN:
-              drop_long_sequences = True
-            else:
-              drop_long_sequences = hparams.eval_drop_long_sequences
-            batch_size_multiplier = hparams.problems[n].batch_size_multiplier
             feature_map = data_reader.batch_examples(
                 examples,
                 data_reader.hparams_to_batching_scheme(
                     hparams,
                     shard_multiplier=num_datashards,
-                    drop_long_sequences=drop_long_sequences,
-                    length_multiplier=batch_size_multiplier))
+                    drop_long_sequences=(mode == tf.contrib.learn.ModeKeys.TRAIN
+                                         or hparams.eval_drop_long_sequences),
+                    length_multiplier=(p_hparams.batch_size_multiplier)))
 
         # Reverse inputs and targets features if the problem was reversed.
         if problem_instance is not None:
           problem_instance.maybe_reverse_features(feature_map)
           problem_instance.maybe_copy_features(feature_map)
         else:
-          if hparams.problems[n].was_reversed:
+          if p_hparams.was_reversed:
             inputs = feature_map["inputs"]
             targets = feature_map["targets"]
             feature_map["inputs"] = targets
             feature_map["targets"] = inputs
           # Use the inputs as the targets if the problem is a copy problem.
-          if hparams.problems[n].was_copy:
+          if p_hparams.was_copy:
             feature_map["targets"] = feature_map["inputs"]
 
         # Ensure inputs and targets are proper rank.
@@ -1122,8 +1119,8 @@ def input_fn():
 
         batches.append(
             (feature_map["inputs"], feature_map["targets"], tf.constant(n),
-             tf.constant(hparams.problems[n].input_space_id),
-             tf.constant(hparams.problems[n].target_space_id)))
+             tf.constant(p_hparams.input_space_id),
+             tf.constant(p_hparams.target_space_id)))
 
     # We choose which problem to process.
     loss_moving_avgs = []  # Need loss moving averages for that.

From daea72a7d81b8d1559bfdc5202a7cb237ac17d0e Mon Sep 17 00:00:00 2001
From: Lukasz Kaiser <lukaszkaiser@google.com>
Date: Thu, 27 Jul 2017 18:18:05 -0700
Subject: [PATCH 4/6] Make a generic Text2TextProblem class, use in WMT, move
 PTB.

PiperOrigin-RevId: 163417898
---
 tensor2tensor/bin/t2t-datagen                 |   7 --
 tensor2tensor/data_generators/problem.py      | 114 ++++++++++++++++--
 .../data_generators/problem_hparams.py        |  16 ---
 tensor2tensor/data_generators/ptb.py          | 103 ++++++++++------
 tensor2tensor/data_generators/wmt.py          |  79 +-----------
 tensor2tensor/utils/data_reader.py            |   3 +-
 6 files changed, 177 insertions(+), 145 deletions(-)

diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen
index e4acb6731..1f876c981 100644
--- a/tensor2tensor/bin/t2t-datagen
+++ b/tensor2tensor/bin/t2t-datagen
@@ -43,7 +43,6 @@ from tensor2tensor.data_generators import audio
 from tensor2tensor.data_generators import generator_utils
 from tensor2tensor.data_generators import image
 from tensor2tensor.data_generators import lm1b
-from tensor2tensor.data_generators import ptb
 from tensor2tensor.data_generators import snli
 from tensor2tensor.data_generators import wiki
 from tensor2tensor.data_generators import wmt
@@ -176,12 +175,6 @@ _SUPPORTED_PROBLEM_GENERATORS = {
         lambda: audio.timit_generator(
             FLAGS.data_dir, FLAGS.tmp_dir, False, 626,
             vocab_filename="vocab.endefr.%d" % 2**15, vocab_size=2**15)),
-    "lmptb_10k": (
-        lambda: ptb.train_generator(
-            FLAGS.tmp_dir,
-            FLAGS.data_dir,
-            False),
-        ptb.valid_generator),
 }
 
 # pylint: enable=g-long-lambda
diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py
index 67e3c6f90..9623791f5 100644
--- a/tensor2tensor/data_generators/problem.py
+++ b/tensor2tensor/data_generators/problem.py
@@ -18,11 +18,14 @@
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 # Dependency imports
 
-from tensor2tensor.data_generators import generator_utils as utils
+from tensor2tensor.data_generators import generator_utils
 from tensor2tensor.data_generators import text_encoder
 from tensor2tensor.utils import metrics
+from tensor2tensor.utils import registry
 
 import tensorflow as tf
 
@@ -176,20 +179,23 @@ def eval_metrics(self):
   def training_filepaths(self, data_dir, num_shards, shuffled):
     file_basename = self.dataset_filename()
     if not shuffled:
-      file_basename += utils.UNSHUFFLED_SUFFIX
-    return utils.train_data_filenames(file_basename, data_dir, num_shards)
+      file_basename += generator_utils.UNSHUFFLED_SUFFIX
+    return generator_utils.train_data_filenames(
+        file_basename, data_dir, num_shards)
 
   def dev_filepaths(self, data_dir, num_shards, shuffled):
     file_basename = self.dataset_filename()
     if not shuffled:
-      file_basename += utils.UNSHUFFLED_SUFFIX
-    return utils.dev_data_filenames(file_basename, data_dir, num_shards)
+      file_basename += generator_utils.UNSHUFFLED_SUFFIX
+    return generator_utils.dev_data_filenames(
+        file_basename, data_dir, num_shards)
 
   def test_filepaths(self, data_dir, num_shards, shuffled):
     file_basename = self.dataset_filename()
     if not shuffled:
-      file_basename += utils.UNSHUFFLED_SUFFIX
-    return utils.test_data_filenames(file_basename, data_dir, num_shards)
+      file_basename += generator_utils.UNSHUFFLED_SUFFIX
+    return generator_utils.test_data_filenames(
+        file_basename, data_dir, num_shards)
 
   def __init__(self, was_reversed=False, was_copy=False):
     """Create a Problem.
@@ -323,3 +329,97 @@ def _default_hparams():
       # class.
       input_space_id=SpaceID.GENERIC,
       target_space_id=SpaceID.GENERIC)
+
+
+class Text2TextProblem(Problem):
+  """Base class for text-to-text problems."""
+
+  @property
+  def is_character_level(self):
+    raise NotImplementedError()
+
+  @property
+  def targeted_vocab_size(self):
+    raise NotImplementedError()  # Not needed if self.is_character_level.
+
+  def train_generator(self, data_dir, tmp_dir, is_training):
+    """Generator of the training data."""
+    raise NotImplementedError()
+
+  def dev_generator(self, data_dir, tmp_dir):
+    """Generator of the development data."""
+    return self.train_generator(data_dir, tmp_dir, False)
+
+  @property
+  def input_space_id(self):
+    raise NotImplementedError()
+
+  @property
+  def target_space_id(self):
+    raise NotImplementedError()
+
+  @property
+  def num_shards(self):
+    raise NotImplementedError()
+
+  @property
+  def vocab_name(self):
+    raise NotImplementedError()
+
+  @property
+  def vocab_file(self):
+    return "%s.%d" % (self.vocab_name, self.targeted_vocab_size)
+
+  @property
+  def use_subword_tokenizer(self):
+    raise NotImplementedError()
+
+  @property
+  def has_inputs(self):
+    return True  # Set to False for language models.
+
+  def generate_data(self, data_dir, tmp_dir, task_id=-1):
+    generator_utils.generate_dataset_and_shuffle(
+        self.train_generator(data_dir, tmp_dir, True),
+        self.training_filepaths(data_dir, self.num_shards, shuffled=False),
+        self.dev_generator(data_dir, tmp_dir),
+        self.dev_filepaths(data_dir, 1, shuffled=False))
+
+  def feature_encoders(self, data_dir):
+    vocab_filename = os.path.join(data_dir, self.vocab_file)
+    if self.is_character_level:
+      encoder = text_encoder.ByteTextEncoder(),
+    elif self.use_subword_tokenizer:
+      encoder = text_encoder.SubwordTextEncoder(vocab_filename)
+    else:
+      encoder = text_encoder.TokenTextEncoder(vocab_filename)
+    if self.has_inputs:
+      return {"inputs": encoder, "targets": encoder}
+    return {"targets": encoder}
+
+  def hparams(self, defaults, unused_model_hparams):
+    p = defaults
+    if self.is_character_level:
+      source_vocab_size = 256
+      target_vocab_size = 256
+    else:
+      target_vocab_size = self._encoders["targets"].vocab_size
+      if self.has_inputs:
+        source_vocab_size = self._encoders["inputs"].vocab_size
+
+    if self.has_inputs:
+      p.input_modality = {"inputs": (registry.Modalities.SYMBOL,
+                                     source_vocab_size)}
+    p.target_modality = (registry.Modalities.SYMBOL, target_vocab_size)
+    if self.has_inputs:
+      p.input_space_id = self.input_space_id
+    p.target_space_id = self.target_space_id
+    if self.is_character_level:
+      p.loss_multiplier = 2.0
+
+  def eval_metrics(self):
+    return [
+        metrics.Metrics.ACC, metrics.Metrics.ACC_TOP5,
+        metrics.Metrics.ACC_PER_SEQ, metrics.Metrics.NEG_LOG_PERPLEXITY,
+        metrics.Metrics.APPROX_BLEU
+    ]
diff --git a/tensor2tensor/data_generators/problem_hparams.py b/tensor2tensor/data_generators/problem_hparams.py
index 2792c79e9..3c829eeac 100644
--- a/tensor2tensor/data_generators/problem_hparams.py
+++ b/tensor2tensor/data_generators/problem_hparams.py
@@ -368,21 +368,6 @@ def wiki_32k(model_hparams):
   return p
 
 
-def lmptb_10k(model_hparams):
-  """Penn Tree Bank language-modeling benchmark, 10k token vocabulary."""
-  p = default_problem_hparams()
-  p.input_modality = {}
-  p.target_modality = (registry.Modalities.SYMBOL, 10000)
-  vocabulary = text_encoder.TokenTextEncoder(
-      os.path.join(model_hparams.data_dir, "lmptb_10k.vocab"))
-  p.vocabulary = {
-      "targets": vocabulary,
-  }
-  p.input_space_id = 3
-  p.target_space_id = 3
-  return p
-
-
 def wmt_ende_bpe32k(model_hparams):
   """English to German translation benchmark."""
   p = default_problem_hparams()
@@ -642,7 +627,6 @@ def image_celeba(unused_model_hparams):
     "lm1b_characters": lm1b_characters,
     "lm1b_32k": lm1b_32k,
     "wiki_32k": wiki_32k,
-    "lmptb_10k": lmptb_10k,
     "ice_parsing_characters": wmt_parsing_characters,
     "ice_parsing_tokens": lambda p: ice_parsing_tokens(p, 2**13),
     "wmt_parsing_tokens_8k": lambda p: wmt_parsing_tokens(p, 2**13),
diff --git a/tensor2tensor/data_generators/ptb.py b/tensor2tensor/data_generators/ptb.py
index f71f0d902..18aedd640 100644
--- a/tensor2tensor/data_generators/ptb.py
+++ b/tensor2tensor/data_generators/ptb.py
@@ -27,7 +27,9 @@
 # Dependency imports
 
 from tensor2tensor.data_generators import generator_utils
+from tensor2tensor.data_generators import problem
 from tensor2tensor.data_generators import text_encoder
+from tensor2tensor.utils import registry
 
 import tensorflow as tf
 
@@ -48,7 +50,7 @@ def _read_words(filename):
 def _build_vocab(filename, vocab_path, vocab_size):
   """Reads a file to build a vocabulary of `vocab_size` most common words.
 
-   The vocabulary is sorted by occurence count and has one word per line.
+   The vocabulary is sorted by occurrence count and has one word per line.
    Originally from:
    https://github.com/tensorflow/models/blob/master/tutorials/rnn/ptb/reader.py
 
@@ -66,26 +68,47 @@ def _build_vocab(filename, vocab_path, vocab_size):
     f.write("\n".join(words))
 
 
-def _get_token_encoder(vocab_dir, filename):
+def _get_token_encoder(vocab_dir, vocab_name, filename):
   """Reads from file and returns a `TokenTextEncoder` for the vocabulary."""
-  vocab_name = "lmptb_10k.vocab"
   vocab_path = os.path.join(vocab_dir, vocab_name)
-  _build_vocab(filename, vocab_path, 10000)
+  if not tf.gfile.Exists(vocab_path):
+    _build_vocab(filename, vocab_path, 10000)
   return text_encoder.TokenTextEncoder(vocab_path)
 
 
-class PTB(object):
+class PTBProblem(problem.Text2TextProblem):
   """A class for generating PTB data."""
 
-  def __init__(self, tmp_dir, data_dir, char=False):
-    assert not char, "char mode for PTB is not yet implemented"
-    self.char = char
-    self.data_dir = data_dir
+  @property
+  def has_inputs(self):
+    return False
 
-    url = PTB_URL
-    filename = os.path.basename(url)
+  @property
+  def target_space_id(self):
+    if self.is_character_level:
+      return problem.SpaceID.EN_CHR
+    return problem.SpaceID.EN_TOK
+
+  @property
+  def num_shards(self):
+    return 10
+
+  @property
+  def vocab_name(self):
+    return "vocab.lmptb_10k"
+
+  @property
+  def use_subword_tokenizer(self):
+    return False
+
+  @property
+  def targeted_vocab_size(self):
+    return 10000
+
+  def train_generator(self, data_dir, tmp_dir, train):
+    filename = os.path.basename(PTB_URL)
     compressed_filepath = generator_utils.maybe_download(
-        tmp_dir, filename, url)
+        tmp_dir, filename, PTB_URL)
     ptb_files = []
     ptb_char_files = []
     with tarfile.open(compressed_filepath, "r:gz") as tgz:
@@ -101,50 +124,52 @@ def __init__(self, tmp_dir, data_dir, char=False):
 
       tgz.extractall(tmp_dir, members=files)
 
-    if self.char:
+    if self.is_character_level:
       files = ptb_char_files
     else:
       files = ptb_files
-    files = files
 
+    train_file, valid_file = None, None
     for filename in files:
       if "train" in filename:
-        self.train = os.path.join(tmp_dir, filename)
+        train_file = os.path.join(tmp_dir, filename)
       elif "valid" in filename:
-        self.valid = os.path.join(tmp_dir, filename)
+        valid_file = os.path.join(tmp_dir, filename)
 
-    assert hasattr(self, "train"), "Training file not found"
-    assert hasattr(self, "valid"), "Validation file not found"
-    self.encoder = _get_token_encoder(data_dir, self.train)
+    assert train_file, "Training file not found"
+    assert valid_file, "Validation file not found"
 
-  def train_generator(self):
-    return self._generator(self.train)
+    if self.is_character_level:
+      encoder = text_encoder.ByteTextEncoder()
+    else:
+      encoder = _get_token_encoder(data_dir, self.vocab_file, train_file)
 
-  def valid_generator(self):
-    return self._generator(self.valid)
+    if train:
+      return self._generator(train_file, encoder)
+    return self._generator(valid_file, encoder)
 
-  def _generator(self, filename):
+  def _generator(self, filename, encoder):
     with tf.gfile.GFile(filename, "r") as f:
       for line in f:
         line = " ".join(line.replace("\n", EOS).split())
-        tok = self.encoder.encode(line)
-        yield {"inputs": tok[:-1], "targets": tok[1:]}
+        tok = encoder.encode(line)
+        if tok:
+          yield {"inputs": [0], "targets": tok}
 
 
-# Using a object "singleton"
-# `train_generator` must be called before
-# `valid_generator` in order to work
-_ptb = {}
+@registry.register_problem("lm_ptb_10k")
+class LmPtb10k(PTBProblem):
+  """A class for generating PTB data, 10k vocab."""
 
+  @property
+  def is_character_level(self):
+    return False
 
-def train_generator(*args, **kwargs):
-  """The train data generator to be called."""
-  global _ptb
-  _ptb = PTB(*args, **kwargs)
-  return _ptb.train_generator()
 
+@registry.register_problem
+class LmPtbCharacters(PTBProblem):
+  """A class for generating PTB data, character-level."""
 
-def valid_generator():
-  """Validation (aka. dev) data generator."""
-  global _ptb  # pylint:disable=global-variable-not-assigned
-  return _ptb.valid_generator()
+  @property
+  def is_character_level(self):
+    return True
diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py
index 7fde9b3b4..456f36321 100644
--- a/tensor2tensor/data_generators/wmt.py
+++ b/tensor2tensor/data_generators/wmt.py
@@ -28,7 +28,6 @@
 from tensor2tensor.data_generators import problem
 from tensor2tensor.data_generators import text_encoder
 from tensor2tensor.data_generators import wsj_parsing
-from tensor2tensor.utils import metrics
 from tensor2tensor.utils import registry
 
 import tensorflow as tf
@@ -44,33 +43,13 @@
 EOS = text_encoder.EOS_ID
 
 
-class WMTProblem(problem.Problem):
+class WMTProblem(problem.Text2TextProblem):
   """Base class for WMT problems."""
 
   @property
   def is_character_level(self):
     return False
 
-  @property
-  def targeted_vocab_size(self):
-    raise NotImplementedError()  # Not needed if self.is_character_level.
-
-  def train_generator(self, data_dir, tmp_dir, is_training):
-    """Generator of the training data."""
-    raise NotImplementedError()
-
-  def dev_generator(self, data_dir, tmp_dir):
-    """Generator of the development data."""
-    return self.train_generator(data_dir, tmp_dir, False)
-
-  @property
-  def input_space_id(self):
-    raise NotImplementedError()
-
-  @property
-  def target_space_id(self):
-    raise NotImplementedError()
-
   @property
   def num_shards(self):
     return 100
@@ -80,51 +59,8 @@ def vocab_name(self):
     return "vocab.endefr"
 
   @property
-  def vocab_file(self):
-    return "%s.%d" % (self.vocab_name, self.targeted_vocab_size)
-
-  def generate_data(self, data_dir, tmp_dir, task_id=-1):
-    generator_utils.generate_dataset_and_shuffle(
-        self.train_generator(data_dir, tmp_dir, True),
-        self.training_filepaths(data_dir, self.num_shards, shuffled=False),
-        self.dev_generator(data_dir, tmp_dir),
-        self.dev_filepaths(data_dir, 1, shuffled=False))
-
-  def feature_encoders(self, data_dir):
-    if self.is_character_level:
-      return {
-          "inputs": text_encoder.ByteTextEncoder(),
-          "targets": text_encoder.ByteTextEncoder(),
-      }
-    vocab_filename = os.path.join(data_dir, self.vocab_file)
-    subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename)
-    return {
-        "inputs": subtokenizer,
-        "targets": subtokenizer,
-    }
-
-  def hparams(self, defaults, unused_model_hparams):
-    p = defaults
-    if self.is_character_level:
-      source_vocab_size = 256
-      target_vocab_size = 256
-    else:
-      source_vocab_size = self._encoders["inputs"].vocab_size
-      target_vocab_size = self._encoders["targets"].vocab_size
-    p.input_modality = {"inputs": (registry.Modalities.SYMBOL,
-                                   source_vocab_size)}
-    p.target_modality = (registry.Modalities.SYMBOL, target_vocab_size)
-    p.input_space_id = self.input_space_id
-    p.target_space_id = self.target_space_id
-    if self.is_character_level:
-      p.loss_multiplier = 2.0
-
-  def eval_metrics(self):
-    return [
-        metrics.Metrics.ACC, metrics.Metrics.ACC_TOP5,
-        metrics.Metrics.ACC_PER_SEQ, metrics.Metrics.NEG_LOG_PERPLEXITY,
-        metrics.Metrics.APPROX_BLEU
-    ]
+  def use_subword_tokenizer(self):
+    return True
 
 
 # Generic generators used later for multiple problems.
@@ -634,7 +570,7 @@ def target_space_id(self):
 
 
 @registry.register_problem("wmt_encs_tokens_32k")
-class WMTEnCsTokens32k(problem.Problem):
+class WMTEnCsTokens32k(WMTProblem):
   """Problem spec for WMT English-Czech translation."""
 
   @property
@@ -665,13 +601,6 @@ def input_space_id(self):
   def target_space_id(self):
     return problem.SpaceID.CS_TOK
 
-  def eval_metrics(self):
-    return [
-        metrics.Metrics.ACC, metrics.Metrics.ACC_TOP5,
-        metrics.Metrics.ACC_PER_SEQ, metrics.Metrics.NEG_LOG_PERPLEXITY,
-        metrics.Metrics.APPROX_BLEU
-    ]
-
 
 @registry.register_problem("wmt_encs_characters")
 class WMTEnCsCharacters(WMTProblem):
diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py
index e78e22344..454e4f321 100644
--- a/tensor2tensor/utils/data_reader.py
+++ b/tensor2tensor/utils/data_reader.py
@@ -115,7 +115,8 @@ def decode_record(record):
 
   with tf.name_scope("examples_in"):
     data_files = tf.contrib.slim.parallel_reader.get_data_files(data_sources)
-    random.shuffle(data_files)
+    if training:
+      random.shuffle(data_files)
     dataset = tf.contrib.data.TFRecordDataset(data_files)
     num_readers = min(4 if training else 1, len(data_files))
     dataset = dataset.map(decode_record, num_threads=num_readers)

From 01f245fdecdf9fbdfae8a610cf9246e222c0891a Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Thu, 27 Jul 2017 18:21:21 -0700
Subject: [PATCH 5/6] v1.1.3

PiperOrigin-RevId: 163418167
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 6be9aba04..ae028d847 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='tensor2tensor',
-    version='1.1.2',
+    version='1.1.3',
     description='Tensor2Tensor',
     author='Google Inc.',
     author_email='no-reply@google.com',

From 7c072d7b77ada142bd577d01919a9be32900dd0c Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Thu, 27 Jul 2017 19:04:00 -0700
Subject: [PATCH 6/6] Revert usage of Datasets API

PiperOrigin-RevId: 163421122
---
 tensor2tensor/utils/data_reader.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py
index 454e4f321..ba5139433 100644
--- a/tensor2tensor/utils/data_reader.py
+++ b/tensor2tensor/utils/data_reader.py
@@ -20,7 +20,6 @@
 
 import math
 import os
-import random
 
 # Dependency imports
 
@@ -114,17 +113,18 @@ def decode_record(record):
     return dict(zip(decode_items, decoded))
 
   with tf.name_scope("examples_in"):
+    # Read serialized examples using slim parallel_reader.
     data_files = tf.contrib.slim.parallel_reader.get_data_files(data_sources)
-    if training:
-      random.shuffle(data_files)
-    dataset = tf.contrib.data.TFRecordDataset(data_files)
     num_readers = min(4 if training else 1, len(data_files))
-    dataset = dataset.map(decode_record, num_threads=num_readers)
-    if training:
-      dataset = dataset.shuffle(capacity)
-    dataset = dataset.repeat(None if training else 1)
-    it = dataset.make_one_shot_iterator()
-    return it.get_next()
+    _, example_serialized = tf.contrib.slim.parallel_reader.parallel_read(
+        data_sources,
+        tf.TFRecordReader,
+        num_epochs=None if training else 1,
+        shuffle=training,
+        capacity=2 * capacity,
+        min_after_dequeue=capacity,
+        num_readers=num_readers)
+    return decode_record(example_serialized)
 
 
 def preprocessing(examples, data_file_pattern, mode):