diff --git a/.gitignore b/.gitignore
index fbd98dca5..c9dd3db88 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,7 +10,6 @@ _pycache__/
 # PyPI distribution artifacts.
 build/
 dist/
-data/
 
 # Sublime project files
 *.sublime-project
diff --git a/setup.py b/setup.py
index ae028d847..6f509d03e 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='tensor2tensor',
-    version='1.1.3',
+    version='1.1.4',
     description='Tensor2Tensor',
     author='Google Inc.',
     author_email='no-reply@google.com',
diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen
index 1f876c981..837d6d203 100644
--- a/tensor2tensor/bin/t2t-datagen
+++ b/tensor2tensor/bin/t2t-datagen
@@ -118,40 +118,9 @@ _SUPPORTED_PROBLEM_GENERATORS = {
         lambda: wiki.generator(FLAGS.tmp_dir, True),
         1000
     ),
-    "image_mnist_tune": (
-        lambda: image.mnist_generator(FLAGS.tmp_dir, True, 55000),
-        lambda: image.mnist_generator(FLAGS.tmp_dir, True, 5000, 55000)),
-    "image_mnist_test": (
-        lambda: image.mnist_generator(FLAGS.tmp_dir, True, 60000),
-        lambda: image.mnist_generator(FLAGS.tmp_dir, False, 10000)),
-    "image_cifar10_tune": (
-        lambda: image.cifar10_generator(FLAGS.tmp_dir, True, 48000),
-        lambda: image.cifar10_generator(FLAGS.tmp_dir, True, 2000, 48000)),
-    "image_cifar10_test": (
-        lambda: image.cifar10_generator(FLAGS.tmp_dir, True, 50000),
-        lambda: image.cifar10_generator(FLAGS.tmp_dir, False, 10000)),
-    "image_mscoco_characters_test": (
-        lambda: image.mscoco_generator(
-            FLAGS.data_dir, FLAGS.tmp_dir, True, 80000),
-        lambda: image.mscoco_generator(
-            FLAGS.data_dir, FLAGS.tmp_dir, False, 40000)),
     "image_celeba_tune": (
         lambda: image.celeba_generator(FLAGS.tmp_dir, 162770),
         lambda: image.celeba_generator(FLAGS.tmp_dir, 19867, 162770)),
-    "image_mscoco_tokens_8k_test": (
-        lambda: image.mscoco_generator(
-            FLAGS.data_dir, FLAGS.tmp_dir, True, 80000,
-            vocab_filename="vocab.endefr.%d" % 2**13, vocab_size=2**13),
-        lambda: image.mscoco_generator(
-            FLAGS.data_dir, FLAGS.tmp_dir, False, 40000,
-            vocab_filename="vocab.endefr.%d" % 2**13, vocab_size=2**13)),
-    "image_mscoco_tokens_32k_test": (
-        lambda: image.mscoco_generator(
-            FLAGS.data_dir, FLAGS.tmp_dir, True, 80000,
-            vocab_filename="vocab.endefr.%d" % 2**15, vocab_size=2**15),
-        lambda: image.mscoco_generator(
-            FLAGS.data_dir, FLAGS.tmp_dir, False, 40000,
-            vocab_filename="vocab.endefr.%d" % 2**15, vocab_size=2**15)),
     "snli_32k": (
         lambda: snli.snli_token_generator(FLAGS.tmp_dir, True, 2**15),
         lambda: snli.snli_token_generator(FLAGS.tmp_dir, False, 2**15),
diff --git a/tensor2tensor/data_generators/README.md b/tensor2tensor/data_generators/README.md
index 310bc39df..0e6d64dd2 100644
--- a/tensor2tensor/data_generators/README.md
+++ b/tensor2tensor/data_generators/README.md
@@ -28,7 +28,7 @@ for an example.
 
 `Problem`s support data generation, training, and decoding.
 
-Data generation is handles by `Problem.generate_data` which should produce 2
+Data generation is handled by `Problem.generate_data` which should produce 2
 datasets, training and dev, which should be named according to
 `Problem.training_filepaths` and `Problem.dev_filepaths`.
 `Problem.generate_data` should also produce any other files that may be required
diff --git a/tensor2tensor/data_generators/gene_expression.py b/tensor2tensor/data_generators/gene_expression.py
index 60e38a90f..82c15414a 100644
--- a/tensor2tensor/data_generators/gene_expression.py
+++ b/tensor2tensor/data_generators/gene_expression.py
@@ -163,8 +163,9 @@ def example_reading_spec(self):
     data_items_to_decoders = None
     return (data_fields, data_items_to_decoders)
 
-  def preprocess_examples(self, examples, mode):
+  def preprocess_examples(self, examples, mode, hparams):
     del mode
+    del hparams
 
     # Reshape targets
     examples["targets"] = tf.reshape(examples["targets"],
diff --git a/tensor2tensor/data_generators/generator_utils.py b/tensor2tensor/data_generators/generator_utils.py
index 5c7f9f2a1..b38531c1a 100644
--- a/tensor2tensor/data_generators/generator_utils.py
+++ b/tensor2tensor/data_generators/generator_utils.py
@@ -28,6 +28,7 @@
 
 # Dependency imports
 
+import requests
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import six.moves.urllib_request as urllib  # Imports urllib on Python2, urllib.request on Python3
@@ -196,6 +197,56 @@ def maybe_download(directory, filename, url):
   return filepath
 
 
+def maybe_download_from_drive(directory, filename, url):
+  """Download filename from google drive unless it's already in directory.
+
+  Args:
+    directory: path to the directory that will be used.
+    filename: name of the file to download to (do nothing if it already exists).
+    url: URL to download from.
+
+  Returns:
+    The path to the downloaded file.
+  """
+  if not tf.gfile.Exists(directory):
+    tf.logging.info("Creating directory %s" % directory)
+    os.mkdir(directory)
+  filepath = os.path.join(directory, filename)
+  confirm_token = None
+  if tf.gfile.Exists(filepath):
+    tf.logging.info("Not downloading, file already found: %s" % filepath)
+    return filepath
+
+  # Since the file is big, drive will scan it for virus and take it to a
+  # warning page. We find the confirm token on this page and append it to the
+  # URL to start the download process.
+  confirm_token = None
+  session = requests.Session()
+  response = session.get(url, stream=True)
+  for k, v in response.cookies.items():
+    if k.startswith("download_warning"):
+      confirm_token = v
+
+  if confirm_token:
+    url = url + "&confirm=" + confirm_token
+  tf.logging.info("Downloading %s to %s" % (url, filepath))
+
+  response = session.get(url, stream=True)
+  # Now begin the download.
+  chunk_size = 16 * 1024
+  with open(filepath, "wb") as f:
+    for chunk in response.iter_content(chunk_size):
+      if chunk:
+        f.write(chunk)
+
+  # Print newline to clear the carriage return from the download progress
+  print()
+  statinfo = os.stat(filepath)
+  tf.logging.info("Succesfully downloaded %s, %s bytes." % (filename,
+                                                            statinfo.st_size))
+  return filepath
+
+
 def gunzip_file(gz_path, new_path):
   """Unzips from gz_path into new_path.
 
diff --git a/tensor2tensor/data_generators/generator_utils_test.py b/tensor2tensor/data_generators/generator_utils_test.py
index fd6e15ca3..144507e6b 100644
--- a/tensor2tensor/data_generators/generator_utils_test.py
+++ b/tensor2tensor/data_generators/generator_utils_test.py
@@ -64,6 +64,20 @@ def testMaybeDownload(self):
     os.remove(tmp_file_path + ".http")
     os.remove(tmp_file_path)
 
+  def testMaybeDownloadFromDrive(self):
+    tmp_dir = self.get_temp_dir()
+    (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir)
+    tmp_file_name = os.path.basename(tmp_file_path)
+
+    # Download Google index to the temporary file.http.
+    res_path = generator_utils.maybe_download_from_drive(
+        tmp_dir, tmp_file_name + ".http", "http://drive.google.com")
+    self.assertEqual(res_path, tmp_file_path + ".http")
+
+    # Clean up.
+    os.remove(tmp_file_path + ".http")
+    os.remove(tmp_file_path)
+
   def testGunzipFile(self):
     tmp_dir = self.get_temp_dir()
     (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir)
diff --git a/tensor2tensor/data_generators/image.py b/tensor2tensor/data_generators/image.py
index fdad8d432..a2e328f00 100644
--- a/tensor2tensor/data_generators/image.py
+++ b/tensor2tensor/data_generators/image.py
@@ -36,11 +36,189 @@
 from tensor2tensor.data_generators import generator_utils
 from tensor2tensor.data_generators import problem
 from tensor2tensor.data_generators import text_encoder
+from tensor2tensor.models import common_layers
 from tensor2tensor.utils import registry
 
 import tensorflow as tf
 
 
+class ImageProblem(problem.Problem):
+
+  def example_reading_spec(self, label_key=None):
+    if label_key is None:
+      label_key = "image/class/label"
+
+    data_fields = {
+        "image/encoded": tf.FixedLenFeature((), tf.string),
+        "image/format": tf.FixedLenFeature((), tf.string),
+        label_key: tf.VarLenFeature(tf.int64)
+    }
+    data_items_to_decoders = {
+        "inputs":
+            tf.contrib.slim.tfexample_decoder.Image(
+                image_key="image/encoded",
+                format_key="image/format",
+                channels=3),
+        "targets":
+            tf.contrib.slim.tfexample_decoder.Tensor(label_key),
+    }
+
+    return data_fields, data_items_to_decoders
+
+
+# French street names dataset.
+
+
+@registry.register_problem
+class ImageFSNS(ImageProblem):
+  """Problem spec for French Street Name recognition."""
+
+  def generate_data(self, data_dir, tmp_dir, task_id=-1):
+    list_url = ("https://raw.githubusercontent.com/tensorflow/models/master/"
+                "street/python/fsns_urls.txt")
+    fsns_urls = generator_utils.maybe_download(
+        tmp_dir, "fsns_urls.txt", list_url)
+    fsns_files = [f.strip() for f in open(fsns_urls, "r")
+                  if f.startswith("http://")]
+    for url in fsns_files:
+      if "/train/train" in url:
+        generator_utils.maybe_download(
+            data_dir, "image_fsns-train" + url[-len("-00100-of-00512"):], url)
+      elif "/validation/validation" in url:
+        generator_utils.maybe_download(
+            data_dir, "image_fsns-dev" + url[-len("-00100-of-00512"):], url)
+      elif "charset" in url:
+        generator_utils.maybe_download(
+            data_dir, "charset_size134.txt", url)
+
+  def feature_encoders(self, data_dir):
+    # This vocab file must be present within the data directory.
+    vocab_filename = os.path.join(data_dir, "charset_size134.txt")
+    return {
+        "inputs": text_encoder.TextEncoder(),
+        "targets": text_encoder.SubwordTextEncoder(vocab_filename)
+    }
+
+  def hparams(self, defaults, model_hparams):
+    p = defaults
+    p.input_modality = {"inputs": (registry.Modalities.IMAGE, None)}
+    vocab_size = self._encoders["targets"].vocab_size
+    p.target_modality = (registry.Modalities.SYMBOL, vocab_size)
+    p.batch_size_multiplier = 256
+    p.max_expected_batch_size_per_shard = 2
+    p.input_space_id = problem.SpaceID.IMAGE
+    p.target_space_id = problem.SpaceID.EN_TOK
+
+  def example_reading_spec(self):
+    label_key = "image/unpadded_label"
+    return super(ImageFSNS, self).example_reading_spec(self,
+                                                       label_key=label_key)
+
+
+class Image2ClassProblem(ImageProblem):
+  """Base class for image classification problems."""
+
+  @property
+  def is_small(self):
+    raise NotImplementedError()
+
+  @property
+  def num_classes(self):
+    raise NotImplementedError()
+
+  @property
+  def train_shards(self):
+    raise NotImplementedError()
+
+  @property
+  def dev_shards(self):
+    return 1
+
+  def generator(self, data_dir, tmp_dir, is_training):
+    raise NotImplementedError()
+
+  def hparams(self, defaults, model_hparams):
+    p = defaults
+    small_modality = "%s:small_image_modality" % registry.Modalities.IMAGE
+    modality = small_modality if self.is_small else registry.Modalities.IMAGE
+    p.input_modality = {"inputs": (modality, None)}
+    p.target_modality = (registry.Modalities.CLASS_LABEL, self.num_classes)
+    p.batch_size_multiplier = 4 if self.is_small else 256
+    p.max_expected_batch_size_per_shard = 8 if self.is_small else 2
+    p.loss_multiplier = 3.0 if self.is_small else 1.0
+    if self._was_reversed:
+      p.loss_multiplier = 1.0
+    p.input_space_id = problem.SpaceID.IMAGE
+    p.target_space_id = problem.SpaceID.IMAGE_LABEL
+
+  def generate_data(self, data_dir, tmp_dir, task_id=-1):
+    generator_utils.generate_dataset_and_shuffle(
+        self.generator(data_dir, tmp_dir, True),
+        self.training_filepaths(data_dir, self.train_shards, shuffled=False),
+        self.generator(data_dir, tmp_dir, False),
+        self.dev_filepaths(data_dir, self.dev_shards, shuffled=False))
+
+
+def imagenet_preprocess_examples(examples, mode):
+  """Preprocessing used for Imagenet and similar problems."""
+  def preprocess(img):
+    img = tf.image.resize_images(img, [360, 360])
+    img = common_layers.image_augmentation(tf.to_float(img) / 255.)
+    return tf.to_int64(img * 255.)
+
+  def resize(img):
+    return tf.to_int64(tf.image.resize_images(img, [299, 299]))
+
+  inputs = tf.cast(examples["inputs"], tf.int64)
+  if mode == tf.contrib.learn.ModeKeys.TRAIN:
+    examples["inputs"] = tf.cond(  # Preprocess 90% of the time.
+        tf.less(tf.random_uniform([]), 0.9),
+        lambda img=inputs: preprocess(img),
+        lambda img=inputs: resize(img))
+  else:
+    examples["inputs"] = resize(inputs)
+  return examples
+
+
+@registry.register_problem
+class ImageImagenet(Image2ClassProblem):
+  """Imagenet."""
+
+  @property
+  def is_small(self):
+    return False
+
+  @property
+  def num_classes(self):
+    return 1000
+
+  def generate_data(self, data_dir, tmp_dir, task_id=-1):
+    # TODO(lukaszkaiser): find a better way than printing this.
+    print("To generate the ImageNet dataset in the proper format, follow "
+          "instructions at https://github.com/tensorflow/models/blob/master"
+          "/inception/README.md#getting-started")
+
+  def preprocess_examples(self, examples, mode):
+    return imagenet_preprocess_examples(examples, mode)
+
+
+@registry.register_problem
+class ImageImagenet32(Image2ClassProblem):
+  """Imagenet rescaled to 32x32."""
+
+  def dataset_filename(self):
+    return "image_imagenet"  # Reuse Imagenet data.
+
+  @property
+  def is_small(self):
+    return True  # Modalities like for CIFAR.
+
+  def preprocess_examples(self, examples, mode):
+    examples = imagenet_preprocess_examples(examples, mode)
+    examples["inputs"] = tf.to_int64(tf.image.resize_images(
+        examples["inputs"], [32, 32]))
+
+
 def image_generator(images, labels):
   """Generator for images that takes image and labels lists and creates pngs.
 
@@ -158,6 +336,39 @@ def mnist_generator(tmp_dir, training, how_many, start_from=0):
                          labels[start_from:start_from + how_many])
 
 
+@registry.register_problem
+class ImageMnistTune(Image2ClassProblem):
+  """MNIST, tuning data."""
+
+  @property
+  def is_small(self):
+    return True
+
+  @property
+  def num_classes(self):
+    return 10
+
+  @property
+  def train_shards(self):
+    return 10
+
+  def generator(self, data_dir, tmp_dir, is_training):
+    if is_training:
+      return mnist_generator(tmp_dir, True, 55000)
+    else:
+      return mnist_generator(tmp_dir, True, 5000, 55000)
+
+
+@registry.register_problem
+class ImageMnist(ImageMnistTune):
+
+  def generator(self, data_dir, tmp_dir, is_training):
+    if is_training:
+      return mnist_generator(tmp_dir, True, 60000)
+    else:
+      return mnist_generator(tmp_dir, False, 10000)
+
+
 # URLs and filenames for CIFAR data.
 _CIFAR10_URL = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
 _CIFAR10_PREFIX = "cifar-10-batches-py/"
@@ -208,6 +419,39 @@ def cifar10_generator(tmp_dir, training, how_many, start_from=0):
                          all_labels[start_from:start_from + how_many])
 
 
+@registry.register_problem
+class ImageCifar10Tune(ImageMnistTune):
+
+  def preprocess_examples(self, examples, mode):
+    if mode == tf.contrib.learn.ModeKeys.TRAIN:
+      examples["inputs"] = common_layers.cifar_image_augmentation(
+          examples["inputs"])
+    return examples
+
+  def generator(self, data_dir, tmp_dir, is_training):
+    if is_training:
+      return cifar10_generator(tmp_dir, True, 48000)
+    else:
+      return cifar10_generator(tmp_dir, True, 2000, 48000)
+
+
+@registry.register_problem
+class ImageCifar10(ImageCifar10Tune):
+
+  def generator(self, data_dir, tmp_dir, is_training):
+    if is_training:
+      return cifar10_generator(tmp_dir, True, 50000)
+    else:
+      return cifar10_generator(tmp_dir, False, 10000)
+
+
+@registry.register_problem
+class ImageCifar10Plain(ImageCifar10):
+
+  def preprocess_examples(self, examples, mode):
+    return examples
+
+
 # URLs and filenames for MSCOCO data.
 _MSCOCO_ROOT_URL = "http://msvocds.blob.core.windows.net/"
 _MSCOCO_URLS = [
@@ -308,90 +552,148 @@ def mscoco_generator(data_dir,
         }
 
 
-class ImageProblem(problem.Problem):
+class Image2TextProblem(ImageProblem):
+  """Base class for image-to-text problems."""
 
-  def example_reading_spec(self, label_key=None):
-    if label_key is None:
-      label_key = "image/class/label"
+  @property
+  def is_character_level(self):
+    raise NotImplementedError()
 
-    data_fields = {
-        "image/encoded": tf.FixedLenFeature((), tf.string),
-        "image/format": tf.FixedLenFeature((), tf.string),
-        label_key: tf.VarLenFeature(tf.int64)
-    }
-    data_items_to_decoders = {
-        "inputs":
-            tf.contrib.slim.tfexample_decoder.Image(
-                image_key="image/encoded",
-                format_key="image/format",
-                channels=3),
-        "targets":
-            tf.contrib.slim.tfexample_decoder.Tensor(label_key),
-    }
+  @property
+  def targeted_vocab_size(self):
+    raise NotImplementedError()  # Not needed if self.is_character_level.
 
-    return data_fields, data_items_to_decoders
+  @property
+  def target_space_id(self):
+    raise NotImplementedError()
 
-# French street names dataset.
+  @property
+  def train_shards(self):
+    raise NotImplementedError()
 
+  @property
+  def dev_shards(self):
+    raise NotImplementedError()
 
-@registry.register_problem
-class ImageFSNS(ImageProblem):
-  """Problem spec for French Street Name recognition."""
+  def generator(self, data_dir, tmp_dir, is_training):
+    raise NotImplementedError()
 
-  def generate_data(self, data_dir, tmp_dir, task_id=-1):
-    list_url = ("https://raw.githubusercontent.com/tensorflow/models/master/"
-                "street/python/fsns_urls.txt")
-    fsns_urls = generator_utils.maybe_download(
-        tmp_dir, "fsns_urls.txt", list_url)
-    fsns_files = [f.strip() for f in open(fsns_urls, "r")
-                  if f.startswith("http://")]
-    for url in fsns_files:
-      if "/train/train" in url:
-        generator_utils.maybe_download(
-            data_dir, "image_fsns-train" + url[-len("-00100-of-00512"):], url)
-      elif "/validation/validation" in url:
-        generator_utils.maybe_download(
-            data_dir, "image_fsns-dev" + url[-len("-00100-of-00512"):], url)
-      elif "charset" in url:
-        generator_utils.maybe_download(
-            data_dir, "charset_size134.txt", url)
+  def feature_encoders(self, data_dir):
+    if self.is_character_level:
+      encoder = text_encoder.ByteTextEncoder()
+    else:
+      vocab_filename = os.path.join(
+          data_dir, "vocab.endefr.%d" % self.targeted_vocab_size)
+      encoder = text_encoder.SubwordTextEncoder(vocab_filename)
+    return {"targets": encoder}
 
   def hparams(self, defaults, model_hparams):
     p = defaults
     p.input_modality = {"inputs": (registry.Modalities.IMAGE, None)}
-    # This vocab file must be present within the data directory.
-    vocab_filename = os.path.join(model_hparams.data_dir, "charset_size134.txt")
-    subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename)
-    p.target_modality = (registry.Modalities.SYMBOL, subtokenizer.vocab_size)
-    p.vocabulary = {
-        "inputs": text_encoder.TextEncoder(),
-        "targets": subtokenizer,
-    }
+    encoder = self._encoders["targets"]
+    p.target_modality = (registry.Modalities.SYMBOL, encoder.vocab_size)
     p.batch_size_multiplier = 256
     p.max_expected_batch_size_per_shard = 2
-    vocab_size = 144
-    p.input_modality = {"inputs": (registry.Modalities.SYMBOL, vocab_size)}
-    p.target_modality = (registry.Modalities.SYMBOL, vocab_size)
-    p.input_space_id = problem.SpaceID.DIGIT_0
-    p.target_space_id = problem.SpaceID.DIGIT_1
+    p.loss_multiplier = 1.0
+    p.input_space_id = problem.SpaceID.IMAGE
+    p.target_space_id = self.target_space_id
+
+  def generate_data(self, data_dir, tmp_dir, task_id=-1):
+    generator_utils.generate_dataset_and_shuffle(
+        self.generator(data_dir, tmp_dir, True),
+        self.training_filepaths(data_dir, self.train_shards, shuffled=False),
+        self.generator(data_dir, tmp_dir, False),
+        self.dev_filepaths(data_dir, self.dev_shards, shuffled=False))
+
+
+@registry.register_problem
+class ImageMsCocoCharacters(Image2TextProblem):
+  """MSCOCO, character level."""
+
+  @property
+  def is_character_level(self):
+    return True
+
+  @property
+  def target_space_id(self):
+    return problem.SpaceID.EN_CHR
+
+  @property
+  def train_shards(self):
+    return 100
+
+  @property
+  def dev_shards(self):
+    return 10
+
+  def preprocess_examples(self, examples, mode):
+    return imagenet_preprocess_examples(examples, mode)
+
+  def generator(self, data_dir, tmp_dir, is_training):
+    if is_training:
+      return mscoco_generator(data_dir, tmp_dir, True, 80000)
+    else:
+      return mscoco_generator(data_dir, tmp_dir, False, 40000)
+    raise NotImplementedError()
+
+
+@registry.register_problem
+class ImageMsCocoTokens8k(ImageMsCocoCharacters):
+  """MSCOCO, 8k tokens vocab."""
+
+  @property
+  def is_character_level(self):
+    return False
+
+  @property
+  def targeted_vocab_size(self):
+    return 2**13  # 8192
+
+  @property
+  def target_space_id(self):
+    return problem.SpaceID.EN_TOK
+
+  @property
+  def train_shards(self):
+    return 100
+
+  @property
+  def dev_shards(self):
+    return 10
+
+  def generator(self, data_dir, tmp_dir, is_training):
+    vocab_filename = "vocab.endefr.%d" % self.targeted_vocab_size
+    if is_training:
+      return mscoco_generator(
+          data_dir, tmp_dir, True, 80000,
+          vocab_filename=vocab_filename, vocab_size=self.targeted_vocab_size)
+    else:
+      return mscoco_generator(
+          data_dir, tmp_dir, False, 40000,
+          vocab_filename=vocab_filename, vocab_size=self.targeted_vocab_size)
+
+
+@registry.register_problem
+class ImageMsCocoTokens32k(ImageMsCocoTokens8k):
+  """MSCOCO, 32k tokens vocab."""
+
+  @property
+  def targeted_vocab_size(self):
+    return 2**15  # 32768
 
-  def example_reading_spec(self):
-    label_key = "image/unpadded_label"
-    return super(ImageFSNS, self).example_reading_spec(self,
-                                                       label_key=label_key)
 
-# Filename for CELEBA data.
+# URL and filename for CELEBA data.
 _CELEBA_NAME = "img_align_celeba"
+_CELEBA_URL = "https://drive.google.com/uc?export=download&id=0B7EVK8r0v71pZjFTYXZWM3FlRnM"
 
 
 def _get_celeba(directory):
   """Download and extract CELEBA to directory unless it is there."""
-  path = os.path.join(directory, _CELEBA_NAME)
+  # path = os.path.join(directory, _CELEBA_NAME)
+  path = generator_utils.maybe_download_from_drive(directory,
+                                                   _CELEBA_NAME, _CELEBA_URL)
   if not tf.gfile.Exists(path):
-    # We expect that this file has been downloaded from:
-    # https://drive.google.com/uc?export=download&id=0B7EVK8r0v71pZjFTYXZWM3FlRnM
-    # and placed in `directory`.
-    zipfile.ZipFile(path+".zip", "r").extractall(directory)
+    zipfile.ZipFile(path + ".zip", "r").extractall(directory)
 
 
 def celeba_generator(tmp_dir, how_many, start_from=0):
@@ -408,7 +710,7 @@ def celeba_generator(tmp_dir, how_many, start_from=0):
     * image/format: the string "jpeg" representing image format,
   """
   _get_celeba(tmp_dir)
-  image_files = tf.gfile.Glob(tmp_dir + "/*.jpg")
+  image_files = tf.gfile.Glob(os.path.join(tmp_dir, _CELEBA_NAME) + "/*.jpg")
   for filename in image_files[start_from:start_from+how_many]:
     with tf.gfile.Open(filename, "r") as f:
       encoded_image_data = f.read()
diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py
index 9623791f5..3d30ec239 100644
--- a/tensor2tensor/data_generators/problem.py
+++ b/tensor2tensor/data_generators/problem.py
@@ -82,6 +82,8 @@ class SpaceID(object):
   DNA = 23
   # Real numbers
   REAL = 24
+  # Images
+  IMAGE = 25
 
 
 class Problem(object):
@@ -162,8 +164,12 @@ def example_reading_spec(self):
     data_items_to_decoders = None
     return (data_fields, data_items_to_decoders)
 
-  def preprocess_examples(self, examples, mode):
+  def preprocess_examples(self, examples, mode, hparams):
     del mode
+    if hparams.max_input_seq_length > 0:
+      examples["inputs"] = examples["inputs"][:hparams.max_input_seq_length]
+    if hparams.max_target_seq_length > 0:
+      examples["targets"] = examples["targets"][:hparams.max_target_seq_length]
     return examples
 
   def eval_metrics(self):
@@ -230,9 +236,6 @@ def internal_hparams(self, model_hparams):
 
     if self._was_reversed:
       _reverse_problem_hparams(hp)
-      # TODO(rsepassi): Move this into the cifar10 Problem
-      if "image_cifar10" in self.name:
-        hp.loss_multiplier = 1.
     if self._was_copy:
       _copy_problem_hparams(hp)
     return hp
@@ -386,12 +389,13 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1):
         self.dev_filepaths(data_dir, 1, shuffled=False))
 
   def feature_encoders(self, data_dir):
-    vocab_filename = os.path.join(data_dir, self.vocab_file)
     if self.is_character_level:
-      encoder = text_encoder.ByteTextEncoder(),
+      encoder = text_encoder.ByteTextEncoder()
     elif self.use_subword_tokenizer:
+      vocab_filename = os.path.join(data_dir, self.vocab_file)
       encoder = text_encoder.SubwordTextEncoder(vocab_filename)
     else:
+      vocab_filename = os.path.join(data_dir, self.vocab_file)
       encoder = text_encoder.TokenTextEncoder(vocab_filename)
     if self.has_inputs:
       return {"inputs": encoder, "targets": encoder}
@@ -399,17 +403,12 @@ def feature_encoders(self, data_dir):
 
   def hparams(self, defaults, unused_model_hparams):
     p = defaults
-    if self.is_character_level:
-      source_vocab_size = 256
-      target_vocab_size = 256
-    else:
-      target_vocab_size = self._encoders["targets"].vocab_size
-      if self.has_inputs:
-        source_vocab_size = self._encoders["inputs"].vocab_size
 
     if self.has_inputs:
+      source_vocab_size = self._encoders["inputs"].vocab_size
       p.input_modality = {"inputs": (registry.Modalities.SYMBOL,
                                      source_vocab_size)}
+    target_vocab_size = self._encoders["targets"].vocab_size
     p.target_modality = (registry.Modalities.SYMBOL, target_vocab_size)
     if self.has_inputs:
       p.input_space_id = self.input_space_id
diff --git a/tensor2tensor/data_generators/problem_hparams.py b/tensor2tensor/data_generators/problem_hparams.py
index 3c829eeac..607078d2f 100644
--- a/tensor2tensor/data_generators/problem_hparams.py
+++ b/tensor2tensor/data_generators/problem_hparams.py
@@ -48,8 +48,6 @@ def problem_hparams(problem_name, model_hparams):
   p = _lookup_problem_hparams_fn(base_name)(model_hparams)
   if was_reversed:
     _reverse_problem_hparams(p)
-    if "image_cifar10" in base_name:
-      p.loss_multiplier = 1.
   if was_copy:
     _copy_problem_hparams(p)
   return p
@@ -509,86 +507,6 @@ def ice_parsing_tokens(model_hparams, wrong_source_vocab_size):
   return p
 
 
-def image_cifar10(unused_model_hparams):
-  """CIFAR-10."""
-  p = default_problem_hparams()
-  p.input_modality = {
-      "inputs": ("%s:small_image_modality" % registry.Modalities.IMAGE, None)
-  }
-  p.target_modality = (registry.Modalities.CLASS_LABEL, 10)
-  p.batch_size_multiplier = 4
-  p.max_expected_batch_size_per_shard = 8
-  p.loss_multiplier = 3.0
-  p.input_space_id = 1
-  p.target_space_id = 1
-  return p
-
-
-def image_mnist(unused_model_hparams):
-  """MNIST."""
-  p = default_problem_hparams()
-  p.input_modality = {"inputs": (registry.Modalities.SYMBOL, 256)}
-  p.target_modality = (registry.Modalities.CLASS_LABEL, 10)
-  p.batch_size_multiplier = 4
-  p.max_expected_batch_size_per_shard = 8
-  p.loss_multiplier = 3.0
-  p.input_space_id = 1
-  p.target_space_id = 1
-  return p
-
-
-def image_imagenet(model_hparams):
-  """ImageNet."""
-  p = default_problem_hparams()
-  p.input_modality = {
-      "inputs": (registry.Modalities.IMAGE, None),
-  }
-  target_modality = ("%s:class_label_2d" % registry.Modalities.CLASS_LABEL
-                     if model_hparams.imagenet_use_2d else
-                     registry.Modalities.CLASS_LABEL)
-  p.target_modality = (target_modality, 1000)
-  p.batch_size_multiplier = 256
-  p.max_expected_batch_size_per_shard = 2
-  p.loss_multiplier = 0.7
-  p.input_space_id = 1
-  p.target_space_id = 1
-  return p
-
-
-def image_mscoco_characters(unused_model_hparams):
-  """COCO image captioning with captions as characters."""
-  p = default_problem_hparams()
-  p.input_modality = {"inputs": (registry.Modalities.IMAGE, None)}
-  p.target_modality = (registry.Modalities.SYMBOL, 256)
-  p.vocabulary = {
-      "inputs": text_encoder.TextEncoder(),
-      "targets": text_encoder.ByteTextEncoder(),
-  }
-  p.batch_size_multiplier = 128
-  p.max_expected_batch_size_per_shard = 2
-  p.loss_multiplier = 2.0
-  p.input_space_id = 1
-  p.target_space_id = 2
-  return p
-
-
-def image_mscoco_tokens(model_hparams, vocab_count):
-  """COCO image captioning with captions as tokens."""
-  p = default_problem_hparams()
-  p.input_modality = {"inputs": (registry.Modalities.IMAGE, None)}
-  # This vocab file must be present within the data directory.
-  vocab_filename = os.path.join(model_hparams.data_dir,
-                                "vocab.endefr.%d" % vocab_count)
-  subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename)
-  p.target_modality = (registry.Modalities.SYMBOL, subtokenizer.vocab_size)
-  p.vocabulary = {
-      "inputs": text_encoder.TextEncoder(),
-      "targets": subtokenizer,
-  }
-  p.batch_size_multiplier = 256
-  p.max_expected_batch_size_per_shard = 2
-
-
 def img2img_imagenet(unused_model_hparams):
   """Image 2 Image for imagenet dataset."""
   p = default_problem_hparams()
@@ -633,15 +551,6 @@ def image_celeba(unused_model_hparams):
     "wsj_parsing_tokens_16k": lambda p: wsj_parsing_tokens(  # pylint: disable=g-long-lambda
         p, "wsj", 2**14, 2**9),
     "wmt_ende_bpe32k": wmt_ende_bpe32k,
-    "image_cifar10_tune": image_cifar10,
-    "image_cifar10_test": image_cifar10,
-    "image_mnist_tune": image_mnist,
-    "image_mnist_test": image_mnist,
     "image_celeba_tune": image_celeba,
-    "image_mscoco_characters_tune": image_mscoco_characters,
-    "image_mscoco_characters_test": image_mscoco_characters,
-    "image_mscoco_tokens_8k_test": lambda p: image_mscoco_tokens(p, 2**13),
-    "image_mscoco_tokens_32k_test": lambda p: image_mscoco_tokens(p, 2**15),
-    "image_imagenet": image_imagenet,
     "img2img_imagenet": img2img_imagenet,
 }
diff --git a/tensor2tensor/data_generators/test_data/vocab-2.txt b/tensor2tensor/data_generators/test_data/vocab-2.txt
index 7793af4f6..1ad6d20b9 100644
--- a/tensor2tensor/data_generators/test_data/vocab-2.txt
+++ b/tensor2tensor/data_generators/test_data/vocab-2.txt
@@ -1,3 +1,4 @@
 kattywampus,11
+kaput
 balderdash,10
 jiggery-pokery,14
diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py
index ff284bcc6..cd6ca0eea 100644
--- a/tensor2tensor/data_generators/text_encoder.py
+++ b/tensor2tensor/data_generators/text_encoder.py
@@ -54,7 +54,7 @@
 # '\\' is converted to '\'
 # '\213;' is converted to unichr(213)
 _UNESCAPE_REGEX = re.compile(r"\\u|\\\\|\\([0-9]+);")
-_ESCAPE_CHARS = set(u"\\_;0123456789")
+_ESCAPE_CHARS = set(u"\\_u;0123456789")
 
 
 def native_to_unicode_py2(s):
@@ -427,7 +427,7 @@ def bisect(min_val, max_val):
           token_counts, present_count, num_iterations)
 
       # If min_val == max_val, we can't do any better than this.
-      if subtokenizer.vocab_size == target_size or min_val == max_val:
+      if subtokenizer.vocab_size == target_size or min_val >= max_val:
         return subtokenizer
 
       if subtokenizer.vocab_size > target_size:
diff --git a/tensor2tensor/data_generators/text_encoder_build_subword.py b/tensor2tensor/data_generators/text_encoder_build_subword.py
index 47e82a176..0c366c896 100644
--- a/tensor2tensor/data_generators/text_encoder_build_subword.py
+++ b/tensor2tensor/data_generators/text_encoder_build_subword.py
@@ -24,7 +24,7 @@
 python data_generators/text_encoder_build_subword.py \
     --corpus_filepattern=$DATA_DIR/my_problem-train-* \
     --corpus_max_lines=12345 \
-    --output_fn=$DATA_DIR/my_problem.subword_text_encoder \
+    --output_filename=$DATA_DIR/my_problem.subword_text_encoder \
     --logtostderr
 
 """
@@ -75,7 +75,7 @@ def main(unused_argv):
   encoder = text_encoder.SubwordTextEncoder()
   encoder.build_from_token_counts(token_counts, FLAGS.min_count,
                                   FLAGS.num_iterations)
-  encoder.store_to_file(FLAGS.output_fn)
+  encoder.store_to_file(FLAGS.output_filename)
 
 
 if __name__ == '__main__':
diff --git a/tensor2tensor/data_generators/tokenizer.py b/tensor2tensor/data_generators/tokenizer.py
index 0e8daa75f..5cfd7c42e 100644
--- a/tensor2tensor/data_generators/tokenizer.py
+++ b/tensor2tensor/data_generators/tokenizer.py
@@ -185,7 +185,12 @@ def vocab_token_counts(text_filepattern, max_lines):
     a dictionary mapping token to count.
   """
   ret = {}
-  for line in _read_filepattern(text_filepattern, max_lines=max_lines):
+  for i, line in enumerate(
+      _read_filepattern(text_filepattern, max_lines=max_lines)):
+    if "," not in line:
+      tf.logging.warning("Malformed vocab line #%d '%s'", i, line)
+      continue
+
     token, count = line.rsplit(",", 1)
     ret[_native_to_unicode(token)] = int(count)
 
diff --git a/tensor2tensor/data_generators/tokenizer_test.py b/tensor2tensor/data_generators/tokenizer_test.py
index 0c299bd0b..065a32e91 100644
--- a/tensor2tensor/data_generators/tokenizer_test.py
+++ b/tensor2tensor/data_generators/tokenizer_test.py
@@ -130,7 +130,7 @@ def test_vocab_token_counts(self):
 
   def test_vocab_token_counts_with_max_lines(self):
     # vocab-1 has 2 lines, vocab-2 has 3
-    token_counts = tokenizer.vocab_token_counts(self.vocab_path, 4)
+    token_counts = tokenizer.vocab_token_counts(self.vocab_path, 5)
 
     expected = {
         u"lollipop": 8,
diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py
index bcd29e1d4..0a47e9989 100644
--- a/tensor2tensor/data_generators/wmt.py
+++ b/tensor2tensor/data_generators/wmt.py
@@ -193,9 +193,9 @@ def bi_vocabs_token_generator(source_path,
 
 _ENDE_TRAIN_DATASETS = [
     [
-        "http://data.statmt.org/wmt16/translation-task/training-parallel-nc-v11.tgz",  # pylint: disable=line-too-long
-        ("training-parallel-nc-v11/news-commentary-v11.de-en.en",
-         "training-parallel-nc-v11/news-commentary-v11.de-en.de")
+        "http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz",  # pylint: disable=line-too-long
+        ("training/news-commentary-v12.de-en.en",
+         "training/news-commentary-v12.de-en.de")
     ],
     [
         "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz",
@@ -250,7 +250,7 @@ def bi_vocabs_token_generator(source_path,
 
 _ZHEN_TEST_DATASETS = [[
     "http://data.statmt.org/wmt17/translation-task/dev.tgz",
-    ("dev/newsdev2017-zhen-src.zh", "dev/newsdev2017-zhen-ref.en")
+    ("dev/newsdev2017-zhen-src.zh.sgm", "dev/newsdev2017-zhen-ref.en.sgm")
 ]]
 
 # For Macedonian-English the SETimes corpus
@@ -271,9 +271,9 @@ def bi_vocabs_token_generator(source_path,
 # English-Czech datasets
 _ENCS_TRAIN_DATASETS = [
     [
-        "http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v11.tgz",  # pylint: disable=line-too-long
-        ("training-parallel-nc-v11/news-commentary-v11.cs-en.en",
-         "training-parallel-nc-v11/news-commentary-v11.cs-en.cs")
+        "http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz",  # pylint: disable=line-too-long
+        ("training/news-commentary-v12.cs-en.en",
+         "training/news-commentary-v12.cs-en.cs")
     ],
     [
         "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz",
@@ -322,6 +322,23 @@ def ende_bpe_token_generator(data_dir, tmp_dir, train):
                          EOS)
 
 
+def _preprocess_sgm(line, is_sgm):
+  """Preprocessing to strip tags in SGM files."""
+  if not is_sgm:
+    return line
+  # In SGM files, remove <srcset ...>, <p>, <doc ...> lines.
+  if line.startswith("<srcset") or line.startswith("</srcset"):
+    return ""
+  if line.startswith("<doc") or line.startswith("</doc"):
+    return ""
+  if line.startswith("<p>") or line.startswith("</p>"):
+    return ""
+  # Strip <seg> tags.
+  if line.startswith("<seg") and line.endswith("</seg>"):
+    i = line.index(">")
+    return line[i+1:-6]  # Strip first <seg ...> and last </seg>.
+
+
 def _compile_data(tmp_dir, datasets, filename):
   """Concatenate all `datasets` and save to `filename`."""
   filename = os.path.join(tmp_dir, filename)
@@ -335,9 +352,10 @@ def _compile_data(tmp_dir, datasets, filename):
         lang1_filename, lang2_filename = dataset[1]
         lang1_filepath = os.path.join(tmp_dir, lang1_filename)
         lang2_filepath = os.path.join(tmp_dir, lang2_filename)
+        is_sgm = (lang1_filename.endswith("sgm") and
+                  lang2_filename.endswith("sgm"))
 
-        if not os.path.exists(compressed_filepath):
-          generator_utils.maybe_download(tmp_dir, compressed_filename, url)
+        generator_utils.maybe_download(tmp_dir, compressed_filename, url)
         if not (os.path.exists(lang1_filepath) and
                 os.path.exists(lang2_filepath)):
           # For .tar.gz and .tgz files, we read compressed.
@@ -356,8 +374,11 @@ def _compile_data(tmp_dir, datasets, filename):
           with tf.gfile.GFile(lang2_filepath, mode="r") as lang2_file:
             line1, line2 = lang1_file.readline(), lang2_file.readline()
             while line1 or line2:
-              lang1_resfile.write(line1.strip() + "\n")
-              lang2_resfile.write(line2.strip() + "\n")
+              line1res = _preprocess_sgm(line1, is_sgm)
+              line2res = _preprocess_sgm(line2, is_sgm)
+              if line1res or line2res:
+                lang1_resfile.write(line1res.strip() + "\n")
+                lang2_resfile.write(line2res.strip() + "\n")
               line1, line2 = lang1_file.readline(), lang2_file.readline()
 
   return filename
@@ -405,7 +426,7 @@ class WMTEnDeCharacters(WMTProblem):
   def is_character_level(self):
     return True
 
-  def train_generator(self, tmp_dir, train):
+  def train_generator(self, _, tmp_dir, train):
     character_vocab = text_encoder.ByteTextEncoder()
     datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS
     tag = "train" if train else "dev"
@@ -434,8 +455,8 @@ def train_generator(self, data_dir, tmp_dir, train):
     source_vocab_size = self.targeted_vocab_size
     target_vocab_size = self.targeted_vocab_size
     datasets = _ZHEN_TRAIN_DATASETS if train else _ZHEN_TEST_DATASETS
-    source_datasets = [[item[0], [item[1][0]]] for item in datasets]
-    target_datasets = [[item[0], [item[1][1]]] for item in datasets]
+    source_datasets = [[item[0], [item[1][0]]] for item in _ZHEN_TRAIN_DATASETS]
+    target_datasets = [[item[0], [item[1][1]]] for item in _ZHEN_TRAIN_DATASETS]
     source_vocab = generator_utils.get_or_generate_vocab(
         data_dir, tmp_dir, "vocab.zh.%d" % source_vocab_size, source_vocab_size,
         source_datasets)
diff --git a/tensor2tensor/models/bluenet.py b/tensor2tensor/models/bluenet.py
index 3ac477e4b..87ad70e41 100644
--- a/tensor2tensor/models/bluenet.py
+++ b/tensor2tensor/models/bluenet.py
@@ -546,7 +546,6 @@ def bluenet_base():
   hparams.optimizer_adam_epsilon = 1e-6
   hparams.optimizer_adam_beta1 = 0.85
   hparams.optimizer_adam_beta2 = 0.997
-  hparams.add_hparam("imagenet_use_2d", True)
   hparams.add_hparam("anneal_until", 40000)
   hparams.add_hparam("batch_deviation_loss_factor", 5.0)
   return hparams
diff --git a/tensor2tensor/models/bluenet_test.py b/tensor2tensor/models/bluenet_test.py
index d4ce85b1a..70b8defe9 100644
--- a/tensor2tensor/models/bluenet_test.py
+++ b/tensor2tensor/models/bluenet_test.py
@@ -46,7 +46,7 @@ def testBlueNet(self):
       }
       model = bluenet.BlueNet(
           hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams)
-      sharded_logits, _, _ = model.model_fn(features)
+      sharded_logits, _ = model.model_fn(features)
       logits = tf.concat(sharded_logits, 0)
       session.run(tf.global_variables_initializer())
       res = session.run(logits)
diff --git a/tensor2tensor/models/bytenet_test.py b/tensor2tensor/models/bytenet_test.py
index 738b84251..536d348e7 100644
--- a/tensor2tensor/models/bytenet_test.py
+++ b/tensor2tensor/models/bytenet_test.py
@@ -45,7 +45,7 @@ def testByteNet(self):
       }
       model = bytenet.ByteNet(
           hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams)
-      sharded_logits, _, _ = model.model_fn(features)
+      sharded_logits, _ = model.model_fn(features)
       logits = tf.concat(sharded_logits, 0)
       session.run(tf.global_variables_initializer())
       res = session.run(logits)
diff --git a/tensor2tensor/models/common_attention.py b/tensor2tensor/models/common_attention.py
index 94d75b48d..b52fb8aea 100644
--- a/tensor2tensor/models/common_attention.py
+++ b/tensor2tensor/models/common_attention.py
@@ -205,6 +205,20 @@ def attention_bias_ignore_padding(memory_padding):
   return tf.expand_dims(tf.expand_dims(ret, 1), 1)
 
 
+def attention_bias_proximal(length):
+  """Bias for self-attention to encourage attention to close positions.
+
+  Args:
+    length: an integer scalar.
+
+  Returns:
+    a Tensor with shape [1, 1, length, length]
+  """
+  r = tf.to_float(tf.range(length))
+  diff = tf.expand_dims(r, 0) - tf.expand_dims(r, 1)
+  return tf.expand_dims(tf.expand_dims(-tf.log(1 + tf.abs(diff)), 0), 0)
+
+
 def split_last_dimension(x, n):
   """Reshape x so that the last dimension becomes two dimensions.
 
@@ -344,22 +358,23 @@ def dot_product_attention(q,
     return tf.matmul(weights, v)
 
 
-def local_attention_1d(q, k, v, bias=None,
-    block_length=128, look_right=True, use_whole_block=False, 
-    truncate_bias=True, name=None):
+def masked_local_attention_1d(q, k, v,
+                              block_length=128, look_right=True,
+                              use_whole_block=False, name=None):
   """Attention to the source position and a neigborhood around it.
 
   The sequence is divided into blocks of length block_size. Attention for a
   given query position can only see memory positions within a certain number
   of positions before and behind it.
 
+
   If look_right is True then each query will attend to block_length//2
   positions either side, otherwise it will attend to block_length previous
   positions.
 
   If use_whole_block is True then no mask will be applied to the local blocks
   meaning the full blocks are used (if look_right is True then the elements to
-  the right of the current position are still masked out). This allows use to
+  the right of the current position are still masked out). This allows to
   attend to more elements without additional overhead, but means we have
   inconsistent window positions and sizes.
 
@@ -367,7 +382,6 @@ def local_attention_1d(q, k, v, bias=None,
     q: a Tensor with shape [batch, heads, length_q, depth_k]
     k: a Tensor with shape [batch, heads, length_kv, depth_k]
     v: a Tensor with shape [batch, heads, length_kv, depth_v]
-    bias: Not currently used [batch, heads, length_q, length_k]
     block_length: an integer
     look_right: a bool
     use_whole_block: a bool
@@ -384,41 +398,36 @@ def local_attention_1d(q, k, v, bias=None,
     length = tf.shape(q)[2]
     depth_k = tf.shape(q)[3]
     depth_v = tf.shape(v)[3]
-
     original_length = length
 
-    #Pad to desired length
-    #If (length < block_length), then we use only one block.
+    # If (length < block_length), then we use only one block.
     block_length = tf.where(tf.less(length, block_length),
                             length, block_length)
+    # Pad to desired length.
     padding_size = tf.mod(-length, block_length)
     length += padding_size
     num_blocks = tf.div(length, block_length)
-
     padding = [[0, 0], [0, 0], [0, padding_size], [0, 0]]
     q = tf.pad(q, padding)
 
     if not look_right:
-      #Add extra padding so we son't have to do an initial query
+      # Add extra padding so we son't have to do an initial query block.
       extra_padding = [[0, 0], [0, 0], [block_length, padding_size], [0, 0]]
-      bp = [[0, 0], [0, 0], [0, padding_size], [block_length, padding_size]]
     else:
-      #We shift everything over by half a block so query is in centre
+      # We shift everything over by half a block so query is in center.
       pad_right = block_length // 2
       pad_left = block_length - pad_right
-      extra_padding = [[0, 0], [0, 0], 
-          [pad_left, padding_size+pad_right], [0, 0]]
-      bp = [[0, 0], [0, 0], 
-          [0, padding_size], [pad_left, padding_size+pad_right]]
+      extra_padding = [[0, 0], [0, 0],
+                       [pad_left, padding_size+pad_right], [0, 0]]
     k = tf.pad(k, extra_padding)
     v = tf.pad(v, extra_padding)
 
-    # Reshape into blocks
+    # Reshape into blocks.
     q = tf.reshape(q, [batch, heads, num_blocks, block_length, depth_k])
     k = tf.reshape(k, [batch, heads, num_blocks+1, block_length, depth_k])
     v = tf.reshape(v, [batch, heads, num_blocks+1, block_length, depth_v])
 
-    # Get local blocks by slicing
+    # Get local blocks by slicing.
     def local(x):
       """Create a local version of the keys or values."""
       prev_block = tf.slice(
@@ -432,56 +441,22 @@ def local(x):
 
     # [batch, heads, num_blocks, block_length, local_length]
     attention = tf.matmul(q, local_k, transpose_b=True)
-    
-    # Apply bias (N.B: This is not currently working)
-    if bias is not None:
-      with tf.name_scope('bias'):
-        b_batch = tf.shape(bias)[0]
-        b_heads = tf.shape(bias)[1]
-        bias_ = bias
-        #bias = 1.0 + tf.clip_by_value(bias, -1.0, 1.0)
-        if truncate_bias:
-          # Use only the query dimension
-          bias = tf.expand_dims(bias[:,:,:,0], 2)
-          bias = tf.pad(bias, extra_padding, name='bias_pad_b')# 17, 5, 3
-          bias = tf.reshape(bias,
-              [b_batch, b_heads, 1, num_blocks+1, block_length],
-                name='divide_blocks')
-          local_b = tf.reshape(local(bias),
-              [b_batch, b_heads, num_blocks, 1, -1], name='reshape_local')
-        else:
-          bias = tf.pad(bias, bp, name='pad')
-          bias = tf.reshape(bias, 
-              [b_batch, b_heads, num_blocks, block_length,
-                num_blocks+1, block_length], name='divide_blocks')
-          bias = tf.transpose(bias, [4,2,0,1,3,5])
-          bias = tf.reshape(bias, 
-              [num_blocks*(num_blocks+1), b_batch, b_heads,
-                block_length, block_length], name='combine')
-          indices = (num_blocks+1)*tf.range(num_blocks)
-          prev_block = tf.gather(bias, indices)
-          cur_block = tf.gather(bias, indices+num_blocks)
-          local_b = tf.concat([prev_block, cur_block], 4)
-          local_b = tf.transpose(local_b, [1,2,0,3,4])
-          return l-local_b
-        attention += local_b
-        
     attention = tf.nn.softmax(attention)
-    
+
     # Get local mask
     if not use_whole_block:
       good_part = tf.matrix_band_part(
-        tf.ones([block_length, local_length]), 0, tf.to_int64(block_length))
+          tf.ones([block_length, local_length]), 0, tf.to_int64(block_length))
     elif not look_right:
       good_part = tf.matrix_band_part(
-        tf.ones([block_length, local_length]), -1, tf.to_int64(block_length))
+          tf.ones([block_length, local_length]), -1, tf.to_int64(block_length))
     else:
       good_part = tf.ones([block_length, local_length])
 
-    #good_part = tf.cast(good_part, tf.float64)
     attention *= tf.reshape(good_part, [1, 1, 1, block_length, local_length])
 
- 
+    # TODO(noam): figure out how to show a summary for the remaining blocks.
+    # The naive way currently causes errors due to empty tensors.
     output = tf.matmul(attention, local_v)
     output = tf.reshape(output, [batch, heads, -1, depth_v])
 
@@ -491,6 +466,91 @@ def local(x):
     return output
 
 
+def unmasked_local_attention_1d(q, k, v, block_length=128, filter_width=100,
+                                name=None):
+  """strided block local self-attention.
+
+  Args:
+    q: a Tensor with shape [batch, heads, length, depth_k]
+    k: a Tensor with shape [batch, heads, length, depth_k]
+    v: a Tensor with shape [batch, heads, length, depth_v]
+    block_length: an integer
+    filter_width: an integer indicating how much to look left.
+    name: an optional string
+
+  Returns:
+    a Tensor of shape [batch, heads, length, depth_v]
+  """
+  with tf.variable_scope(name, default_name="local_self_attention_1d",
+                         values=[q, k, v]):
+    v_shape = v.get_shape()
+    depth_v = tf.shape(v)[3]
+    batch_size = tf.shape(q)[0]
+    num_heads = tf.shape(q)[1]
+    original_length = tf.shape(q)[2]
+    # making sure q is a multiple of d
+    def pad_to_multiple(x, pad_length):
+      x_length = tf.shape(x)[2]
+      return tf.pad(x, [[0, 0], [0, 0], [0, -x_length % pad_length], [0, 0]])
+    def pad_l_and_r(x, pad_length):
+      return tf.pad(x, [[0, 0], [0, 0], [pad_length, pad_length], [0, 0]])
+    q = pad_to_multiple(q, block_length)
+    k = pad_to_multiple(k, block_length)
+    v = pad_to_multiple(v, block_length)
+
+    # Setting up q blocks
+    new_q_shape = tf.shape(q)
+    # Setting up q blocks
+    q = tf.reshape(q, [new_q_shape[0], new_q_shape[1],
+                       new_q_shape[2]//block_length,
+                       block_length, new_q_shape[3]])
+
+    # Setting up k and v values
+    k = pad_l_and_r(k, filter_width)
+    v = pad_l_and_r(v, filter_width)
+
+    length = tf.shape(k)[2]
+    full_filter_width = block_length + 2*filter_width
+    # getting gather indices
+    indices = tf.range(0, length, delta=1, name="index_range")
+    # making indices [1, length, 1] to appy convs
+    indices = tf.reshape(indices, [1, -1, 1])
+    kernel = tf.expand_dims(tf.eye(full_filter_width), axis=1)
+    gather_indices = tf.nn.conv1d(
+        tf.cast(indices, tf.float32),
+        kernel,
+        block_length,
+        padding="VALID",
+        name="gather_conv")
+
+    gather_indices = tf.squeeze(tf.cast(gather_indices, tf.int32), axis=0)
+
+    # [length, batch, heads, dim]
+    k_t = tf.transpose(k, [2, 0, 1, 3])
+    k_new = tf.gather(k_t, gather_indices)
+
+    # [batch, heads, blocks, block_length, dim]
+    k_new = tf.transpose(k_new, [2, 3, 0, 1, 4])
+
+    attention_bias = tf.expand_dims(
+        tf.to_float(embedding_to_padding(k_new)) * -1e9, axis=-2)
+
+    v_t = tf.transpose(v, [2, 0, 1, 3])
+    v_new = tf.gather(v_t, gather_indices)
+    v_new = tf.transpose(v_new, [2, 3, 0, 1, 4])
+
+    logits = tf.matmul(q, k_new, transpose_b=True)
+
+    attention = tf.nn.softmax(logits+attention_bias)
+    output = tf.matmul(attention, v_new)
+
+    output = tf.reshape(output, [batch_size, num_heads, -1, depth_v])
+    # Remove the padding if introduced
+    output = tf.slice(output, [0, 0, 0, 0], [-1, -1, original_length, -1])
+    output.set_shape(v_shape)
+    return output
+
+
 def multihead_attention(query_antecedent,
                         memory_antecedent,
                         bias,
@@ -502,6 +562,7 @@ def multihead_attention(query_antecedent,
                         image_shapes=None,
                         attention_type="dot_product",
                         block_length=128,
+                        block_width=128,
                         name=None):
   """Multihead scaled-dot-product attention with input/output transformations.
 
@@ -516,9 +577,10 @@ def multihead_attention(query_antecedent,
     dropout_rate: a floating point number
     image_shapes: optional tuple of integer scalars.
       see comments for attention_image_summary()
-    attention_type: a string, either "dot_product" or "local" or
-                    "local_mask_right"
+    attention_type: a string, either "dot_product" or "local_mask_right" or
+                    "local_unmasked"
     block_length: an integer - relevant for "local_mask_right"
+    block_width: an integer - relevant for "local_unmasked"
     name: an optional string
 
   Returns:
@@ -566,12 +628,12 @@ def multihead_attention(query_antecedent,
     if attention_type == "dot_product":
       x = dot_product_attention(
           q, k, v, bias, dropout_rate, image_shapes)
-    elif attention_type == "local":
-      x = local_attention_1d(q, k, v, block_length=block_length)
+    elif attention_type == "local_mask_right":
+      x = masked_local_attention_1d(q, k, v, block_length=block_length)
     else:
-      assert attention_type == "local_mask_right"
-      x = local_attention_1d(
-          q, k, v, block_length=block_length, look_right=False)
+      assert attention_type == "local_unmasked"
+      x = unmasked_local_attention_1d(q, k, v, block_length=block_length,
+                                      filter_width=block_width)
     x = combine_heads(x)
     x = common_layers.conv1d(x, output_depth, 1, name="output_transform")
     return x
diff --git a/tensor2tensor/models/common_attention_test.py b/tensor2tensor/models/common_attention_test.py
index 2e534ba1a..a09da74e1 100644
--- a/tensor2tensor/models/common_attention_test.py
+++ b/tensor2tensor/models/common_attention_test.py
@@ -1,4 +1,5 @@
-# Copyright 2017 Google Inc.
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Tests for common layers."""
+"""Tests for common attention."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -28,54 +29,71 @@
 
 class CommonAttentionTest(tf.test.TestCase):
 
-  def testLocalAttention(self):
-    q = np.array([[[ [1.0, 0.0, 0.0, 0.0],
-            [1.0, 0.0, 0.0, 0.0],
-            [1.0, 0.0, 0.0, 0.0],
-            [1.0, 0.0, 0.0, 0.0],
-            [1.0, 0.0, 0.0, 0.0],
-            [1.0, 0.0, 0.0, 0.0],
-            [1.0, 0.0, 0.0, 0.0],
-            [1.0, 0.0, 0.0, 0.0] ]]])
-
-    k = np.array([[[ [1.0, 0.0, 0.0, 0.0],
-            [1.0, 0.0, 0.0, 0.0],
-            [1.0, 0.0, 0.0, 0.0],
-            [1.0, 0.0, 0.0, 0.0],
-            [1.0, 0.0, 0.0, 0.0],
-            [1.0, 0.0, 0.0, 0.0],
-            [1.0, 0.0, 0.0, 0.0],
-            [1.0, 0.0, 0.0, 0.0] ]]])
+  def testDotProductAttention(self):
+    x = np.random.rand(5, 7, 12, 32)
+    y = np.random.rand(5, 7, 12, 32)
+    with self.test_session() as session:
+      a = common_attention.dot_product_attention(
+          tf.constant(x, dtype=tf.float32),
+          tf.constant(y, dtype=tf.float32),
+          tf.constant(y, dtype=tf.float32), None)
+      session.run(tf.global_variables_initializer())
+      res = session.run(a)
+    self.assertEqual(res.shape, (5, 7, 12, 32))
 
-    b = np.array([[[ [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
-                     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                     [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
-                     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                     [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
-                     [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
-                     [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] ]]])
-                   
-    #b = np.ones((1,1,8,8))
-    #b = (1-b) * (-1e9)
+  def testMaskedLocalAttention(self):
+    q = np.array([[[[1.0, 0.0, 0.0, 0.0],
+                    [1.0, 0.0, 0.0, 0.0],
+                    [1.0, 0.0, 0.0, 0.0],
+                    [1.0, 0.0, 0.0, 0.0],
+                    [1.0, 0.0, 0.0, 0.0],
+                    [1.0, 0.0, 0.0, 0.0],
+                    [1.0, 0.0, 0.0, 0.0],
+                    [1.0, 0.0, 0.0, 0.0]]]])
+    k = np.array([[[[1.0, 0.0, 0.0, 0.0],
+                    [1.0, 0.0, 0.0, 0.0],
+                    [1.0, 0.0, 0.0, 0.0],
+                    [1.0, 0.0, 0.0, 0.0],
+                    [1.0, 0.0, 0.0, 0.0],
+                    [1.0, 0.0, 0.0, 0.0],
+                    [1.0, 0.0, 0.0, 0.0],
+                    [1.0, 0.0, 0.0, 0.0]]]])
     v = np.ones((1, 1, 8, 1))
+    with self.test_session() as session:
+      q_ = tf.constant(q, dtype=tf.float32)
+      k_ = tf.constant(k, dtype=tf.float32)
+      v_ = tf.constant(v, dtype=tf.float32)
+      y = common_attention.masked_local_attention_1d(
+          q_, k_, v_, block_length=tf.constant(2))
+      res = session.run(y)
 
-    #q = np.random.rand(5, 7, 13, 3)
-    #k = np.random.rand(5, 7, 13, 3)
-    #v = np.random.rand(5, 7, 13, 11)
-    #b = np.random.rand(5, 1, 13, 1)
+    self.assertEqual(res.shape, (1, 1, 8, 1))
 
+  def testLocalUnmaskedAttention(self):
+    x = np.random.rand(5, 4, 25, 16)
+    y = np.random.rand(5, 4, 25, 16)
     with self.test_session() as session:
-      q_ = tf.constant(q)
-      k_ = tf.constant(k)
-      v_ = tf.constant(v)
-      b_ = tf.constant(b)
-      y = common_attention.local_attention_1d(q_, k_, v_, b_, block_length=tf.constant(2))
-      res = session.run(y)
-    #print(q)
-    #rint(k)
-    print(res)
-    #self.assertEqual(res.shape, (5, 7, 13, 11))
+      a = common_attention.unmasked_local_attention_1d(
+          tf.constant(x, dtype=tf.float32),
+          tf.constant(y, dtype=tf.float32),
+          tf.constant(y, dtype=tf.float32),
+          block_length=4, filter_width=3)
+      session.run(tf.global_variables_initializer())
+      res = session.run(a)
+    self.assertEqual(res.shape, (5, 4, 25, 16))
+
+  def testLocalUnmaskedAttentionMatchingBlockLength(self):
+    x = np.random.rand(5, 4, 25, 16)
+    y = np.random.rand(5, 4, 25, 16)
+    with self.test_session() as session:
+      a = common_attention.unmasked_local_attention_1d(
+          tf.constant(x, dtype=tf.float32),
+          tf.constant(y, dtype=tf.float32),
+          tf.constant(y, dtype=tf.float32),
+          block_length=5, filter_width=3)
+      session.run(tf.global_variables_initializer())
+      res = session.run(a)
+    self.assertEqual(res.shape, (5, 4, 25, 16))
 
 
 if __name__ == "__main__":
diff --git a/tensor2tensor/models/common_hparams.py b/tensor2tensor/models/common_hparams.py
index e36b2e4e1..353586393 100644
--- a/tensor2tensor/models/common_hparams.py
+++ b/tensor2tensor/models/common_hparams.py
@@ -88,10 +88,22 @@ def basic_params1():
       # modality, add an entry to this semicolon-separated string. Entries are
       # formatted "feature_name:modality_type:modality_name", e.g.
       # "inputs:image:small_image_modality;other_inputs:audio:identity".
-      input_modalities="",
+      input_modalities="default",  # We don't use empty string in params.
       # To override the default target modality, specify
       # "modality_type:modality_name", e.g. "image:small_image_modality".
-      target_modality="")
+      target_modality="default",
+      # The maximum length of "input" sequence.
+      # Sequences longer than this value will be truncated. 0 or negative values
+      # mean there is no maximum or truncation.
+      # You can change this behavior by overridding preprocess_examples() method
+      # in your problem class.
+      max_input_seq_length=0,
+      # The maximum length of "target" sequence.
+      # Sequences longer than this value will be truncated. 0 or negative values
+      # mean there is no maximum or truncation.
+      # You can change this behavior by overridding preprocess_examples() method
+      # in your problem class.
+      max_target_seq_length=0)
 
 
 class RangedHParams(object):
diff --git a/tensor2tensor/models/common_layers.py b/tensor2tensor/models/common_layers.py
index ae6d0cede..5449a8bef 100644
--- a/tensor2tensor/models/common_layers.py
+++ b/tensor2tensor/models/common_layers.py
@@ -475,7 +475,7 @@ def residual_fn(x,
                 residual_dropout,
                 filters=None,
                 epsilon=1e-16,
-                name="residual"):
+                name=None, reuse=None):
   """Returns a function for combining layer input and layer output.
 
   The returned function on x (layer input) and y (layer output) computes:
@@ -489,16 +489,19 @@ def residual_fn(x,
     filters: integer, dimension for layer norm, optional
     epsilon: integer, value of layer norm epsilon
     name: string, name
+    reuse: bool, whether to reuse
 
   Returns:
     residual layer output with applied norm_fn.
   """
-  norm_fn = get_norm(norm_type)
-  res = x + tf.nn.dropout(y, 1.0 - residual_dropout)
-  if norm_type == "layer":
-    return norm_fn(res, name=name, filters=filters, epsilon=epsilon)
-  else:
-    return norm_fn(res, name=name)
+  with tf.variable_scope(name, default_name="residual",
+                         values=[x, y], reuse=reuse):
+    norm_fn = get_norm(norm_type)
+    res = x + tf.nn.dropout(y, 1.0 - residual_dropout)
+    if norm_type == "layer":
+      return norm_fn(res, filters=filters, epsilon=epsilon, name=norm_type)
+    else:
+      return norm_fn(res, name=norm_type)
 
 
 def conv_block_internal(conv_fn,
@@ -1420,22 +1423,22 @@ def smoothing_cross_entropy(logits, labels, vocab_size, confidence):
     return xentropy - normalizing
 
 
+def global_pool_1d(inputs, pooling_type="MAX", mask=None):
+  """Pool elements across the last dimension.
 
-def global_pool_1d(inputs, pooling_type='MAX', mask=None):
-  """
-  Pools elements across the last dimension. Useful to a list of vectors into a
-  single vector to get a representation of a set.
-  Concatenating 
-  
-  Args
-      inputs: A tensor of dimensions batch_size x sequence_length x input_dims
-        containing the sequences of input vectors.
-      pooling_type: the pooling type to use, MAX or AVR
-      mask: A tensor of dimensions batch_size x sequence_length containing a
-        mask for the inputs with 1's for existing elements, and 0's elsewhere.
-  Returns
-      output: A tensor of dimensions batch_size x input_dims
-        dimension containing the sequences of transformed vectors.
+  Useful to convert a list of vectors into a single vector so as
+  to get a representation of a set.
+
+  Args:
+    inputs: A tensor of dimensions batch_size x sequence_length x input_dims
+      containing the sequences of input vectors.
+    pooling_type: the pooling type to use, MAX or AVR
+    mask: A tensor of dimensions batch_size x sequence_length containing a
+      mask for the inputs with 1's for existing elements, and 0's elsewhere.
+
+  Returns:
+    output: A tensor of dimensions batch_size x input_dims
+      dimension containing the sequences of transformed vectors.
   """
   with tf.name_scope("global_pool", [inputs]):
     if mask is not None:
@@ -1457,37 +1460,33 @@ def global_pool_1d(inputs, pooling_type='MAX', mask=None):
   return output
 
 
+def running_global_pool_1d(inputs, pooling_type="MAX"):
+  """Same global pool, but only for the elements up to the current element.
 
-def running_global_pool_1d(inputs, pooling_type='MAX'):
-  """
-  Same global pool, but only for the elements up to the current element. Useful
-  for outputs where the state of future elements is not known.
+  Useful for outputs where the state of future elements is not known.
   Takes no mask as all elements up to the current element are assumed to exist.
   Currently only supports maximum. Equivalent to using a lower triangle bias.
-  
-  Args
-      inputs: A tensor of dimensions batch_size x sequence_length x input_dims
-        containing the sequences of input vectors.
-      pooling_type: Pooling type to use. Currently only supports 'MAX'.
-  Returns
-      output: A tensor of dimensions batch_size x sequence_length x input_dims
-        dimension containing the running 'totals'.
+
+  Args:
+    inputs: A tensor of dimensions batch_size x sequence_length x input_dims
+      containing the sequences of input vectors.
+    pooling_type: Pooling type to use. Currently only supports 'MAX'.
+
+  Returns:
+    output: A tensor of dimensions batch_size x sequence_length x input_dims
+      dimension containing the running 'totals'.
   """
-  
+  del pooling_type
   with tf.name_scope("running_global_pool", [inputs]):
     scan_fct = tf.maximum
-    
-    # Permute inputs so seq_length is first
+    # Permute inputs so seq_length is first.
     elems = tf.transpose(inputs, [1, 0, 2])
-	
-	  # Perform scan
+    # Perform scan.
     cumulatives = tf.scan(scan_fct, elems, swap_memory=True)
-	
-    # Permute output to get back to original order
-    output  = tf.transpose(cumulatives, [1, 0, 2])
-    
+    # Permute output to get back to original order.
+    output = tf.transpose(cumulatives, [1, 0, 2])
   return output
-  
+
 
 def linear_set_layer(layer_size,
                      inputs,
@@ -1502,21 +1501,24 @@ def linear_set_layer(layer_size,
     e.g. One can use global_pool_1d to get a representation of the set which
     can then be used as the context for the next layer.
 
-  Args
-      layer_size: Dimension to transform the input vectors to
-      inputs: A tensor of dimensions batch_size x sequence_length x input_dims
-        containing the sequences of input vectors.
-      context: A tensor of dimensions batch_size x context_dims or batch_size x
-        sequence_length x  context_dims containing a global statistic about the
-        set.
-      dropout: Dropout probability.
-      activation_fn: The activation function to use.
-  Returns
-      output: A tensor of dimensions batch_size x sequence_length x output_dims
-        dimension containing the sequences of transformed vectors.
+  TODO: Add bias add (or control the biases used).
+
+  Args:
+    layer_size: Dimension to transform the input vectors to.
+    inputs: A tensor of dimensions batch_size x sequence_length x input_dims
+      containing the sequences of input vectors.
+    context: A tensor of dimensions batch_size x context_dims
+      containing a global statistic about the set.
+    activation_fn: The activation function to use.
+    dropout: Dropout probability.
+    name: name.
 
+  Returns:
+    output: A tensor of dimensions batch_size x sequence_length x output_dims
+      dimension containing the sequences of transformed vectors.
   """
-  with tf.variable_scope(name, "linear_set_layer", [inputs]):
+  with tf.variable_scope(name, default_name="linear_set_layer",
+                         values=[inputs]):
     # Apply 1D convolution to apply linear filter to each element
     # along the 2nd dimension.
     outputs = conv1d(inputs, layer_size, 1, activation=None, name="set_conv")
@@ -1524,12 +1526,11 @@ def linear_set_layer(layer_size,
     # Apply the context if it exists.
     if context is not None:
       # Unfortunately tf doesn't support broadcasting via concat, but we can
-      #  simply add the transformed context to get the same effect
-      if len(context.get_shape().as_list())==2:
+      # simply add the transformed context to get the same effect.
+      if len(context.get_shape().as_list()) == 2:
         context = tf.expand_dims(context, axis=1)
-      #context_size = context.get_shape().as_list()[-1]
       cont_tfm = conv1d(context, layer_size, 1,
-          activation=None, name="cont_conv")
+                        activation=None, name="cont_conv")
       outputs += cont_tfm
 
     if activation_fn is not None:
@@ -1552,35 +1553,33 @@ def ravanbakhsh_set_layer(layer_size,
 
   More parameter-efficient verstion of a linear-set-layer with context.
 
-  Args
-      layer_size: Dimension to transform the input vectors to.
-      inputs: A tensor of dimensions batch_size x sequence_length x vector
-        containing the sequences of input vectors.
-      mask: A tensor of dimensions batch_size x sequence_length containing a
-        mask for the inputs with 1's for existing elements, and 0's elsewhere.
-      sequential: If true, will use a running global pool so each element will
-        only depend on those before it. Set true if this layer is being used in
-        an ouput sequence. 
-  Returns
-      output: A tensor of dimensions batch_size x sequence_length x vector
-        dimension containing the sequences of transformed vectors.
+  Args:
+    layer_size: Dimension to transform the input vectors to.
+    inputs: A tensor of dimensions batch_size x sequence_length x vector
+      containing the sequences of input vectors.
+    mask: A tensor of dimensions batch_size x sequence_length containing a
+      mask for the inputs with 1's for existing elements, and 0's elsewhere.
+    sequential: If true, will use a running global pool so each element will
+      only depend on those before it. Set true if this layer is being used in
+      an output sequence.
+    activation_fn: The activation function to use.
+    dropout: dropout.
+    name: name.
+
+  Returns:
+    output: A tensor of dimensions batch_size x sequence_length x vector
+      dimension containing the sequences of transformed vectors.
   """
+  del dropout
   with tf.variable_scope(name, "ravanbakhsh_set_layer", [inputs]):
-
     if sequential:
-      output = linear_set_layer(
+      return linear_set_layer(
           layer_size,
           inputs - running_global_pool_1d(inputs),
           activation_fn=activation_fn,
           name=name)
-    else:
-      output = linear_set_layer(
-          layer_size,
-          inputs - tf.expand_dims(global_pool_1d(inputs, mask=mask), axis=1),
-          activation_fn=activation_fn,
-          name=name)
-        
-    return output
-
-
-    return output
+    return linear_set_layer(
+        layer_size,
+        inputs - tf.expand_dims(global_pool_1d(inputs, mask=mask), axis=1),
+        activation_fn=activation_fn,
+        name=name)
diff --git a/tensor2tensor/models/gene_expression_test.py b/tensor2tensor/models/gene_expression_test.py
index bec5268fd..a43eda97a 100644
--- a/tensor2tensor/models/gene_expression_test.py
+++ b/tensor2tensor/models/gene_expression_test.py
@@ -55,8 +55,8 @@ def _testModel(self, hparams, model_cls):
         "targets": tf.constant(targets, dtype=tf.float32),
     }
     p_hparams, = hparams.problems
-    sharded_logits, _, _ = model_cls(hparams, tf.contrib.learn.ModeKeys.TRAIN,
-                                     p_hparams).model_fn(features)
+    sharded_logits, _ = model_cls(hparams, tf.contrib.learn.ModeKeys.TRAIN,
+                                  p_hparams).model_fn(features)
     logits = tf.concat(sharded_logits, 0)
 
     with self.test_session() as sess:
diff --git a/tensor2tensor/models/lstm.py b/tensor2tensor/models/lstm.py
index ae221bdff..195879d78 100644
--- a/tensor2tensor/models/lstm.py
+++ b/tensor2tensor/models/lstm.py
@@ -247,8 +247,8 @@ def lstm_seq2seq_internal_attention(inputs, targets, hparams, train):
     return tf.expand_dims(decoder_outputs, axis=2)
 
 
-@registry.register_model("baseline_lstm_seq2seq")
-class LSTMSeq2Seq(t2t_model.T2TModel):
+@registry.register_model
+class LSTMSeq2seq(t2t_model.T2TModel):
 
   def model_fn_body(self, features):
     train = self._hparams.mode == tf.contrib.learn.ModeKeys.TRAIN
@@ -256,8 +256,8 @@ def model_fn_body(self, features):
                                  self._hparams, train)
 
 
-@registry.register_model("baseline_lstm_seq2seq_attention")
-class LSTMSeq2SeqAttention(t2t_model.T2TModel):
+@registry.register_model
+class LSTMSeq2seqAttention(t2t_model.T2TModel):
 
   def model_fn_body(self, features):
     train = self._hparams.mode == tf.contrib.learn.ModeKeys.TRAIN
diff --git a/tensor2tensor/models/lstm_test.py b/tensor2tensor/models/lstm_test.py
index 1e542a666..6ac792f48 100644
--- a/tensor2tensor/models/lstm_test.py
+++ b/tensor2tensor/models/lstm_test.py
@@ -44,9 +44,9 @@ def testLSTMSeq2Seq(self):
           "inputs": tf.constant(x, dtype=tf.int32),
           "targets": tf.constant(y, dtype=tf.int32),
       }
-      model = lstm.LSTMSeq2Seq(
+      model = lstm.LSTMSeq2seq(
           hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams)
-      sharded_logits, _, _ = model.model_fn(features)
+      sharded_logits, _ = model.model_fn(features)
       logits = tf.concat(sharded_logits, 0)
       session.run(tf.global_variables_initializer())
       res = session.run(logits)
@@ -68,9 +68,9 @@ def testLSTMSeq2SeqAttention(self):
           "inputs": x,
           "targets": tf.constant(y, dtype=tf.int32),
       }
-      model = lstm.LSTMSeq2SeqAttention(
+      model = lstm.LSTMSeq2seqAttention(
           hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams)
-      sharded_logits, _, _ = model.model_fn(features)
+      sharded_logits, _ = model.model_fn(features)
       logits = tf.concat(sharded_logits, 0)
       session.run(tf.global_variables_initializer())
       res = session.run(logits)
diff --git a/tensor2tensor/models/modalities.py b/tensor2tensor/models/modalities.py
index c57a97905..912c54f8c 100644
--- a/tensor2tensor/models/modalities.py
+++ b/tensor2tensor/models/modalities.py
@@ -359,7 +359,7 @@ def xnet_resblock(x, filters, res_relu, name):
 class ClassLabelModality(modality.Modality):
   """Used for label data."""
 
-  def __init__(self, model_hparams, vocab_size, is2d=False):
+  def __init__(self, model_hparams, vocab_size, is2d=True):
     super(ClassLabelModality, self).__init__(model_hparams, vocab_size)
     self._is_2d = is2d
     self._kernel = (3, 3) if is2d else (5, 1)
@@ -425,12 +425,12 @@ def loss(self, top_out, targets, weights_fn=common_layers.weights_all):
 
 
 @registry.register_class_label_modality("class_label_2d")
-class ClassLabel2DModality(ClassLabelModality):
+class ClassLabel1DModality(ClassLabelModality):
   """Used for label data."""
 
   def __init__(self, model_hparams, vocab_size):
-    super(ClassLabel2DModality, self).__init__(
-        model_hparams=model_hparams, vocab_size=vocab_size, is2d=True)
+    super(ClassLabel1DModality, self).__init__(
+        model_hparams=model_hparams, vocab_size=vocab_size, is2d=False)
 
 
 @registry.register_generic_modality("default")
diff --git a/tensor2tensor/models/multimodel.py b/tensor2tensor/models/multimodel.py
index 089889ce6..6f60dbfbf 100644
--- a/tensor2tensor/models/multimodel.py
+++ b/tensor2tensor/models/multimodel.py
@@ -190,7 +190,6 @@ def multimodel_base():
   hparams.add_hparam("moe_n2", 0)
   hparams.add_hparam("moe_layers", "2")
   hparams.add_hparam("moe_loss_coef", 1e-2)
-  hparams.add_hparam("imagenet_use_2d", int(True))
   return hparams
 
 
diff --git a/tensor2tensor/models/multimodel_test.py b/tensor2tensor/models/multimodel_test.py
index 03990594b..73a8436cc 100644
--- a/tensor2tensor/models/multimodel_test.py
+++ b/tensor2tensor/models/multimodel_test.py
@@ -23,8 +23,9 @@
 
 import numpy as np
 
-from tensor2tensor.data_generators import problem_hparams
+from tensor2tensor.data_generators import image  # pylint: disable=unused-import
 from tensor2tensor.models import multimodel
+from tensor2tensor.utils import registry
 
 import tensorflow as tf
 
@@ -32,10 +33,12 @@
 class MultiModelTest(tf.test.TestCase):
 
   def testMultiModel(self):
-    x = np.random.random_integers(0, high=255, size=(3, 5, 4, 3))
+    x = np.random.random_integers(0, high=255, size=(3, 5, 5, 3))
     y = np.random.random_integers(0, high=9, size=(3, 5, 1, 1))
     hparams = multimodel.multimodel_tiny()
-    p_hparams = problem_hparams.image_cifar10(hparams)
+    hparams.add_hparam("data_dir", "")
+    problem = registry.problem("image_cifar10")
+    p_hparams = problem.internal_hparams(hparams)
     hparams.problems = [p_hparams]
     with self.test_session() as session:
       features = {
@@ -45,7 +48,7 @@ def testMultiModel(self):
       }
       model = multimodel.MultiModel(
           hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams)
-      sharded_logits, _, _ = model.model_fn(features)
+      sharded_logits, _ = model.model_fn(features)
       logits = tf.concat(sharded_logits, 0)
       session.run(tf.global_variables_initializer())
       res = session.run(logits)
diff --git a/tensor2tensor/models/neural_gpu_test.py b/tensor2tensor/models/neural_gpu_test.py
index 3d1cc0562..46c01f403 100644
--- a/tensor2tensor/models/neural_gpu_test.py
+++ b/tensor2tensor/models/neural_gpu_test.py
@@ -52,7 +52,7 @@ def testNeuralGPU(self):
       }
       model = neural_gpu.NeuralGPU(
           hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams)
-      shadred_logits, _, _ = model.model_fn(features)
+      shadred_logits, _ = model.model_fn(features)
       logits = tf.concat(shadred_logits, 0)
       session.run(tf.global_variables_initializer())
       res = session.run(logits)
diff --git a/tensor2tensor/models/slicenet.py b/tensor2tensor/models/slicenet.py
index 69e2338b6..f1534137c 100644
--- a/tensor2tensor/models/slicenet.py
+++ b/tensor2tensor/models/slicenet.py
@@ -316,7 +316,6 @@ def slicenet_params1():
   hparams.add_hparam("moe_n1", 32)
   hparams.add_hparam("moe_n2", 0)
   hparams.add_hparam("moe_loss_coef", 1e-2)
-  hparams.add_hparam("imagenet_use_2d", int(True))
   # attention-related flags
   hparams.add_hparam("attention_type", "simple")
   hparams.add_hparam("num_heads", 8)
diff --git a/tensor2tensor/models/slicenet_test.py b/tensor2tensor/models/slicenet_test.py
index 692799571..c357448e4 100644
--- a/tensor2tensor/models/slicenet_test.py
+++ b/tensor2tensor/models/slicenet_test.py
@@ -23,8 +23,10 @@
 
 import numpy as np
 
-from tensor2tensor.data_generators import problem_hparams
+from tensor2tensor.data_generators import image  # pylint: disable=unused-import
+from tensor2tensor.models import modalities  # pylint: disable=unused-import
 from tensor2tensor.models import slicenet
+from tensor2tensor.utils import registry
 
 import tensorflow as tf
 
@@ -32,10 +34,12 @@
 class SliceNetTest(tf.test.TestCase):
 
   def testSliceNet(self):
-    x = np.random.random_integers(0, high=255, size=(3, 5, 4, 3))
+    x = np.random.random_integers(0, high=255, size=(3, 5, 5, 3))
     y = np.random.random_integers(0, high=9, size=(3, 5, 1, 1))
     hparams = slicenet.slicenet_params1_tiny()
-    p_hparams = problem_hparams.image_cifar10(hparams)
+    hparams.add_hparam("data_dir", "")
+    problem = registry.problem("image_cifar10")
+    p_hparams = problem.internal_hparams(hparams)
     hparams.problems = [p_hparams]
     with self.test_session() as session:
       features = {
@@ -45,7 +49,7 @@ def testSliceNet(self):
       }
       model = slicenet.SliceNet(
           hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams)
-      sharded_logits, _, _ = model.model_fn(features)
+      sharded_logits, _ = model.model_fn(features)
       logits = tf.concat(sharded_logits, 0)
       session.run(tf.global_variables_initializer())
       res = session.run(logits)
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index c45e88577..2320a57f1 100644
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -23,8 +23,6 @@
 from __future__ import division
 from __future__ import print_function
 
-import copy
-
 # Dependency imports
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
@@ -43,8 +41,7 @@ class Transformer(t2t_model.T2TModel):
   """Attention net.  See file docstring."""
 
   def model_fn_body(self, features):
-    # Remove dropout if not training
-    hparams = copy.copy(self._hparams)
+    hparams = self._hparams
     targets = features["targets"]
     inputs = features["inputs"]
     target_space = features["target_space_id"]
@@ -52,23 +49,28 @@ def model_fn_body(self, features):
     inputs = common_layers.flatten4d3d(inputs)
     targets = common_layers.flatten4d3d(targets)
 
-    (encoder_input, encoder_attention_bias, _) = (transformer_prepare_encoder(
-        inputs, target_space, hparams))
+    (encoder_input,
+     encoder_self_attention_bias,
+     encoder_decoder_attention_bias) = (
+         transformer_prepare_encoder(inputs, target_space, hparams))
     (decoder_input, decoder_self_attention_bias) = transformer_prepare_decoder(
         targets, hparams)
 
     def residual_fn(x, y):
-      return common_layers.layer_norm(x + tf.nn.dropout(
-          y, 1.0 - hparams.residual_dropout))
+      return common_layers.residual_fn(x, y,
+                                       hparams.norm_type,
+                                       hparams.residual_dropout,
+                                       hparams.hidden_size,
+                                       epsilon=hparams.layer_norm_epsilon)
 
     encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.residual_dropout)
     decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.residual_dropout)
     encoder_output = transformer_encoder(encoder_input, residual_fn,
-                                         encoder_attention_bias, hparams)
+                                         encoder_self_attention_bias, hparams)
 
     decoder_output = transformer_decoder(
         decoder_input, encoder_output, residual_fn, decoder_self_attention_bias,
-        encoder_attention_bias, hparams)
+        encoder_decoder_attention_bias, hparams)
     decoder_output = tf.expand_dims(decoder_output, 2)
 
     return decoder_output
@@ -84,17 +86,20 @@ def transformer_prepare_encoder(inputs, target_space, hparams):
 
   Returns:
     encoder_input: a Tensor, bottom of encoder stack
-    encoder_self_attention_bias: a Tensor, containing large negative values
-      to implement masked attention and possibly baises for diagonal
-      alignments
-    encoder_padding: a Tensor
+    encoder_self_attention_bias: a bias tensor for use in encoder self-attention
+    encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder
+      attention
   """
-  # Flatten inputs.
   ishape_static = inputs.shape.as_list()
   encoder_input = inputs
   encoder_padding = common_attention.embedding_to_padding(encoder_input)
-  encoder_self_attention_bias = common_attention.attention_bias_ignore_padding(
+  ignore_padding = common_attention.attention_bias_ignore_padding(
       encoder_padding)
+  encoder_self_attention_bias = ignore_padding
+  encoder_decoder_attention_bias = ignore_padding
+  if hparams.proximity_bias:
+    encoder_self_attention_bias += common_attention.attention_bias_proximal(
+        tf.shape(inputs)[1])
   # Append target_space_id embedding to inputs.
   emb_target_space = common_layers.embedding(
       target_space, 32, ishape_static[-1], name="target_space_embedding")
@@ -102,7 +107,9 @@ def transformer_prepare_encoder(inputs, target_space, hparams):
   encoder_input += emb_target_space
   if hparams.pos == "timing":
     encoder_input = common_attention.add_timing_signal_1d(encoder_input)
-  return (encoder_input, encoder_self_attention_bias, encoder_padding)
+  return (encoder_input,
+          encoder_self_attention_bias,
+          encoder_decoder_attention_bias)
 
 
 def transformer_prepare_decoder(targets, hparams):
@@ -114,11 +121,13 @@ def transformer_prepare_decoder(targets, hparams):
 
   Returns:
     decoder_input: a Tensor, bottom of decoder stack
-    decoder_self_attention_bias: a Tensor, containing large negative values
-    to implement masked attention and possibly baises for diagonal alignments
+    decoder_self_attention_bias: a bias tensor for use in encoder self-attention
   """
   decoder_self_attention_bias = (
       common_attention.attention_bias_lower_triangle(tf.shape(targets)[1]))
+  if hparams.proximity_bias:
+    decoder_self_attention_bias += common_attention.attention_bias_proximal(
+        tf.shape(targets)[1])
   decoder_input = common_layers.shift_left_3d(targets)
   if hparams.pos == "timing":
     decoder_input = common_attention.add_timing_signal_1d(decoder_input)
@@ -261,6 +270,7 @@ def transformer_ffn_layer(x, hparams):
 def transformer_base():
   """Set of hyperparameters."""
   hparams = common_hparams.basic_params1()
+  hparams.norm_type = "layer"
   hparams.hidden_size = 512
   hparams.batch_size = 4096
   hparams.max_length = 256
@@ -295,6 +305,7 @@ def transformer_base():
   hparams.add_hparam("residual_dropout", 0.1)
   hparams.add_hparam("pos", "timing")  # timing, none
   hparams.add_hparam("nbr_decoder_problems", 1)
+  hparams.add_hparam("proximity_bias", int(False))
   return hparams
 
 
@@ -541,13 +552,16 @@ def transformer_parameter_attention_b():
   return hparams
 
 
-@registry.register_ranged_hparams("transformer_big_single_gpu")
-def transformer_range1(rhp):
+@registry.register_ranged_hparams("transformer_base")
+def transformer_base_range(rhp):
   """Small range of hyperparameters."""
-  hparams = transformer_big_single_gpu()
+  hparams = transformer_base()
   common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp)
-
+  # After starting from base, set intervals for some parameters.
   rhp.set_float("learning_rate", 0.3, 3.0, scale=rhp.LOG_SCALE)
+  rhp.set_discrete("learning_rate_warmup_steps",
+                   [1000, 2000, 4000, 8000, 16000])
   rhp.set_float("initializer_gain", 0.5, 2.0)
+  rhp.set_float("optimizer_adam_beta2", 0.85, 0.95)
   rhp.set_float("optimizer_adam_beta2", 0.97, 0.99)
   rhp.set_float("weight_decay", 0.0, 2.0)
diff --git a/tensor2tensor/models/transformer_alternative.py b/tensor2tensor/models/transformer_alternative.py
index 78398471a..1f20bfb51 100644
--- a/tensor2tensor/models/transformer_alternative.py
+++ b/tensor2tensor/models/transformer_alternative.py
@@ -50,11 +50,10 @@ def model_fn_body(self, features):
     inputs = common_layers.flatten4d3d(inputs)
     targets = common_layers.flatten4d3d(targets)
 
-
-    (encoder_input, encoder_attention_bias, _) = (transformer.\
-        transformer_prepare_encoder(inputs, target_space, hparams) )
-    (decoder_input, decoder_self_attention_bias) = transformer.\
-        transformer_prepare_decoder(targets, hparams)
+    (encoder_input, encoder_attention_bias, _) = (
+        transformer.transformer_prepare_encoder(inputs, target_space, hparams))
+    (decoder_input, _) = (
+        transformer.transformer_prepare_decoder(targets, hparams))
 
     encoder_mask = bias_to_mask(encoder_attention_bias)
 
@@ -76,8 +75,9 @@ def residual_fn(x, y):
 
     return decoder_output
 
-    
+
 def composite_layer(inputs, mask, hparams, for_output=False):
+  """Composite layer."""
   x = inputs
 
   # Applies ravanbakhsh on top of each other.
@@ -85,33 +85,31 @@ def composite_layer(inputs, mask, hparams, for_output=False):
     for layer in xrange(hparams.layers_per_layer):
       with tf.variable_scope(".%d" % layer):
         x = common_layers.ravanbakhsh_set_layer(
-                hparams.hidden_size,
-                x,
-                mask=mask,
-                sequential=for_output,
-                dropout=hparams.relu_dropout)
-  
-  # Transforms elements to get a context, and then uses this in a final layer
+            hparams.hidden_size,
+            x,
+            mask=mask,
+            sequential=for_output,
+            dropout=hparams.relu_dropout)
+
+  # Transforms elements to get a context, and then uses this in a final layer.
   elif hparams.composite_layer_type == "reembedding":
     # Transform elements n times and then pool.
     for layer in xrange(hparams.layers_per_layer):
       with tf.variable_scope("sub_layer_%d" % layer):
         x = common_layers.linear_set_layer(
-                hparams.hidden_size,
-                x,
-                dropout=hparams.relu_dropout)
+            hparams.hidden_size,
+            x,
+            dropout=hparams.relu_dropout)
         if for_output:
           context = common_layers.running_global_pool_1d(x)
         else:
           context = common_layers.global_pool_1d(x, mask=mask)
-     
-    #Final layer
+    # Final layer.
     x = common_layers.linear_set_layer(
-            hparams.hidden_size,
-            x,
-            context=context,
-            dropout=hparams.relu_dropout)
-
+        hparams.hidden_size,
+        x,
+        context=context,
+        dropout=hparams.relu_dropout)
   return x
 
 
@@ -120,16 +118,13 @@ def alt_transformer_encoder(encoder_input,
                             mask,
                             hparams,
                             name="encoder"):
-
   """Alternative encoder."""
   x = encoder_input
-
   with tf.variable_scope(name):
     x = encoder_input
     for layer in xrange(hparams.num_hidden_layers):
       with tf.variable_scope("layer_%d" % layer):
         x = residual_fn(x, composite_layer(x, mask, hparams))
-
   return x
 
 
@@ -139,12 +134,11 @@ def alt_transformer_decoder(decoder_input,
                             encoder_decoder_attention_bias,
                             hparams,
                             name="decoder"):
-
+  """Alternative decoder."""
   with tf.variable_scope(name):
     x = decoder_input
     for layer in xrange(hparams.num_hidden_layers):
       with tf.variable_scope("layer_%d" % layer):
-
         x_ = common_attention.multihead_attention(
             x,
             encoder_output,
@@ -156,23 +150,21 @@ def alt_transformer_decoder(decoder_input,
             hparams.attention_dropout,
             name="encdec_attention")
 
-        x_ = residual_fn(x_, composite_layer(x_, None, hparams, for_output=True))
+        x_ = residual_fn(x_, composite_layer(x_, None, hparams,
+                                             for_output=True))
         x = residual_fn(x, x_)
-        
   return x
 
 
 def bias_to_mask(bias):
-    # We need masks of the form batch size x input sequences
-    # Biases are of the form batch_size x num_heads x input sequences x
-    #  output sequences. Squeeze out dim one, and get the first element of
-    #  each vector.
-
-    bias = tf.squeeze(bias, [1])[:,:,0]
-    bias = - tf.clip_by_value(bias, -1.0, 1.0)
-    mask = 1 - bias
-    return mask
-
+  # We need masks of the form batch size x input sequences
+  # Biases are of the form batch_size x num_heads x input sequences x
+  #  output sequences. Squeeze out dim one, and get the first element of
+  #  each vector.
+  bias = tf.squeeze(bias, [1])[:, :, 0]
+  bias = - tf.clip_by_value(bias, -1.0, 1.0)
+  mask = 1 - bias
+  return mask
 
 
 @registry.register_hparams
@@ -182,7 +174,6 @@ def transformer_alt():
   hparams.batch_size = 2048
   hparams.num_hidden_layers = 10
   hparams.add_hparam("layers_per_layer", 4)
-  hparams.add_hparam("composite_layer_type", "ravanbakhsh") #ravanbakhsh or reembedding
-  #hparams.add_hparam("composite_layer_type", "reembedding")
-
+  # Composite layer: ravanbakhsh or reembedding.
+  hparams.add_hparam("composite_layer_type", "ravanbakhsh")
   return hparams
diff --git a/tensor2tensor/models/transformer_test.py b/tensor2tensor/models/transformer_test.py
index a7f1fc9ae..8f4d26339 100644
--- a/tensor2tensor/models/transformer_test.py
+++ b/tensor2tensor/models/transformer_test.py
@@ -51,7 +51,7 @@ def _testTransformer(self, net):
           "target_space_id": tf.constant(1, dtype=tf.int32),
       }
       model = net(hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams)
-      shadred_logits, _, _ = model.model_fn(features)
+      shadred_logits, _ = model.model_fn(features)
       logits = tf.concat(shadred_logits, 0)
       session.run(tf.global_variables_initializer())
       res = session.run(logits)
diff --git a/tensor2tensor/models/xception.py b/tensor2tensor/models/xception.py
index 61fa61235..f2e69da21 100644
--- a/tensor2tensor/models/xception.py
+++ b/tensor2tensor/models/xception.py
@@ -86,7 +86,6 @@ def xception_base():
   hparams.optimizer_adam_epsilon = 1e-6
   hparams.optimizer_adam_beta1 = 0.85
   hparams.optimizer_adam_beta2 = 0.997
-  hparams.add_hparam("imagenet_use_2d", True)
   return hparams
 
 
diff --git a/tensor2tensor/models/xception_test.py b/tensor2tensor/models/xception_test.py
index bf434aeac..776d1306a 100644
--- a/tensor2tensor/models/xception_test.py
+++ b/tensor2tensor/models/xception_test.py
@@ -45,7 +45,7 @@ def testXception(self):
       }
       model = xception.Xception(
           hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams)
-      sharded_logits, _, _ = model.model_fn(features)
+      sharded_logits, _ = model.model_fn(features)
       logits = tf.concat(sharded_logits, 0)
       session.run(tf.global_variables_initializer())
       res = session.run(logits)
diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py
index ba5139433..05aa9bf26 100644
--- a/tensor2tensor/utils/data_reader.py
+++ b/tensor2tensor/utils/data_reader.py
@@ -27,7 +27,6 @@
 from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensor2tensor.data_generators import problem_hparams
-from tensor2tensor.models import common_layers
 from tensor2tensor.utils import registry
 
 import tensorflow as tf
@@ -127,35 +126,15 @@ def decode_record(record):
     return decode_record(example_serialized)
 
 
-def preprocessing(examples, data_file_pattern, mode):
+def preprocessing(examples, data_file_pattern):
   """Preprocessing of examples."""
+  # This function is for obsolete problems only, as we're porting them
+  # all to the Problem class and its preprocess_examples method. Don't add.
   if "image" in data_file_pattern:
-    # Small single-example pre-processing for images.
     def resize(img, size):
       return tf.to_int64(tf.image.resize_images(img, [size, size]))
 
-    def preprocess(img):
-      img = tf.image.resize_images(img, [360, 360])
-      img = common_layers.image_augmentation(tf.to_float(img) / 255.)
-      return tf.to_int64(img * 255.)
-
-    if ("image_imagenet" in data_file_pattern or
-        "image_mscoco" in data_file_pattern):
-      examples["inputs"] = tf.cast(examples["inputs"], tf.int64)
-      # For imagnet/coco, resize images to 299x299 as is standard.
-      inputs = examples["inputs"]
-      if mode == tf.contrib.learn.ModeKeys.TRAIN:
-        examples["inputs"] = tf.cond(  # Preprocess 80% of the time.
-            tf.less(tf.random_uniform([]), 0.8),
-            lambda img=inputs: preprocess(img),
-            lambda img=inputs: resize(img, 299))
-      else:
-        examples["inputs"] = tf.to_int64(resize(inputs, 299))
-    elif ("image_cifar10" in data_file_pattern and
-          mode == tf.contrib.learn.ModeKeys.TRAIN):
-      examples["inputs"] = common_layers.cifar_image_augmentation(
-          examples["inputs"])
-    elif "img2img" in data_file_pattern:
+    if "img2img" in data_file_pattern:
       inputs = examples["inputs"]
       examples["inputs"] = resize(inputs, 16)
       examples["targets"] = resize(inputs, 64)
@@ -163,7 +142,6 @@ def preprocess(img):
       inputs = examples["inputs"]
       examples["inputs"] = resize(inputs, 8)
       examples["targets"] = resize(inputs, 32)
-
   elif "audio" in data_file_pattern:
     # Reshape audio to proper shape
     sample_count = tf.to_int32(examples.pop("audio/sample_count"))
@@ -179,30 +157,6 @@ def preprocess(img):
   return examples
 
 
-def problem_input_pipeline(problem, data_file_pattern, capacity, mode):
-  """Input pipeline for Problems."""
-  data_fields, data_items_to_decoders = problem.example_reading_spec()
-
-  # Create placeholders for input, rather than reading data from disk.
-  if data_file_pattern is None:
-    return feature_placeholders(data_fields)
-
-  # Now the non-trivial case construction.
-  examples = examples_reader(
-      [data_file_pattern],
-      data_fields,
-      training=(mode == tf.contrib.learn.ModeKeys.TRAIN),
-      capacity=capacity,
-      data_items_to_decoders=data_items_to_decoders)
-
-  examples = problem.preprocess_examples(examples, mode)
-
-  # We do not want int64s as they are not supported on GPUs.
-  examples = cast_int64_to_int32(examples)
-
-  return examples
-
-
 def cast_int64_to_int32(features):
   f = {}
   for k, v in six.iteritems(features):
@@ -221,25 +175,14 @@ def feature_placeholders(data_fields):
   return feature_map
 
 
-def input_pipeline(problem, data_file_pattern, capacity, mode):
-  """Input pipeline, returns a dictionary of tensors from queues."""
-
-  if problem is not None:
-    # problem is not None when the problem is specified with the Problem API,
-    # which handles Example decoding and preprocessing.
-    # Otherwise the problem is specified in problem_hparams and is dealt with
-    # below.
-    # As problems are ported to the Problem API, the special handling here will
-    # need to be moved to Problem.example_reading_spec and
-    # Problem.preprocessing.
-    return problem_input_pipeline(problem, data_file_pattern, capacity, mode)
-
+def default_example_reading_spec(data_file_pattern):
+  """Example reading spec for problem_hparams problems."""
+  # This function is for problems that have yet to be ported to the new Problem
+  # API. Do not add here.
   data_items_to_decoders = None
   # Read from image TFRecords if the file has "image" in its name.
   if data_file_pattern and "image" in data_file_pattern:
     label_key = "image/class/label"
-    if "fsns" in data_file_pattern:
-      label_key = "image/unpadded_label"
     data_fields = {
         "image/encoded": tf.FixedLenFeature((), tf.string),
         "image/format": tf.FixedLenFeature((), tf.string),
@@ -267,12 +210,21 @@ def input_pipeline(problem, data_file_pattern, capacity, mode):
         "inputs": tf.VarLenFeature(tf.int64),
         "targets": tf.VarLenFeature(tf.int64)
     }
+  return data_fields, data_items_to_decoders
+
+
+def input_pipeline(problem, data_file_pattern, capacity, mode, hparams):
+  """Input pipeline, returns a dictionary of tensors from queues."""
+  if problem is None:
+    data_fields, data_items_to_decoders = default_example_reading_spec(
+        data_file_pattern)
+  else:
+    data_fields, data_items_to_decoders = problem.example_reading_spec()
 
-  # Create placeholders for input, rather than reading data from disk.
   if data_file_pattern is None:
+    # Create placeholders for input, rather than reading data from disk.
     return feature_placeholders(data_fields)
 
-  # Now the non-trivial case construction.
   examples = examples_reader(
       [data_file_pattern],
       data_fields,
@@ -280,10 +232,14 @@ def input_pipeline(problem, data_file_pattern, capacity, mode):
       capacity=capacity,
       data_items_to_decoders=data_items_to_decoders)
 
-  examples = preprocessing(examples, data_file_pattern, mode)
+  if problem is None:
+    examples = preprocessing(examples, data_file_pattern)
+  else:
+    examples = problem.preprocess_examples(examples, mode, hparams)
 
   # We do not want int64s as they are not supported on GPUs.
   examples = cast_int64_to_int32(examples)
+
   return examples
 
 
diff --git a/tensor2tensor/utils/registry.py b/tensor2tensor/utils/registry.py
index 0baad2471..9d5e1e0a6 100644
--- a/tensor2tensor/utils/registry.py
+++ b/tensor2tensor/utils/registry.py
@@ -76,7 +76,7 @@ class Modalities(object):
 
 # Camel case to snake case utils
 _first_cap_re = re.compile("(.)([A-Z][a-z0-9]+)")
-_all_cap_re = re.compile("([a-z])([A-Z])")
+_all_cap_re = re.compile("([a-z0-9])([A-Z])")
 
 
 def _convert_camel_to_snake(name):
diff --git a/tensor2tensor/utils/registry_test.py b/tensor2tensor/utils/registry_test.py
index 3231809ea..62c24b054 100644
--- a/tensor2tensor/utils/registry_test.py
+++ b/tensor2tensor/utils/registry_test.py
@@ -94,8 +94,9 @@ def testSnakeCase(self):
     convert = registry._convert_camel_to_snake
 
     self.assertEqual("typical_camel_case", convert("TypicalCamelCase"))
-    self.assertEqual("numbers_fuse2gether", convert("NumbersFuse2Gether"))
-    self.assertEqual("lstm_seq2seq", convert("LSTMSeq2Seq"))
+    self.assertEqual("numbers_fuse2gether", convert("NumbersFuse2gether"))
+    self.assertEqual("numbers_fuse2_gether", convert("NumbersFuse2Gether"))
+    self.assertEqual("lstm_seq2_seq", convert("LSTMSeq2Seq"))
     self.assertEqual("starts_lower", convert("startsLower"))
     self.assertEqual("starts_lower_caps", convert("startsLowerCAPS"))
     self.assertEqual("caps_fuse_together", convert("CapsFUSETogether"))
diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py
index f67cc9540..7cb484bc8 100644
--- a/tensor2tensor/utils/t2t_model.py
+++ b/tensor2tensor/utils/t2t_model.py
@@ -28,7 +28,6 @@
 
 from tensor2tensor.utils import beam_search
 from tensor2tensor.utils import expert_utils as eu
-from tensor2tensor.utils import modality
 from tensor2tensor.utils import registry
 
 import tensorflow as tf
@@ -104,22 +103,18 @@ def _create_modalities(self, problem_hparams, hparams):
 
     input_modality_overrides = {}
     for override_str in hparams.input_modalities.split(";"):
-      parts = override_str.split(":")
-      feature_name = parts[0]
-      modality_name = ":".join(parts[1:])
-      input_modality_overrides[feature_name] = modality_name
+      if override_str != "default":
+        parts = override_str.split(":")
+        feature_name = parts[0]
+        modality_name = ":".join(parts[1:])
+        input_modality_overrides[feature_name] = modality_name
 
     target_modality_name = None
-    if hparams.target_modality:
+    if hparams.target_modality and hparams.target_modality != "default":
       target_modality_name = hparams.target_modality
 
     input_modality = {}
     for f, modality_spec in six.iteritems(problem_hparams.input_modality):
-      if isinstance(modality_spec, modality.Modality):
-        # This function has been previously run (e.g. for training and now is
-        # being called for eval) and the modalities have already been
-        # constructed. Return.
-        return
       if f in input_modality_overrides:
         _warn_changed_modality_type(input_modality_overrides[f],
                                     modality_spec[0], f)
@@ -128,8 +123,6 @@ def _create_modalities(self, problem_hparams, hparams):
     problem_hparams.input_modality = input_modality
 
     target_modality_spec = problem_hparams.target_modality
-    if isinstance(target_modality_spec, modality.Modality):
-      return
     if target_modality_name:
       _warn_changed_modality_type(target_modality_name, target_modality_spec[0],
                                   "target")
@@ -206,7 +199,7 @@ def symbols_to_logits_fn(ids):
 
       features["targets"] = ids
       self._coverage = None
-      sharded_logits, _, _ = self.model_fn(
+      sharded_logits, _ = self.model_fn(
           features, False, last_position_only=last_position_only)
       # now self._coverage is a coverage tensor for the first datashard.
       # it has shape [batch_size] and contains floats between 0 and
@@ -330,7 +323,7 @@ def sample(self, features, last_position_only=False):
     Returns:
        samples: an integer `Tensor`.
     """
-    sharded_logits, _, _ = self.model_fn(
+    sharded_logits, _ = self.model_fn(
         features, False, last_position_only=last_position_only)
     if self._hparams.sampling_method == "argmax":
       sharded_samples = self._data_parallelism(tf.argmax, sharded_logits, 4)
@@ -362,7 +355,7 @@ def _shard_features(self, features):  # pylint: disable=missing-docstring
     return sharded_features
 
   def model_fn(self, features, skip=False, last_position_only=False):
-    """Computes the entire model and produces sharded logits and training loss.
+    """Computes the entire model and produces sharded logits and losses.
 
     Args:
       features: A dictionary of feature name to tensor.
@@ -372,7 +365,7 @@ def model_fn(self, features, skip=False, last_position_only=False):
 
     Returns:
       sharded_logits: a list of `Tensor`s, one per datashard.
-      training_loss: a floating point `Scalar`.
+      losses: a dictionary: {loss-name (string): floating point `Scalar`}.
     """
     start_time = time.time()
     dp = self._data_parallelism
@@ -417,10 +410,13 @@ def model_fn(self, features, skip=False, last_position_only=False):
     # Construct the model body.
     with tf.variable_scope("body", reuse=self._problem_idx > 0):
       if skip:
-        body_outputs, extra_loss = transformed_features["targets"], 0.0
+        body_outputs = transformed_features["targets"]
+        losses = {"extra": 0.0}
       else:
-        body_outputs, extra_loss = self.model_fn_body_sharded(
+        body_outputs, losses = self.model_fn_body_sharded(
             transformed_features)
+        if isinstance(losses, tf.Tensor):  # If it's a single extra loss.
+          losses = {"extra": losses}
 
     with tf.variable_scope(target_modality.name, reuse=target_reuse):
       if not last_position_only:
@@ -447,7 +443,8 @@ def model_fn(self, features, skip=False, last_position_only=False):
         training_loss = None
 
     tf.logging.info("This model_fn took %.3f sec." % (time.time() - start_time))
-    return sharded_logits, training_loss, extra_loss
+    losses["training"] = training_loss
+    return sharded_logits, losses
 
   def model_fn_body_sharded(self, sharded_features):
     """Mixture-of-experts models will override this function.
@@ -472,10 +469,10 @@ def model_fn_body_sharded(self, sharded_features):
           _with_timing(self.model_fn_body, "model_fn_body"),
           datashard_to_features)
       if isinstance(output, tuple):
-        loss = tf.reduce_mean(output[1])
+        loss = {"extra": tf.reduce_mean(output[1])}
         output = output[0]
       else:
-        loss = 0.0
+        loss = {"extra": 0.0}
       return output, loss
 
   def model_fn_body(self, features):
diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py
index 260ec6a00..33053806d 100644
--- a/tensor2tensor/utils/trainer_utils.py
+++ b/tensor2tensor/utils/trainer_utils.py
@@ -19,6 +19,7 @@
 from __future__ import division
 from __future__ import print_function
 
+import copy
 import math
 import operator
 import os
@@ -130,6 +131,7 @@
                   "<beam1>\t<beam2>..\t<input>")
 flags.DEFINE_integer("decode_max_input_size", -1,
                      "Maximum number of ids in input. Or <= 0 for no max.")
+flags.DEFINE_bool("identity_output", False, "To print the output as identity")
 
 
 def _save_until_eos(hyp):
@@ -227,6 +229,24 @@ def log_registry():
     sys.exit(0)
 
 
+def add_problem_hparams(hparams, problems):
+  """Add problem hparams for the problems."""
+  hparams.problems = []
+  hparams.problem_instances = []
+  for problem_name in problems.split("-"):
+    try:
+      problem = registry.problem(problem_name)
+      p_hparams = problem.internal_hparams(hparams)
+    except ValueError:
+      problem = None
+      p_hparams = problem_hparams.problem_hparams(problem_name, hparams)
+
+    hparams.problem_instances.append(problem)
+    hparams.problems.append(p_hparams)
+
+  return hparams
+
+
 def create_hparams(params_id, data_dir):
   """Returns hyperparameters, including any flag value overrides.
 
@@ -247,21 +267,7 @@ def create_hparams(params_id, data_dir):
   if FLAGS.hparams:
     hparams = hparams.parse(FLAGS.hparams)
 
-  # Add hparams for the problems
-  hparams.problems = []
-  hparams.problem_instances = []
-  for problem_name in FLAGS.problems.split("-"):
-    try:
-      problem = registry.problem(problem_name)
-      p_hparams = problem.internal_hparams(hparams)
-    except ValueError:
-      problem = None
-      p_hparams = problem_hparams.problem_hparams(problem_name, hparams)
-
-    hparams.problem_instances.append(problem)
-    hparams.problems.append(p_hparams)
-
-  return hparams
+  return add_problem_hparams(hparams, FLAGS.problems)
 
 
 def run(data_dir, model, output_dir, train_steps, eval_steps, schedule):
@@ -410,11 +416,22 @@ def model_fn(features, targets, mode):
     Returns:
       A tuple consisting of the prediction, loss, and train_op.
     """
+    # Deep-copy the model hparams between modes to eliminate
+    # side-effects caused by abuse of the linked problem_hparams
+    # objects which are used to share modality objects between
+    # problems.  We do not want to share the modality objects between
+    # modes, since the modality objects may decide to do something
+    # mode-specific.  A better fix would be to stop abusing the
+    # hparams in this way and instead use a separate dictionary to
+    # share the modality objects between problems.  This dictionary
+    # could be created once per mode and passed to the constructor of
+    # t2t_model.
+    my_hp = copy.deepcopy(hparams)
     if mode == tf.contrib.learn.ModeKeys.INFER:
       if FLAGS.decode_interactive:
-        features = _interactive_input_tensor_to_features_dict(features, hparams)
+        features = _interactive_input_tensor_to_features_dict(features, my_hp)
       elif FLAGS.decode_from_file:
-        features = _decode_input_tensor_to_features_dict(features, hparams)
+        features = _decode_input_tensor_to_features_dict(features, my_hp)
     # A dictionary containing:
     #  - problem_choice: A Tensor containing an integer indicating which problem
     #                    was selected for this run.
@@ -446,9 +463,9 @@ def model_fn(features, targets, mode):
     def nth_model(n):
       """Build the model for the n-th problem, plus some added variables."""
       model_class = registry.model(model)(
-          hparams,
+          my_hp,
           mode,
-          hparams.problems[n],
+          my_hp.problems[n],
           n,
           dp,
           _ps_devices(all_workers=True))
@@ -462,33 +479,36 @@ def nth_model(n):
             alpha=FLAGS.decode_alpha,
             decode_length=FLAGS.decode_extra_length)
       # In distributed mode, we build graph for problem=0 and problem=worker_id.
-      skipping_is_on = hparams.problem_choice == "distributed" and train
-      problem_worker_id = FLAGS.worker_id % len(hparams.problems)
+      skipping_is_on = my_hp.problem_choice == "distributed" and train
+      problem_worker_id = FLAGS.worker_id % len(my_hp.problems)
       skip_this_one = n != 0 and n % FLAGS.worker_replicas != problem_worker_id
       # On worker 0 also build graph for problems <= 1.
       # TODO(lukaszkaiser): why is this hack needed for variables init? Repair.
       skip_this_one = skip_this_one and (FLAGS.worker_id != 0 or n > 1)
-      sharded_logits, training_loss, extra_loss = model_class.model_fn(
+      sharded_logits, losses_dict = model_class.model_fn(
           features, skip=(skipping_is_on and skip_this_one))
       with tf.variable_scope("losses_avg", reuse=True):
-        loss_moving_avg = tf.get_variable("problem_%d/training_loss" % n)
-        o1 = loss_moving_avg.assign(loss_moving_avg * 0.9 + training_loss * 0.1)
-        loss_moving_avg = tf.get_variable("problem_%d/extra_loss" % n)
-        o2 = loss_moving_avg.assign(loss_moving_avg * 0.9 + extra_loss * 0.1)
+        total_loss, ops = 0.0, []
+        for loss_key, loss_value in losses_dict.iteritems():
+          loss_moving_avg = tf.get_variable("problem_%d/%s_loss"
+                                            % (n, loss_key))
+          ops.append(loss_moving_avg.assign(
+              loss_moving_avg * 0.9 + loss_value * 0.1))
+          total_loss += loss_value
         loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n)
-        total_loss = training_loss + extra_loss
-        o3 = loss_moving_avg.assign(loss_moving_avg * 0.9 + total_loss * 0.1)
+        ops.append(loss_moving_avg.assign(
+            loss_moving_avg * 0.9 + total_loss * 0.1))
       with tf.variable_scope("train_stats"):  # Count steps for this problem.
         problem_steps = tf.get_variable(
             "problem_%d_steps" % n, initializer=0, trainable=False)
-        o4 = problem_steps.assign_add(1)
-      with tf.control_dependencies([o1, o2, o3, o4]):  # Make sure the ops run.
+        ops.append(problem_steps.assign_add(1))
+      with tf.control_dependencies(ops):  # Make sure the ops run.
         # Ensure the loss is a scalar here.
         total_loss = tf.reshape(total_loss, [], name="total_loss_control_id")
       return [total_loss] + sharded_logits  # Need to flatten for cond later.
 
     result_list = _cond_on_index(nth_model, features["problem_choice"], 0,
-                                 len(hparams.problems) - 1)
+                                 len(my_hp.problems) - 1)
 
     if mode == tf.contrib.learn.ModeKeys.INFER:
       # Beam search in sequence model returns both decodes withe key "outputs"
@@ -524,11 +544,11 @@ def nth_model(n):
 
     # Some training statistics.
     with tf.name_scope("training_stats"):
-      learning_rate = hparams.learning_rate * learning_rate_decay()
+      learning_rate = my_hp.learning_rate * learning_rate_decay()
       learning_rate /= math.sqrt(float(FLAGS.worker_replicas))
       tf.summary.scalar("learning_rate", learning_rate)
       global_step = tf.to_float(tf.contrib.framework.get_global_step())
-      for n in xrange(len(hparams.problems)):
+      for n in xrange(len(my_hp.problems)):
         with tf.variable_scope("losses_avg", reuse=True):
           total_loss_var = tf.get_variable("problem_%d/total_loss" % n)
           training_loss_var = tf.get_variable("problem_%d/training_loss" % n)
@@ -550,27 +570,27 @@ def nth_model(n):
       tf.logging.info("Weight    %s\tshape    %s\tsize    %d",
                       v.name[:-2].ljust(80), str(v.shape).ljust(20), v_size)
       total_size += v_size
-      if hparams.weight_decay > 0.0 and len(v.shape.as_list()) > 1:
+      if my_hp.weight_decay > 0.0 and len(v.shape.as_list()) > 1:
         # Add weight regularization if set and the weight is not a bias (dim>1).
         with tf.device(v._ref().device):  # pylint: disable=protected-access
           v_loss = tf.nn.l2_loss(v) / v_size
         weight_decay_loss += v_loss
       is_body = len(v_name) > 5 and v_name[:5] == "body/"
-      if hparams.weight_noise > 0.0 and is_body:
-        # Add weight noise if set in hparams.
+      if my_hp.weight_noise > 0.0 and is_body:
+        # Add weight noise if set in my_hp.
         with tf.device(v._ref().device):  # pylint: disable=protected-access
           scale = learning_rate * 0.001
-          noise = tf.truncated_normal(v.shape) * hparams.weight_noise * scale
+          noise = tf.truncated_normal(v.shape) * my_hp.weight_noise * scale
           noise_op = v.assign_add(noise)
         with tf.control_dependencies([noise_op]):
           total_loss = tf.identity(total_loss)
     tf.logging.info("Total trainable variables size: %d", total_size)
-    if hparams.weight_decay > 0.0:
-      total_loss += weight_decay_loss * hparams.weight_decay
+    if my_hp.weight_decay > 0.0:
+      total_loss += weight_decay_loss * my_hp.weight_decay
     total_loss = tf.identity(total_loss, name="total_loss")
 
     # Define the train_op for the TRAIN mode.
-    opt = _ConditionalOptimizer(hparams.optimizer, learning_rate, hparams)
+    opt = _ConditionalOptimizer(my_hp.optimizer, learning_rate, my_hp)
     tf.logging.info("Computing gradients for global model_fn.")
     opt_summaries = ["learning_rate", "loss"]
     if hparams.summarize_grads:
@@ -580,7 +600,7 @@ def nth_model(n):
         loss=total_loss,
         global_step=tf.contrib.framework.get_global_step(),
         learning_rate=learning_rate,
-        clip_gradients=hparams.clip_grad_norm or None,
+        clip_gradients=my_hp.clip_grad_norm or None,
         gradient_noise_scale=hparams.grad_noise_scale or None,
         optimizer=opt,
         summaries=opt_summaries,
@@ -766,8 +786,11 @@ def decode_interactively(estimator):
           else:
             tf.logging.info(beam_string)
       else:
-        tf.logging.info(
-            targets_vocab.decode(_save_until_eos(result["outputs"].flatten())))
+        if FLAGS.identity_output:
+          tf.logging.info(" ".join(map(str, result["outputs"].flatten())))
+        else:
+          tf.logging.info(targets_vocab.decode(_save_until_eos(
+              result["outputs"].flatten())))
 
 
 def _decode_batch_input_fn(problem_id, num_decode_batches, sorted_inputs,
@@ -843,7 +866,7 @@ def _interactive_input_fn(hparams):
   const_array_size = 10000
   while True:
     prompt = ("INTERACTIVE MODE  num_samples=%d  decode_length=%d  \n"
-              "  it=<input_type>     ('text' or 'image')\n"
+              "  it=<input_type>     ('text' or 'image' or 'label')\n"
               "  pr=<problem_num>    (set the problem number)\n"
               "  in=<input_problem>  (set the input problem number)\n"
               "  ou=<output_problem> (set the output problem number)\n"
@@ -894,6 +917,13 @@ def _interactive_input_fn(hparams):
             "inputs": img,
             "problem_choice": np.array(problem_id)
         }
+      elif input_type == "label":
+        input_ids = [int(input_string)]
+        x = [num_samples, decode_length, len(input_ids)] + input_ids
+        yield problem_id, {
+            "inputs": np.array(x),
+            "problem_choice": np.array(problem_id)
+        }
       else:
         raise Exception("Unsupported input type.")
 
@@ -1085,8 +1115,9 @@ def input_fn():
           with tf.device("/cpu:0"):  # Input reading on CPU
             capacity = p_hparams.max_expected_batch_size_per_shard
             capacity *= num_datashards
-            examples = data_reader.input_pipeline(
-                problem_instance, data_file_patterns[n], capacity, mode)
+            examples = data_reader.input_pipeline(problem_instance,
+                                                  data_file_patterns[n],
+                                                  capacity, mode, hparams)
             feature_map = data_reader.batch_examples(
                 examples,
                 data_reader.hparams_to_batching_scheme(
diff --git a/tensor2tensor/utils/trainer_utils_test.py b/tensor2tensor/utils/trainer_utils_test.py
index ea88183c9..562279623 100644
--- a/tensor2tensor/utils/trainer_utils_test.py
+++ b/tensor2tensor/utils/trainer_utils_test.py
@@ -67,7 +67,7 @@ def setUpClass(cls):
 
   def testModelsImported(self):
     models = registry.list_models()
-    self.assertTrue("baseline_lstm_seq2seq" in models)
+    self.assertTrue("lstm_seq2seq" in models)
 
   def testHParamsImported(self):
     hparams = registry.list_hparams()