From 93177b02cb72e1b0dd585b6f3a40f7382eefd47a Mon Sep 17 00:00:00 2001
From: vthorsteinsson <vt@extrada.com>
Date: Thu, 20 Jul 2017 12:37:13 +0000
Subject: [PATCH 01/10] Added ice_parsing_tokens to problem registry

---
 tensor2tensor/bin/t2t-datagen                 |  5 --
 .../data_generators/generator_utils.py        |  4 +-
 .../data_generators/problem_hparams.py        | 36 -------------
 tensor2tensor/data_generators/wmt.py          | 53 +++++++++++++++++--
 tensor2tensor/models/transformer.py           | 14 +++++
 tensor2tensor/utils/registry.py               |  6 +--
 6 files changed, 69 insertions(+), 49 deletions(-)
 mode change 100644 => 100755 tensor2tensor/data_generators/problem_hparams.py
 mode change 100644 => 100755 tensor2tensor/data_generators/wmt.py
 mode change 100644 => 100755 tensor2tensor/models/transformer.py
 mode change 100644 => 100755 tensor2tensor/utils/registry.py

diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen
index cbf0a6164..2f8a418e2 100755
--- a/tensor2tensor/bin/t2t-datagen
+++ b/tensor2tensor/bin/t2t-datagen
@@ -109,11 +109,6 @@ _SUPPORTED_PROBLEM_GENERATORS = {
     "algorithmic_algebra_inverse": (
         lambda: algorithmic_math.algebra_inverse(26, 0, 2, 100000),
         lambda: algorithmic_math.algebra_inverse(26, 3, 3, 10000)),
-    "ice_parsing_tokens": (
-        lambda: wmt.tabbed_parsing_token_generator(FLAGS.tmp_dir,
-                                                   True, "ice", 2**13, 2**8),
-        lambda: wmt.tabbed_parsing_token_generator(FLAGS.tmp_dir,
-                                                   False, "ice", 2**13, 2**8)),
     "ice_parsing_characters": (
         lambda: wmt.tabbed_parsing_character_generator(FLAGS.tmp_dir, True),
         lambda: wmt.tabbed_parsing_character_generator(FLAGS.tmp_dir, False)),
diff --git a/tensor2tensor/data_generators/generator_utils.py b/tensor2tensor/data_generators/generator_utils.py
index 20f3959d8..51c8a5899 100755
--- a/tensor2tensor/data_generators/generator_utils.py
+++ b/tensor2tensor/data_generators/generator_utils.py
@@ -266,7 +266,7 @@ def get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size, sources=None):
   for source in sources:
     url = source[0]
     filename = os.path.basename(url)
-    read_type = "r:gz" if "tgz" in filename else "r"
+    read_type = "r:gz" if filename.endswith(".tgz") else "r"
 
     compressed_file = maybe_download(tmp_dir, filename, url)
 
@@ -278,7 +278,7 @@ def get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size, sources=None):
       filepath = os.path.join(tmp_dir, lang_file)
 
       # For some datasets a second extraction is necessary.
-      if ".gz" in lang_file:
+      if lang_file.endswith(".gz"):
         new_filepath = os.path.join(tmp_dir, lang_file[:-3])
         if tf.gfile.Exists(new_filepath):
           tf.logging.info(
diff --git a/tensor2tensor/data_generators/problem_hparams.py b/tensor2tensor/data_generators/problem_hparams.py
old mode 100644
new mode 100755
index 70b9dada8..e071ba60d
--- a/tensor2tensor/data_generators/problem_hparams.py
+++ b/tensor2tensor/data_generators/problem_hparams.py
@@ -569,41 +569,6 @@ def wsj_parsing_tokens(model_hparams,
   return p
 
 
-def ice_parsing_tokens(model_hparams, wrong_source_vocab_size):
-  """Icelandic to parse tree translation benchmark.
-
-  Args:
-    model_hparams: a tf.contrib.training.HParams
-    wrong_source_vocab_size: a number used in the filename indicating the
-      approximate vocabulary size.  This is not to be confused with the actual
-      vocabulary size.
-
-  Returns:
-    A tf.contrib.training.HParams object.
-  """
-  p = default_problem_hparams()
-  # This vocab file must be present within the data directory.
-  source_vocab_filename = os.path.join(
-      model_hparams.data_dir,
-      "ice_source.tokens.vocab.%d" % wrong_source_vocab_size)
-  target_vocab_filename = os.path.join(
-      model_hparams.data_dir,
-      "ice_target.tokens.vocab.256")
-  source_subtokenizer = text_encoder.SubwordTextEncoder(source_vocab_filename)
-  target_subtokenizer = text_encoder.SubwordTextEncoder(target_vocab_filename)
-  p.input_modality = {
-      "inputs": (registry.Modalities.SYMBOL, source_subtokenizer.vocab_size)
-  }
-  p.target_modality = (registry.Modalities.SYMBOL, 256)
-  p.vocabulary = {
-      "inputs": source_subtokenizer,
-      "targets": target_subtokenizer,
-  }
-  p.input_space_id = 18   # Icelandic tokens
-  p.target_space_id = 19  # Icelandic parse tokens
-  return p
-
-
 def image_cifar10(unused_model_hparams):
   """CIFAR-10."""
   p = default_problem_hparams()
@@ -720,7 +685,6 @@ def img2img_imagenet(unused_model_hparams):
     "wiki_32k": wiki_32k,
     "lmptb_10k": lmptb_10k,
     "ice_parsing_characters": wmt_parsing_characters,
-    "ice_parsing_tokens": lambda p: ice_parsing_tokens(p, 2**13),
     "wmt_parsing_tokens_8k": lambda p: wmt_parsing_tokens(p, 2**13),
     "wsj_parsing_tokens_16k": lambda p: wsj_parsing_tokens(  # pylint: disable=g-long-lambda
         p, "wsj", 2**14, 2**9),
diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py
old mode 100644
new mode 100755
index de5a25e13..f8b20a0e4
--- a/tensor2tensor/data_generators/wmt.py
+++ b/tensor2tensor/data_generators/wmt.py
@@ -37,6 +37,9 @@
 
 FLAGS = tf.flags.FLAGS
 
+# End-of-sentence marker.
+EOS = text_encoder.EOS_TOKEN
+
 
 @registry.register_problem("wmt_ende_tokens_8k")
 class WMTEnDeTokens8k(problem.Problem):
@@ -81,6 +84,53 @@ def _default_wmt_feature_encoders(data_dir, target_vocab_size):
       "targets": subtokenizer,
   }
 
+
+@registry.register_problem("ice_parsing_tokens")
+class IceParsingTokens(problem.Problem):
+  """Problem spec for parsing tokenized Icelandic text to
+    constituency trees, also tokenized but to a smaller vocabulary."""
+
+  @property
+  def source_vocab_size(self):
+    return 2**13  # 8192
+
+  @property
+  def target_vocab_size(self):
+    return 2**8  # 256
+
+  def feature_encoders(self, data_dir):
+    source_vocab_filename = os.path.join(
+        data_dir, "ice_source.tokens.vocab.%d" % self.source_vocab_size)
+    target_vocab_filename = os.path.join(
+        data_dir, "ice_target.tokens.vocab.%d" % self.target_vocab_size)
+    source_subtokenizer = text_encoder.SubwordTextEncoder(source_vocab_filename)
+    target_subtokenizer = text_encoder.SubwordTextEncoder(target_vocab_filename)
+    return {
+        "inputs": source_subtokenizer,
+        "targets": target_subtokenizer,
+    }
+
+  def generate_data(self, data_dir, tmp_dir, num_shards=100):
+    generator_utils.generate_dataset_and_shuffle(
+        tabbed_parsing_token_generator(tmp_dir, True, "ice",
+                                       self.source_vocab_size,
+                                       self.target_vocab_size),
+        self.training_filepaths(data_dir, num_shards, shuffled=False),
+        tabbed_parsing_token_generator(tmp_dir, False, "ice",
+                                       self.source_vocab_size,
+                                       self.target_vocab_size),
+        self.dev_filepaths(data_dir, 1, shuffled=False))
+
+  def hparams(self, defaults, unused_model_hparams):
+    p = defaults
+    source_vocab_size = self._encoders["inputs"].vocab_size
+    p.input_modality = {"inputs": (registry.Modalities.SYMBOL, source_vocab_size)}
+    p.target_modality = (registry.Modalities.SYMBOL, self.target_vocab_size)
+    p.input_space_id = problem.SpaceID.ICE_TOK
+    p.target_space_id = problem.SpaceID.ICE_PARSE_TOK
+    p.loss_multiplier = 2.5 # Rough estimate of avg number of tokens per word
+
+
 @registry.register_problem("setimes_mken_tokens_32k")
 class SETimesMkEnTokens32k(problem.Problem):
   """Problem spec for SETimes Mk-En translation."""
@@ -107,9 +157,6 @@ def hparams(self, defaults, unused_model_hparams):
     p.input_space_id = problem.SpaceID.MK_TOK
     p.target_space_id = problem.SpaceID.EN_TOK
 
-# End-of-sentence marker.
-EOS = text_encoder.EOS_TOKEN
-
 
 def character_generator(source_path, target_path, character_vocab, eos=None):
   """Generator for sequence-to-sequence tasks that just uses characters.
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
old mode 100644
new mode 100755
index b341d6fe0..042ce797e
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -375,6 +375,20 @@ def transformer_parsing_ice():
   return hparams
 
 
+@registry.register_hparams
+def transformer_parsing_ice_big():
+  """Hparams for parsing Icelandic text, bigger model."""
+  hparams = transformer_parsing_ice()
+  hparams.batch_size = 2048 # 4096 gives Out-of-memory on 8 GB 1080 GTX GPU
+  hparams.attention_dropout = 0.2
+  hparams.residual_dropout = 0.2
+  hparams.max_length = 512
+  hparams.learning_rate_warmup_steps = 16000
+  hparams.hidden_size = 1024
+  hparams.learning_rate = 0.05
+  return hparams
+
+
 @registry.register_hparams
 def transformer_tiny():
   hparams = transformer_base()
diff --git a/tensor2tensor/utils/registry.py b/tensor2tensor/utils/registry.py
old mode 100644
new mode 100755
index 5a8823510..c9adfb692
--- a/tensor2tensor/utils/registry.py
+++ b/tensor2tensor/utils/registry.py
@@ -222,10 +222,10 @@ def parse_problem_name(problem_name):
       was_copy: A boolean.
     """
     # Recursively strip tags until we reach a base name.
-    if len(problem_name) > 4 and problem_name[-4:] == "_rev":
+    if problem_name.endswith("_rev"):
       base, _, was_copy = parse_problem_name(problem_name[:-4])
       return base, True, was_copy
-    elif len(problem_name) > 5 and problem_name[-5:] == "_copy":
+    elif problem_name.endswith("_copy"):
       base, was_reversed, _ = parse_problem_name(problem_name[:-5])
       return base, was_reversed, True
     else:
@@ -338,7 +338,7 @@ def list_modalities():
 
 
 def parse_modality_name(name):
-  name_parts = name.split(":")
+  name_parts = name.split(":", maxsplit=1)
   if len(name_parts) < 2:
     name_parts.append("default")
   modality_type, modality_name = name_parts

From fa92cbe2f293e1189d7481fdab656f724f4bc977 Mon Sep 17 00:00:00 2001
From: vthorsteinsson <vt@extrada.com>
Date: Thu, 20 Jul 2017 19:28:00 +0000
Subject: [PATCH 02/10] Adaptation to upstream changes

---
 tensor2tensor/bin/t2t-trainer        |  0
 tensor2tensor/data_generators/wmt.py | 98 +++++++++++++---------------
 2 files changed, 47 insertions(+), 51 deletions(-)
 mode change 100644 => 100755 tensor2tensor/bin/t2t-trainer

diff --git a/tensor2tensor/bin/t2t-trainer b/tensor2tensor/bin/t2t-trainer
old mode 100644
new mode 100755
diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py
index 2d0902dac..d0f7abaec 100755
--- a/tensor2tensor/data_generators/wmt.py
+++ b/tensor2tensor/data_generators/wmt.py
@@ -75,57 +75,6 @@ def train_generator(self):
     """Generator; takes data_dir, tmp_dir, is_training, targeted_vocab_size."""
     raise NotImplementedError()
 
-
-@registry.register_problem("ice_parsing_tokens")
-class IceParsingTokens(problem.Problem):
-  """Problem spec for parsing tokenized Icelandic text to
-    constituency trees, also tokenized but to a smaller vocabulary."""
-
-  @property
-  def source_vocab_size(self):
-    return 2**13  # 8192
-
-  @property
-  def target_vocab_size(self):
-    return 2**8  # 256
-
-  def feature_encoders(self, data_dir):
-    source_vocab_filename = os.path.join(
-        data_dir, "ice_source.tokens.vocab.%d" % self.source_vocab_size)
-    target_vocab_filename = os.path.join(
-        data_dir, "ice_target.tokens.vocab.%d" % self.target_vocab_size)
-    source_subtokenizer = text_encoder.SubwordTextEncoder(source_vocab_filename)
-    target_subtokenizer = text_encoder.SubwordTextEncoder(target_vocab_filename)
-    return {
-        "inputs": source_subtokenizer,
-        "targets": target_subtokenizer,
-    }
-
-  def generate_data(self, data_dir, tmp_dir, num_shards=100):
-    generator_utils.generate_dataset_and_shuffle(
-        tabbed_parsing_token_generator(tmp_dir, True, "ice",
-                                       self.source_vocab_size,
-                                       self.target_vocab_size),
-        self.training_filepaths(data_dir, num_shards, shuffled=False),
-        tabbed_parsing_token_generator(tmp_dir, False, "ice",
-                                       self.source_vocab_size,
-                                       self.target_vocab_size),
-        self.dev_filepaths(data_dir, 1, shuffled=False))
-
-  def hparams(self, defaults, unused_model_hparams):
-    p = defaults
-    source_vocab_size = self._encoders["inputs"].vocab_size
-    p.input_modality = {"inputs": (registry.Modalities.SYMBOL, source_vocab_size)}
-    p.target_modality = (registry.Modalities.SYMBOL, self.target_vocab_size)
-    p.input_space_id = problem.SpaceID.ICE_TOK
-    p.target_space_id = problem.SpaceID.ICE_PARSE_TOK
-    p.loss_multiplier = 2.5 # Rough estimate of avg number of tokens per word
-
-
-@registry.register_problem("setimes_mken_tokens_32k")
-class SETimesMkEnTokens32k(problem.Problem):
-  """Problem spec for SETimes Mk-En translation."""
-
   @property
   def dev_generator(self):
     return self.train_generator
@@ -734,3 +683,50 @@ def parsing_token_generator(data_dir, tmp_dir, train, vocab_size):
   tree_filepath = os.path.join(tmp_dir, filename)
   return wsj_parsing.token_generator(tree_filepath, symbolizer_vocab,
                                      symbolizer_vocab, EOS)
+
+
+@registry.register_problem("ice_parsing_tokens")
+class IceParsingTokens(problem.Problem):
+  """Problem spec for parsing tokenized Icelandic text to
+    constituency trees, also tokenized but to a smaller vocabulary."""
+
+  @property
+  def source_vocab_size(self):
+    return 2**13  # 8192
+
+  @property
+  def target_vocab_size(self):
+    return 2**8  # 256
+
+  def feature_encoders(self, data_dir):
+    source_vocab_filename = os.path.join(
+        data_dir, "ice_source.tokens.vocab.%d" % self.source_vocab_size)
+    target_vocab_filename = os.path.join(
+        data_dir, "ice_target.tokens.vocab.%d" % self.target_vocab_size)
+    source_subtokenizer = text_encoder.SubwordTextEncoder(source_vocab_filename)
+    target_subtokenizer = text_encoder.SubwordTextEncoder(target_vocab_filename)
+    return {
+        "inputs": source_subtokenizer,
+        "targets": target_subtokenizer,
+    }
+
+  def generate_data(self, data_dir, tmp_dir, num_shards=100):
+    generator_utils.generate_dataset_and_shuffle(
+        tabbed_parsing_token_generator(tmp_dir, True, "ice",
+                                       self.source_vocab_size,
+                                       self.target_vocab_size),
+        self.training_filepaths(data_dir, num_shards, shuffled=False),
+        tabbed_parsing_token_generator(tmp_dir, False, "ice",
+                                       self.source_vocab_size,
+                                       self.target_vocab_size),
+        self.dev_filepaths(data_dir, 1, shuffled=False))
+
+  def hparams(self, defaults, unused_model_hparams):
+    p = defaults
+    source_vocab_size = self._encoders["inputs"].vocab_size
+    p.input_modality = {"inputs": (registry.Modalities.SYMBOL, source_vocab_size)}
+    p.target_modality = (registry.Modalities.SYMBOL, self.target_vocab_size)
+    p.input_space_id = problem.SpaceID.ICE_TOK
+    p.target_space_id = problem.SpaceID.ICE_PARSE_TOK
+    p.loss_multiplier = 2.5 # Rough estimate of avg number of tokens per word
+

From d69ad4d3e7619dd898e648743125517f26a43d44 Mon Sep 17 00:00:00 2001
From: vthorsteinsson <vt@extrada.com>
Date: Fri, 28 Jul 2017 16:21:15 +0000
Subject: [PATCH 03/10] Moved Icelandic parsing to separate module

---
 tensor2tensor/data_generators/wmt.py     | 127 ++++++-----------------
 tensor2tensor/ice_parsing/__init__.py    |   2 +
 tensor2tensor/ice_parsing/ice_parsing.py | 127 +++++++++++++++++++++++
 tensor2tensor/models/transformer.py      |  23 ----
 4 files changed, 159 insertions(+), 120 deletions(-)
 create mode 100644 tensor2tensor/ice_parsing/__init__.py
 create mode 100755 tensor2tensor/ice_parsing/ice_parsing.py

diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py
index d0f7abaec..3d01ab46e 100755
--- a/tensor2tensor/data_generators/wmt.py
+++ b/tensor2tensor/data_generators/wmt.py
@@ -162,36 +162,6 @@ def character_generator(source_path, target_path, character_vocab, eos=None):
         source, target = source_file.readline(), target_file.readline()
 
 
-def tabbed_generator(source_path, source_vocab, target_vocab, eos=None):
-  r"""Generator for sequence-to-sequence tasks using tabbed files.
-
-  Tokens are derived from text files where each line contains both
-  a source and a target string. The two strings are separated by a tab
-  character ('\t'). It yields dictionaries of "inputs" and "targets" where
-  inputs are characters from the source lines converted to integers, and
-  targets are characters from the target lines, also converted to integers.
-
-  Args:
-    source_path: path to the file with source and target sentences.
-    source_vocab: a SunwordTextEncoder to encode the source string.
-    target_vocab: a SunwordTextEncoder to encode the target string.
-    eos: integer to append at the end of each sequence (default: None).
-
-  Yields:
-    A dictionary {"inputs": source-line, "targets": target-line} where
-    the lines are integer lists converted from characters in the file lines.
-  """
-  eos_list = [] if eos is None else [eos]
-  with tf.gfile.GFile(source_path, mode="r") as source_file:
-    for line in source_file:
-      if line and "\t" in line:
-        parts = line.split("\t", maxsplit=1)
-        source, target = parts[0].strip(), parts[1].strip()
-        source_ints = source_vocab.encode(source) + eos_list
-        target_ints = target_vocab.encode(target) + eos_list
-        yield {"inputs": source_ints, "targets": target_ints}
-
-
 def token_generator(source_path, target_path, token_vocab, eos=None):
   """Generator for sequence-to-sequence tasks that uses tokens.
 
@@ -255,6 +225,36 @@ def bi_vocabs_token_generator(source_path,
         source, target = source_file.readline(), target_file.readline()
 
 
+def tabbed_generator(source_path, source_vocab, target_vocab, eos=None):
+  r"""Generator for sequence-to-sequence tasks using tabbed files.
+
+  Tokens are derived from text files where each line contains both
+  a source and a target string. The two strings are separated by a tab
+  character ('\t'). It yields dictionaries of "inputs" and "targets" where
+  inputs are characters from the source lines converted to integers, and
+  targets are characters from the target lines, also converted to integers.
+
+  Args:
+    source_path: path to the file with source and target sentences.
+    source_vocab: a SunwordTextEncoder to encode the source string.
+    target_vocab: a SunwordTextEncoder to encode the target string.
+    eos: integer to append at the end of each sequence (default: None).
+
+  Yields:
+    A dictionary {"inputs": source-line, "targets": target-line} where
+    the lines are integer lists converted from characters in the file lines.
+  """
+  eos_list = [] if eos is None else [eos]
+  with tf.gfile.GFile(source_path, mode="r") as source_file:
+    for line in source_file:
+      if line and "\t" in line:
+        parts = line.split("\t", maxsplit=1)
+        source, target = parts[0].strip(), parts[1].strip()
+        source_ints = source_vocab.encode(source) + eos_list
+        target_ints = target_vocab.encode(target) + eos_list
+        yield {"inputs": source_ints, "targets": target_ints}
+
+
 # Data-set URLs.
 
 
@@ -654,28 +654,6 @@ def parsing_character_generator(tmp_dir, train):
   return character_generator(text_filepath, tags_filepath, character_vocab, EOS)
 
 
-def tabbed_parsing_token_generator(data_dir, tmp_dir, train, prefix,
-                                   source_vocab_size, target_vocab_size):
-  """Generate source and target data from a single file."""
-  source_vocab = generator_utils.get_or_generate_tabbed_vocab(
-      data_dir, tmp_dir, "parsing_train.pairs", 0,
-      prefix + "_source.vocab.%d" % source_vocab_size, source_vocab_size)
-  target_vocab = generator_utils.get_or_generate_tabbed_vocab(
-      data_dir, tmp_dir, "parsing_train.pairs", 1,
-      prefix + "_target.vocab.%d" % target_vocab_size, target_vocab_size)
-  filename = "parsing_%s" % ("train" if train else "dev")
-  pair_filepath = os.path.join(tmp_dir, filename + ".pairs")
-  return tabbed_generator(pair_filepath, source_vocab, target_vocab, EOS)
-
-
-def tabbed_parsing_character_generator(tmp_dir, train):
-  """Generate source and target data from a single file."""
-  character_vocab = text_encoder.ByteTextEncoder()
-  filename = "parsing_%s" % ("train" if train else "dev")
-  pair_filepath = os.path.join(tmp_dir, filename + ".pairs")
-  return tabbed_generator(pair_filepath, character_vocab, character_vocab, EOS)
-
-
 def parsing_token_generator(data_dir, tmp_dir, train, vocab_size):
   symbolizer_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, "vocab.endefr.%d" % vocab_size, vocab_size)
@@ -685,48 +663,3 @@ def parsing_token_generator(data_dir, tmp_dir, train, vocab_size):
                                      symbolizer_vocab, EOS)
 
 
-@registry.register_problem("ice_parsing_tokens")
-class IceParsingTokens(problem.Problem):
-  """Problem spec for parsing tokenized Icelandic text to
-    constituency trees, also tokenized but to a smaller vocabulary."""
-
-  @property
-  def source_vocab_size(self):
-    return 2**13  # 8192
-
-  @property
-  def target_vocab_size(self):
-    return 2**8  # 256
-
-  def feature_encoders(self, data_dir):
-    source_vocab_filename = os.path.join(
-        data_dir, "ice_source.tokens.vocab.%d" % self.source_vocab_size)
-    target_vocab_filename = os.path.join(
-        data_dir, "ice_target.tokens.vocab.%d" % self.target_vocab_size)
-    source_subtokenizer = text_encoder.SubwordTextEncoder(source_vocab_filename)
-    target_subtokenizer = text_encoder.SubwordTextEncoder(target_vocab_filename)
-    return {
-        "inputs": source_subtokenizer,
-        "targets": target_subtokenizer,
-    }
-
-  def generate_data(self, data_dir, tmp_dir, num_shards=100):
-    generator_utils.generate_dataset_and_shuffle(
-        tabbed_parsing_token_generator(tmp_dir, True, "ice",
-                                       self.source_vocab_size,
-                                       self.target_vocab_size),
-        self.training_filepaths(data_dir, num_shards, shuffled=False),
-        tabbed_parsing_token_generator(tmp_dir, False, "ice",
-                                       self.source_vocab_size,
-                                       self.target_vocab_size),
-        self.dev_filepaths(data_dir, 1, shuffled=False))
-
-  def hparams(self, defaults, unused_model_hparams):
-    p = defaults
-    source_vocab_size = self._encoders["inputs"].vocab_size
-    p.input_modality = {"inputs": (registry.Modalities.SYMBOL, source_vocab_size)}
-    p.target_modality = (registry.Modalities.SYMBOL, self.target_vocab_size)
-    p.input_space_id = problem.SpaceID.ICE_TOK
-    p.target_space_id = problem.SpaceID.ICE_PARSE_TOK
-    p.loss_multiplier = 2.5 # Rough estimate of avg number of tokens per word
-
diff --git a/tensor2tensor/ice_parsing/__init__.py b/tensor2tensor/ice_parsing/__init__.py
new file mode 100644
index 000000000..36f468dcb
--- /dev/null
+++ b/tensor2tensor/ice_parsing/__init__.py
@@ -0,0 +1,2 @@
+
+from .ice_parsing import IceParsingTokens, transformer_parsing_ice, transformer_parsing_ice_big
diff --git a/tensor2tensor/ice_parsing/ice_parsing.py b/tensor2tensor/ice_parsing/ice_parsing.py
new file mode 100755
index 000000000..d8dd41cf7
--- /dev/null
+++ b/tensor2tensor/ice_parsing/ice_parsing.py
@@ -0,0 +1,127 @@
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This module implements the ice_parsing_* problems, which
+# parse plain text into flattened parse trees and POS tags.
+# The training data is stored in files named `parsing_train.pairs`
+# and `parsing_dev.pairs`. These files are UTF-8 text files where
+# each line contains an input sentence and a target parse tree,
+# separated by a tab character.
+
+import os
+
+# Dependency imports
+
+from tensor2tensor.data_generators import generator_utils
+from tensor2tensor.data_generators import problem
+from tensor2tensor.data_generators import text_encoder
+from tensor2tensor.data_generators.wmt import tabbed_generator
+from tensor2tensor.utils import registry
+from tensor2tensor.models import transformer
+
+import tensorflow as tf
+
+
+# End-of-sentence marker.
+EOS = text_encoder.EOS_ID
+
+
+def tabbed_parsing_token_generator(data_dir, tmp_dir, train, prefix,
+                                   source_vocab_size, target_vocab_size):
+  """Generate source and target data from a single file."""
+  filename = "parsing_{0}.pairs".format("train" if train else "dev")
+  source_vocab = generator_utils.get_or_generate_tabbed_vocab(
+      data_dir, tmp_dir, filename, 0,
+      prefix + "_source.tokens.vocab.%d" % source_vocab_size, source_vocab_size)
+  target_vocab = generator_utils.get_or_generate_tabbed_vocab(
+      data_dir, tmp_dir, filename, 1,
+      prefix + "_target.tokens.vocab.%d" % target_vocab_size, target_vocab_size)
+  pair_filepath = os.path.join(tmp_dir, filename)
+  return tabbed_generator(pair_filepath, source_vocab, target_vocab, EOS)
+
+
+def tabbed_parsing_character_generator(tmp_dir, train):
+  """Generate source and target data from a single file."""
+  character_vocab = text_encoder.ByteTextEncoder()
+  filename = "parsing_{0}.pairs".format("train" if train else "dev")
+  pair_filepath = os.path.join(tmp_dir, filename)
+  return tabbed_generator(pair_filepath, character_vocab, character_vocab, EOS)
+
+
+@registry.register_problem("ice_parsing_tokens")
+class IceParsingTokens(problem.Problem):
+  """Problem spec for parsing tokenized Icelandic text to
+    constituency trees, also tokenized but to a smaller vocabulary."""
+
+  @property
+  def source_vocab_size(self):
+    return 2**13  # 8192
+
+  @property
+  def target_vocab_size(self):
+    return 2**8  # 256
+
+  def feature_encoders(self, data_dir):
+    source_vocab_filename = os.path.join(
+        data_dir, "ice_source.tokens.vocab.%d" % self.source_vocab_size)
+    target_vocab_filename = os.path.join(
+        data_dir, "ice_target.tokens.vocab.%d" % self.target_vocab_size)
+    source_subtokenizer = text_encoder.SubwordTextEncoder(source_vocab_filename)
+    target_subtokenizer = text_encoder.SubwordTextEncoder(target_vocab_filename)
+    return {
+        "inputs": source_subtokenizer,
+        "targets": target_subtokenizer,
+    }
+
+  def generate_data(self, data_dir, tmp_dir, num_shards=100):
+    generator_utils.generate_dataset_and_shuffle(
+        tabbed_parsing_token_generator(data_dir, tmp_dir, True, "ice",
+                                       self.source_vocab_size,
+                                       self.target_vocab_size),
+        self.training_filepaths(data_dir, num_shards, shuffled=False),
+        tabbed_parsing_token_generator(data_dir, tmp_dir, False, "ice",
+                                       self.source_vocab_size,
+                                       self.target_vocab_size),
+        self.dev_filepaths(data_dir, 1, shuffled=False))
+
+  def hparams(self, defaults, unused_model_hparams):
+    p = defaults
+    source_vocab_size = self._encoders["inputs"].vocab_size
+    p.input_modality = {"inputs": (registry.Modalities.SYMBOL, source_vocab_size)}
+    p.target_modality = (registry.Modalities.SYMBOL, self.target_vocab_size)
+    p.input_space_id = problem.SpaceID.ICE_TOK
+    p.target_space_id = problem.SpaceID.ICE_PARSE_TOK
+    p.loss_multiplier = 2.5 # Rough estimate of avg number of tokens per word
+
+
+@registry.register_hparams
+def transformer_parsing_ice():
+  """Hparams for parsing Icelandic text."""
+  hparams = transformer.transformer_base_single_gpu()
+  hparams.batch_size = 4096
+  hparams.shared_embedding_and_softmax_weights = int(False)
+  return hparams
+
+
+@registry.register_hparams
+def transformer_parsing_ice_big():
+  """Hparams for parsing Icelandic text, bigger model."""
+  hparams = transformer_parsing_ice()
+  hparams.batch_size = 2048 # 4096 gives Out-of-memory on 8 GB 1080 GTX GPU
+  hparams.attention_dropout = 0.05
+  hparams.residual_dropout = 0.05
+  hparams.max_length = 512
+  hparams.hidden_size = 1024
+  return hparams
+
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index f03f173e2..0489567a0 100755
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -357,29 +357,6 @@ def transformer_parsing_big():
   return hparams
 
 
-@registry.register_hparams
-def transformer_parsing_ice():
-  """Hparams for parsing Icelandic text."""
-  hparams = transformer_base_single_gpu()
-  hparams.batch_size = 4096
-  hparams.shared_embedding_and_softmax_weights = int(False)
-  return hparams
-
-
-@registry.register_hparams
-def transformer_parsing_ice_big():
-  """Hparams for parsing Icelandic text, bigger model."""
-  hparams = transformer_parsing_ice()
-  hparams.batch_size = 2048 # 4096 gives Out-of-memory on 8 GB 1080 GTX GPU
-  hparams.attention_dropout = 0.2
-  hparams.residual_dropout = 0.2
-  hparams.max_length = 512
-  hparams.learning_rate_warmup_steps = 16000
-  hparams.hidden_size = 1024
-  hparams.learning_rate = 0.05
-  return hparams
-
-
 @registry.register_hparams
 def transformer_tiny():
   hparams = transformer_base()

From d2af7cfe0b74a0e5ec3e0621ff7ad0e0776281df Mon Sep 17 00:00:00 2001
From: vthorsteinsson <vt@extrada.com>
Date: Wed, 9 Aug 2017 11:18:31 +0000
Subject: [PATCH 04/10] Added readline import in trainer_utils.py

---
 tensor2tensor/utils/trainer_utils.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)
 mode change 100644 => 100755 tensor2tensor/utils/trainer_utils.py

diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py
old mode 100644
new mode 100755
index 260ec6a00..dc8238c51
--- a/tensor2tensor/utils/trainer_utils.py
+++ b/tensor2tensor/utils/trainer_utils.py
@@ -841,6 +841,11 @@ def _interactive_input_fn(hparams):
   vocabulary = p_hparams.vocabulary["inputs" if has_input else "targets"]
   # This should be longer than the longest input.
   const_array_size = 10000
+  # For ease of input, activate the readline module if available.
+  try:
+    import readline
+  except ImportError:
+    pass
   while True:
     prompt = ("INTERACTIVE MODE  num_samples=%d  decode_length=%d  \n"
               "  it=<input_type>     ('text' or 'image')\n"
@@ -848,7 +853,7 @@ def _interactive_input_fn(hparams):
               "  in=<input_problem>  (set the input problem number)\n"
               "  ou=<output_problem> (set the output problem number)\n"
               "  ns=<num_samples>    (changes number of samples)\n"
-              "  dl=<decode_length>  (changes decode legnth)\n"
+              "  dl=<decode_length>  (changes decode length)\n"
               "  <%s>                (decode)\n"
               "  q                   (quit)\n"
               ">" % (num_samples, decode_length, "source_string"

From f2714e93b05753e364e34c1fe9fd69d6e405f5c1 Mon Sep 17 00:00:00 2001
From: vthorsteinsson <vt@extrada.com>
Date: Wed, 9 Aug 2017 11:53:15 +0000
Subject: [PATCH 05/10] Sync with upstream

---
 tensor2tensor/utils/trainer_utils.py | 1113 ++------------------------
 1 file changed, 50 insertions(+), 1063 deletions(-)
 mode change 100755 => 100644 tensor2tensor/utils/trainer_utils.py

diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py
old mode 100755
new mode 100644
index dc8238c51..9e869c15c
--- a/tensor2tensor/utils/trainer_utils.py
+++ b/tensor2tensor/utils/trainer_utils.py
@@ -19,37 +19,24 @@
 from __future__ import division
 from __future__ import print_function
 
-import math
-import operator
-import os
 import sys
 
 # Dependency imports
 
-import numpy as np
-import six
-# pylint: disable=redefined-builtin
-from six.moves import input
-from six.moves import xrange
-# pylint: enable=redefined-builtin
-
 from tensor2tensor.data_generators import all_problems  # pylint: disable=unused-import
 from tensor2tensor.data_generators import problem_hparams
-from tensor2tensor.data_generators import text_encoder
 from tensor2tensor.models import models  # pylint: disable=unused-import
 from tensor2tensor.utils import data_reader
-from tensor2tensor.utils import expert_utils as eu
+from tensor2tensor.utils import decoding
+from tensor2tensor.utils import devices
+from tensor2tensor.utils import input_fn_builder
 from tensor2tensor.utils import metrics
+from tensor2tensor.utils import model_builder
 from tensor2tensor.utils import registry
-from tensor2tensor.utils import yellowfin
 
 import tensorflow as tf
 from tensorflow.contrib.learn.python.learn import learn_runner
 from tensorflow.python import debug
-from tensorflow.python.ops import init_ops
-
-# Number of samples to draw for an image input (in such cases as captioning)
-IMAGE_DECODE_LENGTH = 100
 
 flags = tf.flags
 FLAGS = flags.FLAGS
@@ -130,16 +117,7 @@
                   "<beam1>\t<beam2>..\t<input>")
 flags.DEFINE_integer("decode_max_input_size", -1,
                      "Maximum number of ids in input. Or <= 0 for no max.")
-
-
-def _save_until_eos(hyp):
-  """Strips everything after the first <EOS> token, which is normally 1."""
-  try:
-    index = list(hyp).index(text_encoder.EOS_ID)
-    return hyp[0:index]
-  except ValueError:
-    # No EOS_ID: return the array as-is.
-    return hyp
+flags.DEFINE_bool("identity_output", False, "To print the output as identity")
 
 
 def make_experiment_fn(data_dir, model_name, train_steps, eval_steps):
@@ -179,8 +157,8 @@ def create_experiment(output_dir, data_dir, model_name, train_steps,
     eval_hooks.append(hook)
   return tf.contrib.learn.Experiment(
       estimator=estimator,
-      train_input_fn=input_fns["train"],
-      eval_input_fn=input_fns["eval"],
+      train_input_fn=input_fns[tf.contrib.learn.ModeKeys.TRAIN],
+      eval_input_fn=input_fns[tf.contrib.learn.ModeKeys.EVAL],
       eval_metrics=eval_metrics,
       train_steps=train_steps,
       eval_steps=eval_steps,
@@ -193,22 +171,26 @@ def create_experiment_components(hparams, output_dir, data_dir, model_name):
   """Constructs and returns Estimator and train/eval input functions."""
   tf.logging.info("Creating experiment, storing model files in %s", output_dir)
 
-  num_datashards = data_parallelism().n
-  train_input_fn = get_input_fn(
+  num_datashards = devices.data_parallelism().n
+  train_input_fn = input_fn_builder.build_input_fn(
       mode=tf.contrib.learn.ModeKeys.TRAIN,
       hparams=hparams,
       data_file_patterns=get_data_filepatterns(data_dir,
                                                tf.contrib.learn.ModeKeys.TRAIN),
-      num_datashards=num_datashards)
+      num_datashards=num_datashards,
+      worker_replicas=FLAGS.worker_replicas,
+      worker_id=FLAGS.worker_id)
 
-  eval_input_fn = get_input_fn(
+  eval_input_fn = input_fn_builder.build_input_fn(
       mode=tf.contrib.learn.ModeKeys.EVAL,
       hparams=hparams,
       data_file_patterns=get_data_filepatterns(data_dir,
                                                tf.contrib.learn.ModeKeys.EVAL),
-      num_datashards=num_datashards)
+      num_datashards=num_datashards,
+      worker_replicas=FLAGS.worker_replicas,
+      worker_id=FLAGS.worker_id)
   estimator = tf.contrib.learn.Estimator(
-      model_fn=model_builder(model_name, hparams=hparams),
+      model_fn=model_builder.build_model_fn(model_name, hparams=hparams),
       model_dir=output_dir,
       config=tf.contrib.learn.RunConfig(
           master=FLAGS.master,
@@ -218,7 +200,10 @@ def create_experiment_components(hparams, output_dir, data_dir, model_name):
           keep_checkpoint_max=FLAGS.keep_checkpoint_max))
   # Store the hparams in the estimator as well
   estimator.hparams = hparams
-  return estimator, {"train": train_input_fn, "eval": eval_input_fn}
+  return estimator, {
+      tf.contrib.learn.ModeKeys.TRAIN: train_input_fn,
+      tf.contrib.learn.ModeKeys.EVAL: eval_input_fn
+  }
 
 
 def log_registry():
@@ -227,6 +212,24 @@ def log_registry():
     sys.exit(0)
 
 
+def add_problem_hparams(hparams, problems):
+  """Add problem hparams for the problems."""
+  hparams.problems = []
+  hparams.problem_instances = []
+  for problem_name in problems.split("-"):
+    try:
+      problem = registry.problem(problem_name)
+      p_hparams = problem.internal_hparams(hparams)
+    except ValueError:
+      problem = None
+      p_hparams = problem_hparams.problem_hparams(problem_name, hparams)
+
+    hparams.problem_instances.append(problem)
+    hparams.problems.append(p_hparams)
+
+  return hparams
+
+
 def create_hparams(params_id, data_dir):
   """Returns hyperparameters, including any flag value overrides.
 
@@ -247,21 +250,7 @@ def create_hparams(params_id, data_dir):
   if FLAGS.hparams:
     hparams = hparams.parse(FLAGS.hparams)
 
-  # Add hparams for the problems
-  hparams.problems = []
-  hparams.problem_instances = []
-  for problem_name in FLAGS.problems.split("-"):
-    try:
-      problem = registry.problem(problem_name)
-      p_hparams = problem.internal_hparams(hparams)
-    except ValueError:
-      problem = None
-      p_hparams = problem_hparams.problem_hparams(problem_name, hparams)
-
-    hparams.problem_instances.append(problem)
-    hparams.problems.append(p_hparams)
-
-  return hparams
+  return add_problem_hparams(hparams, FLAGS.problems)
 
 
 def run(data_dir, model, output_dir, train_steps, eval_steps, schedule):
@@ -289,7 +278,11 @@ def run(data_dir, model, output_dir, train_steps, eval_steps, schedule):
 
   if schedule == "local_run":
     # Run the local demo.
-    run_locally(exp_fn(output_dir))
+    exp = exp_fn(output_dir)
+    if exp.train_steps > 0 or exp.eval_steps > 0:
+      tf.logging.info("Performing local training and evaluation.")
+      exp.train_and_evaluate()
+    decode(exp.estimator)
   else:
     # Perform distributed training/evaluation.
     learn_runner.run(
@@ -334,1020 +327,14 @@ def session_config():
   return config
 
 
-def model_builder(model, hparams):
-  """Returns a function to build the model.
-
-  Args:
-    model: The name of the model to use.
-    hparams: The hyperparameters.
-
-  Returns:
-    A function to build the model's graph. This function is called by
-    the Estimator object to construct the graph.
-  """
-
-  def initializer():
-    if hparams.initializer == "orthogonal":
-      return tf.orthogonal_initializer(gain=hparams.initializer_gain)
-    elif hparams.initializer == "uniform":
-      max_val = 0.1 * hparams.initializer_gain
-      return tf.random_uniform_initializer(-max_val, max_val)
-    elif hparams.initializer == "normal_unit_scaling":
-      return init_ops.variance_scaling_initializer(
-          hparams.initializer_gain, mode="fan_avg", distribution="normal")
-    elif hparams.initializer == "uniform_unit_scaling":
-      return init_ops.variance_scaling_initializer(
-          hparams.initializer_gain, mode="fan_avg", distribution="uniform")
-    else:
-      raise ValueError("Unrecognized initializer: %s" % hparams.initializer)
-
-  def learning_rate_decay():
-    """Inverse-decay learning rate until warmup_steps, then decay."""
-    warmup_steps = tf.to_float(
-        hparams.learning_rate_warmup_steps * FLAGS.worker_replicas)
-    step = tf.to_float(tf.contrib.framework.get_global_step())
-    if hparams.learning_rate_decay_scheme == "noam":
-      return 5000.0 * hparams.hidden_size**-0.5 * tf.minimum(
-          (step + 1) * warmup_steps**-1.5, (step + 1)**-0.5)
-    elif hparams.learning_rate_decay_scheme == "exp100k":
-      return 0.94**(step // 100000)
-    elif hparams.learning_rate_decay_scheme == "cosine":
-      cycle_steps = hparams.learning_rate_cosine_cycle_steps
-      return 0.5 * (1 + tf.cos(np.pi * (step % cycle_steps) / cycle_steps))
-
-    inv_base = tf.exp(tf.log(0.01) / warmup_steps)
-    inv_decay = inv_base**(warmup_steps - step)
-    if hparams.learning_rate_decay_scheme == "sqrt":
-      decay = _sqrt_decay(step - warmup_steps)
-    elif hparams.learning_rate_decay_scheme == "exp10k":
-      decay = _exp_decay_after(step - warmup_steps, 0.9995,
-                               FLAGS.train_steps - warmup_steps - 10000)
-    elif hparams.learning_rate_decay_scheme == "exp50k":
-      decay = _exp_decay_after(step - warmup_steps, 0.99995,
-                               FLAGS.train_steps - warmup_steps - 50000)
-    elif hparams.learning_rate_decay_scheme == "exp500k":
-      decay = _exp_decay_after(step - warmup_steps, 0.9999955,
-                               FLAGS.train_steps - warmup_steps - 500000)
-    elif hparams.learning_rate_decay_scheme == "none":
-      decay = tf.constant(1.0)
-    else:
-      raise ValueError("Unrecognized learning rate decay scheme: %s" %
-                       hparams.learning_rate_decay_scheme)
-    return tf.cond(
-        step < warmup_steps,
-        lambda: inv_decay,
-        lambda: decay,
-        name="learning_rate_decay_warump_cond")
-
-  def model_fn(features, targets, mode):
-    """Creates the prediction, loss, and train ops.
-
-    Args:
-      features: A dictionary of tensors keyed by the feature name.
-      targets: A tensor representing the labels (targets).
-      mode: The execution mode, as defined in tf.contrib.learn.ModeKeys.
-
-    Returns:
-      A tuple consisting of the prediction, loss, and train_op.
-    """
-    if mode == tf.contrib.learn.ModeKeys.INFER:
-      if FLAGS.decode_interactive:
-        features = _interactive_input_tensor_to_features_dict(features, hparams)
-      elif FLAGS.decode_from_file:
-        features = _decode_input_tensor_to_features_dict(features, hparams)
-    # A dictionary containing:
-    #  - problem_choice: A Tensor containing an integer indicating which problem
-    #                    was selected for this run.
-    #  - predictions: A Tensor containing the model's output predictions.
-    run_info = dict()
-    run_info["problem_choice"] = features["problem_choice"]
-
-    if targets is not None:
-      features["targets"] = targets
-
-    dp = data_parallelism()
-
-    # Add input statistics for incoming features.
-    with tf.name_scope("input_stats"):
-      for (k, v) in six.iteritems(features):
-        if isinstance(v, tf.Tensor) and v.get_shape().ndims > 1:
-          tf.summary.scalar("%s_batch" % k, tf.shape(v)[0] // dp.n)
-          tf.summary.scalar("%s_length" % k, tf.shape(v)[1])
-          nonpadding = tf.to_float(tf.not_equal(v, 0))
-          tf.summary.scalar("%s_nonpadding_tokens" % k,
-                            tf.reduce_sum(nonpadding))
-          tf.summary.scalar("%s_nonpadding_fraction" % k,
-                            tf.reduce_mean(nonpadding))
-
-    tf.get_variable_scope().set_initializer(initializer())
-    train = mode == tf.contrib.learn.ModeKeys.TRAIN
-
-    # Get multi-problem logits and loss based on features["problem_choice"].
-    def nth_model(n):
-      """Build the model for the n-th problem, plus some added variables."""
-      model_class = registry.model(model)(
-          hparams,
-          mode,
-          hparams.problems[n],
-          n,
-          dp,
-          _ps_devices(all_workers=True))
-      if mode == tf.contrib.learn.ModeKeys.INFER:
-        return model_class.infer(
-            features,
-            beam_size=FLAGS.decode_beam_size,
-            top_beams=(FLAGS.decode_beam_size
-                       if FLAGS.decode_return_beams else 1),
-            last_position_only=FLAGS.decode_use_last_position_only,
-            alpha=FLAGS.decode_alpha,
-            decode_length=FLAGS.decode_extra_length)
-      # In distributed mode, we build graph for problem=0 and problem=worker_id.
-      skipping_is_on = hparams.problem_choice == "distributed" and train
-      problem_worker_id = FLAGS.worker_id % len(hparams.problems)
-      skip_this_one = n != 0 and n % FLAGS.worker_replicas != problem_worker_id
-      # On worker 0 also build graph for problems <= 1.
-      # TODO(lukaszkaiser): why is this hack needed for variables init? Repair.
-      skip_this_one = skip_this_one and (FLAGS.worker_id != 0 or n > 1)
-      sharded_logits, training_loss, extra_loss = model_class.model_fn(
-          features, skip=(skipping_is_on and skip_this_one))
-      with tf.variable_scope("losses_avg", reuse=True):
-        loss_moving_avg = tf.get_variable("problem_%d/training_loss" % n)
-        o1 = loss_moving_avg.assign(loss_moving_avg * 0.9 + training_loss * 0.1)
-        loss_moving_avg = tf.get_variable("problem_%d/extra_loss" % n)
-        o2 = loss_moving_avg.assign(loss_moving_avg * 0.9 + extra_loss * 0.1)
-        loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n)
-        total_loss = training_loss + extra_loss
-        o3 = loss_moving_avg.assign(loss_moving_avg * 0.9 + total_loss * 0.1)
-      with tf.variable_scope("train_stats"):  # Count steps for this problem.
-        problem_steps = tf.get_variable(
-            "problem_%d_steps" % n, initializer=0, trainable=False)
-        o4 = problem_steps.assign_add(1)
-      with tf.control_dependencies([o1, o2, o3, o4]):  # Make sure the ops run.
-        # Ensure the loss is a scalar here.
-        total_loss = tf.reshape(total_loss, [], name="total_loss_control_id")
-      return [total_loss] + sharded_logits  # Need to flatten for cond later.
-
-    result_list = _cond_on_index(nth_model, features["problem_choice"], 0,
-                                 len(hparams.problems) - 1)
-
-    if mode == tf.contrib.learn.ModeKeys.INFER:
-      # Beam search in sequence model returns both decodes withe key "outputs"
-      # and scores with they key "scores". If return list is a dict, we expect
-      # that it will have keys "outputs", a tensor of int32 and scores, a
-      # tensor of floats. This is useful if we want to return scores from
-      # estimator.predict
-      if not isinstance(result_list, dict):
-        ret = {"outputs": result_list}, None, None
-      else:
-        ret = {
-            "outputs": result_list["outputs"],
-            "scores": result_list["scores"]
-        }, None, None
-      if "inputs" in features:
-        ret[0]["inputs"] = features["inputs"]
-      if "infer_targets" in features:
-        ret[0]["targets"] = features["infer_targets"]
-      return ret
-
-    sharded_logits, total_loss = result_list[1:], result_list[0]
-    if mode == tf.contrib.learn.ModeKeys.EVAL:
-      logits = tf.concat(sharded_logits, 0)
-      if FLAGS.eval_print:
-        logits = tf.Print(
-            logits, [features["inputs"], logits], "EVAL PRINT", summarize=10000)
-      # For evaluation, return the logits layer as our predictions.
-      run_info["predictions"] = logits
-      train_op = None
-      return run_info, total_loss, None
-
-    assert mode == tf.contrib.learn.ModeKeys.TRAIN
-
-    # Some training statistics.
-    with tf.name_scope("training_stats"):
-      learning_rate = hparams.learning_rate * learning_rate_decay()
-      learning_rate /= math.sqrt(float(FLAGS.worker_replicas))
-      tf.summary.scalar("learning_rate", learning_rate)
-      global_step = tf.to_float(tf.contrib.framework.get_global_step())
-      for n in xrange(len(hparams.problems)):
-        with tf.variable_scope("losses_avg", reuse=True):
-          total_loss_var = tf.get_variable("problem_%d/total_loss" % n)
-          training_loss_var = tf.get_variable("problem_%d/training_loss" % n)
-          extra_loss_var = tf.get_variable("problem_%d/extra_loss" % n)
-        tf.summary.scalar("loss_avg_%d/total_loss" % n, total_loss_var)
-        tf.summary.scalar("loss_avg_%d/training_loss" % n, training_loss_var)
-        tf.summary.scalar("loss_avg_%d/extra_loss" % n, extra_loss_var)
-        with tf.variable_scope("train_stats", reuse=True):
-          nth_steps = tf.get_variable("problem_%d_steps" % n, dtype=tf.int32)
-        tf.summary.scalar("problem_%d_frequency" % n,
-                          tf.to_float(nth_steps) / (global_step + 1.0))
-
-    # Log trainable weights and add decay.
-    total_size, weight_decay_loss = 0, 0.0
-    all_weights = {v.name: v for v in tf.trainable_variables()}
-    for v_name in sorted(list(all_weights)):
-      v = all_weights[v_name]
-      v_size = int(np.prod(np.array(v.shape.as_list())))
-      tf.logging.info("Weight    %s\tshape    %s\tsize    %d",
-                      v.name[:-2].ljust(80), str(v.shape).ljust(20), v_size)
-      total_size += v_size
-      if hparams.weight_decay > 0.0 and len(v.shape.as_list()) > 1:
-        # Add weight regularization if set and the weight is not a bias (dim>1).
-        with tf.device(v._ref().device):  # pylint: disable=protected-access
-          v_loss = tf.nn.l2_loss(v) / v_size
-        weight_decay_loss += v_loss
-      is_body = len(v_name) > 5 and v_name[:5] == "body/"
-      if hparams.weight_noise > 0.0 and is_body:
-        # Add weight noise if set in hparams.
-        with tf.device(v._ref().device):  # pylint: disable=protected-access
-          scale = learning_rate * 0.001
-          noise = tf.truncated_normal(v.shape) * hparams.weight_noise * scale
-          noise_op = v.assign_add(noise)
-        with tf.control_dependencies([noise_op]):
-          total_loss = tf.identity(total_loss)
-    tf.logging.info("Total trainable variables size: %d", total_size)
-    if hparams.weight_decay > 0.0:
-      total_loss += weight_decay_loss * hparams.weight_decay
-    total_loss = tf.identity(total_loss, name="total_loss")
-
-    # Define the train_op for the TRAIN mode.
-    opt = _ConditionalOptimizer(hparams.optimizer, learning_rate, hparams)
-    tf.logging.info("Computing gradients for global model_fn.")
-    opt_summaries = ["learning_rate", "loss"]
-    if hparams.summarize_grads:
-      opt_summaries.extend(["gradients", "gradient_norm"])
-    train_op = tf.contrib.layers.optimize_loss(
-        name="training",
-        loss=total_loss,
-        global_step=tf.contrib.framework.get_global_step(),
-        learning_rate=learning_rate,
-        clip_gradients=hparams.clip_grad_norm or None,
-        gradient_noise_scale=hparams.grad_noise_scale or None,
-        optimizer=opt,
-        summaries=opt_summaries,
-        colocate_gradients_with_ops=True)
-
-    # Remove summaries that will fail to run because they are in conditionals.
-    # TODO(cwhipkey): Test with this code removed, later in 2017.
-    summaries = tf.get_collection_ref(tf.GraphKeys.SUMMARIES)
-    for i in range(len(summaries) - 1, -1, -1):
-      if summaries[i].name.startswith("cond_"):
-        del summaries[i]
-
-    tf.logging.info("Global model_fn finished.")
-    return run_info, total_loss, train_op
-
-  return model_fn
-
-
-def run_locally(exp):
-  """Runs an Experiment locally - trains, evaluates, and decodes.
-
-  Args:
-    exp: Experiment.
-  """
-  if exp.train_steps > 0 or exp.eval_steps > 0:
-    tf.logging.info("Performing local training and evaluation.")
-    exp.train_and_evaluate()
-  decode(exp.estimator)
+def get_data_filepatterns(data_dir, mode):
+  return data_reader.get_data_filepatterns(FLAGS.problems, data_dir, mode)
 
 
 def decode(estimator):
   if FLAGS.decode_interactive:
-    decode_interactively(estimator)
+    decoding.decode_interactively(estimator)
   elif FLAGS.decode_from_file is not None:
-    decode_from_file(estimator, FLAGS.decode_from_file)
+    decoding.decode_from_file(estimator, FLAGS.decode_from_file)
   elif FLAGS.decode_from_dataset:
-    decode_from_dataset(estimator)
-
-
-def decode_from_dataset(estimator):
-  hparams = estimator.hparams
-  for i, problem in enumerate(FLAGS.problems.split("-")):
-    inputs_vocab = hparams.problems[i].vocabulary.get("inputs", None)
-    targets_vocab = hparams.problems[i].vocabulary["targets"]
-    tf.logging.info("Performing local inference.")
-    infer_problems_data = get_data_filepatterns(hparams.data_dir,
-                                                tf.contrib.learn.ModeKeys.INFER)
-
-    infer_input_fn = get_input_fn(
-        mode=tf.contrib.learn.ModeKeys.INFER,
-        hparams=hparams,
-        data_file_patterns=infer_problems_data,
-        num_datashards=data_parallelism().n,
-        fixed_problem=i)
-    result_iter = estimator.predict(input_fn=infer_input_fn, as_iterable=False)
-
-    def log_fn(inputs,
-               targets,
-               outputs,
-               problem,
-               j,
-               inputs_vocab=inputs_vocab,
-               targets_vocab=targets_vocab):
-      """Log inference results."""
-      if "image" in problem and FLAGS.decode_save_images:
-        save_path = os.path.join(estimator.model_dir,
-                                 "%s_prediction_%d.jpg" % (problem, j))
-        show_and_save_image(inputs / 255., save_path)
-      elif inputs_vocab:
-        decoded_inputs = inputs_vocab.decode(_save_until_eos(inputs.flatten()))
-        tf.logging.info("Inference results INPUT: %s" % decoded_inputs)
-
-      decoded_outputs = targets_vocab.decode(_save_until_eos(outputs.flatten()))
-      tf.logging.info("Inference results OUTPUT: %s" % decoded_outputs)
-      decoded_targets = targets_vocab.decode(_save_until_eos(targets.flatten()))
-      tf.logging.info("Inference results TARGET: %s" % decoded_targets)
-
-      if FLAGS.decode_to_file:
-        output_filepath = FLAGS.decode_to_file + ".outputs." + problem
-        output_file = tf.gfile.Open(output_filepath, "a")
-        output_file.write(decoded_outputs + "\n")
-        target_filepath = FLAGS.decode_to_file + ".targets." + problem
-        target_file = tf.gfile.Open(target_filepath, "a")
-        target_file.write(decoded_targets + "\n")
-
-    # The function predict() returns an iterable over the network's
-    # predictions from the test input. We use it to log inputs and decodes.
-    inputs_iter = result_iter["inputs"]
-    targets_iter = result_iter["targets"]
-    outputs_iter = result_iter["outputs"]
-    for j, result in enumerate(zip(inputs_iter, targets_iter, outputs_iter)):
-      inputs, targets, outputs = result
-      if FLAGS.decode_return_beams:
-        output_beams = np.split(outputs, FLAGS.decode_beam_size, axis=0)
-        for k, beam in enumerate(output_beams):
-          tf.logging.info("BEAM %d:" % k)
-          log_fn(inputs, targets, beam, problem, j)
-      else:
-        log_fn(inputs, targets, outputs, problem, j)
-
-
-def decode_from_file(estimator, filename):
-  """Compute predictions on entries in filename and write them out."""
-  hparams = estimator.hparams
-  problem_id = FLAGS.decode_problem_id
-  inputs_vocab = hparams.problems[problem_id].vocabulary["inputs"]
-  targets_vocab = hparams.problems[problem_id].vocabulary["targets"]
-  tf.logging.info("Performing decoding from a file.")
-  sorted_inputs, sorted_keys = _get_sorted_inputs(filename)
-  num_decode_batches = (len(sorted_inputs) - 1) // FLAGS.decode_batch_size + 1
-  input_fn = _decode_batch_input_fn(problem_id, num_decode_batches,
-                                    sorted_inputs, inputs_vocab)
-
-  decodes = []
-  for _ in range(num_decode_batches):
-    result_iter = estimator.predict(
-        input_fn=input_fn.next if six.PY2 else input_fn.__next__,
-        as_iterable=True)
-    for result in result_iter:
-
-      def log_fn(inputs, outputs):
-        decoded_inputs = inputs_vocab.decode(_save_until_eos(inputs.flatten()))
-        tf.logging.info("Inference results INPUT: %s" % decoded_inputs)
-
-        decoded_outputs = targets_vocab.decode(
-            _save_until_eos(outputs.flatten()))
-        tf.logging.info("Inference results OUTPUT: %s" % decoded_outputs)
-        return decoded_outputs
-
-      if FLAGS.decode_return_beams:
-        beam_decodes = []
-        output_beams = np.split(
-            result["outputs"], FLAGS.decode_beam_size, axis=0)
-        for k, beam in enumerate(output_beams):
-          tf.logging.info("BEAM %d:" % k)
-          beam_decodes.append(log_fn(result["inputs"], beam))
-        decodes.append("\t".join(beam_decodes))
-
-      else:
-        decodes.append(log_fn(result["inputs"], result["outputs"]))
-
-  # Reversing the decoded inputs and outputs because they were reversed in
-  # _decode_batch_input_fn
-  sorted_inputs.reverse()
-  decodes.reverse()
-  # Dumping inputs and outputs to file filename.decodes in
-  # format result\tinput in the same order as original inputs
-  if FLAGS.decode_to_file:
-    output_filename = FLAGS.decode_to_file
-  else:
-    output_filename = filename
-  if FLAGS.decode_shards > 1:
-    base_filename = output_filename + ("%.2d" % FLAGS.worker_id)
-  else:
-    base_filename = output_filename
-  decode_filename = (base_filename + "." + FLAGS.model + "." + FLAGS.hparams_set
-                     + ".beam" + str(FLAGS.decode_beam_size) + ".alpha" +
-                     str(FLAGS.decode_alpha) + ".decodes")
-  tf.logging.info("Writing decodes into %s" % decode_filename)
-  outfile = tf.gfile.Open(decode_filename, "w")
-  for index in range(len(sorted_inputs)):
-    outfile.write("%s\n" % (decodes[sorted_keys[index]]))
-
-
-def decode_interactively(estimator):
-  hparams = estimator.hparams
-
-  infer_input_fn = _interactive_input_fn(hparams)
-  for problem_idx, example in infer_input_fn:
-    targets_vocab = hparams.problems[problem_idx].vocabulary["targets"]
-    result_iter = estimator.predict(input_fn=lambda e=example: e)
-    for result in result_iter:
-      if FLAGS.decode_return_beams:
-        beams = np.split(result["outputs"], FLAGS.decode_beam_size, axis=0)
-        scores = None
-        if "scores" in result:
-          scores = np.split(result["scores"], FLAGS.decode_beam_size, axis=0)
-        for k, beam in enumerate(beams):
-          tf.logging.info("BEAM %d:" % k)
-          beam_string = targets_vocab.decode(_save_until_eos(beam.flatten()))
-          if scores is not None:
-            tf.logging.info("%s\tScore:%f" % (beam_string, scores[k]))
-          else:
-            tf.logging.info(beam_string)
-      else:
-        tf.logging.info(
-            targets_vocab.decode(_save_until_eos(result["outputs"].flatten())))
-
-
-def _decode_batch_input_fn(problem_id, num_decode_batches, sorted_inputs,
-                           vocabulary):
-  tf.logging.info(" batch %d" % num_decode_batches)
-  # First reverse all the input sentences so that if you're going to get OOMs,
-  # you'll see it in the first batch
-  sorted_inputs.reverse()
-  for b in range(num_decode_batches):
-    tf.logging.info("Decoding batch %d" % b)
-    batch_length = 0
-    batch_inputs = []
-    for inputs in sorted_inputs[b * FLAGS.decode_batch_size:(
-        b + 1) * FLAGS.decode_batch_size]:
-      input_ids = vocabulary.encode(inputs)
-      if FLAGS.decode_max_input_size > 0:
-        # Subtract 1 for the EOS_ID.
-        input_ids = input_ids[:FLAGS.decode_max_input_size - 1]
-      input_ids.append(text_encoder.EOS_ID)
-      batch_inputs.append(input_ids)
-      if len(input_ids) > batch_length:
-        batch_length = len(input_ids)
-    final_batch_inputs = []
-    for input_ids in batch_inputs:
-      assert len(input_ids) <= batch_length
-      x = input_ids + [0] * (batch_length - len(input_ids))
-      final_batch_inputs.append(x)
-    yield {
-        "inputs": np.array(final_batch_inputs),
-        "problem_choice": np.array(problem_id)
-    }
-
-
-def get_data_filepatterns(data_dir, mode):
-  return data_reader.get_data_filepatterns(FLAGS.problems, data_dir, mode)
-
-
-def _cond_on_index(fn, index_tensor, cur_idx, max_idx):
-  """Call fn(index_tensor) using tf.cond in [cur_id, max_idx]."""
-  if cur_idx == max_idx:
-    return fn(cur_idx)
-  return tf.cond(
-      tf.equal(index_tensor, cur_idx), lambda: fn(cur_idx),
-      lambda: _cond_on_index(fn, index_tensor, cur_idx + 1, max_idx))
-
-
-def _interactive_input_fn(hparams):
-  """Generator that reads from the terminal and yields "interactive inputs".
-
-  Due to temporary limitations in tf.learn, if we don't want to reload the
-  whole graph, then we are stuck encoding all of the input as one fixed-size
-  numpy array.
-
-  We yield int64 arrays with shape [const_array_size].  The format is:
-  [num_samples, decode_length, len(input ids), <input ids>, <padding>]
-
-  Args:
-    hparams: model hparams
-  Yields:
-    numpy arrays
-
-  Raises:
-    Exception: when `input_type` is invalid.
-  """
-  num_samples = 3
-  decode_length = 100
-  input_type = "text"
-  problem_id = 0
-  p_hparams = hparams.problems[problem_id]
-  has_input = "inputs" in p_hparams.input_modality
-  vocabulary = p_hparams.vocabulary["inputs" if has_input else "targets"]
-  # This should be longer than the longest input.
-  const_array_size = 10000
-  # For ease of input, activate the readline module if available.
-  try:
-    import readline
-  except ImportError:
-    pass
-  while True:
-    prompt = ("INTERACTIVE MODE  num_samples=%d  decode_length=%d  \n"
-              "  it=<input_type>     ('text' or 'image')\n"
-              "  pr=<problem_num>    (set the problem number)\n"
-              "  in=<input_problem>  (set the input problem number)\n"
-              "  ou=<output_problem> (set the output problem number)\n"
-              "  ns=<num_samples>    (changes number of samples)\n"
-              "  dl=<decode_length>  (changes decode length)\n"
-              "  <%s>                (decode)\n"
-              "  q                   (quit)\n"
-              ">" % (num_samples, decode_length, "source_string"
-                     if has_input else "target_prefix"))
-    input_string = input(prompt)
-    if input_string == "q":
-      return
-    elif input_string[:3] == "pr=":
-      problem_id = int(input_string[3:])
-      p_hparams = hparams.problems[problem_id]
-      has_input = "inputs" in p_hparams.input_modality
-      vocabulary = p_hparams.vocabulary["inputs" if has_input else "targets"]
-    elif input_string[:3] == "in=":
-      problem = int(input_string[3:])
-      p_hparams.input_modality = hparams.problems[problem].input_modality
-      p_hparams.input_space_id = hparams.problems[problem].input_space_id
-    elif input_string[:3] == "ou=":
-      problem = int(input_string[3:])
-      p_hparams.target_modality = hparams.problems[problem].target_modality
-      p_hparams.target_space_id = hparams.problems[problem].target_space_id
-    elif input_string[:3] == "ns=":
-      num_samples = int(input_string[3:])
-    elif input_string[:3] == "dl=":
-      decode_length = int(input_string[3:])
-    elif input_string[:3] == "it=":
-      input_type = input_string[3:]
-    else:
-      if input_type == "text":
-        input_ids = vocabulary.encode(input_string)
-        if has_input:
-          input_ids.append(text_encoder.EOS_ID)
-        x = [num_samples, decode_length, len(input_ids)] + input_ids
-        assert len(x) < const_array_size
-        x += [0] * (const_array_size - len(x))
-        yield problem_id, {
-            "inputs": np.array(x),
-            "problem_choice": np.array(problem_id)
-        }
-      elif input_type == "image":
-        input_path = input_string
-        img = read_image(input_path)
-        yield problem_id, {
-            "inputs": img,
-            "problem_choice": np.array(problem_id)
-        }
-      else:
-        raise Exception("Unsupported input type.")
-
-
-def read_image(path):
-  try:
-    import matplotlib.image as im  # pylint: disable=g-import-not-at-top
-  except ImportError as e:
-    tf.logging.warning(
-        "Reading an image requires matplotlib to be installed: %s", e)
-    raise NotImplementedError("Image reading not implemented.")
-  return im.imread(path)
-
-
-def show_and_save_image(img, save_path):
-  try:
-    import matplotlib.pyplot as plt  # pylint: disable=g-import-not-at-top
-  except ImportError as e:
-    tf.logging.warning("Showing and saving an image requires matplotlib to be "
-                       "installed: %s", e)
-    raise NotImplementedError("Image display and save not implemented.")
-  plt.imshow(img)
-  plt.savefig(save_path)
-
-
-def _get_sorted_inputs(filename):
-  """Returning inputs sorted according to length.
-
-  Args:
-    filename: path to file with inputs, 1 per line.
-
-  Returns:
-    a sorted list of inputs
-
-  """
-  tf.logging.info("Getting sorted inputs")
-  # read file and sort inputs according them according to input length.
-  if FLAGS.decode_shards > 1:
-    decode_filename = filename + ("%.2d" % FLAGS.worker_id)
-  else:
-    decode_filename = filename
-  inputs = [line.strip() for line in tf.gfile.Open(decode_filename)]
-  input_lens = [(i, len(line.strip().split())) for i, line in enumerate(inputs)]
-  sorted_input_lens = sorted(input_lens, key=operator.itemgetter(1))
-  # We'll need the keys to rearrange the inputs back into their original order
-  sorted_keys = {}
-  sorted_inputs = []
-  for i, (index, _) in enumerate(sorted_input_lens):
-    sorted_inputs.append(inputs[index])
-    sorted_keys[index] = i
-  return sorted_inputs, sorted_keys
-
-
-def _interactive_input_tensor_to_features_dict(feature_map, hparams):
-  """Convert the interactive input format (see above) to a dictionary.
-
-  Args:
-    feature_map: a dictionary with keys `problem_choice` and `input` containing
-      Tensors.
-    hparams: model hyperparameters
-
-  Returns:
-    a features dictionary, as expected by the decoder.
-  """
-  inputs = tf.constant(feature_map["inputs"])
-  input_is_image = False if len(inputs.shape) < 3 else True
-
-  def input_fn(problem_choice, x=inputs):  # pylint: disable=missing-docstring
-    p_hparams = hparams.problems[problem_choice]
-    if not input_is_image:
-      # Remove the batch dimension.
-      num_samples = x[0]
-      length = x[2]
-      x = tf.slice(x, [3], tf.to_int32([length]))
-      x = tf.reshape(x, [1, -1, 1, 1])
-      # Transform into a batch of size num_samples to get that many random
-      # decodes.
-      x = tf.tile(x, tf.to_int32([num_samples, 1, 1, 1]))
-    else:
-      x = tf.image.resize_images(x, [299, 299])
-      x = tf.reshape(x, [1, 299, 299, -1])
-      x = tf.to_int32(x)
-    return (tf.constant(p_hparams.input_space_id),
-            tf.constant(p_hparams.target_space_id), x)
-
-  input_space_id, target_space_id, x = _cond_on_index(
-      input_fn, feature_map["problem_choice"], 0, len(hparams.problems) - 1)
-
-  features = {}
-  features["problem_choice"] = tf.constant(feature_map["problem_choice"])
-  features["input_space_id"] = input_space_id
-  features["target_space_id"] = target_space_id
-  features["decode_length"] = (IMAGE_DECODE_LENGTH
-                               if input_is_image else inputs[1])
-  features["inputs"] = x
-  return features
-
-
-def _decode_input_tensor_to_features_dict(feature_map, hparams):
-  """Convert the interactive input format (see above) to a dictionary.
-
-  Args:
-    feature_map: a dictionary with keys `problem_choice` and `input` containing
-      Tensors.
-    hparams: model hyperparameters
-
-  Returns:
-    a features dictionary, as expected by the decoder.
-  """
-  inputs = tf.constant(feature_map["inputs"])
-  input_is_image = False
-
-  def input_fn(problem_choice, x=inputs):  # pylint: disable=missing-docstring
-    p_hparams = hparams.problems[problem_choice]
-    # Add a third empty dimension dimension
-    x = tf.expand_dims(x, axis=[2])
-    x = tf.to_int32(x)
-    return (tf.constant(p_hparams.input_space_id),
-            tf.constant(p_hparams.target_space_id), x)
-
-  input_space_id, target_space_id, x = _cond_on_index(
-      input_fn, feature_map["problem_choice"], 0, len(hparams.problems) - 1)
-
-  features = {}
-  features["problem_choice"] = feature_map["problem_choice"]
-  features["input_space_id"] = input_space_id
-  features["target_space_id"] = target_space_id
-  features["decode_length"] = (IMAGE_DECODE_LENGTH
-                               if input_is_image else tf.shape(x)[1] + 50)
-  features["inputs"] = x
-  return features
-
-
-def get_input_fn(mode,
-                 hparams,
-                 data_file_patterns=None,
-                 num_datashards=None,
-                 fixed_problem=None):
-  """Provides input to the graph, either from disk or via a placeholder.
-
-  This function produces an input function that will feed data into
-  the network. There are two modes of operation:
-
-  1. If data_file_pattern and all subsequent arguments are None, then
-     it creates a placeholder for a serialized tf.Example proto.
-  2. If data_file_pattern is defined, it will read the data from the
-     files at the given location. Use this mode for training,
-     evaluation, and testing prediction.
-
-  Args:
-    mode: The execution mode, as defined in tf.contrib.learn.ModeKeys.
-    hparams: HParams object.
-    data_file_patterns: The list of file patterns to use to read in data. Set to
-      `None` if you want to create a placeholder for the input data. The
-      `problems` flag is a list of problem names joined by the `-` character.
-      The flag's string is then split along the `-` and each problem gets its
-      own example queue.
-    num_datashards: An integer.
-    fixed_problem: An integer indicating the problem to fetch data for, or None
-      if the input is to be randomly selected.
-
-  Returns:
-    A function that returns a dictionary of features and the target labels.
-  """
-
-  def input_fn():
-    """Supplies input to our model.
-
-    This function supplies input to our model, where this input is a
-    function of the mode. For example, we supply different data if
-    we're performing training versus evaluation.
-
-    Returns:
-      A tuple consisting of 1) a dictionary of tensors whose keys are
-      the feature names, and 2) a tensor of target labels if the mode
-      is not INFER (and None, otherwise).
-
-    Raises:
-      ValueError: if one of the parameters has an unsupported value.
-    """
-    problem_count, batches = len(data_file_patterns), []
-    with tf.name_scope("input_reader"):
-      for n in xrange(problem_count):
-        if fixed_problem is not None and n != fixed_problem:
-          continue
-        problem_instance = hparams.problem_instances[n]
-        p_hparams = hparams.problems[n]
-        with tf.name_scope("problem_%d" % n):
-          with tf.device("/cpu:0"):  # Input reading on CPU
-            capacity = p_hparams.max_expected_batch_size_per_shard
-            capacity *= num_datashards
-            examples = data_reader.input_pipeline(
-                problem_instance, data_file_patterns[n], capacity, mode)
-            feature_map = data_reader.batch_examples(
-                examples,
-                data_reader.hparams_to_batching_scheme(
-                    hparams,
-                    shard_multiplier=num_datashards,
-                    drop_long_sequences=(mode == tf.contrib.learn.ModeKeys.TRAIN
-                                         or hparams.eval_drop_long_sequences),
-                    length_multiplier=(p_hparams.batch_size_multiplier)))
-
-        # Reverse inputs and targets features if the problem was reversed.
-        if problem_instance is not None:
-          problem_instance.maybe_reverse_features(feature_map)
-          problem_instance.maybe_copy_features(feature_map)
-        else:
-          if p_hparams.was_reversed:
-            inputs = feature_map["inputs"]
-            targets = feature_map["targets"]
-            feature_map["inputs"] = targets
-            feature_map["targets"] = inputs
-          # Use the inputs as the targets if the problem is a copy problem.
-          if p_hparams.was_copy:
-            feature_map["targets"] = feature_map["inputs"]
-
-        # Ensure inputs and targets are proper rank.
-        while len(feature_map["inputs"].get_shape()) != 4:
-          feature_map["inputs"] = tf.expand_dims(feature_map["inputs"], axis=-1)
-        while len(feature_map["targets"].get_shape()) != 4:
-          feature_map["targets"] = tf.expand_dims(
-              feature_map["targets"], axis=-1)
-
-        batches.append(
-            (feature_map["inputs"], feature_map["targets"], tf.constant(n),
-             tf.constant(p_hparams.input_space_id),
-             tf.constant(p_hparams.target_space_id)))
-
-    # We choose which problem to process.
-    loss_moving_avgs = []  # Need loss moving averages for that.
-    for n in xrange(problem_count):
-      with tf.variable_scope("losses_avg"):
-        loss_moving_avgs.append(
-            tf.get_variable(
-                "problem_%d/total_loss" % n, initializer=100.0,
-                trainable=False))
-        tf.get_variable(
-            "problem_%d/training_loss" % n, initializer=100.0, trainable=False)
-        tf.get_variable(
-            "problem_%d/extra_loss" % n, initializer=100.0, trainable=False)
-    if fixed_problem is None:
-      if (hparams.problem_choice == "uniform" or
-          mode != tf.contrib.learn.ModeKeys.TRAIN):
-        problem_choice = tf.random_uniform(
-            [], maxval=problem_count, dtype=tf.int32)
-      elif hparams.problem_choice == "adaptive":
-        loss_moving_avgs = tf.stack(loss_moving_avgs)
-        problem_choice = tf.multinomial(
-            tf.reshape(loss_moving_avgs, [1, -1]), 1)
-        problem_choice = tf.to_int32(tf.squeeze(problem_choice))
-      elif hparams.problem_choice == "distributed":
-        assert FLAGS.worker_replicas >= problem_count
-        assert FLAGS.worker_replicas % problem_count == 0
-        problem_choice = tf.to_int32(FLAGS.worker_id % problem_count)
-      else:
-        raise ValueError(
-            "Value of hparams.problem_choice is %s and must be "
-            "one of [uniform, adaptive, distributed]" % hparams.problem_choice)
-
-      # Inputs and targets conditional on problem_choice.
-      rand_inputs, rand_target, choice, inp_id, tgt_id = _cond_on_index(
-          lambda n: batches[n], problem_choice, 0, problem_count - 1)
-    else:
-      problem_choice = tf.constant(fixed_problem)
-      # Take the only constructed batch, which is the fixed_problem.
-      rand_inputs, rand_target, choice, inp_id, tgt_id = batches[0]
-
-    # Set shapes so the ranks are clear.
-    rand_inputs.set_shape([None, None, None, None])
-    rand_target.set_shape([None, None, None, None])
-    choice.set_shape([])
-    inp_id.set_shape([])
-    tgt_id.set_shape([])
-    #  Forced shape obfuscation is necessary for inference.
-    if mode == tf.contrib.learn.ModeKeys.INFER:
-      rand_inputs._shape = tf.TensorShape([None, None, None, None])  # pylint: disable=protected-access
-      rand_target._shape = tf.TensorShape([None, None, None, None])  # pylint: disable=protected-access
-
-    # Final feature map.
-    rand_feature_map = {
-        "inputs": rand_inputs,
-        "problem_choice": choice,
-        "input_space_id": inp_id,
-        "target_space_id": tgt_id
-    }
-    if mode == tf.contrib.learn.ModeKeys.INFER:
-      rand_feature_map["infer_targets"] = rand_target
-      rand_target = None
-    return rand_feature_map, rand_target
-
-  return input_fn
-
-
-class _ConditionalOptimizer(tf.train.Optimizer):
-  """Conditional optimizer."""
-
-  def __init__(self, optimizer_name, lr, hparams):
-    if optimizer_name == "Adam":
-      # We change the default epsilon for Adam and re-scale lr.
-      # Using LazyAdam as it's much faster for large vocabulary embeddings.
-      self._opt = tf.contrib.opt.LazyAdamOptimizer(
-          lr / 500.0,
-          beta1=hparams.optimizer_adam_beta1,
-          beta2=hparams.optimizer_adam_beta2,
-          epsilon=hparams.optimizer_adam_epsilon)
-    elif optimizer_name == "Momentum":
-      self._opt = tf.train.MomentumOptimizer(
-          lr, momentum=hparams.optimizer_momentum_momentum)
-    elif optimizer_name == "YellowFin":
-      tf.logging.info("Init YellowFin Optimizer.")
-      self._opt = yellowfin.YellowFinOptimizer(
-          learning_rate=lr, momentum=hparams.optimizer_momentum_momentum)
-    else:
-      self._opt = tf.contrib.layers.OPTIMIZER_CLS_NAMES[optimizer_name](lr)
-
-  def compute_gradients(self, loss, var_list, colocate_gradients_with_ops):
-    return self._opt.compute_gradients(
-        loss, var_list, colocate_gradients_with_ops=colocate_gradients_with_ops)
-
-  def apply_gradients(self, gradients, global_step=None, name=None):
-    return self._opt.apply_gradients(
-        gradients, global_step=global_step, name=name)
-
-
-def _sqrt_decay(step):
-  """Decay like 1 / sqrt(step), multiplied by 500 to normalize."""
-  return 500.0 / tf.sqrt(tf.maximum(step, 1.0))
-
-
-def _exp_decay_after(step, rate, from_which_step):
-  """Decay exponentially by rate (per step) starting at from_which_step."""
-  return tf.cond(
-      step < from_which_step,
-      lambda: tf.constant(1.0),
-      lambda: rate**(step - from_which_step),
-      name="exponential_decay_step_cond")
-
-
-def _ps_replicas(all_workers=False):
-  if all_workers:
-    return list(range(FLAGS.ps_replicas))
-  # Worker K will be using replicas {0,...n-1} + K*n if we have n replicas.
-  num_replicas = FLAGS.ps_replicas // FLAGS.worker_replicas
-  return [d + FLAGS.worker_id * num_replicas for d in xrange(num_replicas)]
-
-
-def _gpu_order(num_gpus):
-  if FLAGS.gpu_order:
-    ret = [int(s) for s in FLAGS.gpu_order.split(" ")]
-    if len(ret) == num_gpus:
-      return ret
-  return list(range(num_gpus))
-
-
-def _ps_gpus(all_workers=False):
-  ps_gpus = []
-  for d in _ps_replicas(all_workers=all_workers):
-    ps_gpus.extend([(d, gpu) for gpu in _gpu_order(FLAGS.ps_gpu)])
-  return ps_gpus
-
-
-def _ps_devices(all_workers=False):
-  """List of ps devices (where to put the experts).
-
-  Args:
-    all_workers: whether the list is for all async workers or just this one.
-
-  Returns:
-    a list of device names
-  """
-  if FLAGS.ps_replicas > 0:
-    if FLAGS.ps_gpu > 0:
-      return [
-          FLAGS.ps_job + "/task:%d/GPU:%d" % (d, gpu)
-          for (d, gpu) in _ps_gpus(all_workers=all_workers)
-      ]
-    else:
-      return [
-          FLAGS.ps_job + "/task:%d" % d
-          for d in _ps_replicas(all_workers=all_workers)
-      ]
-  else:
-    if FLAGS.worker_gpu > 0:
-      return ["gpu:%d" % d for d in _gpu_order(FLAGS.worker_gpu)]
-    else:
-      return [""]
-
-
-def data_parallelism(all_workers=False):
-  """Over which devices do we split each training batch.
-
-  In old-fashioned async mode, we split the batch over all GPUs on the
-  current worker.
-
-  In sync mode, we split the batch over all the parameter server GPUs.
-
-  This function returns an expert_utils.Parallelism object, which can be used
-  to build the model.  It is configured in a way that any variables created
-  by `tf.get_variable` will be assigned to the parameter servers and shared
-  between datashards.
-
-  Args:
-    all_workers: whether the devices are all async workers or just this one.
-
-  Returns:
-    a expert_utils.Parallelism.
-  """
-
-  def _replica_device_setter(worker_device):
-    if FLAGS.ps_replicas == 0:
-      return worker_device
-    return tf.train.replica_device_setter(
-        worker_device=worker_device,
-        ps_tasks=FLAGS.ps_replicas,
-        ps_device=FLAGS.ps_job + "/GPU:0" if FLAGS.ps_gpu > 0 else FLAGS.ps_job)
-
-  if FLAGS.schedule == "local_run":
-    assert not FLAGS.sync
-    datashard_devices = ["gpu:%d" % d for d in _gpu_order(FLAGS.worker_gpu)]
-    if FLAGS.locally_shard_to_cpu:
-      datashard_devices += ["cpu:0"]
-    caching_devices = None
-  elif FLAGS.sync:
-    assert FLAGS.ps_replicas > 0
-    datashard_devices = [
-        _replica_device_setter(d) for d in _ps_devices(all_workers=all_workers)
-    ]
-    if FLAGS.ps_gpu > 0 and FLAGS.ps_replicas > 1:
-      caching_devices = [
-          FLAGS.ps_job + "/task:%d/cpu:0" % d
-          for (d, _) in _ps_gpus(all_workers=all_workers)
-      ]
-    else:
-      caching_devices = None
-  else:
-    # old fashioned async - compute on worker
-    if FLAGS.worker_gpu > 1:
-      datashard_devices = [
-          _replica_device_setter(FLAGS.worker_job + "/GPU:%d" % d)
-          for d in _gpu_order(FLAGS.worker_gpu)
-      ]
-      caching_devices = [FLAGS.worker_job + "/GPU:0"] * FLAGS.worker_gpu
-    else:
-      datashard_devices = [_replica_device_setter(FLAGS.worker_job)]
-      caching_devices = None
-  tf.logging.info("datashard_devices: %s", datashard_devices)
-  tf.logging.info("caching_devices: %s", caching_devices)
-  return eu.Parallelism(
-      datashard_devices,
-      reuse=True,
-      caching_devices=caching_devices,
-      daisy_chain_variables=FLAGS.daisy_chain_variables)
+    decoding.decode_from_dataset(estimator)

From accf019e8676afbecc14048fe2151ae85a645be1 Mon Sep 17 00:00:00 2001
From: vthorsteinsson <vt@extrada.com>
Date: Wed, 9 Aug 2017 12:01:20 +0000
Subject: [PATCH 06/10] Import readline in decoding.py

---
 tensor2tensor/utils/decoding.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)
 mode change 100644 => 100755 tensor2tensor/utils/decoding.py

diff --git a/tensor2tensor/utils/decoding.py b/tensor2tensor/utils/decoding.py
old mode 100644
new mode 100755
index 12057d8e6..cf981a1e3
--- a/tensor2tensor/utils/decoding.py
+++ b/tensor2tensor/utils/decoding.py
@@ -248,6 +248,11 @@ def _interactive_input_fn(hparams):
   vocabulary = p_hparams.vocabulary["inputs" if has_input else "targets"]
   # This should be longer than the longest input.
   const_array_size = 10000
+  # Import readline if available for command line editing and recall
+  try:
+    import readline
+  except ImportError:
+    pass
   while True:
     prompt = ("INTERACTIVE MODE  num_samples=%d  decode_length=%d  \n"
               "  it=<input_type>     ('text' or 'image' or 'label')\n"
@@ -255,7 +260,7 @@ def _interactive_input_fn(hparams):
               "  in=<input_problem>  (set the input problem number)\n"
               "  ou=<output_problem> (set the output problem number)\n"
               "  ns=<num_samples>    (changes number of samples)\n"
-              "  dl=<decode_length>  (changes decode legnth)\n"
+              "  dl=<decode_length>  (changes decode length)\n"
               "  <%s>                (decode)\n"
               "  q                   (quit)\n"
               ">" % (num_samples, decode_length, "source_string"

From b3de49a72743d212fef786e80ee01044eb89be98 Mon Sep 17 00:00:00 2001
From: vthorsteinsson <vt@extrada.com>
Date: Thu, 10 Aug 2017 17:59:50 +0000
Subject: [PATCH 07/10] Larger source vocab; adapt to new upstream version

---
 tensor2tensor/ice_parsing/ice_parsing.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensor2tensor/ice_parsing/ice_parsing.py b/tensor2tensor/ice_parsing/ice_parsing.py
index d8dd41cf7..df9748589 100755
--- a/tensor2tensor/ice_parsing/ice_parsing.py
+++ b/tensor2tensor/ice_parsing/ice_parsing.py
@@ -66,7 +66,7 @@ class IceParsingTokens(problem.Problem):
 
   @property
   def source_vocab_size(self):
-    return 2**13  # 8192
+    return 2**14  # 16384
 
   @property
   def target_vocab_size(self):
@@ -84,18 +84,18 @@ def feature_encoders(self, data_dir):
         "targets": target_subtokenizer,
     }
 
-  def generate_data(self, data_dir, tmp_dir, num_shards=100):
+  def generate_data(self, data_dir, tmp_dir, task_id=-1):
     generator_utils.generate_dataset_and_shuffle(
         tabbed_parsing_token_generator(data_dir, tmp_dir, True, "ice",
                                        self.source_vocab_size,
                                        self.target_vocab_size),
-        self.training_filepaths(data_dir, num_shards, shuffled=False),
+        self.training_filepaths(data_dir, 1, shuffled=False),
         tabbed_parsing_token_generator(data_dir, tmp_dir, False, "ice",
                                        self.source_vocab_size,
                                        self.target_vocab_size),
         self.dev_filepaths(data_dir, 1, shuffled=False))
 
-  def hparams(self, defaults, unused_model_hparams):
+  def hparams(self, defaults, model_hparams):
     p = defaults
     source_vocab_size = self._encoders["inputs"].vocab_size
     p.input_modality = {"inputs": (registry.Modalities.SYMBOL, source_vocab_size)}

From b4de995cec2430acf61c367fbb61a00f61fc5097 Mon Sep 17 00:00:00 2001
From: vthorsteinsson <vt@extrada.com>
Date: Fri, 11 Aug 2017 00:17:55 +0000
Subject: [PATCH 08/10] Moved ice_parsing to data_generators; updated to 1.1.7

---
 tensor2tensor/data_generators/all_problems.py |  1 +
 .../ice_parsing.py                            | 44 +++++++------------
 tensor2tensor/data_generators/wmt.py          | 30 -------------
 tensor2tensor/ice_parsing/__init__.py         |  2 -
 tensor2tensor/models/transformer.py           |  9 ++++
 5 files changed, 27 insertions(+), 59 deletions(-)
 mode change 100644 => 100755 tensor2tensor/data_generators/all_problems.py
 rename tensor2tensor/{ice_parsing => data_generators}/ice_parsing.py (82%)
 delete mode 100644 tensor2tensor/ice_parsing/__init__.py

diff --git a/tensor2tensor/data_generators/all_problems.py b/tensor2tensor/data_generators/all_problems.py
old mode 100644
new mode 100755
index ca6dccfda..10a4764f5
--- a/tensor2tensor/data_generators/all_problems.py
+++ b/tensor2tensor/data_generators/all_problems.py
@@ -31,6 +31,7 @@
 from tensor2tensor.data_generators import wiki
 from tensor2tensor.data_generators import wmt
 from tensor2tensor.data_generators import wsj_parsing
+from tensor2tensor.data_generators import ice_parsing
 
 
 # Problem modules that require optional dependencies
diff --git a/tensor2tensor/ice_parsing/ice_parsing.py b/tensor2tensor/data_generators/ice_parsing.py
similarity index 82%
rename from tensor2tensor/ice_parsing/ice_parsing.py
rename to tensor2tensor/data_generators/ice_parsing.py
index df9748589..f6e6bdca4 100755
--- a/tensor2tensor/ice_parsing/ice_parsing.py
+++ b/tensor2tensor/data_generators/ice_parsing.py
@@ -28,7 +28,6 @@
 from tensor2tensor.data_generators import text_encoder
 from tensor2tensor.data_generators.wmt import tabbed_generator
 from tensor2tensor.utils import registry
-from tensor2tensor.models import transformer
 
 import tensorflow as tf
 
@@ -69,9 +68,21 @@ def source_vocab_size(self):
     return 2**14  # 16384
 
   @property
-  def target_vocab_size(self):
+  def targeted_vocab_size(self):
     return 2**8  # 256
 
+  @property
+  def input_space_id(self):
+    return problem.SpaceID.ICE_TOK
+
+  @property
+  def target_space_id(self):
+    return problem.SpaceID.ICE_PARSE_TOK
+
+  @property
+  def num_shards(self):
+    return 10
+
   def feature_encoders(self, data_dir):
     source_vocab_filename = os.path.join(
         data_dir, "ice_source.tokens.vocab.%d" % self.source_vocab_size)
@@ -89,7 +100,7 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1):
         tabbed_parsing_token_generator(data_dir, tmp_dir, True, "ice",
                                        self.source_vocab_size,
                                        self.target_vocab_size),
-        self.training_filepaths(data_dir, 1, shuffled=False),
+        self.training_filepaths(data_dir, self.num_shards, shuffled=False),
         tabbed_parsing_token_generator(data_dir, tmp_dir, False, "ice",
                                        self.source_vocab_size,
                                        self.target_vocab_size),
@@ -99,29 +110,8 @@ def hparams(self, defaults, model_hparams):
     p = defaults
     source_vocab_size = self._encoders["inputs"].vocab_size
     p.input_modality = {"inputs": (registry.Modalities.SYMBOL, source_vocab_size)}
-    p.target_modality = (registry.Modalities.SYMBOL, self.target_vocab_size)
-    p.input_space_id = problem.SpaceID.ICE_TOK
-    p.target_space_id = problem.SpaceID.ICE_PARSE_TOK
+    p.target_modality = (registry.Modalities.SYMBOL, self.targeted_vocab_size)
+    p.input_space_id = self.input_space_id
+    p.target_space_id = self.target_space_id
     p.loss_multiplier = 2.5 # Rough estimate of avg number of tokens per word
 
-
-@registry.register_hparams
-def transformer_parsing_ice():
-  """Hparams for parsing Icelandic text."""
-  hparams = transformer.transformer_base_single_gpu()
-  hparams.batch_size = 4096
-  hparams.shared_embedding_and_softmax_weights = int(False)
-  return hparams
-
-
-@registry.register_hparams
-def transformer_parsing_ice_big():
-  """Hparams for parsing Icelandic text, bigger model."""
-  hparams = transformer_parsing_ice()
-  hparams.batch_size = 2048 # 4096 gives Out-of-memory on 8 GB 1080 GTX GPU
-  hparams.attention_dropout = 0.05
-  hparams.residual_dropout = 0.05
-  hparams.max_length = 512
-  hparams.hidden_size = 1024
-  return hparams
-
diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py
index f673dee82..f66e366d1 100755
--- a/tensor2tensor/data_generators/wmt.py
+++ b/tensor2tensor/data_generators/wmt.py
@@ -187,36 +187,6 @@ def bi_vocabs_token_generator(source_path,
         source, target = source_file.readline(), target_file.readline()
 
 
-def tabbed_generator(source_path, source_vocab, target_vocab, eos=None):
-  r"""Generator for sequence-to-sequence tasks using tabbed files.
-
-  Tokens are derived from text files where each line contains both
-  a source and a target string. The two strings are separated by a tab
-  character ('\t'). It yields dictionaries of "inputs" and "targets" where
-  inputs are characters from the source lines converted to integers, and
-  targets are characters from the target lines, also converted to integers.
-
-  Args:
-    source_path: path to the file with source and target sentences.
-    source_vocab: a SunwordTextEncoder to encode the source string.
-    target_vocab: a SunwordTextEncoder to encode the target string.
-    eos: integer to append at the end of each sequence (default: None).
-
-  Yields:
-    A dictionary {"inputs": source-line, "targets": target-line} where
-    the lines are integer lists converted from characters in the file lines.
-  """
-  eos_list = [] if eos is None else [eos]
-  with tf.gfile.GFile(source_path, mode="r") as source_file:
-    for line in source_file:
-      if line and "\t" in line:
-        parts = line.split("\t", maxsplit=1)
-        source, target = parts[0].strip(), parts[1].strip()
-        source_ints = source_vocab.encode(source) + eos_list
-        target_ints = target_vocab.encode(target) + eos_list
-        yield {"inputs": source_ints, "targets": target_ints}
-
-
 # Data-set URLs.
 
 
diff --git a/tensor2tensor/ice_parsing/__init__.py b/tensor2tensor/ice_parsing/__init__.py
deleted file mode 100644
index 36f468dcb..000000000
--- a/tensor2tensor/ice_parsing/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-
-from .ice_parsing import IceParsingTokens, transformer_parsing_ice, transformer_parsing_ice_big
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index c6fb74958..f1b2d761f 100755
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -391,6 +391,15 @@ def transformer_parsing_big():
   return hparams
 
 
+@registry.register_hparams
+def transformer_parsing_ice():
+  """Hparams for parsing and tagging Icelandic text."""
+  hparams = transformer.transformer_base_single_gpu()
+  hparams.batch_size = 4096
+  hparams.shared_embedding_and_softmax_weights = int(False)
+  return hparams
+
+
 @registry.register_hparams
 def transformer_tiny():
   hparams = transformer_base()

From b9e216b4c76ca973773a6bd4a04372a4dc4cffe3 Mon Sep 17 00:00:00 2001
From: vthorsteinsson <vt@extrada.com>
Date: Fri, 11 Aug 2017 00:22:58 +0000
Subject: [PATCH 09/10] Adaptation to 1.1.7

---
 tensor2tensor/bin/t2t-datagen                    | 5 -----
 tensor2tensor/data_generators/problem_hparams.py | 2 --
 tensor2tensor/data_generators/wmt.py             | 3 +--
 tensor2tensor/models/transformer.py              | 2 +-
 4 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen
index 8ea0d9bc6..97bbd1241 100755
--- a/tensor2tensor/bin/t2t-datagen
+++ b/tensor2tensor/bin/t2t-datagen
@@ -82,11 +82,6 @@ _SUPPORTED_PROBLEM_GENERATORS = {
     "algorithmic_algebra_inverse": (
         lambda: algorithmic_math.algebra_inverse(26, 0, 2, 100000),
         lambda: algorithmic_math.algebra_inverse(26, 3, 3, 10000)),
-    "ice_parsing_characters": (
-        lambda: wmt.tabbed_parsing_character_generator(
-            FLAGS.data_dir, FLAGS.tmp_dir, True),
-        lambda: wmt.tabbed_parsing_character_generator(
-            FLAGS.data_dir, FLAGS.tmp_dir, False)),
     "wmt_parsing_tokens_8k": (
         lambda: wmt.parsing_token_generator(
             FLAGS.data_dir, FLAGS.tmp_dir, True, 2**13),
diff --git a/tensor2tensor/data_generators/problem_hparams.py b/tensor2tensor/data_generators/problem_hparams.py
index 2f417a992..b0ed44f5b 100755
--- a/tensor2tensor/data_generators/problem_hparams.py
+++ b/tensor2tensor/data_generators/problem_hparams.py
@@ -511,8 +511,6 @@ def image_celeba(unused_model_hparams):
         lm1b_32k,
     "wiki_32k":
         wiki_32k,
-    "ice_parsing_characters":
-        wmt_parsing_characters,
     "wmt_parsing_tokens_8k":
         lambda p: wmt_parsing_tokens(p, 2**13),
     "wsj_parsing_tokens_16k":
diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py
index f66e366d1..35d1b5fca 100755
--- a/tensor2tensor/data_generators/wmt.py
+++ b/tensor2tensor/data_generators/wmt.py
@@ -65,6 +65,7 @@ def use_subword_tokenizer(self):
 
 # Generic generators used later for multiple problems.
 
+
 def character_generator(source_path, target_path, character_vocab, eos=None):
   """Generator for sequence-to-sequence tasks that just uses characters.
 
@@ -654,5 +655,3 @@ def parsing_token_generator(data_dir, tmp_dir, train, vocab_size):
   tree_filepath = os.path.join(tmp_dir, filename)
   return wsj_parsing.token_generator(tree_filepath, symbolizer_vocab,
                                      symbolizer_vocab, EOS)
-
-
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index f1b2d761f..fa7ecdf81 100755
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -394,7 +394,7 @@ def transformer_parsing_big():
 @registry.register_hparams
 def transformer_parsing_ice():
   """Hparams for parsing and tagging Icelandic text."""
-  hparams = transformer.transformer_base_single_gpu()
+  hparams = transformer_base_single_gpu()
   hparams.batch_size = 4096
   hparams.shared_embedding_and_softmax_weights = int(False)
   return hparams

From ab9b00465add968ee1a09bd749cd35f53a9659cd Mon Sep 17 00:00:00 2001
From: vthorsteinsson <vt@extrada.com>
Date: Fri, 11 Aug 2017 11:43:59 +0000
Subject: [PATCH 10/10] Bugfix in ice_parsing.py

---
 tensor2tensor/data_generators/ice_parsing.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensor2tensor/data_generators/ice_parsing.py b/tensor2tensor/data_generators/ice_parsing.py
index f6e6bdca4..7a90fec45 100755
--- a/tensor2tensor/data_generators/ice_parsing.py
+++ b/tensor2tensor/data_generators/ice_parsing.py
@@ -87,7 +87,7 @@ def feature_encoders(self, data_dir):
     source_vocab_filename = os.path.join(
         data_dir, "ice_source.tokens.vocab.%d" % self.source_vocab_size)
     target_vocab_filename = os.path.join(
-        data_dir, "ice_target.tokens.vocab.%d" % self.target_vocab_size)
+        data_dir, "ice_target.tokens.vocab.%d" % self.targeted_vocab_size)
     source_subtokenizer = text_encoder.SubwordTextEncoder(source_vocab_filename)
     target_subtokenizer = text_encoder.SubwordTextEncoder(target_vocab_filename)
     return {
@@ -99,11 +99,11 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1):
     generator_utils.generate_dataset_and_shuffle(
         tabbed_parsing_token_generator(data_dir, tmp_dir, True, "ice",
                                        self.source_vocab_size,
-                                       self.target_vocab_size),
+                                       self.targeted_vocab_size),
         self.training_filepaths(data_dir, self.num_shards, shuffled=False),
         tabbed_parsing_token_generator(data_dir, tmp_dir, False, "ice",
                                        self.source_vocab_size,
-                                       self.target_vocab_size),
+                                       self.targeted_vocab_size),
         self.dev_filepaths(data_dir, 1, shuffled=False))
 
   def hparams(self, defaults, model_hparams):