From 93177b02cb72e1b0dd585b6f3a40f7382eefd47a Mon Sep 17 00:00:00 2001 From: vthorsteinsson Date: Thu, 20 Jul 2017 12:37:13 +0000 Subject: [PATCH 01/10] Added ice_parsing_tokens to problem registry --- tensor2tensor/bin/t2t-datagen | 5 -- .../data_generators/generator_utils.py | 4 +- .../data_generators/problem_hparams.py | 36 ------------- tensor2tensor/data_generators/wmt.py | 53 +++++++++++++++++-- tensor2tensor/models/transformer.py | 14 +++++ tensor2tensor/utils/registry.py | 6 +-- 6 files changed, 69 insertions(+), 49 deletions(-) mode change 100644 => 100755 tensor2tensor/data_generators/problem_hparams.py mode change 100644 => 100755 tensor2tensor/data_generators/wmt.py mode change 100644 => 100755 tensor2tensor/models/transformer.py mode change 100644 => 100755 tensor2tensor/utils/registry.py diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen index cbf0a6164..2f8a418e2 100755 --- a/tensor2tensor/bin/t2t-datagen +++ b/tensor2tensor/bin/t2t-datagen @@ -109,11 +109,6 @@ _SUPPORTED_PROBLEM_GENERATORS = { "algorithmic_algebra_inverse": ( lambda: algorithmic_math.algebra_inverse(26, 0, 2, 100000), lambda: algorithmic_math.algebra_inverse(26, 3, 3, 10000)), - "ice_parsing_tokens": ( - lambda: wmt.tabbed_parsing_token_generator(FLAGS.tmp_dir, - True, "ice", 2**13, 2**8), - lambda: wmt.tabbed_parsing_token_generator(FLAGS.tmp_dir, - False, "ice", 2**13, 2**8)), "ice_parsing_characters": ( lambda: wmt.tabbed_parsing_character_generator(FLAGS.tmp_dir, True), lambda: wmt.tabbed_parsing_character_generator(FLAGS.tmp_dir, False)), diff --git a/tensor2tensor/data_generators/generator_utils.py b/tensor2tensor/data_generators/generator_utils.py index 20f3959d8..51c8a5899 100755 --- a/tensor2tensor/data_generators/generator_utils.py +++ b/tensor2tensor/data_generators/generator_utils.py @@ -266,7 +266,7 @@ def get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size, sources=None): for source in sources: url = source[0] filename = os.path.basename(url) - read_type = "r:gz" if "tgz" in filename else "r" + read_type = "r:gz" if filename.endswith(".tgz") else "r" compressed_file = maybe_download(tmp_dir, filename, url) @@ -278,7 +278,7 @@ def get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size, sources=None): filepath = os.path.join(tmp_dir, lang_file) # For some datasets a second extraction is necessary. - if ".gz" in lang_file: + if lang_file.endswith(".gz"): new_filepath = os.path.join(tmp_dir, lang_file[:-3]) if tf.gfile.Exists(new_filepath): tf.logging.info( diff --git a/tensor2tensor/data_generators/problem_hparams.py b/tensor2tensor/data_generators/problem_hparams.py old mode 100644 new mode 100755 index 70b9dada8..e071ba60d --- a/tensor2tensor/data_generators/problem_hparams.py +++ b/tensor2tensor/data_generators/problem_hparams.py @@ -569,41 +569,6 @@ def wsj_parsing_tokens(model_hparams, return p -def ice_parsing_tokens(model_hparams, wrong_source_vocab_size): - """Icelandic to parse tree translation benchmark. - - Args: - model_hparams: a tf.contrib.training.HParams - wrong_source_vocab_size: a number used in the filename indicating the - approximate vocabulary size. This is not to be confused with the actual - vocabulary size. - - Returns: - A tf.contrib.training.HParams object. - """ - p = default_problem_hparams() - # This vocab file must be present within the data directory. - source_vocab_filename = os.path.join( - model_hparams.data_dir, - "ice_source.tokens.vocab.%d" % wrong_source_vocab_size) - target_vocab_filename = os.path.join( - model_hparams.data_dir, - "ice_target.tokens.vocab.256") - source_subtokenizer = text_encoder.SubwordTextEncoder(source_vocab_filename) - target_subtokenizer = text_encoder.SubwordTextEncoder(target_vocab_filename) - p.input_modality = { - "inputs": (registry.Modalities.SYMBOL, source_subtokenizer.vocab_size) - } - p.target_modality = (registry.Modalities.SYMBOL, 256) - p.vocabulary = { - "inputs": source_subtokenizer, - "targets": target_subtokenizer, - } - p.input_space_id = 18 # Icelandic tokens - p.target_space_id = 19 # Icelandic parse tokens - return p - - def image_cifar10(unused_model_hparams): """CIFAR-10.""" p = default_problem_hparams() @@ -720,7 +685,6 @@ def img2img_imagenet(unused_model_hparams): "wiki_32k": wiki_32k, "lmptb_10k": lmptb_10k, "ice_parsing_characters": wmt_parsing_characters, - "ice_parsing_tokens": lambda p: ice_parsing_tokens(p, 2**13), "wmt_parsing_tokens_8k": lambda p: wmt_parsing_tokens(p, 2**13), "wsj_parsing_tokens_16k": lambda p: wsj_parsing_tokens( # pylint: disable=g-long-lambda p, "wsj", 2**14, 2**9), diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py old mode 100644 new mode 100755 index de5a25e13..f8b20a0e4 --- a/tensor2tensor/data_generators/wmt.py +++ b/tensor2tensor/data_generators/wmt.py @@ -37,6 +37,9 @@ FLAGS = tf.flags.FLAGS +# End-of-sentence marker. +EOS = text_encoder.EOS_TOKEN + @registry.register_problem("wmt_ende_tokens_8k") class WMTEnDeTokens8k(problem.Problem): @@ -81,6 +84,53 @@ def _default_wmt_feature_encoders(data_dir, target_vocab_size): "targets": subtokenizer, } + +@registry.register_problem("ice_parsing_tokens") +class IceParsingTokens(problem.Problem): + """Problem spec for parsing tokenized Icelandic text to + constituency trees, also tokenized but to a smaller vocabulary.""" + + @property + def source_vocab_size(self): + return 2**13 # 8192 + + @property + def target_vocab_size(self): + return 2**8 # 256 + + def feature_encoders(self, data_dir): + source_vocab_filename = os.path.join( + data_dir, "ice_source.tokens.vocab.%d" % self.source_vocab_size) + target_vocab_filename = os.path.join( + data_dir, "ice_target.tokens.vocab.%d" % self.target_vocab_size) + source_subtokenizer = text_encoder.SubwordTextEncoder(source_vocab_filename) + target_subtokenizer = text_encoder.SubwordTextEncoder(target_vocab_filename) + return { + "inputs": source_subtokenizer, + "targets": target_subtokenizer, + } + + def generate_data(self, data_dir, tmp_dir, num_shards=100): + generator_utils.generate_dataset_and_shuffle( + tabbed_parsing_token_generator(tmp_dir, True, "ice", + self.source_vocab_size, + self.target_vocab_size), + self.training_filepaths(data_dir, num_shards, shuffled=False), + tabbed_parsing_token_generator(tmp_dir, False, "ice", + self.source_vocab_size, + self.target_vocab_size), + self.dev_filepaths(data_dir, 1, shuffled=False)) + + def hparams(self, defaults, unused_model_hparams): + p = defaults + source_vocab_size = self._encoders["inputs"].vocab_size + p.input_modality = {"inputs": (registry.Modalities.SYMBOL, source_vocab_size)} + p.target_modality = (registry.Modalities.SYMBOL, self.target_vocab_size) + p.input_space_id = problem.SpaceID.ICE_TOK + p.target_space_id = problem.SpaceID.ICE_PARSE_TOK + p.loss_multiplier = 2.5 # Rough estimate of avg number of tokens per word + + @registry.register_problem("setimes_mken_tokens_32k") class SETimesMkEnTokens32k(problem.Problem): """Problem spec for SETimes Mk-En translation.""" @@ -107,9 +157,6 @@ def hparams(self, defaults, unused_model_hparams): p.input_space_id = problem.SpaceID.MK_TOK p.target_space_id = problem.SpaceID.EN_TOK -# End-of-sentence marker. -EOS = text_encoder.EOS_TOKEN - def character_generator(source_path, target_path, character_vocab, eos=None): """Generator for sequence-to-sequence tasks that just uses characters. diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py old mode 100644 new mode 100755 index b341d6fe0..042ce797e --- a/tensor2tensor/models/transformer.py +++ b/tensor2tensor/models/transformer.py @@ -375,6 +375,20 @@ def transformer_parsing_ice(): return hparams +@registry.register_hparams +def transformer_parsing_ice_big(): + """Hparams for parsing Icelandic text, bigger model.""" + hparams = transformer_parsing_ice() + hparams.batch_size = 2048 # 4096 gives Out-of-memory on 8 GB 1080 GTX GPU + hparams.attention_dropout = 0.2 + hparams.residual_dropout = 0.2 + hparams.max_length = 512 + hparams.learning_rate_warmup_steps = 16000 + hparams.hidden_size = 1024 + hparams.learning_rate = 0.05 + return hparams + + @registry.register_hparams def transformer_tiny(): hparams = transformer_base() diff --git a/tensor2tensor/utils/registry.py b/tensor2tensor/utils/registry.py old mode 100644 new mode 100755 index 5a8823510..c9adfb692 --- a/tensor2tensor/utils/registry.py +++ b/tensor2tensor/utils/registry.py @@ -222,10 +222,10 @@ def parse_problem_name(problem_name): was_copy: A boolean. """ # Recursively strip tags until we reach a base name. - if len(problem_name) > 4 and problem_name[-4:] == "_rev": + if problem_name.endswith("_rev"): base, _, was_copy = parse_problem_name(problem_name[:-4]) return base, True, was_copy - elif len(problem_name) > 5 and problem_name[-5:] == "_copy": + elif problem_name.endswith("_copy"): base, was_reversed, _ = parse_problem_name(problem_name[:-5]) return base, was_reversed, True else: @@ -338,7 +338,7 @@ def list_modalities(): def parse_modality_name(name): - name_parts = name.split(":") + name_parts = name.split(":", maxsplit=1) if len(name_parts) < 2: name_parts.append("default") modality_type, modality_name = name_parts From fa92cbe2f293e1189d7481fdab656f724f4bc977 Mon Sep 17 00:00:00 2001 From: vthorsteinsson Date: Thu, 20 Jul 2017 19:28:00 +0000 Subject: [PATCH 02/10] Adaptation to upstream changes --- tensor2tensor/bin/t2t-trainer | 0 tensor2tensor/data_generators/wmt.py | 98 +++++++++++++--------------- 2 files changed, 47 insertions(+), 51 deletions(-) mode change 100644 => 100755 tensor2tensor/bin/t2t-trainer diff --git a/tensor2tensor/bin/t2t-trainer b/tensor2tensor/bin/t2t-trainer old mode 100644 new mode 100755 diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py index 2d0902dac..d0f7abaec 100755 --- a/tensor2tensor/data_generators/wmt.py +++ b/tensor2tensor/data_generators/wmt.py @@ -75,57 +75,6 @@ def train_generator(self): """Generator; takes data_dir, tmp_dir, is_training, targeted_vocab_size.""" raise NotImplementedError() - -@registry.register_problem("ice_parsing_tokens") -class IceParsingTokens(problem.Problem): - """Problem spec for parsing tokenized Icelandic text to - constituency trees, also tokenized but to a smaller vocabulary.""" - - @property - def source_vocab_size(self): - return 2**13 # 8192 - - @property - def target_vocab_size(self): - return 2**8 # 256 - - def feature_encoders(self, data_dir): - source_vocab_filename = os.path.join( - data_dir, "ice_source.tokens.vocab.%d" % self.source_vocab_size) - target_vocab_filename = os.path.join( - data_dir, "ice_target.tokens.vocab.%d" % self.target_vocab_size) - source_subtokenizer = text_encoder.SubwordTextEncoder(source_vocab_filename) - target_subtokenizer = text_encoder.SubwordTextEncoder(target_vocab_filename) - return { - "inputs": source_subtokenizer, - "targets": target_subtokenizer, - } - - def generate_data(self, data_dir, tmp_dir, num_shards=100): - generator_utils.generate_dataset_and_shuffle( - tabbed_parsing_token_generator(tmp_dir, True, "ice", - self.source_vocab_size, - self.target_vocab_size), - self.training_filepaths(data_dir, num_shards, shuffled=False), - tabbed_parsing_token_generator(tmp_dir, False, "ice", - self.source_vocab_size, - self.target_vocab_size), - self.dev_filepaths(data_dir, 1, shuffled=False)) - - def hparams(self, defaults, unused_model_hparams): - p = defaults - source_vocab_size = self._encoders["inputs"].vocab_size - p.input_modality = {"inputs": (registry.Modalities.SYMBOL, source_vocab_size)} - p.target_modality = (registry.Modalities.SYMBOL, self.target_vocab_size) - p.input_space_id = problem.SpaceID.ICE_TOK - p.target_space_id = problem.SpaceID.ICE_PARSE_TOK - p.loss_multiplier = 2.5 # Rough estimate of avg number of tokens per word - - -@registry.register_problem("setimes_mken_tokens_32k") -class SETimesMkEnTokens32k(problem.Problem): - """Problem spec for SETimes Mk-En translation.""" - @property def dev_generator(self): return self.train_generator @@ -734,3 +683,50 @@ def parsing_token_generator(data_dir, tmp_dir, train, vocab_size): tree_filepath = os.path.join(tmp_dir, filename) return wsj_parsing.token_generator(tree_filepath, symbolizer_vocab, symbolizer_vocab, EOS) + + +@registry.register_problem("ice_parsing_tokens") +class IceParsingTokens(problem.Problem): + """Problem spec for parsing tokenized Icelandic text to + constituency trees, also tokenized but to a smaller vocabulary.""" + + @property + def source_vocab_size(self): + return 2**13 # 8192 + + @property + def target_vocab_size(self): + return 2**8 # 256 + + def feature_encoders(self, data_dir): + source_vocab_filename = os.path.join( + data_dir, "ice_source.tokens.vocab.%d" % self.source_vocab_size) + target_vocab_filename = os.path.join( + data_dir, "ice_target.tokens.vocab.%d" % self.target_vocab_size) + source_subtokenizer = text_encoder.SubwordTextEncoder(source_vocab_filename) + target_subtokenizer = text_encoder.SubwordTextEncoder(target_vocab_filename) + return { + "inputs": source_subtokenizer, + "targets": target_subtokenizer, + } + + def generate_data(self, data_dir, tmp_dir, num_shards=100): + generator_utils.generate_dataset_and_shuffle( + tabbed_parsing_token_generator(tmp_dir, True, "ice", + self.source_vocab_size, + self.target_vocab_size), + self.training_filepaths(data_dir, num_shards, shuffled=False), + tabbed_parsing_token_generator(tmp_dir, False, "ice", + self.source_vocab_size, + self.target_vocab_size), + self.dev_filepaths(data_dir, 1, shuffled=False)) + + def hparams(self, defaults, unused_model_hparams): + p = defaults + source_vocab_size = self._encoders["inputs"].vocab_size + p.input_modality = {"inputs": (registry.Modalities.SYMBOL, source_vocab_size)} + p.target_modality = (registry.Modalities.SYMBOL, self.target_vocab_size) + p.input_space_id = problem.SpaceID.ICE_TOK + p.target_space_id = problem.SpaceID.ICE_PARSE_TOK + p.loss_multiplier = 2.5 # Rough estimate of avg number of tokens per word + From d69ad4d3e7619dd898e648743125517f26a43d44 Mon Sep 17 00:00:00 2001 From: vthorsteinsson Date: Fri, 28 Jul 2017 16:21:15 +0000 Subject: [PATCH 03/10] Moved Icelandic parsing to separate module --- tensor2tensor/data_generators/wmt.py | 127 ++++++----------------- tensor2tensor/ice_parsing/__init__.py | 2 + tensor2tensor/ice_parsing/ice_parsing.py | 127 +++++++++++++++++++++++ tensor2tensor/models/transformer.py | 23 ---- 4 files changed, 159 insertions(+), 120 deletions(-) create mode 100644 tensor2tensor/ice_parsing/__init__.py create mode 100755 tensor2tensor/ice_parsing/ice_parsing.py diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py index d0f7abaec..3d01ab46e 100755 --- a/tensor2tensor/data_generators/wmt.py +++ b/tensor2tensor/data_generators/wmt.py @@ -162,36 +162,6 @@ def character_generator(source_path, target_path, character_vocab, eos=None): source, target = source_file.readline(), target_file.readline() -def tabbed_generator(source_path, source_vocab, target_vocab, eos=None): - r"""Generator for sequence-to-sequence tasks using tabbed files. - - Tokens are derived from text files where each line contains both - a source and a target string. The two strings are separated by a tab - character ('\t'). It yields dictionaries of "inputs" and "targets" where - inputs are characters from the source lines converted to integers, and - targets are characters from the target lines, also converted to integers. - - Args: - source_path: path to the file with source and target sentences. - source_vocab: a SunwordTextEncoder to encode the source string. - target_vocab: a SunwordTextEncoder to encode the target string. - eos: integer to append at the end of each sequence (default: None). - - Yields: - A dictionary {"inputs": source-line, "targets": target-line} where - the lines are integer lists converted from characters in the file lines. - """ - eos_list = [] if eos is None else [eos] - with tf.gfile.GFile(source_path, mode="r") as source_file: - for line in source_file: - if line and "\t" in line: - parts = line.split("\t", maxsplit=1) - source, target = parts[0].strip(), parts[1].strip() - source_ints = source_vocab.encode(source) + eos_list - target_ints = target_vocab.encode(target) + eos_list - yield {"inputs": source_ints, "targets": target_ints} - - def token_generator(source_path, target_path, token_vocab, eos=None): """Generator for sequence-to-sequence tasks that uses tokens. @@ -255,6 +225,36 @@ def bi_vocabs_token_generator(source_path, source, target = source_file.readline(), target_file.readline() +def tabbed_generator(source_path, source_vocab, target_vocab, eos=None): + r"""Generator for sequence-to-sequence tasks using tabbed files. + + Tokens are derived from text files where each line contains both + a source and a target string. The two strings are separated by a tab + character ('\t'). It yields dictionaries of "inputs" and "targets" where + inputs are characters from the source lines converted to integers, and + targets are characters from the target lines, also converted to integers. + + Args: + source_path: path to the file with source and target sentences. + source_vocab: a SunwordTextEncoder to encode the source string. + target_vocab: a SunwordTextEncoder to encode the target string. + eos: integer to append at the end of each sequence (default: None). + + Yields: + A dictionary {"inputs": source-line, "targets": target-line} where + the lines are integer lists converted from characters in the file lines. + """ + eos_list = [] if eos is None else [eos] + with tf.gfile.GFile(source_path, mode="r") as source_file: + for line in source_file: + if line and "\t" in line: + parts = line.split("\t", maxsplit=1) + source, target = parts[0].strip(), parts[1].strip() + source_ints = source_vocab.encode(source) + eos_list + target_ints = target_vocab.encode(target) + eos_list + yield {"inputs": source_ints, "targets": target_ints} + + # Data-set URLs. @@ -654,28 +654,6 @@ def parsing_character_generator(tmp_dir, train): return character_generator(text_filepath, tags_filepath, character_vocab, EOS) -def tabbed_parsing_token_generator(data_dir, tmp_dir, train, prefix, - source_vocab_size, target_vocab_size): - """Generate source and target data from a single file.""" - source_vocab = generator_utils.get_or_generate_tabbed_vocab( - data_dir, tmp_dir, "parsing_train.pairs", 0, - prefix + "_source.vocab.%d" % source_vocab_size, source_vocab_size) - target_vocab = generator_utils.get_or_generate_tabbed_vocab( - data_dir, tmp_dir, "parsing_train.pairs", 1, - prefix + "_target.vocab.%d" % target_vocab_size, target_vocab_size) - filename = "parsing_%s" % ("train" if train else "dev") - pair_filepath = os.path.join(tmp_dir, filename + ".pairs") - return tabbed_generator(pair_filepath, source_vocab, target_vocab, EOS) - - -def tabbed_parsing_character_generator(tmp_dir, train): - """Generate source and target data from a single file.""" - character_vocab = text_encoder.ByteTextEncoder() - filename = "parsing_%s" % ("train" if train else "dev") - pair_filepath = os.path.join(tmp_dir, filename + ".pairs") - return tabbed_generator(pair_filepath, character_vocab, character_vocab, EOS) - - def parsing_token_generator(data_dir, tmp_dir, train, vocab_size): symbolizer_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, "vocab.endefr.%d" % vocab_size, vocab_size) @@ -685,48 +663,3 @@ def parsing_token_generator(data_dir, tmp_dir, train, vocab_size): symbolizer_vocab, EOS) -@registry.register_problem("ice_parsing_tokens") -class IceParsingTokens(problem.Problem): - """Problem spec for parsing tokenized Icelandic text to - constituency trees, also tokenized but to a smaller vocabulary.""" - - @property - def source_vocab_size(self): - return 2**13 # 8192 - - @property - def target_vocab_size(self): - return 2**8 # 256 - - def feature_encoders(self, data_dir): - source_vocab_filename = os.path.join( - data_dir, "ice_source.tokens.vocab.%d" % self.source_vocab_size) - target_vocab_filename = os.path.join( - data_dir, "ice_target.tokens.vocab.%d" % self.target_vocab_size) - source_subtokenizer = text_encoder.SubwordTextEncoder(source_vocab_filename) - target_subtokenizer = text_encoder.SubwordTextEncoder(target_vocab_filename) - return { - "inputs": source_subtokenizer, - "targets": target_subtokenizer, - } - - def generate_data(self, data_dir, tmp_dir, num_shards=100): - generator_utils.generate_dataset_and_shuffle( - tabbed_parsing_token_generator(tmp_dir, True, "ice", - self.source_vocab_size, - self.target_vocab_size), - self.training_filepaths(data_dir, num_shards, shuffled=False), - tabbed_parsing_token_generator(tmp_dir, False, "ice", - self.source_vocab_size, - self.target_vocab_size), - self.dev_filepaths(data_dir, 1, shuffled=False)) - - def hparams(self, defaults, unused_model_hparams): - p = defaults - source_vocab_size = self._encoders["inputs"].vocab_size - p.input_modality = {"inputs": (registry.Modalities.SYMBOL, source_vocab_size)} - p.target_modality = (registry.Modalities.SYMBOL, self.target_vocab_size) - p.input_space_id = problem.SpaceID.ICE_TOK - p.target_space_id = problem.SpaceID.ICE_PARSE_TOK - p.loss_multiplier = 2.5 # Rough estimate of avg number of tokens per word - diff --git a/tensor2tensor/ice_parsing/__init__.py b/tensor2tensor/ice_parsing/__init__.py new file mode 100644 index 000000000..36f468dcb --- /dev/null +++ b/tensor2tensor/ice_parsing/__init__.py @@ -0,0 +1,2 @@ + +from .ice_parsing import IceParsingTokens, transformer_parsing_ice, transformer_parsing_ice_big diff --git a/tensor2tensor/ice_parsing/ice_parsing.py b/tensor2tensor/ice_parsing/ice_parsing.py new file mode 100755 index 000000000..d8dd41cf7 --- /dev/null +++ b/tensor2tensor/ice_parsing/ice_parsing.py @@ -0,0 +1,127 @@ +# Copyright 2017 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This module implements the ice_parsing_* problems, which +# parse plain text into flattened parse trees and POS tags. +# The training data is stored in files named `parsing_train.pairs` +# and `parsing_dev.pairs`. These files are UTF-8 text files where +# each line contains an input sentence and a target parse tree, +# separated by a tab character. + +import os + +# Dependency imports + +from tensor2tensor.data_generators import generator_utils +from tensor2tensor.data_generators import problem +from tensor2tensor.data_generators import text_encoder +from tensor2tensor.data_generators.wmt import tabbed_generator +from tensor2tensor.utils import registry +from tensor2tensor.models import transformer + +import tensorflow as tf + + +# End-of-sentence marker. +EOS = text_encoder.EOS_ID + + +def tabbed_parsing_token_generator(data_dir, tmp_dir, train, prefix, + source_vocab_size, target_vocab_size): + """Generate source and target data from a single file.""" + filename = "parsing_{0}.pairs".format("train" if train else "dev") + source_vocab = generator_utils.get_or_generate_tabbed_vocab( + data_dir, tmp_dir, filename, 0, + prefix + "_source.tokens.vocab.%d" % source_vocab_size, source_vocab_size) + target_vocab = generator_utils.get_or_generate_tabbed_vocab( + data_dir, tmp_dir, filename, 1, + prefix + "_target.tokens.vocab.%d" % target_vocab_size, target_vocab_size) + pair_filepath = os.path.join(tmp_dir, filename) + return tabbed_generator(pair_filepath, source_vocab, target_vocab, EOS) + + +def tabbed_parsing_character_generator(tmp_dir, train): + """Generate source and target data from a single file.""" + character_vocab = text_encoder.ByteTextEncoder() + filename = "parsing_{0}.pairs".format("train" if train else "dev") + pair_filepath = os.path.join(tmp_dir, filename) + return tabbed_generator(pair_filepath, character_vocab, character_vocab, EOS) + + +@registry.register_problem("ice_parsing_tokens") +class IceParsingTokens(problem.Problem): + """Problem spec for parsing tokenized Icelandic text to + constituency trees, also tokenized but to a smaller vocabulary.""" + + @property + def source_vocab_size(self): + return 2**13 # 8192 + + @property + def target_vocab_size(self): + return 2**8 # 256 + + def feature_encoders(self, data_dir): + source_vocab_filename = os.path.join( + data_dir, "ice_source.tokens.vocab.%d" % self.source_vocab_size) + target_vocab_filename = os.path.join( + data_dir, "ice_target.tokens.vocab.%d" % self.target_vocab_size) + source_subtokenizer = text_encoder.SubwordTextEncoder(source_vocab_filename) + target_subtokenizer = text_encoder.SubwordTextEncoder(target_vocab_filename) + return { + "inputs": source_subtokenizer, + "targets": target_subtokenizer, + } + + def generate_data(self, data_dir, tmp_dir, num_shards=100): + generator_utils.generate_dataset_and_shuffle( + tabbed_parsing_token_generator(data_dir, tmp_dir, True, "ice", + self.source_vocab_size, + self.target_vocab_size), + self.training_filepaths(data_dir, num_shards, shuffled=False), + tabbed_parsing_token_generator(data_dir, tmp_dir, False, "ice", + self.source_vocab_size, + self.target_vocab_size), + self.dev_filepaths(data_dir, 1, shuffled=False)) + + def hparams(self, defaults, unused_model_hparams): + p = defaults + source_vocab_size = self._encoders["inputs"].vocab_size + p.input_modality = {"inputs": (registry.Modalities.SYMBOL, source_vocab_size)} + p.target_modality = (registry.Modalities.SYMBOL, self.target_vocab_size) + p.input_space_id = problem.SpaceID.ICE_TOK + p.target_space_id = problem.SpaceID.ICE_PARSE_TOK + p.loss_multiplier = 2.5 # Rough estimate of avg number of tokens per word + + +@registry.register_hparams +def transformer_parsing_ice(): + """Hparams for parsing Icelandic text.""" + hparams = transformer.transformer_base_single_gpu() + hparams.batch_size = 4096 + hparams.shared_embedding_and_softmax_weights = int(False) + return hparams + + +@registry.register_hparams +def transformer_parsing_ice_big(): + """Hparams for parsing Icelandic text, bigger model.""" + hparams = transformer_parsing_ice() + hparams.batch_size = 2048 # 4096 gives Out-of-memory on 8 GB 1080 GTX GPU + hparams.attention_dropout = 0.05 + hparams.residual_dropout = 0.05 + hparams.max_length = 512 + hparams.hidden_size = 1024 + return hparams + diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py index f03f173e2..0489567a0 100755 --- a/tensor2tensor/models/transformer.py +++ b/tensor2tensor/models/transformer.py @@ -357,29 +357,6 @@ def transformer_parsing_big(): return hparams -@registry.register_hparams -def transformer_parsing_ice(): - """Hparams for parsing Icelandic text.""" - hparams = transformer_base_single_gpu() - hparams.batch_size = 4096 - hparams.shared_embedding_and_softmax_weights = int(False) - return hparams - - -@registry.register_hparams -def transformer_parsing_ice_big(): - """Hparams for parsing Icelandic text, bigger model.""" - hparams = transformer_parsing_ice() - hparams.batch_size = 2048 # 4096 gives Out-of-memory on 8 GB 1080 GTX GPU - hparams.attention_dropout = 0.2 - hparams.residual_dropout = 0.2 - hparams.max_length = 512 - hparams.learning_rate_warmup_steps = 16000 - hparams.hidden_size = 1024 - hparams.learning_rate = 0.05 - return hparams - - @registry.register_hparams def transformer_tiny(): hparams = transformer_base() From d2af7cfe0b74a0e5ec3e0621ff7ad0e0776281df Mon Sep 17 00:00:00 2001 From: vthorsteinsson Date: Wed, 9 Aug 2017 11:18:31 +0000 Subject: [PATCH 04/10] Added readline import in trainer_utils.py --- tensor2tensor/utils/trainer_utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) mode change 100644 => 100755 tensor2tensor/utils/trainer_utils.py diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py old mode 100644 new mode 100755 index 260ec6a00..dc8238c51 --- a/tensor2tensor/utils/trainer_utils.py +++ b/tensor2tensor/utils/trainer_utils.py @@ -841,6 +841,11 @@ def _interactive_input_fn(hparams): vocabulary = p_hparams.vocabulary["inputs" if has_input else "targets"] # This should be longer than the longest input. const_array_size = 10000 + # For ease of input, activate the readline module if available. + try: + import readline + except ImportError: + pass while True: prompt = ("INTERACTIVE MODE num_samples=%d decode_length=%d \n" " it= ('text' or 'image')\n" @@ -848,7 +853,7 @@ def _interactive_input_fn(hparams): " in= (set the input problem number)\n" " ou= (set the output problem number)\n" " ns= (changes number of samples)\n" - " dl= (changes decode legnth)\n" + " dl= (changes decode length)\n" " <%s> (decode)\n" " q (quit)\n" ">" % (num_samples, decode_length, "source_string" From f2714e93b05753e364e34c1fe9fd69d6e405f5c1 Mon Sep 17 00:00:00 2001 From: vthorsteinsson Date: Wed, 9 Aug 2017 11:53:15 +0000 Subject: [PATCH 05/10] Sync with upstream --- tensor2tensor/utils/trainer_utils.py | 1113 ++------------------------ 1 file changed, 50 insertions(+), 1063 deletions(-) mode change 100755 => 100644 tensor2tensor/utils/trainer_utils.py diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py old mode 100755 new mode 100644 index dc8238c51..9e869c15c --- a/tensor2tensor/utils/trainer_utils.py +++ b/tensor2tensor/utils/trainer_utils.py @@ -19,37 +19,24 @@ from __future__ import division from __future__ import print_function -import math -import operator -import os import sys # Dependency imports -import numpy as np -import six -# pylint: disable=redefined-builtin -from six.moves import input -from six.moves import xrange -# pylint: enable=redefined-builtin - from tensor2tensor.data_generators import all_problems # pylint: disable=unused-import from tensor2tensor.data_generators import problem_hparams -from tensor2tensor.data_generators import text_encoder from tensor2tensor.models import models # pylint: disable=unused-import from tensor2tensor.utils import data_reader -from tensor2tensor.utils import expert_utils as eu +from tensor2tensor.utils import decoding +from tensor2tensor.utils import devices +from tensor2tensor.utils import input_fn_builder from tensor2tensor.utils import metrics +from tensor2tensor.utils import model_builder from tensor2tensor.utils import registry -from tensor2tensor.utils import yellowfin import tensorflow as tf from tensorflow.contrib.learn.python.learn import learn_runner from tensorflow.python import debug -from tensorflow.python.ops import init_ops - -# Number of samples to draw for an image input (in such cases as captioning) -IMAGE_DECODE_LENGTH = 100 flags = tf.flags FLAGS = flags.FLAGS @@ -130,16 +117,7 @@ "\t..\t") flags.DEFINE_integer("decode_max_input_size", -1, "Maximum number of ids in input. Or <= 0 for no max.") - - -def _save_until_eos(hyp): - """Strips everything after the first token, which is normally 1.""" - try: - index = list(hyp).index(text_encoder.EOS_ID) - return hyp[0:index] - except ValueError: - # No EOS_ID: return the array as-is. - return hyp +flags.DEFINE_bool("identity_output", False, "To print the output as identity") def make_experiment_fn(data_dir, model_name, train_steps, eval_steps): @@ -179,8 +157,8 @@ def create_experiment(output_dir, data_dir, model_name, train_steps, eval_hooks.append(hook) return tf.contrib.learn.Experiment( estimator=estimator, - train_input_fn=input_fns["train"], - eval_input_fn=input_fns["eval"], + train_input_fn=input_fns[tf.contrib.learn.ModeKeys.TRAIN], + eval_input_fn=input_fns[tf.contrib.learn.ModeKeys.EVAL], eval_metrics=eval_metrics, train_steps=train_steps, eval_steps=eval_steps, @@ -193,22 +171,26 @@ def create_experiment_components(hparams, output_dir, data_dir, model_name): """Constructs and returns Estimator and train/eval input functions.""" tf.logging.info("Creating experiment, storing model files in %s", output_dir) - num_datashards = data_parallelism().n - train_input_fn = get_input_fn( + num_datashards = devices.data_parallelism().n + train_input_fn = input_fn_builder.build_input_fn( mode=tf.contrib.learn.ModeKeys.TRAIN, hparams=hparams, data_file_patterns=get_data_filepatterns(data_dir, tf.contrib.learn.ModeKeys.TRAIN), - num_datashards=num_datashards) + num_datashards=num_datashards, + worker_replicas=FLAGS.worker_replicas, + worker_id=FLAGS.worker_id) - eval_input_fn = get_input_fn( + eval_input_fn = input_fn_builder.build_input_fn( mode=tf.contrib.learn.ModeKeys.EVAL, hparams=hparams, data_file_patterns=get_data_filepatterns(data_dir, tf.contrib.learn.ModeKeys.EVAL), - num_datashards=num_datashards) + num_datashards=num_datashards, + worker_replicas=FLAGS.worker_replicas, + worker_id=FLAGS.worker_id) estimator = tf.contrib.learn.Estimator( - model_fn=model_builder(model_name, hparams=hparams), + model_fn=model_builder.build_model_fn(model_name, hparams=hparams), model_dir=output_dir, config=tf.contrib.learn.RunConfig( master=FLAGS.master, @@ -218,7 +200,10 @@ def create_experiment_components(hparams, output_dir, data_dir, model_name): keep_checkpoint_max=FLAGS.keep_checkpoint_max)) # Store the hparams in the estimator as well estimator.hparams = hparams - return estimator, {"train": train_input_fn, "eval": eval_input_fn} + return estimator, { + tf.contrib.learn.ModeKeys.TRAIN: train_input_fn, + tf.contrib.learn.ModeKeys.EVAL: eval_input_fn + } def log_registry(): @@ -227,6 +212,24 @@ def log_registry(): sys.exit(0) +def add_problem_hparams(hparams, problems): + """Add problem hparams for the problems.""" + hparams.problems = [] + hparams.problem_instances = [] + for problem_name in problems.split("-"): + try: + problem = registry.problem(problem_name) + p_hparams = problem.internal_hparams(hparams) + except ValueError: + problem = None + p_hparams = problem_hparams.problem_hparams(problem_name, hparams) + + hparams.problem_instances.append(problem) + hparams.problems.append(p_hparams) + + return hparams + + def create_hparams(params_id, data_dir): """Returns hyperparameters, including any flag value overrides. @@ -247,21 +250,7 @@ def create_hparams(params_id, data_dir): if FLAGS.hparams: hparams = hparams.parse(FLAGS.hparams) - # Add hparams for the problems - hparams.problems = [] - hparams.problem_instances = [] - for problem_name in FLAGS.problems.split("-"): - try: - problem = registry.problem(problem_name) - p_hparams = problem.internal_hparams(hparams) - except ValueError: - problem = None - p_hparams = problem_hparams.problem_hparams(problem_name, hparams) - - hparams.problem_instances.append(problem) - hparams.problems.append(p_hparams) - - return hparams + return add_problem_hparams(hparams, FLAGS.problems) def run(data_dir, model, output_dir, train_steps, eval_steps, schedule): @@ -289,7 +278,11 @@ def run(data_dir, model, output_dir, train_steps, eval_steps, schedule): if schedule == "local_run": # Run the local demo. - run_locally(exp_fn(output_dir)) + exp = exp_fn(output_dir) + if exp.train_steps > 0 or exp.eval_steps > 0: + tf.logging.info("Performing local training and evaluation.") + exp.train_and_evaluate() + decode(exp.estimator) else: # Perform distributed training/evaluation. learn_runner.run( @@ -334,1020 +327,14 @@ def session_config(): return config -def model_builder(model, hparams): - """Returns a function to build the model. - - Args: - model: The name of the model to use. - hparams: The hyperparameters. - - Returns: - A function to build the model's graph. This function is called by - the Estimator object to construct the graph. - """ - - def initializer(): - if hparams.initializer == "orthogonal": - return tf.orthogonal_initializer(gain=hparams.initializer_gain) - elif hparams.initializer == "uniform": - max_val = 0.1 * hparams.initializer_gain - return tf.random_uniform_initializer(-max_val, max_val) - elif hparams.initializer == "normal_unit_scaling": - return init_ops.variance_scaling_initializer( - hparams.initializer_gain, mode="fan_avg", distribution="normal") - elif hparams.initializer == "uniform_unit_scaling": - return init_ops.variance_scaling_initializer( - hparams.initializer_gain, mode="fan_avg", distribution="uniform") - else: - raise ValueError("Unrecognized initializer: %s" % hparams.initializer) - - def learning_rate_decay(): - """Inverse-decay learning rate until warmup_steps, then decay.""" - warmup_steps = tf.to_float( - hparams.learning_rate_warmup_steps * FLAGS.worker_replicas) - step = tf.to_float(tf.contrib.framework.get_global_step()) - if hparams.learning_rate_decay_scheme == "noam": - return 5000.0 * hparams.hidden_size**-0.5 * tf.minimum( - (step + 1) * warmup_steps**-1.5, (step + 1)**-0.5) - elif hparams.learning_rate_decay_scheme == "exp100k": - return 0.94**(step // 100000) - elif hparams.learning_rate_decay_scheme == "cosine": - cycle_steps = hparams.learning_rate_cosine_cycle_steps - return 0.5 * (1 + tf.cos(np.pi * (step % cycle_steps) / cycle_steps)) - - inv_base = tf.exp(tf.log(0.01) / warmup_steps) - inv_decay = inv_base**(warmup_steps - step) - if hparams.learning_rate_decay_scheme == "sqrt": - decay = _sqrt_decay(step - warmup_steps) - elif hparams.learning_rate_decay_scheme == "exp10k": - decay = _exp_decay_after(step - warmup_steps, 0.9995, - FLAGS.train_steps - warmup_steps - 10000) - elif hparams.learning_rate_decay_scheme == "exp50k": - decay = _exp_decay_after(step - warmup_steps, 0.99995, - FLAGS.train_steps - warmup_steps - 50000) - elif hparams.learning_rate_decay_scheme == "exp500k": - decay = _exp_decay_after(step - warmup_steps, 0.9999955, - FLAGS.train_steps - warmup_steps - 500000) - elif hparams.learning_rate_decay_scheme == "none": - decay = tf.constant(1.0) - else: - raise ValueError("Unrecognized learning rate decay scheme: %s" % - hparams.learning_rate_decay_scheme) - return tf.cond( - step < warmup_steps, - lambda: inv_decay, - lambda: decay, - name="learning_rate_decay_warump_cond") - - def model_fn(features, targets, mode): - """Creates the prediction, loss, and train ops. - - Args: - features: A dictionary of tensors keyed by the feature name. - targets: A tensor representing the labels (targets). - mode: The execution mode, as defined in tf.contrib.learn.ModeKeys. - - Returns: - A tuple consisting of the prediction, loss, and train_op. - """ - if mode == tf.contrib.learn.ModeKeys.INFER: - if FLAGS.decode_interactive: - features = _interactive_input_tensor_to_features_dict(features, hparams) - elif FLAGS.decode_from_file: - features = _decode_input_tensor_to_features_dict(features, hparams) - # A dictionary containing: - # - problem_choice: A Tensor containing an integer indicating which problem - # was selected for this run. - # - predictions: A Tensor containing the model's output predictions. - run_info = dict() - run_info["problem_choice"] = features["problem_choice"] - - if targets is not None: - features["targets"] = targets - - dp = data_parallelism() - - # Add input statistics for incoming features. - with tf.name_scope("input_stats"): - for (k, v) in six.iteritems(features): - if isinstance(v, tf.Tensor) and v.get_shape().ndims > 1: - tf.summary.scalar("%s_batch" % k, tf.shape(v)[0] // dp.n) - tf.summary.scalar("%s_length" % k, tf.shape(v)[1]) - nonpadding = tf.to_float(tf.not_equal(v, 0)) - tf.summary.scalar("%s_nonpadding_tokens" % k, - tf.reduce_sum(nonpadding)) - tf.summary.scalar("%s_nonpadding_fraction" % k, - tf.reduce_mean(nonpadding)) - - tf.get_variable_scope().set_initializer(initializer()) - train = mode == tf.contrib.learn.ModeKeys.TRAIN - - # Get multi-problem logits and loss based on features["problem_choice"]. - def nth_model(n): - """Build the model for the n-th problem, plus some added variables.""" - model_class = registry.model(model)( - hparams, - mode, - hparams.problems[n], - n, - dp, - _ps_devices(all_workers=True)) - if mode == tf.contrib.learn.ModeKeys.INFER: - return model_class.infer( - features, - beam_size=FLAGS.decode_beam_size, - top_beams=(FLAGS.decode_beam_size - if FLAGS.decode_return_beams else 1), - last_position_only=FLAGS.decode_use_last_position_only, - alpha=FLAGS.decode_alpha, - decode_length=FLAGS.decode_extra_length) - # In distributed mode, we build graph for problem=0 and problem=worker_id. - skipping_is_on = hparams.problem_choice == "distributed" and train - problem_worker_id = FLAGS.worker_id % len(hparams.problems) - skip_this_one = n != 0 and n % FLAGS.worker_replicas != problem_worker_id - # On worker 0 also build graph for problems <= 1. - # TODO(lukaszkaiser): why is this hack needed for variables init? Repair. - skip_this_one = skip_this_one and (FLAGS.worker_id != 0 or n > 1) - sharded_logits, training_loss, extra_loss = model_class.model_fn( - features, skip=(skipping_is_on and skip_this_one)) - with tf.variable_scope("losses_avg", reuse=True): - loss_moving_avg = tf.get_variable("problem_%d/training_loss" % n) - o1 = loss_moving_avg.assign(loss_moving_avg * 0.9 + training_loss * 0.1) - loss_moving_avg = tf.get_variable("problem_%d/extra_loss" % n) - o2 = loss_moving_avg.assign(loss_moving_avg * 0.9 + extra_loss * 0.1) - loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n) - total_loss = training_loss + extra_loss - o3 = loss_moving_avg.assign(loss_moving_avg * 0.9 + total_loss * 0.1) - with tf.variable_scope("train_stats"): # Count steps for this problem. - problem_steps = tf.get_variable( - "problem_%d_steps" % n, initializer=0, trainable=False) - o4 = problem_steps.assign_add(1) - with tf.control_dependencies([o1, o2, o3, o4]): # Make sure the ops run. - # Ensure the loss is a scalar here. - total_loss = tf.reshape(total_loss, [], name="total_loss_control_id") - return [total_loss] + sharded_logits # Need to flatten for cond later. - - result_list = _cond_on_index(nth_model, features["problem_choice"], 0, - len(hparams.problems) - 1) - - if mode == tf.contrib.learn.ModeKeys.INFER: - # Beam search in sequence model returns both decodes withe key "outputs" - # and scores with they key "scores". If return list is a dict, we expect - # that it will have keys "outputs", a tensor of int32 and scores, a - # tensor of floats. This is useful if we want to return scores from - # estimator.predict - if not isinstance(result_list, dict): - ret = {"outputs": result_list}, None, None - else: - ret = { - "outputs": result_list["outputs"], - "scores": result_list["scores"] - }, None, None - if "inputs" in features: - ret[0]["inputs"] = features["inputs"] - if "infer_targets" in features: - ret[0]["targets"] = features["infer_targets"] - return ret - - sharded_logits, total_loss = result_list[1:], result_list[0] - if mode == tf.contrib.learn.ModeKeys.EVAL: - logits = tf.concat(sharded_logits, 0) - if FLAGS.eval_print: - logits = tf.Print( - logits, [features["inputs"], logits], "EVAL PRINT", summarize=10000) - # For evaluation, return the logits layer as our predictions. - run_info["predictions"] = logits - train_op = None - return run_info, total_loss, None - - assert mode == tf.contrib.learn.ModeKeys.TRAIN - - # Some training statistics. - with tf.name_scope("training_stats"): - learning_rate = hparams.learning_rate * learning_rate_decay() - learning_rate /= math.sqrt(float(FLAGS.worker_replicas)) - tf.summary.scalar("learning_rate", learning_rate) - global_step = tf.to_float(tf.contrib.framework.get_global_step()) - for n in xrange(len(hparams.problems)): - with tf.variable_scope("losses_avg", reuse=True): - total_loss_var = tf.get_variable("problem_%d/total_loss" % n) - training_loss_var = tf.get_variable("problem_%d/training_loss" % n) - extra_loss_var = tf.get_variable("problem_%d/extra_loss" % n) - tf.summary.scalar("loss_avg_%d/total_loss" % n, total_loss_var) - tf.summary.scalar("loss_avg_%d/training_loss" % n, training_loss_var) - tf.summary.scalar("loss_avg_%d/extra_loss" % n, extra_loss_var) - with tf.variable_scope("train_stats", reuse=True): - nth_steps = tf.get_variable("problem_%d_steps" % n, dtype=tf.int32) - tf.summary.scalar("problem_%d_frequency" % n, - tf.to_float(nth_steps) / (global_step + 1.0)) - - # Log trainable weights and add decay. - total_size, weight_decay_loss = 0, 0.0 - all_weights = {v.name: v for v in tf.trainable_variables()} - for v_name in sorted(list(all_weights)): - v = all_weights[v_name] - v_size = int(np.prod(np.array(v.shape.as_list()))) - tf.logging.info("Weight %s\tshape %s\tsize %d", - v.name[:-2].ljust(80), str(v.shape).ljust(20), v_size) - total_size += v_size - if hparams.weight_decay > 0.0 and len(v.shape.as_list()) > 1: - # Add weight regularization if set and the weight is not a bias (dim>1). - with tf.device(v._ref().device): # pylint: disable=protected-access - v_loss = tf.nn.l2_loss(v) / v_size - weight_decay_loss += v_loss - is_body = len(v_name) > 5 and v_name[:5] == "body/" - if hparams.weight_noise > 0.0 and is_body: - # Add weight noise if set in hparams. - with tf.device(v._ref().device): # pylint: disable=protected-access - scale = learning_rate * 0.001 - noise = tf.truncated_normal(v.shape) * hparams.weight_noise * scale - noise_op = v.assign_add(noise) - with tf.control_dependencies([noise_op]): - total_loss = tf.identity(total_loss) - tf.logging.info("Total trainable variables size: %d", total_size) - if hparams.weight_decay > 0.0: - total_loss += weight_decay_loss * hparams.weight_decay - total_loss = tf.identity(total_loss, name="total_loss") - - # Define the train_op for the TRAIN mode. - opt = _ConditionalOptimizer(hparams.optimizer, learning_rate, hparams) - tf.logging.info("Computing gradients for global model_fn.") - opt_summaries = ["learning_rate", "loss"] - if hparams.summarize_grads: - opt_summaries.extend(["gradients", "gradient_norm"]) - train_op = tf.contrib.layers.optimize_loss( - name="training", - loss=total_loss, - global_step=tf.contrib.framework.get_global_step(), - learning_rate=learning_rate, - clip_gradients=hparams.clip_grad_norm or None, - gradient_noise_scale=hparams.grad_noise_scale or None, - optimizer=opt, - summaries=opt_summaries, - colocate_gradients_with_ops=True) - - # Remove summaries that will fail to run because they are in conditionals. - # TODO(cwhipkey): Test with this code removed, later in 2017. - summaries = tf.get_collection_ref(tf.GraphKeys.SUMMARIES) - for i in range(len(summaries) - 1, -1, -1): - if summaries[i].name.startswith("cond_"): - del summaries[i] - - tf.logging.info("Global model_fn finished.") - return run_info, total_loss, train_op - - return model_fn - - -def run_locally(exp): - """Runs an Experiment locally - trains, evaluates, and decodes. - - Args: - exp: Experiment. - """ - if exp.train_steps > 0 or exp.eval_steps > 0: - tf.logging.info("Performing local training and evaluation.") - exp.train_and_evaluate() - decode(exp.estimator) +def get_data_filepatterns(data_dir, mode): + return data_reader.get_data_filepatterns(FLAGS.problems, data_dir, mode) def decode(estimator): if FLAGS.decode_interactive: - decode_interactively(estimator) + decoding.decode_interactively(estimator) elif FLAGS.decode_from_file is not None: - decode_from_file(estimator, FLAGS.decode_from_file) + decoding.decode_from_file(estimator, FLAGS.decode_from_file) elif FLAGS.decode_from_dataset: - decode_from_dataset(estimator) - - -def decode_from_dataset(estimator): - hparams = estimator.hparams - for i, problem in enumerate(FLAGS.problems.split("-")): - inputs_vocab = hparams.problems[i].vocabulary.get("inputs", None) - targets_vocab = hparams.problems[i].vocabulary["targets"] - tf.logging.info("Performing local inference.") - infer_problems_data = get_data_filepatterns(hparams.data_dir, - tf.contrib.learn.ModeKeys.INFER) - - infer_input_fn = get_input_fn( - mode=tf.contrib.learn.ModeKeys.INFER, - hparams=hparams, - data_file_patterns=infer_problems_data, - num_datashards=data_parallelism().n, - fixed_problem=i) - result_iter = estimator.predict(input_fn=infer_input_fn, as_iterable=False) - - def log_fn(inputs, - targets, - outputs, - problem, - j, - inputs_vocab=inputs_vocab, - targets_vocab=targets_vocab): - """Log inference results.""" - if "image" in problem and FLAGS.decode_save_images: - save_path = os.path.join(estimator.model_dir, - "%s_prediction_%d.jpg" % (problem, j)) - show_and_save_image(inputs / 255., save_path) - elif inputs_vocab: - decoded_inputs = inputs_vocab.decode(_save_until_eos(inputs.flatten())) - tf.logging.info("Inference results INPUT: %s" % decoded_inputs) - - decoded_outputs = targets_vocab.decode(_save_until_eos(outputs.flatten())) - tf.logging.info("Inference results OUTPUT: %s" % decoded_outputs) - decoded_targets = targets_vocab.decode(_save_until_eos(targets.flatten())) - tf.logging.info("Inference results TARGET: %s" % decoded_targets) - - if FLAGS.decode_to_file: - output_filepath = FLAGS.decode_to_file + ".outputs." + problem - output_file = tf.gfile.Open(output_filepath, "a") - output_file.write(decoded_outputs + "\n") - target_filepath = FLAGS.decode_to_file + ".targets." + problem - target_file = tf.gfile.Open(target_filepath, "a") - target_file.write(decoded_targets + "\n") - - # The function predict() returns an iterable over the network's - # predictions from the test input. We use it to log inputs and decodes. - inputs_iter = result_iter["inputs"] - targets_iter = result_iter["targets"] - outputs_iter = result_iter["outputs"] - for j, result in enumerate(zip(inputs_iter, targets_iter, outputs_iter)): - inputs, targets, outputs = result - if FLAGS.decode_return_beams: - output_beams = np.split(outputs, FLAGS.decode_beam_size, axis=0) - for k, beam in enumerate(output_beams): - tf.logging.info("BEAM %d:" % k) - log_fn(inputs, targets, beam, problem, j) - else: - log_fn(inputs, targets, outputs, problem, j) - - -def decode_from_file(estimator, filename): - """Compute predictions on entries in filename and write them out.""" - hparams = estimator.hparams - problem_id = FLAGS.decode_problem_id - inputs_vocab = hparams.problems[problem_id].vocabulary["inputs"] - targets_vocab = hparams.problems[problem_id].vocabulary["targets"] - tf.logging.info("Performing decoding from a file.") - sorted_inputs, sorted_keys = _get_sorted_inputs(filename) - num_decode_batches = (len(sorted_inputs) - 1) // FLAGS.decode_batch_size + 1 - input_fn = _decode_batch_input_fn(problem_id, num_decode_batches, - sorted_inputs, inputs_vocab) - - decodes = [] - for _ in range(num_decode_batches): - result_iter = estimator.predict( - input_fn=input_fn.next if six.PY2 else input_fn.__next__, - as_iterable=True) - for result in result_iter: - - def log_fn(inputs, outputs): - decoded_inputs = inputs_vocab.decode(_save_until_eos(inputs.flatten())) - tf.logging.info("Inference results INPUT: %s" % decoded_inputs) - - decoded_outputs = targets_vocab.decode( - _save_until_eos(outputs.flatten())) - tf.logging.info("Inference results OUTPUT: %s" % decoded_outputs) - return decoded_outputs - - if FLAGS.decode_return_beams: - beam_decodes = [] - output_beams = np.split( - result["outputs"], FLAGS.decode_beam_size, axis=0) - for k, beam in enumerate(output_beams): - tf.logging.info("BEAM %d:" % k) - beam_decodes.append(log_fn(result["inputs"], beam)) - decodes.append("\t".join(beam_decodes)) - - else: - decodes.append(log_fn(result["inputs"], result["outputs"])) - - # Reversing the decoded inputs and outputs because they were reversed in - # _decode_batch_input_fn - sorted_inputs.reverse() - decodes.reverse() - # Dumping inputs and outputs to file filename.decodes in - # format result\tinput in the same order as original inputs - if FLAGS.decode_to_file: - output_filename = FLAGS.decode_to_file - else: - output_filename = filename - if FLAGS.decode_shards > 1: - base_filename = output_filename + ("%.2d" % FLAGS.worker_id) - else: - base_filename = output_filename - decode_filename = (base_filename + "." + FLAGS.model + "." + FLAGS.hparams_set - + ".beam" + str(FLAGS.decode_beam_size) + ".alpha" + - str(FLAGS.decode_alpha) + ".decodes") - tf.logging.info("Writing decodes into %s" % decode_filename) - outfile = tf.gfile.Open(decode_filename, "w") - for index in range(len(sorted_inputs)): - outfile.write("%s\n" % (decodes[sorted_keys[index]])) - - -def decode_interactively(estimator): - hparams = estimator.hparams - - infer_input_fn = _interactive_input_fn(hparams) - for problem_idx, example in infer_input_fn: - targets_vocab = hparams.problems[problem_idx].vocabulary["targets"] - result_iter = estimator.predict(input_fn=lambda e=example: e) - for result in result_iter: - if FLAGS.decode_return_beams: - beams = np.split(result["outputs"], FLAGS.decode_beam_size, axis=0) - scores = None - if "scores" in result: - scores = np.split(result["scores"], FLAGS.decode_beam_size, axis=0) - for k, beam in enumerate(beams): - tf.logging.info("BEAM %d:" % k) - beam_string = targets_vocab.decode(_save_until_eos(beam.flatten())) - if scores is not None: - tf.logging.info("%s\tScore:%f" % (beam_string, scores[k])) - else: - tf.logging.info(beam_string) - else: - tf.logging.info( - targets_vocab.decode(_save_until_eos(result["outputs"].flatten()))) - - -def _decode_batch_input_fn(problem_id, num_decode_batches, sorted_inputs, - vocabulary): - tf.logging.info(" batch %d" % num_decode_batches) - # First reverse all the input sentences so that if you're going to get OOMs, - # you'll see it in the first batch - sorted_inputs.reverse() - for b in range(num_decode_batches): - tf.logging.info("Decoding batch %d" % b) - batch_length = 0 - batch_inputs = [] - for inputs in sorted_inputs[b * FLAGS.decode_batch_size:( - b + 1) * FLAGS.decode_batch_size]: - input_ids = vocabulary.encode(inputs) - if FLAGS.decode_max_input_size > 0: - # Subtract 1 for the EOS_ID. - input_ids = input_ids[:FLAGS.decode_max_input_size - 1] - input_ids.append(text_encoder.EOS_ID) - batch_inputs.append(input_ids) - if len(input_ids) > batch_length: - batch_length = len(input_ids) - final_batch_inputs = [] - for input_ids in batch_inputs: - assert len(input_ids) <= batch_length - x = input_ids + [0] * (batch_length - len(input_ids)) - final_batch_inputs.append(x) - yield { - "inputs": np.array(final_batch_inputs), - "problem_choice": np.array(problem_id) - } - - -def get_data_filepatterns(data_dir, mode): - return data_reader.get_data_filepatterns(FLAGS.problems, data_dir, mode) - - -def _cond_on_index(fn, index_tensor, cur_idx, max_idx): - """Call fn(index_tensor) using tf.cond in [cur_id, max_idx].""" - if cur_idx == max_idx: - return fn(cur_idx) - return tf.cond( - tf.equal(index_tensor, cur_idx), lambda: fn(cur_idx), - lambda: _cond_on_index(fn, index_tensor, cur_idx + 1, max_idx)) - - -def _interactive_input_fn(hparams): - """Generator that reads from the terminal and yields "interactive inputs". - - Due to temporary limitations in tf.learn, if we don't want to reload the - whole graph, then we are stuck encoding all of the input as one fixed-size - numpy array. - - We yield int64 arrays with shape [const_array_size]. The format is: - [num_samples, decode_length, len(input ids), , ] - - Args: - hparams: model hparams - Yields: - numpy arrays - - Raises: - Exception: when `input_type` is invalid. - """ - num_samples = 3 - decode_length = 100 - input_type = "text" - problem_id = 0 - p_hparams = hparams.problems[problem_id] - has_input = "inputs" in p_hparams.input_modality - vocabulary = p_hparams.vocabulary["inputs" if has_input else "targets"] - # This should be longer than the longest input. - const_array_size = 10000 - # For ease of input, activate the readline module if available. - try: - import readline - except ImportError: - pass - while True: - prompt = ("INTERACTIVE MODE num_samples=%d decode_length=%d \n" - " it= ('text' or 'image')\n" - " pr= (set the problem number)\n" - " in= (set the input problem number)\n" - " ou= (set the output problem number)\n" - " ns= (changes number of samples)\n" - " dl= (changes decode length)\n" - " <%s> (decode)\n" - " q (quit)\n" - ">" % (num_samples, decode_length, "source_string" - if has_input else "target_prefix")) - input_string = input(prompt) - if input_string == "q": - return - elif input_string[:3] == "pr=": - problem_id = int(input_string[3:]) - p_hparams = hparams.problems[problem_id] - has_input = "inputs" in p_hparams.input_modality - vocabulary = p_hparams.vocabulary["inputs" if has_input else "targets"] - elif input_string[:3] == "in=": - problem = int(input_string[3:]) - p_hparams.input_modality = hparams.problems[problem].input_modality - p_hparams.input_space_id = hparams.problems[problem].input_space_id - elif input_string[:3] == "ou=": - problem = int(input_string[3:]) - p_hparams.target_modality = hparams.problems[problem].target_modality - p_hparams.target_space_id = hparams.problems[problem].target_space_id - elif input_string[:3] == "ns=": - num_samples = int(input_string[3:]) - elif input_string[:3] == "dl=": - decode_length = int(input_string[3:]) - elif input_string[:3] == "it=": - input_type = input_string[3:] - else: - if input_type == "text": - input_ids = vocabulary.encode(input_string) - if has_input: - input_ids.append(text_encoder.EOS_ID) - x = [num_samples, decode_length, len(input_ids)] + input_ids - assert len(x) < const_array_size - x += [0] * (const_array_size - len(x)) - yield problem_id, { - "inputs": np.array(x), - "problem_choice": np.array(problem_id) - } - elif input_type == "image": - input_path = input_string - img = read_image(input_path) - yield problem_id, { - "inputs": img, - "problem_choice": np.array(problem_id) - } - else: - raise Exception("Unsupported input type.") - - -def read_image(path): - try: - import matplotlib.image as im # pylint: disable=g-import-not-at-top - except ImportError as e: - tf.logging.warning( - "Reading an image requires matplotlib to be installed: %s", e) - raise NotImplementedError("Image reading not implemented.") - return im.imread(path) - - -def show_and_save_image(img, save_path): - try: - import matplotlib.pyplot as plt # pylint: disable=g-import-not-at-top - except ImportError as e: - tf.logging.warning("Showing and saving an image requires matplotlib to be " - "installed: %s", e) - raise NotImplementedError("Image display and save not implemented.") - plt.imshow(img) - plt.savefig(save_path) - - -def _get_sorted_inputs(filename): - """Returning inputs sorted according to length. - - Args: - filename: path to file with inputs, 1 per line. - - Returns: - a sorted list of inputs - - """ - tf.logging.info("Getting sorted inputs") - # read file and sort inputs according them according to input length. - if FLAGS.decode_shards > 1: - decode_filename = filename + ("%.2d" % FLAGS.worker_id) - else: - decode_filename = filename - inputs = [line.strip() for line in tf.gfile.Open(decode_filename)] - input_lens = [(i, len(line.strip().split())) for i, line in enumerate(inputs)] - sorted_input_lens = sorted(input_lens, key=operator.itemgetter(1)) - # We'll need the keys to rearrange the inputs back into their original order - sorted_keys = {} - sorted_inputs = [] - for i, (index, _) in enumerate(sorted_input_lens): - sorted_inputs.append(inputs[index]) - sorted_keys[index] = i - return sorted_inputs, sorted_keys - - -def _interactive_input_tensor_to_features_dict(feature_map, hparams): - """Convert the interactive input format (see above) to a dictionary. - - Args: - feature_map: a dictionary with keys `problem_choice` and `input` containing - Tensors. - hparams: model hyperparameters - - Returns: - a features dictionary, as expected by the decoder. - """ - inputs = tf.constant(feature_map["inputs"]) - input_is_image = False if len(inputs.shape) < 3 else True - - def input_fn(problem_choice, x=inputs): # pylint: disable=missing-docstring - p_hparams = hparams.problems[problem_choice] - if not input_is_image: - # Remove the batch dimension. - num_samples = x[0] - length = x[2] - x = tf.slice(x, [3], tf.to_int32([length])) - x = tf.reshape(x, [1, -1, 1, 1]) - # Transform into a batch of size num_samples to get that many random - # decodes. - x = tf.tile(x, tf.to_int32([num_samples, 1, 1, 1])) - else: - x = tf.image.resize_images(x, [299, 299]) - x = tf.reshape(x, [1, 299, 299, -1]) - x = tf.to_int32(x) - return (tf.constant(p_hparams.input_space_id), - tf.constant(p_hparams.target_space_id), x) - - input_space_id, target_space_id, x = _cond_on_index( - input_fn, feature_map["problem_choice"], 0, len(hparams.problems) - 1) - - features = {} - features["problem_choice"] = tf.constant(feature_map["problem_choice"]) - features["input_space_id"] = input_space_id - features["target_space_id"] = target_space_id - features["decode_length"] = (IMAGE_DECODE_LENGTH - if input_is_image else inputs[1]) - features["inputs"] = x - return features - - -def _decode_input_tensor_to_features_dict(feature_map, hparams): - """Convert the interactive input format (see above) to a dictionary. - - Args: - feature_map: a dictionary with keys `problem_choice` and `input` containing - Tensors. - hparams: model hyperparameters - - Returns: - a features dictionary, as expected by the decoder. - """ - inputs = tf.constant(feature_map["inputs"]) - input_is_image = False - - def input_fn(problem_choice, x=inputs): # pylint: disable=missing-docstring - p_hparams = hparams.problems[problem_choice] - # Add a third empty dimension dimension - x = tf.expand_dims(x, axis=[2]) - x = tf.to_int32(x) - return (tf.constant(p_hparams.input_space_id), - tf.constant(p_hparams.target_space_id), x) - - input_space_id, target_space_id, x = _cond_on_index( - input_fn, feature_map["problem_choice"], 0, len(hparams.problems) - 1) - - features = {} - features["problem_choice"] = feature_map["problem_choice"] - features["input_space_id"] = input_space_id - features["target_space_id"] = target_space_id - features["decode_length"] = (IMAGE_DECODE_LENGTH - if input_is_image else tf.shape(x)[1] + 50) - features["inputs"] = x - return features - - -def get_input_fn(mode, - hparams, - data_file_patterns=None, - num_datashards=None, - fixed_problem=None): - """Provides input to the graph, either from disk or via a placeholder. - - This function produces an input function that will feed data into - the network. There are two modes of operation: - - 1. If data_file_pattern and all subsequent arguments are None, then - it creates a placeholder for a serialized tf.Example proto. - 2. If data_file_pattern is defined, it will read the data from the - files at the given location. Use this mode for training, - evaluation, and testing prediction. - - Args: - mode: The execution mode, as defined in tf.contrib.learn.ModeKeys. - hparams: HParams object. - data_file_patterns: The list of file patterns to use to read in data. Set to - `None` if you want to create a placeholder for the input data. The - `problems` flag is a list of problem names joined by the `-` character. - The flag's string is then split along the `-` and each problem gets its - own example queue. - num_datashards: An integer. - fixed_problem: An integer indicating the problem to fetch data for, or None - if the input is to be randomly selected. - - Returns: - A function that returns a dictionary of features and the target labels. - """ - - def input_fn(): - """Supplies input to our model. - - This function supplies input to our model, where this input is a - function of the mode. For example, we supply different data if - we're performing training versus evaluation. - - Returns: - A tuple consisting of 1) a dictionary of tensors whose keys are - the feature names, and 2) a tensor of target labels if the mode - is not INFER (and None, otherwise). - - Raises: - ValueError: if one of the parameters has an unsupported value. - """ - problem_count, batches = len(data_file_patterns), [] - with tf.name_scope("input_reader"): - for n in xrange(problem_count): - if fixed_problem is not None and n != fixed_problem: - continue - problem_instance = hparams.problem_instances[n] - p_hparams = hparams.problems[n] - with tf.name_scope("problem_%d" % n): - with tf.device("/cpu:0"): # Input reading on CPU - capacity = p_hparams.max_expected_batch_size_per_shard - capacity *= num_datashards - examples = data_reader.input_pipeline( - problem_instance, data_file_patterns[n], capacity, mode) - feature_map = data_reader.batch_examples( - examples, - data_reader.hparams_to_batching_scheme( - hparams, - shard_multiplier=num_datashards, - drop_long_sequences=(mode == tf.contrib.learn.ModeKeys.TRAIN - or hparams.eval_drop_long_sequences), - length_multiplier=(p_hparams.batch_size_multiplier))) - - # Reverse inputs and targets features if the problem was reversed. - if problem_instance is not None: - problem_instance.maybe_reverse_features(feature_map) - problem_instance.maybe_copy_features(feature_map) - else: - if p_hparams.was_reversed: - inputs = feature_map["inputs"] - targets = feature_map["targets"] - feature_map["inputs"] = targets - feature_map["targets"] = inputs - # Use the inputs as the targets if the problem is a copy problem. - if p_hparams.was_copy: - feature_map["targets"] = feature_map["inputs"] - - # Ensure inputs and targets are proper rank. - while len(feature_map["inputs"].get_shape()) != 4: - feature_map["inputs"] = tf.expand_dims(feature_map["inputs"], axis=-1) - while len(feature_map["targets"].get_shape()) != 4: - feature_map["targets"] = tf.expand_dims( - feature_map["targets"], axis=-1) - - batches.append( - (feature_map["inputs"], feature_map["targets"], tf.constant(n), - tf.constant(p_hparams.input_space_id), - tf.constant(p_hparams.target_space_id))) - - # We choose which problem to process. - loss_moving_avgs = [] # Need loss moving averages for that. - for n in xrange(problem_count): - with tf.variable_scope("losses_avg"): - loss_moving_avgs.append( - tf.get_variable( - "problem_%d/total_loss" % n, initializer=100.0, - trainable=False)) - tf.get_variable( - "problem_%d/training_loss" % n, initializer=100.0, trainable=False) - tf.get_variable( - "problem_%d/extra_loss" % n, initializer=100.0, trainable=False) - if fixed_problem is None: - if (hparams.problem_choice == "uniform" or - mode != tf.contrib.learn.ModeKeys.TRAIN): - problem_choice = tf.random_uniform( - [], maxval=problem_count, dtype=tf.int32) - elif hparams.problem_choice == "adaptive": - loss_moving_avgs = tf.stack(loss_moving_avgs) - problem_choice = tf.multinomial( - tf.reshape(loss_moving_avgs, [1, -1]), 1) - problem_choice = tf.to_int32(tf.squeeze(problem_choice)) - elif hparams.problem_choice == "distributed": - assert FLAGS.worker_replicas >= problem_count - assert FLAGS.worker_replicas % problem_count == 0 - problem_choice = tf.to_int32(FLAGS.worker_id % problem_count) - else: - raise ValueError( - "Value of hparams.problem_choice is %s and must be " - "one of [uniform, adaptive, distributed]" % hparams.problem_choice) - - # Inputs and targets conditional on problem_choice. - rand_inputs, rand_target, choice, inp_id, tgt_id = _cond_on_index( - lambda n: batches[n], problem_choice, 0, problem_count - 1) - else: - problem_choice = tf.constant(fixed_problem) - # Take the only constructed batch, which is the fixed_problem. - rand_inputs, rand_target, choice, inp_id, tgt_id = batches[0] - - # Set shapes so the ranks are clear. - rand_inputs.set_shape([None, None, None, None]) - rand_target.set_shape([None, None, None, None]) - choice.set_shape([]) - inp_id.set_shape([]) - tgt_id.set_shape([]) - # Forced shape obfuscation is necessary for inference. - if mode == tf.contrib.learn.ModeKeys.INFER: - rand_inputs._shape = tf.TensorShape([None, None, None, None]) # pylint: disable=protected-access - rand_target._shape = tf.TensorShape([None, None, None, None]) # pylint: disable=protected-access - - # Final feature map. - rand_feature_map = { - "inputs": rand_inputs, - "problem_choice": choice, - "input_space_id": inp_id, - "target_space_id": tgt_id - } - if mode == tf.contrib.learn.ModeKeys.INFER: - rand_feature_map["infer_targets"] = rand_target - rand_target = None - return rand_feature_map, rand_target - - return input_fn - - -class _ConditionalOptimizer(tf.train.Optimizer): - """Conditional optimizer.""" - - def __init__(self, optimizer_name, lr, hparams): - if optimizer_name == "Adam": - # We change the default epsilon for Adam and re-scale lr. - # Using LazyAdam as it's much faster for large vocabulary embeddings. - self._opt = tf.contrib.opt.LazyAdamOptimizer( - lr / 500.0, - beta1=hparams.optimizer_adam_beta1, - beta2=hparams.optimizer_adam_beta2, - epsilon=hparams.optimizer_adam_epsilon) - elif optimizer_name == "Momentum": - self._opt = tf.train.MomentumOptimizer( - lr, momentum=hparams.optimizer_momentum_momentum) - elif optimizer_name == "YellowFin": - tf.logging.info("Init YellowFin Optimizer.") - self._opt = yellowfin.YellowFinOptimizer( - learning_rate=lr, momentum=hparams.optimizer_momentum_momentum) - else: - self._opt = tf.contrib.layers.OPTIMIZER_CLS_NAMES[optimizer_name](lr) - - def compute_gradients(self, loss, var_list, colocate_gradients_with_ops): - return self._opt.compute_gradients( - loss, var_list, colocate_gradients_with_ops=colocate_gradients_with_ops) - - def apply_gradients(self, gradients, global_step=None, name=None): - return self._opt.apply_gradients( - gradients, global_step=global_step, name=name) - - -def _sqrt_decay(step): - """Decay like 1 / sqrt(step), multiplied by 500 to normalize.""" - return 500.0 / tf.sqrt(tf.maximum(step, 1.0)) - - -def _exp_decay_after(step, rate, from_which_step): - """Decay exponentially by rate (per step) starting at from_which_step.""" - return tf.cond( - step < from_which_step, - lambda: tf.constant(1.0), - lambda: rate**(step - from_which_step), - name="exponential_decay_step_cond") - - -def _ps_replicas(all_workers=False): - if all_workers: - return list(range(FLAGS.ps_replicas)) - # Worker K will be using replicas {0,...n-1} + K*n if we have n replicas. - num_replicas = FLAGS.ps_replicas // FLAGS.worker_replicas - return [d + FLAGS.worker_id * num_replicas for d in xrange(num_replicas)] - - -def _gpu_order(num_gpus): - if FLAGS.gpu_order: - ret = [int(s) for s in FLAGS.gpu_order.split(" ")] - if len(ret) == num_gpus: - return ret - return list(range(num_gpus)) - - -def _ps_gpus(all_workers=False): - ps_gpus = [] - for d in _ps_replicas(all_workers=all_workers): - ps_gpus.extend([(d, gpu) for gpu in _gpu_order(FLAGS.ps_gpu)]) - return ps_gpus - - -def _ps_devices(all_workers=False): - """List of ps devices (where to put the experts). - - Args: - all_workers: whether the list is for all async workers or just this one. - - Returns: - a list of device names - """ - if FLAGS.ps_replicas > 0: - if FLAGS.ps_gpu > 0: - return [ - FLAGS.ps_job + "/task:%d/GPU:%d" % (d, gpu) - for (d, gpu) in _ps_gpus(all_workers=all_workers) - ] - else: - return [ - FLAGS.ps_job + "/task:%d" % d - for d in _ps_replicas(all_workers=all_workers) - ] - else: - if FLAGS.worker_gpu > 0: - return ["gpu:%d" % d for d in _gpu_order(FLAGS.worker_gpu)] - else: - return [""] - - -def data_parallelism(all_workers=False): - """Over which devices do we split each training batch. - - In old-fashioned async mode, we split the batch over all GPUs on the - current worker. - - In sync mode, we split the batch over all the parameter server GPUs. - - This function returns an expert_utils.Parallelism object, which can be used - to build the model. It is configured in a way that any variables created - by `tf.get_variable` will be assigned to the parameter servers and shared - between datashards. - - Args: - all_workers: whether the devices are all async workers or just this one. - - Returns: - a expert_utils.Parallelism. - """ - - def _replica_device_setter(worker_device): - if FLAGS.ps_replicas == 0: - return worker_device - return tf.train.replica_device_setter( - worker_device=worker_device, - ps_tasks=FLAGS.ps_replicas, - ps_device=FLAGS.ps_job + "/GPU:0" if FLAGS.ps_gpu > 0 else FLAGS.ps_job) - - if FLAGS.schedule == "local_run": - assert not FLAGS.sync - datashard_devices = ["gpu:%d" % d for d in _gpu_order(FLAGS.worker_gpu)] - if FLAGS.locally_shard_to_cpu: - datashard_devices += ["cpu:0"] - caching_devices = None - elif FLAGS.sync: - assert FLAGS.ps_replicas > 0 - datashard_devices = [ - _replica_device_setter(d) for d in _ps_devices(all_workers=all_workers) - ] - if FLAGS.ps_gpu > 0 and FLAGS.ps_replicas > 1: - caching_devices = [ - FLAGS.ps_job + "/task:%d/cpu:0" % d - for (d, _) in _ps_gpus(all_workers=all_workers) - ] - else: - caching_devices = None - else: - # old fashioned async - compute on worker - if FLAGS.worker_gpu > 1: - datashard_devices = [ - _replica_device_setter(FLAGS.worker_job + "/GPU:%d" % d) - for d in _gpu_order(FLAGS.worker_gpu) - ] - caching_devices = [FLAGS.worker_job + "/GPU:0"] * FLAGS.worker_gpu - else: - datashard_devices = [_replica_device_setter(FLAGS.worker_job)] - caching_devices = None - tf.logging.info("datashard_devices: %s", datashard_devices) - tf.logging.info("caching_devices: %s", caching_devices) - return eu.Parallelism( - datashard_devices, - reuse=True, - caching_devices=caching_devices, - daisy_chain_variables=FLAGS.daisy_chain_variables) + decoding.decode_from_dataset(estimator) From accf019e8676afbecc14048fe2151ae85a645be1 Mon Sep 17 00:00:00 2001 From: vthorsteinsson Date: Wed, 9 Aug 2017 12:01:20 +0000 Subject: [PATCH 06/10] Import readline in decoding.py --- tensor2tensor/utils/decoding.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) mode change 100644 => 100755 tensor2tensor/utils/decoding.py diff --git a/tensor2tensor/utils/decoding.py b/tensor2tensor/utils/decoding.py old mode 100644 new mode 100755 index 12057d8e6..cf981a1e3 --- a/tensor2tensor/utils/decoding.py +++ b/tensor2tensor/utils/decoding.py @@ -248,6 +248,11 @@ def _interactive_input_fn(hparams): vocabulary = p_hparams.vocabulary["inputs" if has_input else "targets"] # This should be longer than the longest input. const_array_size = 10000 + # Import readline if available for command line editing and recall + try: + import readline + except ImportError: + pass while True: prompt = ("INTERACTIVE MODE num_samples=%d decode_length=%d \n" " it= ('text' or 'image' or 'label')\n" @@ -255,7 +260,7 @@ def _interactive_input_fn(hparams): " in= (set the input problem number)\n" " ou= (set the output problem number)\n" " ns= (changes number of samples)\n" - " dl= (changes decode legnth)\n" + " dl= (changes decode length)\n" " <%s> (decode)\n" " q (quit)\n" ">" % (num_samples, decode_length, "source_string" From b3de49a72743d212fef786e80ee01044eb89be98 Mon Sep 17 00:00:00 2001 From: vthorsteinsson Date: Thu, 10 Aug 2017 17:59:50 +0000 Subject: [PATCH 07/10] Larger source vocab; adapt to new upstream version --- tensor2tensor/ice_parsing/ice_parsing.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensor2tensor/ice_parsing/ice_parsing.py b/tensor2tensor/ice_parsing/ice_parsing.py index d8dd41cf7..df9748589 100755 --- a/tensor2tensor/ice_parsing/ice_parsing.py +++ b/tensor2tensor/ice_parsing/ice_parsing.py @@ -66,7 +66,7 @@ class IceParsingTokens(problem.Problem): @property def source_vocab_size(self): - return 2**13 # 8192 + return 2**14 # 16384 @property def target_vocab_size(self): @@ -84,18 +84,18 @@ def feature_encoders(self, data_dir): "targets": target_subtokenizer, } - def generate_data(self, data_dir, tmp_dir, num_shards=100): + def generate_data(self, data_dir, tmp_dir, task_id=-1): generator_utils.generate_dataset_and_shuffle( tabbed_parsing_token_generator(data_dir, tmp_dir, True, "ice", self.source_vocab_size, self.target_vocab_size), - self.training_filepaths(data_dir, num_shards, shuffled=False), + self.training_filepaths(data_dir, 1, shuffled=False), tabbed_parsing_token_generator(data_dir, tmp_dir, False, "ice", self.source_vocab_size, self.target_vocab_size), self.dev_filepaths(data_dir, 1, shuffled=False)) - def hparams(self, defaults, unused_model_hparams): + def hparams(self, defaults, model_hparams): p = defaults source_vocab_size = self._encoders["inputs"].vocab_size p.input_modality = {"inputs": (registry.Modalities.SYMBOL, source_vocab_size)} From b4de995cec2430acf61c367fbb61a00f61fc5097 Mon Sep 17 00:00:00 2001 From: vthorsteinsson Date: Fri, 11 Aug 2017 00:17:55 +0000 Subject: [PATCH 08/10] Moved ice_parsing to data_generators; updated to 1.1.7 --- tensor2tensor/data_generators/all_problems.py | 1 + .../ice_parsing.py | 44 +++++++------------ tensor2tensor/data_generators/wmt.py | 30 ------------- tensor2tensor/ice_parsing/__init__.py | 2 - tensor2tensor/models/transformer.py | 9 ++++ 5 files changed, 27 insertions(+), 59 deletions(-) mode change 100644 => 100755 tensor2tensor/data_generators/all_problems.py rename tensor2tensor/{ice_parsing => data_generators}/ice_parsing.py (82%) delete mode 100644 tensor2tensor/ice_parsing/__init__.py diff --git a/tensor2tensor/data_generators/all_problems.py b/tensor2tensor/data_generators/all_problems.py old mode 100644 new mode 100755 index ca6dccfda..10a4764f5 --- a/tensor2tensor/data_generators/all_problems.py +++ b/tensor2tensor/data_generators/all_problems.py @@ -31,6 +31,7 @@ from tensor2tensor.data_generators import wiki from tensor2tensor.data_generators import wmt from tensor2tensor.data_generators import wsj_parsing +from tensor2tensor.data_generators import ice_parsing # Problem modules that require optional dependencies diff --git a/tensor2tensor/ice_parsing/ice_parsing.py b/tensor2tensor/data_generators/ice_parsing.py similarity index 82% rename from tensor2tensor/ice_parsing/ice_parsing.py rename to tensor2tensor/data_generators/ice_parsing.py index df9748589..f6e6bdca4 100755 --- a/tensor2tensor/ice_parsing/ice_parsing.py +++ b/tensor2tensor/data_generators/ice_parsing.py @@ -28,7 +28,6 @@ from tensor2tensor.data_generators import text_encoder from tensor2tensor.data_generators.wmt import tabbed_generator from tensor2tensor.utils import registry -from tensor2tensor.models import transformer import tensorflow as tf @@ -69,9 +68,21 @@ def source_vocab_size(self): return 2**14 # 16384 @property - def target_vocab_size(self): + def targeted_vocab_size(self): return 2**8 # 256 + @property + def input_space_id(self): + return problem.SpaceID.ICE_TOK + + @property + def target_space_id(self): + return problem.SpaceID.ICE_PARSE_TOK + + @property + def num_shards(self): + return 10 + def feature_encoders(self, data_dir): source_vocab_filename = os.path.join( data_dir, "ice_source.tokens.vocab.%d" % self.source_vocab_size) @@ -89,7 +100,7 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1): tabbed_parsing_token_generator(data_dir, tmp_dir, True, "ice", self.source_vocab_size, self.target_vocab_size), - self.training_filepaths(data_dir, 1, shuffled=False), + self.training_filepaths(data_dir, self.num_shards, shuffled=False), tabbed_parsing_token_generator(data_dir, tmp_dir, False, "ice", self.source_vocab_size, self.target_vocab_size), @@ -99,29 +110,8 @@ def hparams(self, defaults, model_hparams): p = defaults source_vocab_size = self._encoders["inputs"].vocab_size p.input_modality = {"inputs": (registry.Modalities.SYMBOL, source_vocab_size)} - p.target_modality = (registry.Modalities.SYMBOL, self.target_vocab_size) - p.input_space_id = problem.SpaceID.ICE_TOK - p.target_space_id = problem.SpaceID.ICE_PARSE_TOK + p.target_modality = (registry.Modalities.SYMBOL, self.targeted_vocab_size) + p.input_space_id = self.input_space_id + p.target_space_id = self.target_space_id p.loss_multiplier = 2.5 # Rough estimate of avg number of tokens per word - -@registry.register_hparams -def transformer_parsing_ice(): - """Hparams for parsing Icelandic text.""" - hparams = transformer.transformer_base_single_gpu() - hparams.batch_size = 4096 - hparams.shared_embedding_and_softmax_weights = int(False) - return hparams - - -@registry.register_hparams -def transformer_parsing_ice_big(): - """Hparams for parsing Icelandic text, bigger model.""" - hparams = transformer_parsing_ice() - hparams.batch_size = 2048 # 4096 gives Out-of-memory on 8 GB 1080 GTX GPU - hparams.attention_dropout = 0.05 - hparams.residual_dropout = 0.05 - hparams.max_length = 512 - hparams.hidden_size = 1024 - return hparams - diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py index f673dee82..f66e366d1 100755 --- a/tensor2tensor/data_generators/wmt.py +++ b/tensor2tensor/data_generators/wmt.py @@ -187,36 +187,6 @@ def bi_vocabs_token_generator(source_path, source, target = source_file.readline(), target_file.readline() -def tabbed_generator(source_path, source_vocab, target_vocab, eos=None): - r"""Generator for sequence-to-sequence tasks using tabbed files. - - Tokens are derived from text files where each line contains both - a source and a target string. The two strings are separated by a tab - character ('\t'). It yields dictionaries of "inputs" and "targets" where - inputs are characters from the source lines converted to integers, and - targets are characters from the target lines, also converted to integers. - - Args: - source_path: path to the file with source and target sentences. - source_vocab: a SunwordTextEncoder to encode the source string. - target_vocab: a SunwordTextEncoder to encode the target string. - eos: integer to append at the end of each sequence (default: None). - - Yields: - A dictionary {"inputs": source-line, "targets": target-line} where - the lines are integer lists converted from characters in the file lines. - """ - eos_list = [] if eos is None else [eos] - with tf.gfile.GFile(source_path, mode="r") as source_file: - for line in source_file: - if line and "\t" in line: - parts = line.split("\t", maxsplit=1) - source, target = parts[0].strip(), parts[1].strip() - source_ints = source_vocab.encode(source) + eos_list - target_ints = target_vocab.encode(target) + eos_list - yield {"inputs": source_ints, "targets": target_ints} - - # Data-set URLs. diff --git a/tensor2tensor/ice_parsing/__init__.py b/tensor2tensor/ice_parsing/__init__.py deleted file mode 100644 index 36f468dcb..000000000 --- a/tensor2tensor/ice_parsing/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ - -from .ice_parsing import IceParsingTokens, transformer_parsing_ice, transformer_parsing_ice_big diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py index c6fb74958..f1b2d761f 100755 --- a/tensor2tensor/models/transformer.py +++ b/tensor2tensor/models/transformer.py @@ -391,6 +391,15 @@ def transformer_parsing_big(): return hparams +@registry.register_hparams +def transformer_parsing_ice(): + """Hparams for parsing and tagging Icelandic text.""" + hparams = transformer.transformer_base_single_gpu() + hparams.batch_size = 4096 + hparams.shared_embedding_and_softmax_weights = int(False) + return hparams + + @registry.register_hparams def transformer_tiny(): hparams = transformer_base() From b9e216b4c76ca973773a6bd4a04372a4dc4cffe3 Mon Sep 17 00:00:00 2001 From: vthorsteinsson Date: Fri, 11 Aug 2017 00:22:58 +0000 Subject: [PATCH 09/10] Adaptation to 1.1.7 --- tensor2tensor/bin/t2t-datagen | 5 ----- tensor2tensor/data_generators/problem_hparams.py | 2 -- tensor2tensor/data_generators/wmt.py | 3 +-- tensor2tensor/models/transformer.py | 2 +- 4 files changed, 2 insertions(+), 10 deletions(-) diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen index 8ea0d9bc6..97bbd1241 100755 --- a/tensor2tensor/bin/t2t-datagen +++ b/tensor2tensor/bin/t2t-datagen @@ -82,11 +82,6 @@ _SUPPORTED_PROBLEM_GENERATORS = { "algorithmic_algebra_inverse": ( lambda: algorithmic_math.algebra_inverse(26, 0, 2, 100000), lambda: algorithmic_math.algebra_inverse(26, 3, 3, 10000)), - "ice_parsing_characters": ( - lambda: wmt.tabbed_parsing_character_generator( - FLAGS.data_dir, FLAGS.tmp_dir, True), - lambda: wmt.tabbed_parsing_character_generator( - FLAGS.data_dir, FLAGS.tmp_dir, False)), "wmt_parsing_tokens_8k": ( lambda: wmt.parsing_token_generator( FLAGS.data_dir, FLAGS.tmp_dir, True, 2**13), diff --git a/tensor2tensor/data_generators/problem_hparams.py b/tensor2tensor/data_generators/problem_hparams.py index 2f417a992..b0ed44f5b 100755 --- a/tensor2tensor/data_generators/problem_hparams.py +++ b/tensor2tensor/data_generators/problem_hparams.py @@ -511,8 +511,6 @@ def image_celeba(unused_model_hparams): lm1b_32k, "wiki_32k": wiki_32k, - "ice_parsing_characters": - wmt_parsing_characters, "wmt_parsing_tokens_8k": lambda p: wmt_parsing_tokens(p, 2**13), "wsj_parsing_tokens_16k": diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py index f66e366d1..35d1b5fca 100755 --- a/tensor2tensor/data_generators/wmt.py +++ b/tensor2tensor/data_generators/wmt.py @@ -65,6 +65,7 @@ def use_subword_tokenizer(self): # Generic generators used later for multiple problems. + def character_generator(source_path, target_path, character_vocab, eos=None): """Generator for sequence-to-sequence tasks that just uses characters. @@ -654,5 +655,3 @@ def parsing_token_generator(data_dir, tmp_dir, train, vocab_size): tree_filepath = os.path.join(tmp_dir, filename) return wsj_parsing.token_generator(tree_filepath, symbolizer_vocab, symbolizer_vocab, EOS) - - diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py index f1b2d761f..fa7ecdf81 100755 --- a/tensor2tensor/models/transformer.py +++ b/tensor2tensor/models/transformer.py @@ -394,7 +394,7 @@ def transformer_parsing_big(): @registry.register_hparams def transformer_parsing_ice(): """Hparams for parsing and tagging Icelandic text.""" - hparams = transformer.transformer_base_single_gpu() + hparams = transformer_base_single_gpu() hparams.batch_size = 4096 hparams.shared_embedding_and_softmax_weights = int(False) return hparams From ab9b00465add968ee1a09bd749cd35f53a9659cd Mon Sep 17 00:00:00 2001 From: vthorsteinsson Date: Fri, 11 Aug 2017 11:43:59 +0000 Subject: [PATCH 10/10] Bugfix in ice_parsing.py --- tensor2tensor/data_generators/ice_parsing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensor2tensor/data_generators/ice_parsing.py b/tensor2tensor/data_generators/ice_parsing.py index f6e6bdca4..7a90fec45 100755 --- a/tensor2tensor/data_generators/ice_parsing.py +++ b/tensor2tensor/data_generators/ice_parsing.py @@ -87,7 +87,7 @@ def feature_encoders(self, data_dir): source_vocab_filename = os.path.join( data_dir, "ice_source.tokens.vocab.%d" % self.source_vocab_size) target_vocab_filename = os.path.join( - data_dir, "ice_target.tokens.vocab.%d" % self.target_vocab_size) + data_dir, "ice_target.tokens.vocab.%d" % self.targeted_vocab_size) source_subtokenizer = text_encoder.SubwordTextEncoder(source_vocab_filename) target_subtokenizer = text_encoder.SubwordTextEncoder(target_vocab_filename) return { @@ -99,11 +99,11 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1): generator_utils.generate_dataset_and_shuffle( tabbed_parsing_token_generator(data_dir, tmp_dir, True, "ice", self.source_vocab_size, - self.target_vocab_size), + self.targeted_vocab_size), self.training_filepaths(data_dir, self.num_shards, shuffled=False), tabbed_parsing_token_generator(data_dir, tmp_dir, False, "ice", self.source_vocab_size, - self.target_vocab_size), + self.targeted_vocab_size), self.dev_filepaths(data_dir, 1, shuffled=False)) def hparams(self, defaults, model_hparams):