From 2e55ec24d728be1323ba3b20b08facb4abf8004e Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Thu, 27 Jul 2017 12:30:18 -0700 Subject: [PATCH 1/6] Modality.loss PiperOrigin-RevId: 163376624 --- tensor2tensor/models/modalities.py | 68 ++++++----------------- tensor2tensor/models/modalities_test.py | 10 ++-- tensor2tensor/utils/modality.py | 73 +++++++++++++------------ tensor2tensor/utils/t2t_model.py | 13 +++-- 4 files changed, 69 insertions(+), 95 deletions(-) diff --git a/tensor2tensor/models/modalities.py b/tensor2tensor/models/modalities.py index 20464c0a2..c57a97905 100644 --- a/tensor2tensor/models/modalities.py +++ b/tensor2tensor/models/modalities.py @@ -96,12 +96,11 @@ def targets_bottom(self, x): else: return self.bottom_simple(x, "target_emb", reuse=None) - def top(self, body_output, targets): + def top(self, body_output, _): """Generate logits. Args: body_output: A Tensor with shape [batch, p0, p1, body_input_depth] - targets: A Tensor with shape [batch, p0, p1, 1] Returns: logits: A Tensor with shape [batch, p0, p1, ?, vocab_size]. """ @@ -192,18 +191,11 @@ def top(self, body_output, _): return logits - def top_sharded(self, - sharded_body_output, - sharded_targets, - data_parallelism, - weights_fn=common_layers.weights_all): + def loss(self, top_out, targets, weights_fn=common_layers.weights_all): # Call the default implementation, but weight 1.0 on 0s by default. # (Since we're processing images and so have no padding and some pixel 0s.) - return super(SmallImageModality, self).top_sharded( - sharded_body_output, - sharded_targets, - data_parallelism, - weights_fn=weights_fn) + return super(SmallImageModality, self).loss( + top_out, targets, weights_fn=weights_fn) @registry.register_image_modality("default") @@ -425,18 +417,11 @@ def top(self, body_output, _): res = common_layers.conv(x, self._vocab_size, (1, 1)) return tf.expand_dims(res, 3) - def top_sharded(self, - sharded_body_output, - sharded_targets, - data_parallelism, - weights_fn=common_layers.weights_all): + def loss(self, top_out, targets, weights_fn=common_layers.weights_all): # Call the default implementation, but weight 1.0 on 0s by default. - # (Since we're processing images and so have no padding and some labels 0.) - return super(ClassLabelModality, self).top_sharded( - sharded_body_output, - sharded_targets, - data_parallelism, - weights_fn=weights_fn) + # (Since we're processing images and so have no padding and some pixel 0s.) + return super(ClassLabelModality, self).loss( + top_out, targets, weights_fn=weights_fn) @registry.register_class_label_modality("class_label_2d") @@ -479,24 +464,12 @@ def top(self, body_output, _): with tf.variable_scope("real"): return tf.layers.dense(body_output, self._vocab_size) - def top_sharded(self, - sharded_body_output, - sharded_targets, - data_parallelism, - weights_fn=common_layers.weights_nonzero): - sharded_predictions = data_parallelism(self.top, sharded_body_output, - sharded_targets) - - def l2_loss(predictions, targets): - with tf.name_scope("l2"): - weights = weights_fn(targets) - l2 = tf.pow(predictions - targets, 2) - return tf.reduce_sum(l2 * weights), tf.reduce_sum(weights) - - loss_num, loss_den = data_parallelism(l2_loss, sharded_predictions, - sharded_targets) - loss = tf.add_n(loss_num) / tf.maximum(1.0, tf.add_n(loss_den)) - return sharded_predictions, loss + def loss(self, top_out, targets, weights_fn=common_layers.weights_nonzero): + predictions = top_out + with tf.name_scope("l2"): + weights = weights_fn(targets) + l2 = tf.pow(predictions - targets, 2) + return tf.reduce_sum(l2 * weights), tf.reduce_sum(weights) @registry.register_image_modality("identity_no_pad") @@ -513,15 +486,8 @@ def bottom(self, x): def top(self, body_output, _): return body_output - def top_sharded(self, - sharded_body_output, - sharded_targets, - data_parallelism, - weights_fn=common_layers.weights_all): + def loss(self, top_out, targets, weights_fn=common_layers.weights_all): # Call the default implementation, but weight 1.0 on 0s by default. # (Since we're processing images and so have no padding and some pixel 0s.) - return super(IdentityModalityNoPad, self).top_sharded( - sharded_body_output, - sharded_targets, - data_parallelism, - weights_fn=weights_fn) + return super(IdentityModalityNoPad, self).loss( + top_out, targets, weights_fn=weights_fn) diff --git a/tensor2tensor/models/modalities_test.py b/tensor2tensor/models/modalities_test.py index 4254c6b04..9130613b9 100644 --- a/tensor2tensor/models/modalities_test.py +++ b/tensor2tensor/models/modalities_test.py @@ -41,8 +41,8 @@ def testSymbolModalityInputs(self): hidden_size=hidden_size, multiply_embedding_mode="sqrt_depth", shared_embedding_and_softmax_weights=0) - x = -1 + np.random.random_integers(vocab_size, size=( - batch_size, length, 1, 1)) + x = -1 + np.random.random_integers( + vocab_size, size=(batch_size, length, 1, 1)) m = modalities.SymbolModality(model_hparams, vocab_size) data_parallelism = expert_utils.Parallelism( ["/device:CPU:0"] * num_datashards, reuse=True) @@ -76,8 +76,10 @@ def testSymbolModalityTargets(self): with self.test_session() as session: sharded_body_output = tf.split(tf.to_float(body_output), num_datashards) sharded_targets = tf.split(targets, num_datashards) - sharded_logits, train_loss = m.top_sharded( - sharded_body_output, sharded_targets, data_parallelism) + sharded_logits = m.top_sharded(sharded_body_output, sharded_targets, + data_parallelism) + train_loss = m.loss_sharded(sharded_logits, sharded_targets, + data_parallelism) logits = tf.concat(sharded_logits, 0) session.run(tf.global_variables_initializer()) res1, res2 = session.run((logits, train_loss)) diff --git a/tensor2tensor/utils/modality.py b/tensor2tensor/utils/modality.py index 72169be1f..5c596e10f 100644 --- a/tensor2tensor/utils/modality.py +++ b/tensor2tensor/utils/modality.py @@ -31,23 +31,26 @@ class Modality(object): """Abstract Modality class for data transformations. An abstract class representing modalities for transforming data to a space - interpretable by sequence models. It has 3 functions: - * bottom: called on inputs entering the model. + interpretable by T2T models. It has 4 functions: + * bottom: called on inputs entering the model. * targets_bottom: called on targets entering the model (e.g., the decoder). - * top: called on targets to generate predictions. - - For example, think about a modality for images. The inputs_bottom function - represents the part of the model applied to an incoming image, e.g., an entry - flow of a convolutional network. The targets_top function represents the top - part of a model that is generating images, e.g., a PixelCNN network. The final - function targets_bottom represents the auto-regressive part of the network. - It is applied to the already-generated part of an image, which is given to - the decoder to generate the next part. In some cases, e.g., for text, it is - the same as the inputs_bottom function, and that is the default we use. But, - e.g., for images, a different function might be needed to regress properly. - - All 3 functions have simple and sharded versions. A sub-class only needs - to implement the simple version, the default sharding will be used then. + * top: called on model outputs to generate predictions (e.g., logits). + * loss: called on predictions (outputs of top) and targets. + + For example, think about a modality for images: + * `bottom` represents the part of the model applied to an incoming image, + e.g., an entry flow of a convolutional network. + * `top` represents the top part of a model that is generating images, e.g., a + PixelCNN network. + * `targets_bottom` represents the auto-regressive part of the network. It is + applied to the already-generated part of an image, which is given to the + decoder to generate the next part. In some cases, e.g., for text, it is the + same as the `bottom` function, and that is the default we use. But, e.g., + for images, a different function might be needed to regress properly. + * `loss` would compare the generated image to the target image and score it. + + All the functions have simple and sharded versions. A sub-class only needs to + implement the simple version, the default sharding will be used then. """ def __init__(self, model_hparams, vocab_size=None): @@ -116,7 +119,7 @@ def targets_bottom_sharded(self, xs, data_parallelism): return data_parallelism(self.targets_bottom, xs) def top(self, body_output, targets): - """Transform one shard of output. + """Generate predictions/logits for one shard of output. Most classes will override this function. @@ -129,12 +132,8 @@ def top(self, body_output, targets): """ raise NotImplementedError("Abstract Method") - def top_sharded(self, - sharded_body_output, - sharded_targets, - data_parallelism, - weights_fn=common_layers.weights_nonzero): - """Transform all shards of targets. + def top_sharded(self, sharded_body_output, sharded_targets, data_parallelism): + """Generate predictions/logits for all shards. Classes with cross-shard interaction will override this function. @@ -142,18 +141,24 @@ def top_sharded(self, sharded_body_output: A list of Tensors. sharded_targets: A list of Tensors. data_parallelism: a expert_utils.Parallelism object. - weights_fn: function from targets to target weights. Returns: - shaded_logits: A list of Tensors. - training_loss: a Scalar. + sharded_logits: A list of Tensors. """ - sharded_logits = data_parallelism(self.top, sharded_body_output, - sharded_targets) - loss_num, loss_den = data_parallelism( - common_layers.padded_cross_entropy, - sharded_logits, - sharded_targets, + return data_parallelism(self.top, sharded_body_output, sharded_targets) + + def loss(self, top_out, targets, weights_fn=common_layers.weights_nonzero): + """Compute loss numerator and denominator for one shard of output.""" + logits = top_out + return common_layers.padded_cross_entropy( + logits, + targets, self._model_hparams.label_smoothing, weights_fn=weights_fn) - loss = tf.add_n(loss_num) / tf.maximum(1.0, tf.add_n(loss_den)) - return sharded_logits, loss + + def loss_sharded(self, sharded_top_out, sharded_targets, data_parallelism): + """Compute loss for all shards.""" + sharded_loss_num, sharded_loss_den = data_parallelism( + self.loss, sharded_top_out, sharded_targets) + loss = tf.add_n(sharded_loss_num) / tf.maximum(1.0, + tf.add_n(sharded_loss_den)) + return loss diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py index 66e40d495..f67cc9540 100644 --- a/tensor2tensor/utils/t2t_model.py +++ b/tensor2tensor/utils/t2t_model.py @@ -424,8 +424,10 @@ def model_fn(self, features, skip=False, last_position_only=False): with tf.variable_scope(target_modality.name, reuse=target_reuse): if not last_position_only: - sharded_logits, training_loss = (target_modality.top_sharded( - body_outputs, sharded_features["targets"], self._data_parallelism)) + sharded_logits = target_modality.top_sharded( + body_outputs, sharded_features["targets"], self._data_parallelism) + training_loss = target_modality.loss_sharded( + sharded_logits, sharded_features["targets"], self._data_parallelism) training_loss *= self._problem_hparams.loss_multiplier else: @@ -439,10 +441,9 @@ def model_fn(self, features, skip=False, last_position_only=False): tf.expand_dims(target_shard[:, -1:, :, :], axis=[1]) for target_shard in sharded_features["targets"] ] - sharded_logits, training_loss = (target_modality.top_sharded( - last_position_body_outputs, last_position_targets, - self._data_parallelism)) - + sharded_logits = target_modality.top_sharded(last_position_body_outputs, + last_position_targets, + self._data_parallelism) training_loss = None tf.logging.info("This model_fn took %.3f sec." % (time.time() - start_time)) From b82bdfd13a94a05a82dc6755126c1ad1bcc6c02c Mon Sep 17 00:00:00 2001 From: Lukasz Kaiser Date: Thu, 27 Jul 2017 16:05:10 -0700 Subject: [PATCH 2/6] correct metrics and some generator and python3 corrections. PiperOrigin-RevId: 163402917 --- tensor2tensor/data_generators/gene_expression.py | 12 ++++++------ tensor2tensor/data_generators/generator_utils.py | 2 +- tensor2tensor/data_generators/text_encoder.py | 4 ++-- tensor2tensor/data_generators/wmt.py | 7 ++++--- tensor2tensor/utils/metrics.py | 3 +++ 5 files changed, 16 insertions(+), 12 deletions(-) diff --git a/tensor2tensor/data_generators/gene_expression.py b/tensor2tensor/data_generators/gene_expression.py index 31d1cd150..1bb9d4ab3 100644 --- a/tensor2tensor/data_generators/gene_expression.py +++ b/tensor2tensor/data_generators/gene_expression.py @@ -110,10 +110,10 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1): # Collect created shard processes to start and join processes = [] - datasets = [(self.training_filepaths, self.num_shards, "train", - num_train_examples), (self.dev_filepaths, 1, "valid", - num_dev_examples), - (self.test_filepaths, 1, "test", num_test_examples)] + datasets = [ + (self.training_filepaths, self.num_shards, "train", num_train_examples), + (self.dev_filepaths, 10, "valid", num_dev_examples), + (self.test_filepaths, 10, "test", num_test_examples)] for fname_fn, nshards, key_prefix, num_examples in datasets: outfiles = fname_fn(data_dir, nshards, shuffled=False) all_filepaths.extend(outfiles) @@ -125,8 +125,8 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1): start_idx, end_idx)) processes.append(p) - # 1 per training shard + dev + test - assert len(processes) == self.num_shards + 2 + # 1 per training shard + 10 for dev + 10 for test + assert len(processes) == self.num_shards + 20 # Start and wait for processes in batches num_batches = int( diff --git a/tensor2tensor/data_generators/generator_utils.py b/tensor2tensor/data_generators/generator_utils.py index 866a0f3e7..5c7f9f2a1 100644 --- a/tensor2tensor/data_generators/generator_utils.py +++ b/tensor2tensor/data_generators/generator_utils.py @@ -305,7 +305,7 @@ def generate(): # Use Tokenizer to count the word occurrences. with tf.gfile.GFile(filepath, mode="r") as source_file: - file_byte_budget = 3.5e5 if "en" in filepath else 7e5 + file_byte_budget = 3.5e5 if filepath.endswith("en") else 7e5 for line in source_file: if file_byte_budget <= 0: break diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py index 4bb1c875d..ff284bcc6 100644 --- a/tensor2tensor/data_generators/text_encoder.py +++ b/tensor2tensor/data_generators/text_encoder.py @@ -53,7 +53,7 @@ # '\u' is converted to '_' # '\\' is converted to '\' # '\213;' is converted to unichr(213) -_UNESCAPE_REGEX = re.compile(ur"\\u|\\\\|\\([0-9]+);") +_UNESCAPE_REGEX = re.compile(r"\\u|\\\\|\\([0-9]+);") _ESCAPE_CHARS = set(u"\\_;0123456789") @@ -219,7 +219,7 @@ def _escape_token(token, alphabet): token = token.replace(u"\\", u"\\\\").replace(u"_", u"\\u") ret = [ - c if c in alphabet and c != u"\n" else ur"\%d;" % ord(c) + c if c in alphabet and c != u"\n" else r"\%d;" % ord(c) for c in token] return u"".join(ret) + "_" diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py index 97b191096..7fde9b3b4 100644 --- a/tensor2tensor/data_generators/wmt.py +++ b/tensor2tensor/data_generators/wmt.py @@ -404,14 +404,15 @@ def _compile_data(tmp_dir, datasets, filename): generator_utils.maybe_download(tmp_dir, compressed_filename, url) if not (os.path.exists(lang1_filepath) and os.path.exists(lang2_filepath)): - mode = "r:gz" if "gz" in compressed_filepath else "r" + # For .tar.gz and .tgz files, we read compressed. + mode = "r:gz" if compressed_filepath.endswith("gz") else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) - if ".gz" in lang1_filepath: + if lang1_filepath.endswith(".gz"): new_filepath = lang1_filepath.strip(".gz") generator_utils.gunzip_file(lang1_filepath, new_filepath) lang1_filepath = new_filepath - if ".gz" in lang2_filepath: + if lang2_filepath.endswith(".gz"): new_filepath = lang2_filepath.strip(".gz") generator_utils.gunzip_file(lang2_filepath, new_filepath) lang2_filepath = new_filepath diff --git a/tensor2tensor/utils/metrics.py b/tensor2tensor/utils/metrics.py index ae9ce3882..4435707cd 100644 --- a/tensor2tensor/utils/metrics.py +++ b/tensor2tensor/utils/metrics.py @@ -59,6 +59,7 @@ def padded_accuracy_topk(predictions, effective_k = tf.minimum(k, tf.shape(padded_predictions)[-1]) _, outputs = tf.nn.top_k(padded_predictions, k=effective_k) outputs = tf.to_int32(outputs) + padded_labels = tf.to_int32(padded_labels) padded_labels = tf.expand_dims(padded_labels, axis=-1) padded_labels += tf.zeros_like(outputs) # Pad to same shape. same = tf.to_float(tf.equal(outputs, padded_labels)) @@ -82,6 +83,7 @@ def padded_sequence_accuracy(predictions, predictions, labels) weights = weights_fn(padded_labels) outputs = tf.to_int32(tf.argmax(padded_predictions, axis=-1)) + padded_labels = tf.to_int32(padded_labels) not_correct = tf.to_float(tf.not_equal(outputs, padded_labels)) * weights axis = list(range(1, len(outputs.get_shape()))) correct_seq = 1.0 - tf.minimum(1.0, tf.reduce_sum(not_correct, axis=axis)) @@ -106,6 +108,7 @@ def padded_accuracy(predictions, predictions, labels) weights = weights_fn(padded_labels) outputs = tf.to_int32(tf.argmax(padded_predictions, axis=-1)) + padded_labels = tf.to_int32(padded_labels) return tf.to_float(tf.equal(outputs, padded_labels)), weights From 8ad79b60d29bef80c7724f5d5d5dfa0ff2ff8cab Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Thu, 27 Jul 2017 16:39:45 -0700 Subject: [PATCH 3/6] Move examples reading to new Datasets API PiperOrigin-RevId: 163407588 --- .../data_generators/gene_expression.py | 4 +- tensor2tensor/utils/data_reader.py | 90 +++++++++---------- tensor2tensor/utils/data_reader_test.py | 8 +- tensor2tensor/utils/trainer_utils.py | 35 ++++---- 4 files changed, 64 insertions(+), 73 deletions(-) diff --git a/tensor2tensor/data_generators/gene_expression.py b/tensor2tensor/data_generators/gene_expression.py index 1bb9d4ab3..60e38a90f 100644 --- a/tensor2tensor/data_generators/gene_expression.py +++ b/tensor2tensor/data_generators/gene_expression.py @@ -168,8 +168,8 @@ def preprocess_examples(self, examples, mode): # Reshape targets examples["targets"] = tf.reshape(examples["targets"], - [-1, 1, self.num_output_predictions]) - examples["targets_mask"] = tf.reshape(examples["targets_mask"], [-1, 1, 1]) + [-1, self.num_output_predictions]) + examples["targets_mask"] = tf.reshape(examples["targets_mask"], [-1, 1]) # Set masked targets to 0 (i.e. pad) so that loss and metrics ignore them. # Add epsilon because some unmasked labels are actually 0. diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py index 24dd31485..e78e22344 100644 --- a/tensor2tensor/utils/data_reader.py +++ b/tensor2tensor/utils/data_reader.py @@ -20,6 +20,7 @@ import math import os +import random # Dependency imports @@ -33,19 +34,15 @@ import tensorflow as tf -def examples_queue(data_sources, - data_fields_to_features, - training, - capacity=32, - data_items_to_decoders=None, - data_items_to_decode=None): - """Contruct a queue of training or evaluation examples. +def examples_reader(data_sources, + data_fields_to_features, + training, + capacity=32, + data_items_to_decoders=None, + data_items_to_decode=None): + """Reads Examples from data_sources and decodes to Tensors. - This function will create a reader from files given by data_sources, - then enqueue the tf.Examples from these files, shuffling if training - is true, and finally parse these tf.Examples to tensors. - - The dictionary data_fields_to_features for an image dataset can be this: + The dictionary data_fields_to_features for an image dataset can be: data_fields_to_features = { 'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''), @@ -54,7 +51,7 @@ def examples_queue(data_sources, [1], tf.int64, default_value=tf.zeros([1], dtype=tf.int64)), } - and for a simple algorithmic dataset with variable-length data it is this: + and for a simple algorithmic dataset with variable-length data it is: data_fields_to_features = { 'inputs': tf.VarLenFeature(tf.int64), @@ -63,7 +60,7 @@ def examples_queue(data_sources, The data_items_to_decoders dictionary argument can be left as None if there is no decoding to be performed. But, e.g. for images, it should be set so that - the images are decoded from the features, e.g., like this for MNIST: + the images are decoded from the features, e.g., for MNIST: data_items_to_decoders = { 'image': tfexample_decoder.Image( @@ -83,7 +80,7 @@ def examples_queue(data_sources, data_fields_to_features: a dictionary from data fields in the data sources to features, such as tf.VarLenFeature(tf.int64), see above for examples. training: a Boolean, whether to read for training or evaluation. - capacity: integer, queue capacity; set to 2 * max_batch_size or more. + capacity: integer, buffer capacity; set to 2 * max_batch_size or more. data_items_to_decoders: a dictionary mapping data items (that will be in the returned result) to decoders that will decode them using features defined in data_fields_to_features; see above for examples. By default @@ -93,43 +90,40 @@ def examples_queue(data_sources, Returns: A dictionary mapping each data_field to a corresponding 1D int64 tensor - read from the created queue. - - Raises: - ValueError: if no files are found with the provided data_prefix or no data - fields were provided. + read from the created Dataset. """ - with tf.name_scope("examples_queue"): - # Read serialized examples using slim parallel_reader. - num_epochs = None if training else 1 - data_files = tf.contrib.slim.parallel_reader.get_data_files(data_sources) - num_readers = min(4 if training else 1, len(data_files)) - _, example_serialized = tf.contrib.slim.parallel_reader.parallel_read( - data_sources, - tf.TFRecordReader, - num_epochs=num_epochs, - shuffle=training, - capacity=2 * capacity, - min_after_dequeue=capacity, - num_readers=num_readers) - - if data_items_to_decoders is None: - data_items_to_decoders = { + + def decode_record(record): + """Serialized Example to dict of .""" + example_serialized = record + item_decoders = data_items_to_decoders + if item_decoders is None: + item_decoders = { field: tf.contrib.slim.tfexample_decoder.Tensor(field) for field in data_fields_to_features } decoder = tf.contrib.slim.tfexample_decoder.TFExampleDecoder( - data_fields_to_features, data_items_to_decoders) + data_fields_to_features, item_decoders) - if data_items_to_decode is None: - data_items_to_decode = list(data_items_to_decoders) + decode_items = data_items_to_decode + if decode_items is None: + decode_items = list(item_decoders) - decoded = decoder.decode(example_serialized, items=data_items_to_decode) - return { - field: tensor - for (field, tensor) in zip(data_items_to_decode, decoded) - } + decoded = decoder.decode(example_serialized, items=decode_items) + return dict(zip(decode_items, decoded)) + + with tf.name_scope("examples_in"): + data_files = tf.contrib.slim.parallel_reader.get_data_files(data_sources) + random.shuffle(data_files) + dataset = tf.contrib.data.TFRecordDataset(data_files) + num_readers = min(4 if training else 1, len(data_files)) + dataset = dataset.map(decode_record, num_threads=num_readers) + if training: + dataset = dataset.shuffle(capacity) + dataset = dataset.repeat(None if training else 1) + it = dataset.make_one_shot_iterator() + return it.get_next() def preprocessing(examples, data_file_pattern, mode): @@ -193,7 +187,7 @@ def problem_input_pipeline(problem, data_file_pattern, capacity, mode): return feature_placeholders(data_fields) # Now the non-trivial case construction. - examples = examples_queue( + examples = examples_reader( [data_file_pattern], data_fields, training=(mode == tf.contrib.learn.ModeKeys.TRAIN), @@ -278,7 +272,7 @@ def input_pipeline(problem, data_file_pattern, capacity, mode): return feature_placeholders(data_fields) # Now the non-trivial case construction. - examples = examples_queue( + examples = examples_reader( [data_file_pattern], data_fields, training=(mode == tf.contrib.learn.ModeKeys.TRAIN), @@ -296,7 +290,7 @@ def batch_examples(examples, batching_scheme): """Given a queue of examples, create batches of examples with similar lengths. We assume that examples is a dictionary with string keys and tensor values, - possibly coming from a queue, e.g., constructed by examples_queue above. + possibly coming from a queue, e.g., constructed by examples_reader above. Each tensor in examples is assumed to be 1D. We will put tensors of similar length into batches togeter. We return a dictionary with the same keys as examples, and with values being batches of size batch_size. If elements have @@ -407,7 +401,7 @@ def constant_batching_scheme(constant_batch_size_in_sequences): } -def get_datasets(problems, data_dir, mode): +def get_data_filepatterns(problems, data_dir, mode): """Return the location of a dataset for a given mode.""" datasets = [] for problem in problems.split("-"): diff --git a/tensor2tensor/utils/data_reader_test.py b/tensor2tensor/utils/data_reader_test.py index f0c318e7b..ea98da06d 100644 --- a/tensor2tensor/utils/data_reader_test.py +++ b/tensor2tensor/utils/data_reader_test.py @@ -50,13 +50,13 @@ def test_generator(): generator_utils.generate_files(test_generator(), filenames) self.assertTrue(tf.gfile.Exists(tmp_file_path + "-train-00000-of-00001")) - examples_train = data_reader.examples_queue( + examples_train = data_reader.examples_reader( [tmp_file_path + "*"], { "inputs": tf.VarLenFeature(tf.int64), "targets": tf.VarLenFeature(tf.int64) }, training=True) - examples_eval = data_reader.examples_queue( + examples_eval = data_reader.examples_reader( [tmp_file_path + "*"], { "inputs": tf.VarLenFeature(tf.int64), "targets": tf.VarLenFeature(tf.int64), @@ -103,12 +103,12 @@ def test_generator(): generator_utils.generate_files(test_generator(), filenames) self.assertTrue(tf.gfile.Exists(tmp_file_path + "-train-00000-of-00001")) - examples_train = data_reader.examples_queue([tmp_file_path + "*"], { + examples_train = data_reader.examples_reader([tmp_file_path + "*"], { "inputs": tf.VarLenFeature(tf.int64), "targets": tf.VarLenFeature(tf.int64) }, True) batch_train = data_reader.batch_examples(examples_train, 4) - examples_eval = data_reader.examples_queue([tmp_file_path + "*"], { + examples_eval = data_reader.examples_reader([tmp_file_path + "*"], { "inputs": tf.VarLenFeature(tf.int64), "targets": tf.VarLenFeature(tf.int64) }, False) diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py index bf105c5ae..260ec6a00 100644 --- a/tensor2tensor/utils/trainer_utils.py +++ b/tensor2tensor/utils/trainer_utils.py @@ -197,14 +197,14 @@ def create_experiment_components(hparams, output_dir, data_dir, model_name): train_input_fn = get_input_fn( mode=tf.contrib.learn.ModeKeys.TRAIN, hparams=hparams, - data_file_patterns=get_datasets_for_mode(data_dir, + data_file_patterns=get_data_filepatterns(data_dir, tf.contrib.learn.ModeKeys.TRAIN), num_datashards=num_datashards) eval_input_fn = get_input_fn( mode=tf.contrib.learn.ModeKeys.EVAL, hparams=hparams, - data_file_patterns=get_datasets_for_mode(data_dir, + data_file_patterns=get_data_filepatterns(data_dir, tf.contrib.learn.ModeKeys.EVAL), num_datashards=num_datashards) estimator = tf.contrib.learn.Estimator( @@ -626,7 +626,7 @@ def decode_from_dataset(estimator): inputs_vocab = hparams.problems[i].vocabulary.get("inputs", None) targets_vocab = hparams.problems[i].vocabulary["targets"] tf.logging.info("Performing local inference.") - infer_problems_data = get_datasets_for_mode(hparams.data_dir, + infer_problems_data = get_data_filepatterns(hparams.data_dir, tf.contrib.learn.ModeKeys.INFER) infer_input_fn = get_input_fn( @@ -801,8 +801,8 @@ def _decode_batch_input_fn(problem_id, num_decode_batches, sorted_inputs, } -def get_datasets_for_mode(data_dir, mode): - return data_reader.get_datasets(FLAGS.problems, data_dir, mode) +def get_data_filepatterns(data_dir, mode): + return data_reader.get_data_filepatterns(FLAGS.problems, data_dir, mode) def _cond_on_index(fn, index_tensor, cur_idx, max_idx): @@ -1075,42 +1075,39 @@ def input_fn(): ValueError: if one of the parameters has an unsupported value. """ problem_count, batches = len(data_file_patterns), [] - with tf.name_scope("input_queues"): + with tf.name_scope("input_reader"): for n in xrange(problem_count): if fixed_problem is not None and n != fixed_problem: continue problem_instance = hparams.problem_instances[n] + p_hparams = hparams.problems[n] with tf.name_scope("problem_%d" % n): - with tf.device("/cpu:0"): # Input queues are on CPU. - capacity = hparams.problems[n].max_expected_batch_size_per_shard + with tf.device("/cpu:0"): # Input reading on CPU + capacity = p_hparams.max_expected_batch_size_per_shard capacity *= num_datashards examples = data_reader.input_pipeline( problem_instance, data_file_patterns[n], capacity, mode) - if mode == tf.contrib.learn.ModeKeys.TRAIN: - drop_long_sequences = True - else: - drop_long_sequences = hparams.eval_drop_long_sequences - batch_size_multiplier = hparams.problems[n].batch_size_multiplier feature_map = data_reader.batch_examples( examples, data_reader.hparams_to_batching_scheme( hparams, shard_multiplier=num_datashards, - drop_long_sequences=drop_long_sequences, - length_multiplier=batch_size_multiplier)) + drop_long_sequences=(mode == tf.contrib.learn.ModeKeys.TRAIN + or hparams.eval_drop_long_sequences), + length_multiplier=(p_hparams.batch_size_multiplier))) # Reverse inputs and targets features if the problem was reversed. if problem_instance is not None: problem_instance.maybe_reverse_features(feature_map) problem_instance.maybe_copy_features(feature_map) else: - if hparams.problems[n].was_reversed: + if p_hparams.was_reversed: inputs = feature_map["inputs"] targets = feature_map["targets"] feature_map["inputs"] = targets feature_map["targets"] = inputs # Use the inputs as the targets if the problem is a copy problem. - if hparams.problems[n].was_copy: + if p_hparams.was_copy: feature_map["targets"] = feature_map["inputs"] # Ensure inputs and targets are proper rank. @@ -1122,8 +1119,8 @@ def input_fn(): batches.append( (feature_map["inputs"], feature_map["targets"], tf.constant(n), - tf.constant(hparams.problems[n].input_space_id), - tf.constant(hparams.problems[n].target_space_id))) + tf.constant(p_hparams.input_space_id), + tf.constant(p_hparams.target_space_id))) # We choose which problem to process. loss_moving_avgs = [] # Need loss moving averages for that. From daea72a7d81b8d1559bfdc5202a7cb237ac17d0e Mon Sep 17 00:00:00 2001 From: Lukasz Kaiser Date: Thu, 27 Jul 2017 18:18:05 -0700 Subject: [PATCH 4/6] Make a generic Text2TextProblem class, use in WMT, move PTB. PiperOrigin-RevId: 163417898 --- tensor2tensor/bin/t2t-datagen | 7 -- tensor2tensor/data_generators/problem.py | 114 ++++++++++++++++-- .../data_generators/problem_hparams.py | 16 --- tensor2tensor/data_generators/ptb.py | 103 ++++++++++------ tensor2tensor/data_generators/wmt.py | 79 +----------- tensor2tensor/utils/data_reader.py | 3 +- 6 files changed, 177 insertions(+), 145 deletions(-) diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen index e4acb6731..1f876c981 100644 --- a/tensor2tensor/bin/t2t-datagen +++ b/tensor2tensor/bin/t2t-datagen @@ -43,7 +43,6 @@ from tensor2tensor.data_generators import audio from tensor2tensor.data_generators import generator_utils from tensor2tensor.data_generators import image from tensor2tensor.data_generators import lm1b -from tensor2tensor.data_generators import ptb from tensor2tensor.data_generators import snli from tensor2tensor.data_generators import wiki from tensor2tensor.data_generators import wmt @@ -176,12 +175,6 @@ _SUPPORTED_PROBLEM_GENERATORS = { lambda: audio.timit_generator( FLAGS.data_dir, FLAGS.tmp_dir, False, 626, vocab_filename="vocab.endefr.%d" % 2**15, vocab_size=2**15)), - "lmptb_10k": ( - lambda: ptb.train_generator( - FLAGS.tmp_dir, - FLAGS.data_dir, - False), - ptb.valid_generator), } # pylint: enable=g-long-lambda diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py index 67e3c6f90..9623791f5 100644 --- a/tensor2tensor/data_generators/problem.py +++ b/tensor2tensor/data_generators/problem.py @@ -18,11 +18,14 @@ from __future__ import division from __future__ import print_function +import os + # Dependency imports -from tensor2tensor.data_generators import generator_utils as utils +from tensor2tensor.data_generators import generator_utils from tensor2tensor.data_generators import text_encoder from tensor2tensor.utils import metrics +from tensor2tensor.utils import registry import tensorflow as tf @@ -176,20 +179,23 @@ def eval_metrics(self): def training_filepaths(self, data_dir, num_shards, shuffled): file_basename = self.dataset_filename() if not shuffled: - file_basename += utils.UNSHUFFLED_SUFFIX - return utils.train_data_filenames(file_basename, data_dir, num_shards) + file_basename += generator_utils.UNSHUFFLED_SUFFIX + return generator_utils.train_data_filenames( + file_basename, data_dir, num_shards) def dev_filepaths(self, data_dir, num_shards, shuffled): file_basename = self.dataset_filename() if not shuffled: - file_basename += utils.UNSHUFFLED_SUFFIX - return utils.dev_data_filenames(file_basename, data_dir, num_shards) + file_basename += generator_utils.UNSHUFFLED_SUFFIX + return generator_utils.dev_data_filenames( + file_basename, data_dir, num_shards) def test_filepaths(self, data_dir, num_shards, shuffled): file_basename = self.dataset_filename() if not shuffled: - file_basename += utils.UNSHUFFLED_SUFFIX - return utils.test_data_filenames(file_basename, data_dir, num_shards) + file_basename += generator_utils.UNSHUFFLED_SUFFIX + return generator_utils.test_data_filenames( + file_basename, data_dir, num_shards) def __init__(self, was_reversed=False, was_copy=False): """Create a Problem. @@ -323,3 +329,97 @@ def _default_hparams(): # class. input_space_id=SpaceID.GENERIC, target_space_id=SpaceID.GENERIC) + + +class Text2TextProblem(Problem): + """Base class for text-to-text problems.""" + + @property + def is_character_level(self): + raise NotImplementedError() + + @property + def targeted_vocab_size(self): + raise NotImplementedError() # Not needed if self.is_character_level. + + def train_generator(self, data_dir, tmp_dir, is_training): + """Generator of the training data.""" + raise NotImplementedError() + + def dev_generator(self, data_dir, tmp_dir): + """Generator of the development data.""" + return self.train_generator(data_dir, tmp_dir, False) + + @property + def input_space_id(self): + raise NotImplementedError() + + @property + def target_space_id(self): + raise NotImplementedError() + + @property + def num_shards(self): + raise NotImplementedError() + + @property + def vocab_name(self): + raise NotImplementedError() + + @property + def vocab_file(self): + return "%s.%d" % (self.vocab_name, self.targeted_vocab_size) + + @property + def use_subword_tokenizer(self): + raise NotImplementedError() + + @property + def has_inputs(self): + return True # Set to False for language models. + + def generate_data(self, data_dir, tmp_dir, task_id=-1): + generator_utils.generate_dataset_and_shuffle( + self.train_generator(data_dir, tmp_dir, True), + self.training_filepaths(data_dir, self.num_shards, shuffled=False), + self.dev_generator(data_dir, tmp_dir), + self.dev_filepaths(data_dir, 1, shuffled=False)) + + def feature_encoders(self, data_dir): + vocab_filename = os.path.join(data_dir, self.vocab_file) + if self.is_character_level: + encoder = text_encoder.ByteTextEncoder(), + elif self.use_subword_tokenizer: + encoder = text_encoder.SubwordTextEncoder(vocab_filename) + else: + encoder = text_encoder.TokenTextEncoder(vocab_filename) + if self.has_inputs: + return {"inputs": encoder, "targets": encoder} + return {"targets": encoder} + + def hparams(self, defaults, unused_model_hparams): + p = defaults + if self.is_character_level: + source_vocab_size = 256 + target_vocab_size = 256 + else: + target_vocab_size = self._encoders["targets"].vocab_size + if self.has_inputs: + source_vocab_size = self._encoders["inputs"].vocab_size + + if self.has_inputs: + p.input_modality = {"inputs": (registry.Modalities.SYMBOL, + source_vocab_size)} + p.target_modality = (registry.Modalities.SYMBOL, target_vocab_size) + if self.has_inputs: + p.input_space_id = self.input_space_id + p.target_space_id = self.target_space_id + if self.is_character_level: + p.loss_multiplier = 2.0 + + def eval_metrics(self): + return [ + metrics.Metrics.ACC, metrics.Metrics.ACC_TOP5, + metrics.Metrics.ACC_PER_SEQ, metrics.Metrics.NEG_LOG_PERPLEXITY, + metrics.Metrics.APPROX_BLEU + ] diff --git a/tensor2tensor/data_generators/problem_hparams.py b/tensor2tensor/data_generators/problem_hparams.py index 2792c79e9..3c829eeac 100644 --- a/tensor2tensor/data_generators/problem_hparams.py +++ b/tensor2tensor/data_generators/problem_hparams.py @@ -368,21 +368,6 @@ def wiki_32k(model_hparams): return p -def lmptb_10k(model_hparams): - """Penn Tree Bank language-modeling benchmark, 10k token vocabulary.""" - p = default_problem_hparams() - p.input_modality = {} - p.target_modality = (registry.Modalities.SYMBOL, 10000) - vocabulary = text_encoder.TokenTextEncoder( - os.path.join(model_hparams.data_dir, "lmptb_10k.vocab")) - p.vocabulary = { - "targets": vocabulary, - } - p.input_space_id = 3 - p.target_space_id = 3 - return p - - def wmt_ende_bpe32k(model_hparams): """English to German translation benchmark.""" p = default_problem_hparams() @@ -642,7 +627,6 @@ def image_celeba(unused_model_hparams): "lm1b_characters": lm1b_characters, "lm1b_32k": lm1b_32k, "wiki_32k": wiki_32k, - "lmptb_10k": lmptb_10k, "ice_parsing_characters": wmt_parsing_characters, "ice_parsing_tokens": lambda p: ice_parsing_tokens(p, 2**13), "wmt_parsing_tokens_8k": lambda p: wmt_parsing_tokens(p, 2**13), diff --git a/tensor2tensor/data_generators/ptb.py b/tensor2tensor/data_generators/ptb.py index f71f0d902..18aedd640 100644 --- a/tensor2tensor/data_generators/ptb.py +++ b/tensor2tensor/data_generators/ptb.py @@ -27,7 +27,9 @@ # Dependency imports from tensor2tensor.data_generators import generator_utils +from tensor2tensor.data_generators import problem from tensor2tensor.data_generators import text_encoder +from tensor2tensor.utils import registry import tensorflow as tf @@ -48,7 +50,7 @@ def _read_words(filename): def _build_vocab(filename, vocab_path, vocab_size): """Reads a file to build a vocabulary of `vocab_size` most common words. - The vocabulary is sorted by occurence count and has one word per line. + The vocabulary is sorted by occurrence count and has one word per line. Originally from: https://github.com/tensorflow/models/blob/master/tutorials/rnn/ptb/reader.py @@ -66,26 +68,47 @@ def _build_vocab(filename, vocab_path, vocab_size): f.write("\n".join(words)) -def _get_token_encoder(vocab_dir, filename): +def _get_token_encoder(vocab_dir, vocab_name, filename): """Reads from file and returns a `TokenTextEncoder` for the vocabulary.""" - vocab_name = "lmptb_10k.vocab" vocab_path = os.path.join(vocab_dir, vocab_name) - _build_vocab(filename, vocab_path, 10000) + if not tf.gfile.Exists(vocab_path): + _build_vocab(filename, vocab_path, 10000) return text_encoder.TokenTextEncoder(vocab_path) -class PTB(object): +class PTBProblem(problem.Text2TextProblem): """A class for generating PTB data.""" - def __init__(self, tmp_dir, data_dir, char=False): - assert not char, "char mode for PTB is not yet implemented" - self.char = char - self.data_dir = data_dir + @property + def has_inputs(self): + return False - url = PTB_URL - filename = os.path.basename(url) + @property + def target_space_id(self): + if self.is_character_level: + return problem.SpaceID.EN_CHR + return problem.SpaceID.EN_TOK + + @property + def num_shards(self): + return 10 + + @property + def vocab_name(self): + return "vocab.lmptb_10k" + + @property + def use_subword_tokenizer(self): + return False + + @property + def targeted_vocab_size(self): + return 10000 + + def train_generator(self, data_dir, tmp_dir, train): + filename = os.path.basename(PTB_URL) compressed_filepath = generator_utils.maybe_download( - tmp_dir, filename, url) + tmp_dir, filename, PTB_URL) ptb_files = [] ptb_char_files = [] with tarfile.open(compressed_filepath, "r:gz") as tgz: @@ -101,50 +124,52 @@ def __init__(self, tmp_dir, data_dir, char=False): tgz.extractall(tmp_dir, members=files) - if self.char: + if self.is_character_level: files = ptb_char_files else: files = ptb_files - files = files + train_file, valid_file = None, None for filename in files: if "train" in filename: - self.train = os.path.join(tmp_dir, filename) + train_file = os.path.join(tmp_dir, filename) elif "valid" in filename: - self.valid = os.path.join(tmp_dir, filename) + valid_file = os.path.join(tmp_dir, filename) - assert hasattr(self, "train"), "Training file not found" - assert hasattr(self, "valid"), "Validation file not found" - self.encoder = _get_token_encoder(data_dir, self.train) + assert train_file, "Training file not found" + assert valid_file, "Validation file not found" - def train_generator(self): - return self._generator(self.train) + if self.is_character_level: + encoder = text_encoder.ByteTextEncoder() + else: + encoder = _get_token_encoder(data_dir, self.vocab_file, train_file) - def valid_generator(self): - return self._generator(self.valid) + if train: + return self._generator(train_file, encoder) + return self._generator(valid_file, encoder) - def _generator(self, filename): + def _generator(self, filename, encoder): with tf.gfile.GFile(filename, "r") as f: for line in f: line = " ".join(line.replace("\n", EOS).split()) - tok = self.encoder.encode(line) - yield {"inputs": tok[:-1], "targets": tok[1:]} + tok = encoder.encode(line) + if tok: + yield {"inputs": [0], "targets": tok} -# Using a object "singleton" -# `train_generator` must be called before -# `valid_generator` in order to work -_ptb = {} +@registry.register_problem("lm_ptb_10k") +class LmPtb10k(PTBProblem): + """A class for generating PTB data, 10k vocab.""" + @property + def is_character_level(self): + return False -def train_generator(*args, **kwargs): - """The train data generator to be called.""" - global _ptb - _ptb = PTB(*args, **kwargs) - return _ptb.train_generator() +@registry.register_problem +class LmPtbCharacters(PTBProblem): + """A class for generating PTB data, character-level.""" -def valid_generator(): - """Validation (aka. dev) data generator.""" - global _ptb # pylint:disable=global-variable-not-assigned - return _ptb.valid_generator() + @property + def is_character_level(self): + return True diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py index 7fde9b3b4..456f36321 100644 --- a/tensor2tensor/data_generators/wmt.py +++ b/tensor2tensor/data_generators/wmt.py @@ -28,7 +28,6 @@ from tensor2tensor.data_generators import problem from tensor2tensor.data_generators import text_encoder from tensor2tensor.data_generators import wsj_parsing -from tensor2tensor.utils import metrics from tensor2tensor.utils import registry import tensorflow as tf @@ -44,33 +43,13 @@ EOS = text_encoder.EOS_ID -class WMTProblem(problem.Problem): +class WMTProblem(problem.Text2TextProblem): """Base class for WMT problems.""" @property def is_character_level(self): return False - @property - def targeted_vocab_size(self): - raise NotImplementedError() # Not needed if self.is_character_level. - - def train_generator(self, data_dir, tmp_dir, is_training): - """Generator of the training data.""" - raise NotImplementedError() - - def dev_generator(self, data_dir, tmp_dir): - """Generator of the development data.""" - return self.train_generator(data_dir, tmp_dir, False) - - @property - def input_space_id(self): - raise NotImplementedError() - - @property - def target_space_id(self): - raise NotImplementedError() - @property def num_shards(self): return 100 @@ -80,51 +59,8 @@ def vocab_name(self): return "vocab.endefr" @property - def vocab_file(self): - return "%s.%d" % (self.vocab_name, self.targeted_vocab_size) - - def generate_data(self, data_dir, tmp_dir, task_id=-1): - generator_utils.generate_dataset_and_shuffle( - self.train_generator(data_dir, tmp_dir, True), - self.training_filepaths(data_dir, self.num_shards, shuffled=False), - self.dev_generator(data_dir, tmp_dir), - self.dev_filepaths(data_dir, 1, shuffled=False)) - - def feature_encoders(self, data_dir): - if self.is_character_level: - return { - "inputs": text_encoder.ByteTextEncoder(), - "targets": text_encoder.ByteTextEncoder(), - } - vocab_filename = os.path.join(data_dir, self.vocab_file) - subtokenizer = text_encoder.SubwordTextEncoder(vocab_filename) - return { - "inputs": subtokenizer, - "targets": subtokenizer, - } - - def hparams(self, defaults, unused_model_hparams): - p = defaults - if self.is_character_level: - source_vocab_size = 256 - target_vocab_size = 256 - else: - source_vocab_size = self._encoders["inputs"].vocab_size - target_vocab_size = self._encoders["targets"].vocab_size - p.input_modality = {"inputs": (registry.Modalities.SYMBOL, - source_vocab_size)} - p.target_modality = (registry.Modalities.SYMBOL, target_vocab_size) - p.input_space_id = self.input_space_id - p.target_space_id = self.target_space_id - if self.is_character_level: - p.loss_multiplier = 2.0 - - def eval_metrics(self): - return [ - metrics.Metrics.ACC, metrics.Metrics.ACC_TOP5, - metrics.Metrics.ACC_PER_SEQ, metrics.Metrics.NEG_LOG_PERPLEXITY, - metrics.Metrics.APPROX_BLEU - ] + def use_subword_tokenizer(self): + return True # Generic generators used later for multiple problems. @@ -634,7 +570,7 @@ def target_space_id(self): @registry.register_problem("wmt_encs_tokens_32k") -class WMTEnCsTokens32k(problem.Problem): +class WMTEnCsTokens32k(WMTProblem): """Problem spec for WMT English-Czech translation.""" @property @@ -665,13 +601,6 @@ def input_space_id(self): def target_space_id(self): return problem.SpaceID.CS_TOK - def eval_metrics(self): - return [ - metrics.Metrics.ACC, metrics.Metrics.ACC_TOP5, - metrics.Metrics.ACC_PER_SEQ, metrics.Metrics.NEG_LOG_PERPLEXITY, - metrics.Metrics.APPROX_BLEU - ] - @registry.register_problem("wmt_encs_characters") class WMTEnCsCharacters(WMTProblem): diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py index e78e22344..454e4f321 100644 --- a/tensor2tensor/utils/data_reader.py +++ b/tensor2tensor/utils/data_reader.py @@ -115,7 +115,8 @@ def decode_record(record): with tf.name_scope("examples_in"): data_files = tf.contrib.slim.parallel_reader.get_data_files(data_sources) - random.shuffle(data_files) + if training: + random.shuffle(data_files) dataset = tf.contrib.data.TFRecordDataset(data_files) num_readers = min(4 if training else 1, len(data_files)) dataset = dataset.map(decode_record, num_threads=num_readers) From 01f245fdecdf9fbdfae8a610cf9246e222c0891a Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Thu, 27 Jul 2017 18:21:21 -0700 Subject: [PATCH 5/6] v1.1.3 PiperOrigin-RevId: 163418167 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6be9aba04..ae028d847 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='tensor2tensor', - version='1.1.2', + version='1.1.3', description='Tensor2Tensor', author='Google Inc.', author_email='no-reply@google.com', From 7c072d7b77ada142bd577d01919a9be32900dd0c Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Thu, 27 Jul 2017 19:04:00 -0700 Subject: [PATCH 6/6] Revert usage of Datasets API PiperOrigin-RevId: 163421122 --- tensor2tensor/utils/data_reader.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py index 454e4f321..ba5139433 100644 --- a/tensor2tensor/utils/data_reader.py +++ b/tensor2tensor/utils/data_reader.py @@ -20,7 +20,6 @@ import math import os -import random # Dependency imports @@ -114,17 +113,18 @@ def decode_record(record): return dict(zip(decode_items, decoded)) with tf.name_scope("examples_in"): + # Read serialized examples using slim parallel_reader. data_files = tf.contrib.slim.parallel_reader.get_data_files(data_sources) - if training: - random.shuffle(data_files) - dataset = tf.contrib.data.TFRecordDataset(data_files) num_readers = min(4 if training else 1, len(data_files)) - dataset = dataset.map(decode_record, num_threads=num_readers) - if training: - dataset = dataset.shuffle(capacity) - dataset = dataset.repeat(None if training else 1) - it = dataset.make_one_shot_iterator() - return it.get_next() + _, example_serialized = tf.contrib.slim.parallel_reader.parallel_read( + data_sources, + tf.TFRecordReader, + num_epochs=None if training else 1, + shuffle=training, + capacity=2 * capacity, + min_after_dequeue=capacity, + num_readers=num_readers) + return decode_record(example_serialized) def preprocessing(examples, data_file_pattern, mode):