Skip to content
This repository has been archived by the owner on Jul 7, 2023. It is now read-only.

Commit

Permalink
Merge pull request #188 from rsepassi/push
Browse files Browse the repository at this point in the history
v1.1.3
  • Loading branch information
lukaszkaiser authored Jul 28, 2017
2 parents a55c4cf + 7c072d7 commit cd222d3
Show file tree
Hide file tree
Showing 17 changed files with 323 additions and 322 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name='tensor2tensor',
version='1.1.2',
version='1.1.3',
description='Tensor2Tensor',
author='Google Inc.',
author_email='no-reply@google.com',
Expand Down
7 changes: 0 additions & 7 deletions tensor2tensor/bin/t2t-datagen
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ from tensor2tensor.data_generators import audio
from tensor2tensor.data_generators import generator_utils
from tensor2tensor.data_generators import image
from tensor2tensor.data_generators import lm1b
from tensor2tensor.data_generators import ptb
from tensor2tensor.data_generators import snli
from tensor2tensor.data_generators import wiki
from tensor2tensor.data_generators import wmt
Expand Down Expand Up @@ -176,12 +175,6 @@ _SUPPORTED_PROBLEM_GENERATORS = {
lambda: audio.timit_generator(
FLAGS.data_dir, FLAGS.tmp_dir, False, 626,
vocab_filename="vocab.endefr.%d" % 2**15, vocab_size=2**15)),
"lmptb_10k": (
lambda: ptb.train_generator(
FLAGS.tmp_dir,
FLAGS.data_dir,
False),
ptb.valid_generator),
}

# pylint: enable=g-long-lambda
Expand Down
16 changes: 8 additions & 8 deletions tensor2tensor/data_generators/gene_expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,10 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1):
# Collect created shard processes to start and join
processes = []

datasets = [(self.training_filepaths, self.num_shards, "train",
num_train_examples), (self.dev_filepaths, 1, "valid",
num_dev_examples),
(self.test_filepaths, 1, "test", num_test_examples)]
datasets = [
(self.training_filepaths, self.num_shards, "train", num_train_examples),
(self.dev_filepaths, 10, "valid", num_dev_examples),
(self.test_filepaths, 10, "test", num_test_examples)]
for fname_fn, nshards, key_prefix, num_examples in datasets:
outfiles = fname_fn(data_dir, nshards, shuffled=False)
all_filepaths.extend(outfiles)
Expand All @@ -125,8 +125,8 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1):
start_idx, end_idx))
processes.append(p)

# 1 per training shard + dev + test
assert len(processes) == self.num_shards + 2
# 1 per training shard + 10 for dev + 10 for test
assert len(processes) == self.num_shards + 20

# Start and wait for processes in batches
num_batches = int(
Expand Down Expand Up @@ -168,8 +168,8 @@ def preprocess_examples(self, examples, mode):

# Reshape targets
examples["targets"] = tf.reshape(examples["targets"],
[-1, 1, self.num_output_predictions])
examples["targets_mask"] = tf.reshape(examples["targets_mask"], [-1, 1, 1])
[-1, self.num_output_predictions])
examples["targets_mask"] = tf.reshape(examples["targets_mask"], [-1, 1])

# Set masked targets to 0 (i.e. pad) so that loss and metrics ignore them.
# Add epsilon because some unmasked labels are actually 0.
Expand Down
2 changes: 1 addition & 1 deletion tensor2tensor/data_generators/generator_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ def generate():

# Use Tokenizer to count the word occurrences.
with tf.gfile.GFile(filepath, mode="r") as source_file:
file_byte_budget = 3.5e5 if "en" in filepath else 7e5
file_byte_budget = 3.5e5 if filepath.endswith("en") else 7e5
for line in source_file:
if file_byte_budget <= 0:
break
Expand Down
114 changes: 107 additions & 7 deletions tensor2tensor/data_generators/problem.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,14 @@
from __future__ import division
from __future__ import print_function

import os

# Dependency imports

from tensor2tensor.data_generators import generator_utils as utils
from tensor2tensor.data_generators import generator_utils
from tensor2tensor.data_generators import text_encoder
from tensor2tensor.utils import metrics
from tensor2tensor.utils import registry

import tensorflow as tf

Expand Down Expand Up @@ -176,20 +179,23 @@ def eval_metrics(self):
def training_filepaths(self, data_dir, num_shards, shuffled):
file_basename = self.dataset_filename()
if not shuffled:
file_basename += utils.UNSHUFFLED_SUFFIX
return utils.train_data_filenames(file_basename, data_dir, num_shards)
file_basename += generator_utils.UNSHUFFLED_SUFFIX
return generator_utils.train_data_filenames(
file_basename, data_dir, num_shards)

def dev_filepaths(self, data_dir, num_shards, shuffled):
file_basename = self.dataset_filename()
if not shuffled:
file_basename += utils.UNSHUFFLED_SUFFIX
return utils.dev_data_filenames(file_basename, data_dir, num_shards)
file_basename += generator_utils.UNSHUFFLED_SUFFIX
return generator_utils.dev_data_filenames(
file_basename, data_dir, num_shards)

def test_filepaths(self, data_dir, num_shards, shuffled):
file_basename = self.dataset_filename()
if not shuffled:
file_basename += utils.UNSHUFFLED_SUFFIX
return utils.test_data_filenames(file_basename, data_dir, num_shards)
file_basename += generator_utils.UNSHUFFLED_SUFFIX
return generator_utils.test_data_filenames(
file_basename, data_dir, num_shards)

def __init__(self, was_reversed=False, was_copy=False):
"""Create a Problem.
Expand Down Expand Up @@ -323,3 +329,97 @@ def _default_hparams():
# class.
input_space_id=SpaceID.GENERIC,
target_space_id=SpaceID.GENERIC)


class Text2TextProblem(Problem):
"""Base class for text-to-text problems."""

@property
def is_character_level(self):
raise NotImplementedError()

@property
def targeted_vocab_size(self):
raise NotImplementedError() # Not needed if self.is_character_level.

def train_generator(self, data_dir, tmp_dir, is_training):
"""Generator of the training data."""
raise NotImplementedError()

def dev_generator(self, data_dir, tmp_dir):
"""Generator of the development data."""
return self.train_generator(data_dir, tmp_dir, False)

@property
def input_space_id(self):
raise NotImplementedError()

@property
def target_space_id(self):
raise NotImplementedError()

@property
def num_shards(self):
raise NotImplementedError()

@property
def vocab_name(self):
raise NotImplementedError()

@property
def vocab_file(self):
return "%s.%d" % (self.vocab_name, self.targeted_vocab_size)

@property
def use_subword_tokenizer(self):
raise NotImplementedError()

@property
def has_inputs(self):
return True # Set to False for language models.

def generate_data(self, data_dir, tmp_dir, task_id=-1):
generator_utils.generate_dataset_and_shuffle(
self.train_generator(data_dir, tmp_dir, True),
self.training_filepaths(data_dir, self.num_shards, shuffled=False),
self.dev_generator(data_dir, tmp_dir),
self.dev_filepaths(data_dir, 1, shuffled=False))

def feature_encoders(self, data_dir):
vocab_filename = os.path.join(data_dir, self.vocab_file)
if self.is_character_level:
encoder = text_encoder.ByteTextEncoder(),
elif self.use_subword_tokenizer:
encoder = text_encoder.SubwordTextEncoder(vocab_filename)
else:
encoder = text_encoder.TokenTextEncoder(vocab_filename)
if self.has_inputs:
return {"inputs": encoder, "targets": encoder}
return {"targets": encoder}

def hparams(self, defaults, unused_model_hparams):
p = defaults
if self.is_character_level:
source_vocab_size = 256
target_vocab_size = 256
else:
target_vocab_size = self._encoders["targets"].vocab_size
if self.has_inputs:
source_vocab_size = self._encoders["inputs"].vocab_size

if self.has_inputs:
p.input_modality = {"inputs": (registry.Modalities.SYMBOL,
source_vocab_size)}
p.target_modality = (registry.Modalities.SYMBOL, target_vocab_size)
if self.has_inputs:
p.input_space_id = self.input_space_id
p.target_space_id = self.target_space_id
if self.is_character_level:
p.loss_multiplier = 2.0

def eval_metrics(self):
return [
metrics.Metrics.ACC, metrics.Metrics.ACC_TOP5,
metrics.Metrics.ACC_PER_SEQ, metrics.Metrics.NEG_LOG_PERPLEXITY,
metrics.Metrics.APPROX_BLEU
]
16 changes: 0 additions & 16 deletions tensor2tensor/data_generators/problem_hparams.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,21 +368,6 @@ def wiki_32k(model_hparams):
return p


def lmptb_10k(model_hparams):
"""Penn Tree Bank language-modeling benchmark, 10k token vocabulary."""
p = default_problem_hparams()
p.input_modality = {}
p.target_modality = (registry.Modalities.SYMBOL, 10000)
vocabulary = text_encoder.TokenTextEncoder(
os.path.join(model_hparams.data_dir, "lmptb_10k.vocab"))
p.vocabulary = {
"targets": vocabulary,
}
p.input_space_id = 3
p.target_space_id = 3
return p


def wmt_ende_bpe32k(model_hparams):
"""English to German translation benchmark."""
p = default_problem_hparams()
Expand Down Expand Up @@ -642,7 +627,6 @@ def image_celeba(unused_model_hparams):
"lm1b_characters": lm1b_characters,
"lm1b_32k": lm1b_32k,
"wiki_32k": wiki_32k,
"lmptb_10k": lmptb_10k,
"ice_parsing_characters": wmt_parsing_characters,
"ice_parsing_tokens": lambda p: ice_parsing_tokens(p, 2**13),
"wmt_parsing_tokens_8k": lambda p: wmt_parsing_tokens(p, 2**13),
Expand Down
Loading

0 comments on commit cd222d3

Please sign in to comment.