diff --git a/CHANGELOG.md b/CHANGELOG.md index e59c087bf..160aeffbd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,10 @@ Note that Sockeye has checks in place to not translate with an old model that wa For each item we will potentially have subsections for: _Added_, _Changed_, _Removed_, _Deprecated_, and _Fixed_. +## [1.10.5] +### Fixed + - Fixed yet another bug with the data iterator. + ## [1.10.4] ### Fixed - Fixed a bug with the revised data iterator not correctly appending EOS symbols for variable-length batches. diff --git a/sockeye/__init__.py b/sockeye/__init__.py index 7d3299b94..d3e863112 100644 --- a/sockeye/__init__.py +++ b/sockeye/__init__.py @@ -11,4 +11,4 @@ # express or implied. See the License for the specific language governing # permissions and limitations under the License. -__version__ = '1.10.4' +__version__ = '1.10.5' diff --git a/sockeye/data_io.py b/sockeye/data_io.py index 332bbcfea..12479ce57 100644 --- a/sockeye/data_io.py +++ b/sockeye/data_io.py @@ -676,6 +676,7 @@ def reset(self): self.nd_source = [] self.nd_target = [] + self.nd_label = [] self.indices = [] for i in range(len(self.data_source)): # shuffle indices within each bucket diff --git a/test/common.py b/test/common.py index 78949aee2..f41ff9f0e 100644 --- a/test/common.py +++ b/test/common.py @@ -14,6 +14,7 @@ import os import random import sys +from contextlib import contextmanager from tempfile import TemporaryDirectory from typing import Optional, Tuple from unittest.mock import patch @@ -101,6 +102,30 @@ def generate_digits_file(source_path: str, print(" ".join(digits), file=target_out) +@contextmanager +def tmp_digits_dataset(prefix: str, + train_line_count: int, train_max_length: int, + dev_line_count: int, dev_max_length: int, + sort_target: bool = False, + seed_train: int = 13, seed_dev: int = 13): + with TemporaryDirectory(prefix=prefix) as work_dir: + # Simple digits files for train/dev data + train_source_path = os.path.join(work_dir, "train.src") + train_target_path = os.path.join(work_dir, "train.tgt") + dev_source_path = os.path.join(work_dir, "dev.src") + dev_target_path = os.path.join(work_dir, "dev.tgt") + generate_digits_file(train_source_path, train_target_path, train_line_count, train_max_length, + sort_target=sort_target, seed=seed_train) + generate_digits_file(dev_source_path, dev_target_path, dev_line_count, dev_max_length, sort_target=sort_target, + seed=seed_dev) + data = {'work_dir': work_dir, + 'source': train_source_path, + 'target': train_target_path, + 'validation_source': dev_source_path, + 'validation_target': dev_target_path} + yield data + + _TRAIN_PARAMS_COMMON = "--use-cpu --max-seq-len {max_len} --source {train_source} --target {train_target}" \ " --validation-source {dev_source} --validation-target {dev_target} --output {model}" diff --git a/test/integration/test_seq_copy_int.py b/test/integration/test_seq_copy_int.py index 240d674cd..14c6cae91 100644 --- a/test/integration/test_seq_copy_int.py +++ b/test/integration/test_seq_copy_int.py @@ -11,12 +11,9 @@ # express or implied. See the License for the specific language governing # permissions and limitations under the License. -import os -from tempfile import TemporaryDirectory - import pytest -from test.common import generate_digits_file, run_train_translate +from test.common import run_train_translate, tmp_digits_dataset _TRAIN_LINE_COUNT = 100 _DEV_LINE_COUNT = 10 @@ -81,23 +78,17 @@ @pytest.mark.parametrize("train_params, translate_params", ENCODER_DECODER_SETTINGS) def test_seq_copy(train_params, translate_params): """Task: copy short sequences of digits""" - with TemporaryDirectory(prefix="test_seq_copy") as work_dir: - # Simple digits files for train/dev data - train_source_path = os.path.join(work_dir, "train.src") - train_target_path = os.path.join(work_dir, "train.tgt") - dev_source_path = os.path.join(work_dir, "dev.src") - dev_target_path = os.path.join(work_dir, "dev.tgt") - generate_digits_file(train_source_path, train_target_path, _TRAIN_LINE_COUNT, _LINE_MAX_LENGTH) - generate_digits_file(dev_source_path, dev_target_path, _DEV_LINE_COUNT, _LINE_MAX_LENGTH) + with tmp_digits_dataset("test_seq_copy", _TRAIN_LINE_COUNT, _LINE_MAX_LENGTH, _DEV_LINE_COUNT, + _LINE_MAX_LENGTH) as data: # Test model configuration, including the output equivalence of batch and no-batch decoding translate_params_batch = translate_params + " --batch-size 2" # Ignore return values (perplexity and BLEU) for integration test run_train_translate(train_params, translate_params, translate_params_batch, - train_source_path, - train_target_path, - dev_source_path, - dev_target_path, + data['source'], + data['target'], + data['validation_source'], + data['validation_target'], max_seq_len=_LINE_MAX_LENGTH + 1, - work_dir=work_dir) + work_dir=data['work_dir']) diff --git a/test/system/test_seq_copy_sys.py b/test/system/test_seq_copy_sys.py index dc0329e83..9ce5d199b 100644 --- a/test/system/test_seq_copy_sys.py +++ b/test/system/test_seq_copy_sys.py @@ -11,12 +11,9 @@ # express or implied. See the License for the specific language governing # permissions and limitations under the License. -import os -from tempfile import TemporaryDirectory - import pytest -from test.common import generate_digits_file, run_train_translate +from test.common import tmp_digits_dataset, run_train_translate _TRAIN_LINE_COUNT = 10000 _DEV_LINE_COUNT = 100 @@ -91,24 +88,18 @@ ]) def test_seq_copy(train_params, translate_params, perplexity_thresh, bleu_thresh): """Task: copy short sequences of digits""" - with TemporaryDirectory(prefix="test_seq_copy.") as work_dir: - # Simple digits files for train/dev data - train_source_path = os.path.join(work_dir, "train.src") - train_target_path = os.path.join(work_dir, "train.tgt") - dev_source_path = os.path.join(work_dir, "dev.src") - dev_target_path = os.path.join(work_dir, "dev.tgt") - generate_digits_file(train_source_path, train_target_path, _TRAIN_LINE_COUNT, _LINE_MAX_LENGTH, seed=_SEED_TRAIN) - generate_digits_file(dev_source_path, dev_target_path, _DEV_LINE_COUNT, _LINE_MAX_LENGTH, seed=_SEED_DEV) + with tmp_digits_dataset("test_seq_copy.", _TRAIN_LINE_COUNT, _LINE_MAX_LENGTH, _DEV_LINE_COUNT, + _LINE_MAX_LENGTH, seed_train=_SEED_TRAIN, seed_dev=_SEED_DEV) as data: # Test model configuration perplexity, bleu = run_train_translate(train_params, translate_params, None, # no second set of parameters - train_source_path, - train_target_path, - dev_source_path, - dev_target_path, + data['source'], + data['target'], + data['validation_source'], + data['validation_target'], max_seq_len=_LINE_MAX_LENGTH + 1, - work_dir=work_dir) + work_dir=data['work_dir']) assert perplexity <= perplexity_thresh assert bleu >= bleu_thresh @@ -171,25 +162,17 @@ def test_seq_copy(train_params, translate_params, perplexity_thresh, bleu_thresh ]) def test_seq_sort(train_params, translate_params, perplexity_thresh, bleu_thresh): """Task: sort short sequences of digits""" - with TemporaryDirectory(prefix="test_seq_sort.") as work_dir: - # Simple digits files for train/dev data - train_source_path = os.path.join(work_dir, "train.src") - train_target_path = os.path.join(work_dir, "train.tgt") - dev_source_path = os.path.join(work_dir, "dev.src") - dev_target_path = os.path.join(work_dir, "dev.tgt") - generate_digits_file(train_source_path, train_target_path, _TRAIN_LINE_COUNT, _LINE_MAX_LENGTH, - sort_target=True, seed=_SEED_TRAIN) - generate_digits_file(dev_source_path, dev_target_path, _DEV_LINE_COUNT, _LINE_MAX_LENGTH, - sort_target=True, seed=_SEED_DEV) + with tmp_digits_dataset("test_seq_sort.", _TRAIN_LINE_COUNT, _LINE_MAX_LENGTH, _DEV_LINE_COUNT, + _LINE_MAX_LENGTH, sort_target=True, seed_train=_SEED_TRAIN, seed_dev=_SEED_DEV) as data: # Test model configuration perplexity, bleu = run_train_translate(train_params, translate_params, None, - train_source_path, - train_target_path, - dev_source_path, - dev_target_path, + data['source'], + data['target'], + data['validation_source'], + data['validation_target'], max_seq_len=_LINE_MAX_LENGTH + 1, - work_dir=work_dir) + work_dir=data['work_dir']) assert perplexity <= perplexity_thresh assert bleu >= bleu_thresh diff --git a/test/unit/test_data_io.py b/test/unit/test_data_io.py index b689a080a..dfa1d76c9 100644 --- a/test/unit/test_data_io.py +++ b/test/unit/test_data_io.py @@ -11,10 +11,13 @@ # express or implied. See the License for the specific language governing # permissions and limitations under the License. +import numpy as np import pytest -import sockeye.constants as C -import sockeye.data_io +from sockeye import constants as C +from sockeye import data_io +from sockeye import vocab +from test.common import tmp_digits_dataset define_bucket_tests = [(50, 10, [10, 20, 30, 40, 50]), (50, 20, [20, 40, 50]), @@ -26,15 +29,20 @@ @pytest.mark.parametrize("max_seq_len, step, expected_buckets", define_bucket_tests) def test_define_buckets(max_seq_len, step, expected_buckets): - buckets = sockeye.data_io.define_buckets(max_seq_len, step=step) + buckets = data_io.define_buckets(max_seq_len, step=step) assert buckets == expected_buckets define_parallel_bucket_tests = [(50, 50, 10, 1.0, [(10, 10), (20, 20), (30, 30), (40, 40), (50, 50)]), - (50, 50, 10, 0.5, [(10, 5), (20, 10), (30, 15), (40, 20), (50, 25), (50, 30), (50, 35), (50, 40), (50, 45), (50, 50)]), - (10, 10, 10, 0.1, [(10, 2), (10, 3), (10, 4), (10, 5), (10, 6), (10, 7), (10, 8), (10, 9), (10, 10)]), + (50, 50, 10, 0.5, + [(10, 5), (20, 10), (30, 15), (40, 20), (50, 25), (50, 30), (50, 35), (50, 40), + (50, 45), (50, 50)]), + (10, 10, 10, 0.1, + [(10, 2), (10, 3), (10, 4), (10, 5), (10, 6), (10, 7), (10, 8), (10, 9), (10, 10)]), (10, 5, 10, 0.01, [(10, 2), (10, 3), (10, 4), (10, 5)]), - (50, 50, 10, 2.0, [(5, 10), (10, 20), (15, 30), (20, 40), (25, 50), (30, 50), (35, 50), (40, 50), (45, 50), (50, 50)]), + (50, 50, 10, 2.0, + [(5, 10), (10, 20), (15, 30), (20, 40), (25, 50), (30, 50), (35, 50), (40, 50), + (45, 50), (50, 50)]), (5, 10, 10, 10.0, [(2, 10), (3, 10), (4, 10), (5, 10)]), (5, 10, 10, 11.0, [(2, 10), (3, 10), (4, 10), (5, 10)]), (50, 50, 50, 0.5, [(50, 25), (50, 50)]), @@ -42,9 +50,11 @@ def test_define_buckets(max_seq_len, step, expected_buckets): (75, 75, 50, 1.5, [(33, 50), (66, 75), (75, 75)])] -@pytest.mark.parametrize("max_seq_len_source, max_seq_len_target, bucket_width, length_ratio, expected_buckets", define_parallel_bucket_tests) +@pytest.mark.parametrize("max_seq_len_source, max_seq_len_target, bucket_width, length_ratio, expected_buckets", + define_parallel_bucket_tests) def test_define_parallel_buckets(max_seq_len_source, max_seq_len_target, bucket_width, length_ratio, expected_buckets): - buckets = sockeye.data_io.define_parallel_buckets(max_seq_len_source, max_seq_len_target, bucket_width=bucket_width, length_ratio=length_ratio) + buckets = data_io.define_parallel_buckets(max_seq_len_source, max_seq_len_target, bucket_width=bucket_width, + length_ratio=length_ratio) assert buckets == expected_buckets @@ -60,7 +70,7 @@ def test_define_parallel_buckets(max_seq_len_source, max_seq_len_target, bucket_ @pytest.mark.parametrize("buckets, length, expected_bucket", get_bucket_tests) def test_get_bucket(buckets, length, expected_bucket): - bucket = sockeye.data_io.get_bucket(length, buckets) + bucket = data_io.get_bucket(length, buckets) assert bucket == expected_bucket @@ -70,7 +80,7 @@ def test_get_bucket(buckets, length, expected_bucket): @pytest.mark.parametrize("line, expected_tokens", get_tokens_tests) def test_get_tokens(line, expected_tokens): - tokens = list(sockeye.data_io.get_tokens(line)) + tokens = list(data_io.get_tokens(line)) assert tokens == expected_tokens @@ -80,7 +90,7 @@ def test_get_tokens(line, expected_tokens): @pytest.mark.parametrize("tokens, vocab, expected_ids", tokens2ids_tests) def test_tokens2ids(tokens, vocab, expected_ids): - ids = sockeye.data_io.tokens2ids(tokens, vocab) + ids = data_io.tokens2ids(tokens, vocab) assert ids == expected_ids @@ -88,7 +98,7 @@ def test_tokens2ids(tokens, vocab, expected_ids): [([(10, 10), (20, 20), (30, 30), (40, 40), (50, 50)], (50, 50)), ([(5, 10), (10, 20), (15, 30), (25, 50), (20, 40)], (25, 50))]) def test_get_default_bucket_key(buckets, expected_default_bucket_key): - default_bucket_key = sockeye.data_io.get_default_bucket_key(buckets) + default_bucket_key = data_io.get_default_bucket_key(buckets) assert default_bucket_key == expected_default_bucket_key @@ -104,6 +114,93 @@ def test_get_default_bucket_key(buckets, expected_default_bucket_key): @pytest.mark.parametrize("buckets, source_length, target_length, expected_bucket_index, expected_bucket", get_parallel_bucket_tests) def test_get_parallel_bucket(buckets, source_length, target_length, expected_bucket_index, expected_bucket): - bucket_index, bucket = sockeye.data_io.get_parallel_bucket(buckets, source_length, target_length) + bucket_index, bucket = data_io.get_parallel_bucket(buckets, source_length, target_length) assert bucket_index == expected_bucket_index assert bucket == expected_bucket + + +@pytest.mark.parametrize("source, target, expected_mean, expected_std", + [([[1, 1, 1], [2, 2, 2], [3, 3, 3]], + [[1, 1, 1], [2, 2, 2], [3, 3, 3]], 1.0, 0.0), + ([[1, 1], [2, 2], [3, 3]], + [[1, 1, 1], [2, 2, 2], [3, 3, 3]], 1.5, 0.0), + ([[1, 1, 1], [2, 2]], + [[1, 1, 1], [2], [3, 3, 3]], 0.75, 0.25)]) +def test_length_statistics(source, target, expected_mean, expected_std): + mean, std = data_io.length_statistics(source, target) + assert np.isclose(mean, expected_mean) + assert np.isclose(std, expected_std) + + +def test_get_training_data_iters(): + train_line_count = 100 + train_max_length = 30 + dev_line_count = 20 + dev_max_length = 30 + expected_mean = 1.1476392401276574 + expected_std = 0.2318455878853099 + batch_size = 5 + with tmp_digits_dataset("tmp_corpus", + train_line_count, train_max_length, dev_line_count, dev_max_length) as data: + # tmp common vocab + vcb = vocab.build_from_paths([data['source'], data['target']]) + + train_iter, val_iter, config_data = data_io.get_training_data_iters(data['source'], data['target'], + data['validation_source'], + data['validation_target'], + vocab_source=vcb, + vocab_target=vcb, + vocab_source_path=None, + vocab_target_path=None, + batch_size=batch_size, + batch_by_words=False, + batch_num_devices=1, + fill_up="replicate", + max_seq_len_source=train_max_length, + max_seq_len_target=train_max_length, + bucketing=True, + bucket_width=10) + assert config_data.source == data['source'] + assert config_data.target == data['target'] + assert config_data.validation_source == data['validation_source'] + assert config_data.validation_target == data['validation_target'] + assert config_data.vocab_source is None + assert config_data.vocab_target is None + assert config_data.max_observed_source_seq_len == train_max_length - 1 + assert config_data.max_observed_target_seq_len == train_max_length + assert np.isclose(config_data.length_ratio_mean, expected_mean) + assert np.isclose(config_data.length_ratio_std, expected_std) + + assert train_iter.batch_size == batch_size + assert val_iter.batch_size == batch_size + assert train_iter.default_bucket_key == (train_max_length, train_max_length) + assert val_iter.default_bucket_key == (dev_max_length, dev_max_length) + assert train_iter.max_observed_source_len == config_data.max_observed_source_seq_len + assert train_iter.max_observed_target_len == config_data.max_observed_target_seq_len + assert train_iter.pad_id == vcb[C.PAD_SYMBOL] + assert train_iter.dtype == 'float32' + assert not train_iter.batch_by_words + assert train_iter.fill_up == 'replicate' + + # test some batches + bos_id = vcb[C.BOS_SYMBOL] + expected_first_target_symbols = np.full((batch_size,), bos_id, dtype='float32') + for epoch in range(2): + while train_iter.iter_next(): + batch = train_iter.next() + assert len(batch.data) == 2 + assert len(batch.label) == 1 + assert batch.bucket_key in train_iter.buckets + source = batch.data[0].asnumpy() + target = batch.data[1].asnumpy() + label = batch.label[0].asnumpy() + assert source.shape[0] == batch_size + assert target.shape[0] == batch_size + assert label.shape[0] == batch_size + # target first symbol should be BOS + assert np.array_equal(target[:, 0], expected_first_target_symbols) + # label first symbol should be 2nd target symbol + assert np.array_equal(label[:, 0], target[:, 1]) + # each label sequence contains one EOS symbol + assert np.sum(label == vcb[C.EOS_SYMBOL]) == batch_size + train_iter.reset()