diff --git a/dual_net.py b/dual_net.py index 5ce6157c3..e0fe79e54 100644 --- a/dual_net.py +++ b/dual_net.py @@ -685,7 +685,7 @@ def freeze_graph(model_path, use_trt=False, trt_max_batch_size=8, max_workspace_size_bytes=1 << 29, precision_mode=trt_precision) - metadata = get_model_metadata({ + metadata = make_model_metadata({ 'engine': 'tf', 'use_trt': bool(use_trt), }) @@ -733,14 +733,14 @@ def freeze_graph_tpu(model_path): out_graph = tf.graph_util.convert_variables_to_constants( sess, sess.graph.as_graph_def(), output_names) - metadata = get_model_metadata({ + metadata = make_model_metadata({ 'engine': 'tpu', }) atomic_write_model(out_graph, metadata, model_path) -def get_model_metadata(metadata): +def make_model_metadata(metadata): for f in ['conv_width', 'fc_width', 'trunk_layers', 'use_SE', 'use_SE_bias', 'use_swish', 'bool_features', 'input_features']: metadata[f] = getattr(FLAGS, f) diff --git a/ml_perf/scripts/bootstrap.sh b/ml_perf/scripts/bootstrap.sh index 8bc132f5f..3a760482f 100755 --- a/ml_perf/scripts/bootstrap.sh +++ b/ml_perf/scripts/bootstrap.sh @@ -36,7 +36,11 @@ for var_name in flag_dir golden_chunk_dir holdout_dir log_dir model_dir \ selfplay_dir sgf_dir work_dir; do dir="${!var_name}" if [[ "${dir}" == gs://* ]]; then + # `gsutil rm -f` "helpfully" returns a non-zero error code if the requested file + # target files don't exist. + set +e gsutil -m rm -rf "${dir}"/* + set -e else mkdir -p "${dir}" rm -rf "${dir}"/* diff --git a/ml_perf/scripts/start_selfplay.sh b/ml_perf/scripts/start_selfplay.sh index c4e1b6d72..7333ca7a3 100755 --- a/ml_perf/scripts/start_selfplay.sh +++ b/ml_perf/scripts/start_selfplay.sh @@ -33,8 +33,8 @@ for device in {0..7}; do CUDA_VISIBLE_DEVICES="${device}" \ ./bazel-bin/cc/concurrent_selfplay \ --flagfile="${flag_dir}/selfplay.flags" \ - --output_dir="${data_dir}/selfplay/\$MODEL/${device}" \ - --holdout_dir="${data_dir}/holdout/\$MODEL/${device}" \ + --output_dir="${selfplay_dir}/\$MODEL/${device}" \ + --holdout_dir="${holdout_dir}/\$MODEL/${device}" \ --model="${model_dir}/%d.pb" \ --run_forever=1 \ --abort_file=${abort_file} \ diff --git a/ml_perf/train_loop.py b/ml_perf/train_loop.py index cdf9031ca..509560b84 100644 --- a/ml_perf/train_loop.py +++ b/ml_perf/train_loop.py @@ -61,7 +61,7 @@ 'the training chunks as more threads are used to ' 'compress the data. Using too many threads however could ' 'slow down training time if each shard gets much smaller ' - 'than around 100MB'. + 'than around 100MB.') flags.DEFINE_string('golden_chunk_dir', None, 'Training example directory.') flags.DEFINE_string('holdout_dir', None, 'Holdout example directory.') diff --git a/train.py b/train.py index 277d04e26..e3157e0d9 100644 --- a/train.py +++ b/train.py @@ -42,8 +42,9 @@ 'once over training data.') flags.DEFINE_integer('num_examples', None, - 'Total number of examples passed. Used to calculate ' - 'steps_to_train if it isn\'t set.') + 'Total number of input examples. This is only used if ' + 'steps_to_train is not set. Requires that filter_amount ' + 'is 1.0.') flags.DEFINE_integer('window_size', 500000, 'Number of games to include in the window') @@ -68,6 +69,13 @@ lambda flags: flags['use_tpu'] if flags['use_bt'] else True, '`use_bt` flag only valid with `use_tpu` as well') +@flags.multi_flags_validator( + ['num_examples', 'steps_to_train', 'filter_amount'], + '`num_examples` requires `steps_to_train==0` and `filter_amount==1.0`') +def _example_flags_validator(flags_dict): + if not flags_dict['num_examples']: + return True + return not flags_dict['steps_to_train'] and flags_dict['filter_amount'] == 1.0 @flags.multi_flags_validator( ['use_bt', 'cbt_project', 'cbt_instance', 'cbt_table'],