diff --git a/benchmarks/horovod-resnet/execute_horovod_training.py b/benchmarks/horovod-resnet/execute_horovod_training.py index e6ac7609..4b0b9b23 100755 --- a/benchmarks/horovod-resnet/execute_horovod_training.py +++ b/benchmarks/horovod-resnet/execute_horovod_training.py @@ -26,7 +26,7 @@ from sagemaker.tensorflow import TensorFlow dir_path = os.path.dirname(os.path.realpath(__file__)) -benchmark_results_dir = os.path.join('s3://', Session().default_bucket(), 'hvd-benchmarking') +benchmark_results_dir = os.path.join("s3://", Session().default_bucket(), "hvd-benchmarking") @click.group() @@ -35,93 +35,98 @@ def cli(): def generate_report(): - results_dir = os.path.join(dir_path, 'results') + results_dir = os.path.join(dir_path, "results") if os.path.exists(results_dir): shutil.rmtree(results_dir) - subprocess.call(['aws', 's3', 'cp', '--recursive', benchmark_results_dir, results_dir]) + subprocess.call(["aws", "s3", "cp", "--recursive", benchmark_results_dir, results_dir]) jobs = {} for job_name in os.listdir(results_dir): jobs[job_name] = {} - _, instance_type, instance_count, device, py_version, _, _, _, _, _, _, _ = job_name.split('-') + _, instance_type, instance_count, device, py_version, _, _, _, _, _, _, _ = job_name.split( + "-" + ) current_dir = os.path.join(results_dir, job_name) - model_dir = os.path.join(current_dir, 'output', 'model.tar.gz') - subprocess.call(['tar', '-xvzf', model_dir], cwd=current_dir) + model_dir = os.path.join(current_dir, "output", "model.tar.gz") + subprocess.call(["tar", "-xvzf", model_dir], cwd=current_dir) - jobs[job_name]['instance_type'] = instance_type - jobs[job_name]['instance_count'] = instance_count - jobs[job_name]['device'] = device - jobs[job_name]['py_version'] = py_version + jobs[job_name]["instance_type"] = instance_type + jobs[job_name]["instance_count"] = instance_count + jobs[job_name]["device"] = device + jobs[job_name]["py_version"] = py_version - benchmark_log = os.path.join(current_dir, 'benchmark_run.log') + benchmark_log = os.path.join(current_dir, "benchmark_run.log") if os.path.exists(benchmark_log): with open(benchmark_log) as f: data = json.load(f) - - jobs[job_name]['dataset'] = data['dataset']['name'] - jobs[job_name]['num_cores'] = data['machine_config']['cpu_info']['num_cores'] - jobs[job_name]['cpu_info'] = data['machine_config']['cpu_info']['cpu_info'] - jobs[job_name]['mhz_per_cpu'] = data['machine_config']['cpu_info']['mhz_per_cpu'] - jobs[job_name]['gpu_count'] = data['machine_config']['gpu_info']['count'] - jobs[job_name]['gpu_model'] = data['machine_config']['gpu_info']['model'] + jobs[job_name]["dataset"] = data["dataset"]["name"] + jobs[job_name]["num_cores"] = data["machine_config"]["cpu_info"]["num_cores"] + jobs[job_name]["cpu_info"] = data["machine_config"]["cpu_info"]["cpu_info"] + jobs[job_name]["mhz_per_cpu"] = data["machine_config"]["cpu_info"]["mhz_per_cpu"] + jobs[job_name]["gpu_count"] = data["machine_config"]["gpu_info"]["count"] + jobs[job_name]["gpu_model"] = data["machine_config"]["gpu_info"]["model"] def find_value(parameter): - other_key = [k for k in parameter if k != 'name'][0] + other_key = [k for k in parameter if k != "name"][0] return parameter[other_key] - for parameter in data['run_parameters']: - jobs[job_name][parameter['name']] = find_value(parameter) + for parameter in data["run_parameters"]: + jobs[job_name][parameter["name"]] = find_value(parameter) - jobs[job_name]['model_name'] = data['model_name'] - jobs[job_name]['run_date'] = data['run_date'] - jobs[job_name]['tensorflow_version'] = data['tensorflow_version']['version'] - jobs[job_name]['tensorflow_version_git_hash'] = data['tensorflow_version']['git_hash'] + jobs[job_name]["model_name"] = data["model_name"] + jobs[job_name]["run_date"] = data["run_date"] + jobs[job_name]["tensorflow_version"] = data["tensorflow_version"]["version"] + jobs[job_name]["tensorflow_version_git_hash"] = data["tensorflow_version"][ + "git_hash" + ] return pd.DataFrame(jobs) -@cli.command('train') -@click.option('--framework-version', required=True, type=click.Choice(['1.11', '1.12'])) -@click.option('--device', required=True, type=click.Choice(['cpu', 'gpu'])) -@click.option('--py-versions', multiple=True, type=str) -@click.option('--training-input-mode', default='File', type=click.Choice(['File', 'Pipe'])) -@click.option('--networking-isolation/--no-networking-isolation', default=False) -@click.option('--wait/--no-wait', default=False) -@click.option('--security-groups', multiple=True, type=str) -@click.option('--subnets', multiple=True, type=str) -@click.option('--role', default='SageMakerRole', type=str) -@click.option('--instance-counts', multiple=True, type=int) -@click.option('--instance-types', multiple=True, type=str) -@click.argument('script_args', nargs=-1, type=str) -def train(framework_version, - device, - py_versions, - training_input_mode, - networking_isolation, - wait, - security_groups, - subnets, - role, - instance_counts, - instance_types, - script_args): +@cli.command("train") +@click.option("--framework-version", required=True, type=click.Choice(["1.11", "1.12"])) +@click.option("--device", required=True, type=click.Choice(["cpu", "gpu"])) +@click.option("--py-versions", multiple=True, type=str) +@click.option("--training-input-mode", default="File", type=click.Choice(["File", "Pipe"])) +@click.option("--networking-isolation/--no-networking-isolation", default=False) +@click.option("--wait/--no-wait", default=False) +@click.option("--security-groups", multiple=True, type=str) +@click.option("--subnets", multiple=True, type=str) +@click.option("--role", default="SageMakerRole", type=str) +@click.option("--instance-counts", multiple=True, type=int) +@click.option("--instance-types", multiple=True, type=str) +@click.argument("script_args", nargs=-1, type=str) +def train( + framework_version, + device, + py_versions, + training_input_mode, + networking_isolation, + wait, + security_groups, + subnets, + role, + instance_counts, + instance_types, + script_args, +): iterator = itertools.product(instance_types, py_versions, instance_counts) for instance_type, py_version, instance_count in iterator: base_name = job_name(instance_type, instance_count, device, py_version) - mpi_options = '-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 -x HOROVOD_FUSION_THRESHOLD=16777216 -x TF_CPP_MIN_LOG_LEVEL=0 -x HOROVOD_TIMELINE --output-filename /opt/ml/model/hlog' + mpi_options = "-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 -x HOROVOD_FUSION_THRESHOLD=16777216 -x TF_CPP_MIN_LOG_LEVEL=0 -x HOROVOD_TIMELINE --output-filename /opt/ml/model/hlog" estimator = TensorFlow( - entry_point=os.path.join(dir_path, 'train.sh'), + entry_point=os.path.join(dir_path, "train.sh"), role=role, - dependencies=[os.path.join(dir_path, 'train_imagenet_resnet_hvd.py')], + dependencies=[os.path.join(dir_path, "train_imagenet_resnet_hvd.py")], base_job_name=base_name, train_instance_count=instance_count, train_instance_type=instance_type, @@ -129,36 +134,34 @@ def train(framework_version, py_version=py_version, script_mode=True, hyperparameters={ - 'sagemaker_mpi_enabled': True, - 'sagemaker_mpi_num_of_processes_per_host': 8, - 'sagemaker_mpi_custom_mpi_options': mpi_options + "sagemaker_mpi_enabled": True, + "sagemaker_mpi_num_of_processes_per_host": 8, + "sagemaker_mpi_custom_mpi_options": mpi_options, }, output_path=benchmark_results_dir, security_group_ids=security_groups, - subnets=subnets + subnets=subnets, ) estimator.fit(wait=wait) if wait: - artifacts_path = os.path.join(dir_path, 'results', - estimator.latest_training_job.job_name) - model_path = os.path.join(artifacts_path, 'model.tar.gz') + artifacts_path = os.path.join( + dir_path, "results", estimator.latest_training_job.job_name + ) + model_path = os.path.join(artifacts_path, "model.tar.gz") os.makedirs(artifacts_path) - subprocess.call(['aws', 's3', 'cp', estimator.model_data, model_path]) - subprocess.call(['tar', '-xvzf', model_path], cwd=artifacts_path) + subprocess.call(["aws", "s3", "cp", estimator.model_data, model_path]) + subprocess.call(["tar", "-xvzf", model_path], cwd=artifacts_path) + + print("Model downloaded at %s" % model_path) - print('Model downloaded at %s' % model_path) +def job_name(instance_type, instance_count, device, python_version): + instance_typename = instance_type.replace(".", "").replace("ml", "") -def job_name(instance_type, - instance_count, - device, - python_version): - instance_typename = instance_type.replace('.', '').replace('ml', '') + return "hvd-%s-%s-%s-%s" % (instance_typename, instance_count, device, python_version) - return 'hvd-%s-%s-%s-%s' % ( - instance_typename, instance_count, device, python_version) -if __name__ == '__main__': +if __name__ == "__main__": cli() diff --git a/benchmarks/horovod-resnet/train_imagenet_resnet_hvd.py b/benchmarks/horovod-resnet/train_imagenet_resnet_hvd.py index d415c62d..cf0e2486 100644 --- a/benchmarks/horovod-resnet/train_imagenet_resnet_hvd.py +++ b/benchmarks/horovod-resnet/train_imagenet_resnet_hvd.py @@ -51,18 +51,26 @@ from operator import itemgetter from tensorflow.python.util import nest + def rank0log(logger, *args, **kwargs): if hvd.rank() == 0: if logger: - logger.info(''.join([str(x) for x in list(args)])) + logger.info("".join([str(x) for x in list(args)])) else: print(*args, **kwargs) class LayerBuilder(object): - def __init__(self, activation=None, data_format='channels_last', - training=False, use_batch_norm=False, batch_norm_config=None, - conv_initializer=None, adv_bn_init=False): + def __init__( + self, + activation=None, + data_format="channels_last", + training=False, + use_batch_norm=False, + batch_norm_config=None, + conv_initializer=None, + adv_bn_init=False, + ): self.activation = activation self.data_format = data_format self.training = training @@ -72,19 +80,22 @@ def __init__(self, activation=None, data_format='channels_last', self.adv_bn_init = adv_bn_init if self.batch_norm_config is None: self.batch_norm_config = { - 'decay': 0.9, - 'epsilon': 1e-4, - 'scale': True, - 'zero_debias_moving_mean': False, + "decay": 0.9, + "epsilon": 1e-4, + "scale": True, + "zero_debias_moving_mean": False, } def _conv2d(self, inputs, activation, *args, **kwargs): x = tf.layers.conv2d( - inputs, data_format=self.data_format, + inputs, + data_format=self.data_format, use_bias=not self.use_batch_norm, kernel_initializer=self.conv_initializer, activation=None if self.use_batch_norm else activation, - *args, **kwargs) + *args, + **kwargs + ) if self.use_batch_norm: x = self.batch_norm(x) x = activation(x) if activation is not None else x @@ -92,19 +103,23 @@ def _conv2d(self, inputs, activation, *args, **kwargs): def conv2d_linear_last_bn(self, inputs, *args, **kwargs): x = tf.layers.conv2d( - inputs, data_format=self.data_format, + inputs, + data_format=self.data_format, use_bias=False, kernel_initializer=self.conv_initializer, - activation=None, *args, **kwargs) + activation=None, + *args, + **kwargs + ) param_initializers = { - 'moving_mean': tf.zeros_initializer(), - 'moving_variance': tf.ones_initializer(), - 'beta': tf.zeros_initializer(), + "moving_mean": tf.zeros_initializer(), + "moving_variance": tf.ones_initializer(), + "beta": tf.zeros_initializer(), } if self.adv_bn_init: - param_initializers['gamma'] = tf.zeros_initializer() + param_initializers["gamma"] = tf.zeros_initializer() else: - param_initializers['gamma'] = tf.ones_initializer() + param_initializers["gamma"] = tf.ones_initializer() x = self.batch_norm(x, param_initializers=param_initializers) return x @@ -125,19 +140,17 @@ def pad2d(self, inputs, begin, end=None): _ = end[1] except TypeError: end = [end, end] - if self.data_format == 'channels_last': + if self.data_format == "channels_last": padding = [[0, 0], [begin[0], end[0]], [begin[1], end[1]], [0, 0]] else: padding = [[0, 0], [0, 0], [begin[0], end[0]], [begin[1], end[1]]] return tf.pad(inputs, padding) def max_pooling2d(self, inputs, *args, **kwargs): - return tf.layers.max_pooling2d( - inputs, data_format=self.data_format, *args, **kwargs) + return tf.layers.max_pooling2d(inputs, data_format=self.data_format, *args, **kwargs) def average_pooling2d(self, inputs, *args, **kwargs): - return tf.layers.average_pooling2d( - inputs, data_format=self.data_format, *args, **kwargs) + return tf.layers.average_pooling2d(inputs, data_format=self.data_format, *args, **kwargs) def dense_linear(self, inputs, units, **kwargs): return tf.layers.dense(inputs, units, activation=None) @@ -152,72 +165,72 @@ def activate(self, inputs, activation=None): def batch_norm(self, inputs, **kwargs): all_kwargs = dict(self.batch_norm_config) all_kwargs.update(kwargs) - data_format = 'NHWC' if self.data_format == 'channels_last' else 'NCHW' + data_format = "NHWC" if self.data_format == "channels_last" else "NCHW" return tf.contrib.layers.batch_norm( - inputs, is_training=self.training, data_format=data_format, - fused=True, **all_kwargs) + inputs, is_training=self.training, data_format=data_format, fused=True, **all_kwargs + ) def spatial_average2d(self, inputs): shape = inputs.get_shape().as_list() - if self.data_format == 'channels_last': + if self.data_format == "channels_last": n, h, w, c = shape else: n, c, h, w = shape n = -1 if n is None else n - x = tf.layers.average_pooling2d(inputs, (h, w), (1, 1), - data_format=self.data_format) + x = tf.layers.average_pooling2d(inputs, (h, w), (1, 1), data_format=self.data_format) return tf.reshape(x, [n, c]) def flatten2d(self, inputs): x = inputs - if self.data_format != 'channel_last': + if self.data_format != "channel_last": # Note: This ensures the output order matches that of NHWC networks x = tf.transpose(x, [0, 2, 3, 1]) input_shape = x.get_shape().as_list() num_inputs = 1 for dim in input_shape[1:]: num_inputs *= dim - return tf.reshape(x, [-1, num_inputs], name='flatten') + return tf.reshape(x, [-1, num_inputs], name="flatten") def residual2d(self, inputs, network, units=None, scale=1.0, activate=False): outputs = network(inputs) - c_axis = -1 if self.data_format == 'channels_last' else 1 - h_axis = 1 if self.data_format == 'channels_last' else 2 + c_axis = -1 if self.data_format == "channels_last" else 1 + h_axis = 1 if self.data_format == "channels_last" else 2 w_axis = h_axis + 1 ishape, oshape = [y.get_shape().as_list() for y in [inputs, outputs]] ichans, ochans = ishape[c_axis], oshape[c_axis] - strides = ((ishape[h_axis] - 1) // oshape[h_axis] + 1, - (ishape[w_axis] - 1) // oshape[w_axis] + 1) - with tf.name_scope('residual'): - if (ochans != ichans or strides[0] != 1 or strides[1] != 1): - inputs = self.conv2d_linear(inputs, units, 1, strides, 'SAME') + strides = ( + (ishape[h_axis] - 1) // oshape[h_axis] + 1, + (ishape[w_axis] - 1) // oshape[w_axis] + 1, + ) + with tf.name_scope("residual"): + if ochans != ichans or strides[0] != 1 or strides[1] != 1: + inputs = self.conv2d_linear(inputs, units, 1, strides, "SAME") x = inputs + scale * outputs if activate: x = self.activate(x) return x -def resnet_bottleneck_v1(builder, inputs, depth, depth_bottleneck, stride, - basic=False): +def resnet_bottleneck_v1(builder, inputs, depth, depth_bottleneck, stride, basic=False): num_inputs = inputs.get_shape().as_list()[1] x = inputs - with tf.name_scope('resnet_v1'): + with tf.name_scope("resnet_v1"): if depth == num_inputs: if stride == 1: shortcut = x else: shortcut = builder.max_pooling2d(x, 1, stride) else: - shortcut = builder.conv2d_linear(x, depth, 1, stride, 'SAME') + shortcut = builder.conv2d_linear(x, depth, 1, stride, "SAME") if basic: x = builder.pad2d(x, 1) - x = builder.conv2d(x, depth_bottleneck, 3, stride, 'VALID') - x = builder.conv2d_linear(x, depth, 3, 1, 'SAME') + x = builder.conv2d(x, depth_bottleneck, 3, stride, "VALID") + x = builder.conv2d_linear(x, depth, 3, 1, "SAME") else: - x = builder.conv2d(x, depth_bottleneck, 1, 1, 'SAME') - x = builder.conv2d(x, depth_bottleneck, 3, stride, 'SAME') + x = builder.conv2d(x, depth_bottleneck, 1, 1, "SAME") + x = builder.conv2d(x, depth_bottleneck, 3, stride, "SAME") # x = builder.conv2d_linear(x, depth, 1, 1, 'SAME') - x = builder.conv2d_linear_last_bn(x, depth, 1, 1, 'SAME') + x = builder.conv2d_linear_last_bn(x, depth, 1, 1, "SAME") x = tf.nn.relu(x + shortcut) return x @@ -225,8 +238,8 @@ def resnet_bottleneck_v1(builder, inputs, depth, depth_bottleneck, stride, def inference_resnet_v1_impl(builder, inputs, layer_counts, basic=False): x = inputs x = builder.pad2d(x, 3) - x = builder.conv2d(x, 64, 7, 2, 'VALID') - x = builder.max_pooling2d(x, 3, 2, 'SAME') + x = builder.conv2d(x, 64, 7, 2, "VALID") + x = builder.max_pooling2d(x, 3, 2, "SAME") for i in range(layer_counts[0]): x = resnet_bottleneck_v1(builder, x, 256, 64, 1, basic) for i in range(layer_counts[1]): @@ -238,13 +251,25 @@ def inference_resnet_v1_impl(builder, inputs, layer_counts, basic=False): return builder.spatial_average2d(x) -def inference_resnet_v1(inputs, nlayer, data_format='channels_last', - training=False, conv_initializer=None, adv_bn_init=False): +def inference_resnet_v1( + inputs, + nlayer, + data_format="channels_last", + training=False, + conv_initializer=None, + adv_bn_init=False, +): """Deep Residual Networks family of models https://arxiv.org/abs/1512.03385 """ - builder = LayerBuilder(tf.nn.relu, data_format, training, use_batch_norm=True, - conv_initializer=conv_initializer, adv_bn_init=adv_bn_init) + builder = LayerBuilder( + tf.nn.relu, + data_format, + training, + use_batch_norm=True, + conv_initializer=conv_initializer, + adv_bn_init=adv_bn_init, + ) if nlayer == 18: return inference_resnet_v1_impl(builder, inputs, [2, 2, 2, 2], basic=True) elif nlayer == 34: @@ -256,83 +281,95 @@ def inference_resnet_v1(inputs, nlayer, data_format='channels_last', elif nlayer == 152: return inference_resnet_v1_impl(builder, inputs, [3, 8, 36, 3]) else: - raise ValueError("Invalid nlayer (%i); must be one of: 18,34,50,101,152" % - nlayer) + raise ValueError("Invalid nlayer (%i); must be one of: 18,34,50,101,152" % nlayer) def get_model_func(model_name): - if model_name.startswith('resnet'): - nlayer = int(model_name[len('resnet'):]) - return lambda images, *args, **kwargs: \ - inference_resnet_v1(images, nlayer, *args, **kwargs) + if model_name.startswith("resnet"): + nlayer = int(model_name[len("resnet") :]) + return lambda images, *args, **kwargs: inference_resnet_v1(images, nlayer, *args, **kwargs) else: raise ValueError("Invalid model type: %s" % model_name) def deserialize_image_record(record): feature_map = { - 'image/encoded': tf.FixedLenFeature([], tf.string, ''), - 'image/class/label': tf.FixedLenFeature([1], tf.int64, -1), - 'image/class/text': tf.FixedLenFeature([], tf.string, ''), - 'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32), - 'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32), - 'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32), - 'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32) + "image/encoded": tf.FixedLenFeature([], tf.string, ""), + "image/class/label": tf.FixedLenFeature([1], tf.int64, -1), + "image/class/text": tf.FixedLenFeature([], tf.string, ""), + "image/object/bbox/xmin": tf.VarLenFeature(dtype=tf.float32), + "image/object/bbox/ymin": tf.VarLenFeature(dtype=tf.float32), + "image/object/bbox/xmax": tf.VarLenFeature(dtype=tf.float32), + "image/object/bbox/ymax": tf.VarLenFeature(dtype=tf.float32), } - with tf.name_scope('deserialize_image_record'): + with tf.name_scope("deserialize_image_record"): obj = tf.parse_single_example(record, feature_map) - imgdata = obj['image/encoded'] - label = tf.cast(obj['image/class/label'], tf.int32) - bbox = tf.stack([obj['image/object/bbox/%s' % x].values - for x in ['ymin', 'xmin', 'ymax', 'xmax']]) + imgdata = obj["image/encoded"] + label = tf.cast(obj["image/class/label"], tf.int32) + bbox = tf.stack( + [obj["image/object/bbox/%s" % x].values for x in ["ymin", "xmin", "ymax", "xmax"]] + ) bbox = tf.transpose(tf.expand_dims(bbox, 0), [0, 2, 1]) - text = obj['image/class/text'] + text = obj["image/class/text"] return imgdata, label, bbox, text def decode_jpeg(imgdata, channels=3): - return tf.image.decode_jpeg(imgdata, channels=channels, - fancy_upscaling=False, - dct_method='INTEGER_FAST') + return tf.image.decode_jpeg( + imgdata, channels=channels, fancy_upscaling=False, dct_method="INTEGER_FAST" + ) -def crop_and_resize_image(image, original_bbox, height, width, - distort=False, nsummary=10): - with tf.name_scope('crop_and_resize'): +def crop_and_resize_image(image, original_bbox, height, width, distort=False, nsummary=10): + with tf.name_scope("crop_and_resize"): # Evaluation is done on a center-crop of this ratio eval_crop_ratio = 0.8 if distort: - initial_shape = [int(round(height / eval_crop_ratio)), - int(round(width / eval_crop_ratio)), - 3] - bbox_begin, bbox_size, bbox = \ - tf.image.sample_distorted_bounding_box( - initial_shape, - bounding_boxes=tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4]), - # tf.zeros(shape=[1,0,4]), # No bounding boxes - min_object_covered=0.1, - aspect_ratio_range=[3. / 4., 4. / 3.], - area_range=[0.08, 1.0], - max_attempts=100, - seed=11 * hvd.rank(), # Need to set for deterministic results - use_image_if_no_bounding_boxes=True) + initial_shape = [ + int(round(height / eval_crop_ratio)), + int(round(width / eval_crop_ratio)), + 3, + ] + bbox_begin, bbox_size, bbox = tf.image.sample_distorted_bounding_box( + initial_shape, + bounding_boxes=tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4]), + # tf.zeros(shape=[1,0,4]), # No bounding boxes + min_object_covered=0.1, + aspect_ratio_range=[3.0 / 4.0, 4.0 / 3.0], + area_range=[0.08, 1.0], + max_attempts=100, + seed=11 * hvd.rank(), # Need to set for deterministic results + use_image_if_no_bounding_boxes=True, + ) bbox = bbox[0, 0] # Remove batch, box_idx dims else: # Central crop ratio_y = ratio_x = eval_crop_ratio - bbox = tf.constant([0.5 * (1 - ratio_y), 0.5 * (1 - ratio_x), - 0.5 * (1 + ratio_y), 0.5 * (1 + ratio_x)]) - image = tf.image.crop_and_resize( - image[None, :, :, :], bbox[None, :], [0], [height, width])[0] + bbox = tf.constant( + [0.5 * (1 - ratio_y), 0.5 * (1 - ratio_x), 0.5 * (1 + ratio_y), 0.5 * (1 + ratio_x)] + ) + image = tf.image.crop_and_resize(image[None, :, :, :], bbox[None, :], [0], [height, width])[ + 0 + ] return image -def parse_and_preprocess_image_record(record, counter, height, width, - brightness, contrast, saturation, hue, - distort=False, nsummary=10, increased_aug=False): +def parse_and_preprocess_image_record( + record, + counter, + height, + width, + brightness, + contrast, + saturation, + hue, + distort=False, + nsummary=10, + increased_aug=False, +): imgdata, label, bbox, text = deserialize_image_record(record) label -= 1 # Change to 0-based (don't use background class) - with tf.name_scope('preprocess_train'): + with tf.name_scope("preprocess_train"): try: image = decode_jpeg(imgdata, channels=3) except: @@ -342,24 +379,44 @@ def parse_and_preprocess_image_record(record, counter, height, width, image = tf.image.random_flip_left_right(image) if increased_aug: image = tf.image.random_brightness(image, max_delta=brightness) - image = distort_image_ops.random_hsv_in_yiq(image, - lower_saturation=saturation, - upper_saturation=2.0 - saturation, - max_delta_hue=hue * math.pi) + image = distort_image_ops.random_hsv_in_yiq( + image, + lower_saturation=saturation, + upper_saturation=2.0 - saturation, + max_delta_hue=hue * math.pi, + ) image = tf.image.random_contrast(image, lower=contrast, upper=2.0 - contrast) - tf.summary.image('distorted_color_image', tf.expand_dims(image, 0)) - image = tf.clip_by_value(image, 0., 255.) + tf.summary.image("distorted_color_image", tf.expand_dims(image, 0)) + image = tf.clip_by_value(image, 0.0, 255.0) image = tf.cast(image, tf.uint8) return image, label -def make_dataset(filenames, take_count, batch_size, height, width, - brightness, contrast, saturation, hue, - training=False, num_threads=10, nsummary=10, shard=False, synthetic=False, - increased_aug=False): + +def make_dataset( + filenames, + take_count, + batch_size, + height, + width, + brightness, + contrast, + saturation, + hue, + training=False, + num_threads=10, + nsummary=10, + shard=False, + synthetic=False, + increased_aug=False, +): if synthetic and training: input_shape = [height, width, 3] - input_element = nest.map_structure(lambda s: tf.constant(0.5, tf.float32, s), tf.TensorShape(input_shape)) - label_element = nest.map_structure(lambda s: tf.constant(1, tf.int32, s), tf.TensorShape([1])) + input_element = nest.map_structure( + lambda s: tf.constant(0.5, tf.float32, s), tf.TensorShape(input_shape) + ) + label_element = nest.map_structure( + lambda s: tf.constant(1, tf.int32, s), tf.TensorShape([1]) + ) element = (input_element, label_element) ds = tf.data.Dataset.from_tensors(element).repeat() else: @@ -380,16 +437,29 @@ def make_dataset(filenames, take_count, batch_size, height, width, if training: ds = ds.shuffle(1000, seed=7 * (1 + hvd.rank())) - ds = ds.interleave( - tf.data.TFRecordDataset, cycle_length=num_readers, block_length=1) + ds = ds.interleave(tf.data.TFRecordDataset, cycle_length=num_readers, block_length=1) counter = tf.data.Dataset.range(sys.maxsize) ds = tf.data.Dataset.zip((ds, counter)) preproc_func = lambda record, counter_: parse_and_preprocess_image_record( - record, counter_, height, width, brightness, contrast, saturation, hue, - distort=training, nsummary=nsummary if training else 0, increased_aug=increased_aug) + record, + counter_, + height, + width, + brightness, + contrast, + saturation, + hue, + distort=training, + nsummary=nsummary if training else 0, + increased_aug=increased_aug, + ) ds = ds.map(preproc_func, num_parallel_calls=num_threads) if training: - ds = ds.apply(tf.data.experimental.shuffle_and_repeat(shuffle_buffer_size, seed=5*(1+hvd.rank()))) + ds = ds.apply( + tf.data.experimental.shuffle_and_repeat( + shuffle_buffer_size, seed=5 * (1 + hvd.rank()) + ) + ) ds = ds.batch(batch_size) return ds @@ -399,18 +469,19 @@ def stage(tensors): """ stage_area = data_flow_ops.StagingArea( dtypes=[tensor.dtype for tensor in tensors], - shapes=[tensor.get_shape() for tensor in tensors]) + shapes=[tensor.get_shape() for tensor in tensors], + ) put_op = stage_area.put(tensors) get_tensors = stage_area.get() - tf.add_to_collection('STAGING_AREA_PUTS', put_op) + tf.add_to_collection("STAGING_AREA_PUTS", put_op) return put_op, get_tensors class PrefillStagingAreasHook(tf.train.SessionRunHook): def after_create_session(self, session, coord): - enqueue_ops = tf.get_collection('STAGING_AREA_PUTS') + enqueue_ops = tf.get_collection("STAGING_AREA_PUTS") for i in range(len(enqueue_ops)): - session.run(enqueue_ops[:i + 1]) + session.run(enqueue_ops[: i + 1]) class LogSessionRunHook(tf.train.SessionRunHook): @@ -421,15 +492,15 @@ def __init__(self, global_batch_size, num_records, display_every=10, logger=None self.logger = logger def after_create_session(self, session, coord): - rank0log(self.logger, ' Step Epoch Speed Loss FinLoss LR') - self.elapsed_secs = 0. + rank0log(self.logger, " Step Epoch Speed Loss FinLoss LR") + self.elapsed_secs = 0.0 self.count = 0 def before_run(self, run_context): self.t0 = time.time() return tf.train.SessionRunArgs( - fetches=[tf.train.get_global_step(), - 'loss:0', 'total_loss:0', 'learning_rate:0']) + fetches=[tf.train.get_global_step(), "loss:0", "total_loss:0", "learning_rate:0"] + ) def after_run(self, run_context, run_values): self.elapsed_secs += time.time() - self.t0 @@ -439,25 +510,37 @@ def after_run(self, run_context, run_values): dt = self.elapsed_secs / self.count img_per_sec = self.global_batch_size / dt epoch = global_step * self.global_batch_size / self.num_records - self.logger.info('%6i %5.1f %7.1f %6.3f %6.3f %7.5f' % - (global_step, epoch, img_per_sec, loss, total_loss, lr)) - self.elapsed_secs = 0. + self.logger.info( + "%6i %5.1f %7.1f %6.3f %6.3f %7.5f" + % (global_step, epoch, img_per_sec, loss, total_loss, lr) + ) + self.elapsed_secs = 0.0 self.count = 0 -def _fp32_trainvar_getter(getter, name, shape=None, dtype=None, - trainable=True, regularizer=None, - *args, **kwargs): +def _fp32_trainvar_getter( + getter, name, shape=None, dtype=None, trainable=True, regularizer=None, *args, **kwargs +): storage_dtype = tf.float32 if trainable else dtype - variable = getter(name, shape, dtype=storage_dtype, - trainable=trainable, - regularizer=regularizer if trainable and 'BatchNorm' not in name and 'batchnorm' not in name and 'batch_norm' not in name and 'Batch_Norm' not in name else None, - *args, **kwargs) + variable = getter( + name, + shape, + dtype=storage_dtype, + trainable=trainable, + regularizer=regularizer + if trainable + and "BatchNorm" not in name + and "batchnorm" not in name + and "batch_norm" not in name + and "Batch_Norm" not in name + else None, + *args, + **kwargs + ) if trainable and dtype != tf.float32: - cast_name = name + '/fp16_cast' + cast_name = name + "/fp16_cast" try: - cast_variable = tf.get_default_graph().get_tensor_by_name( - cast_name + ':0') + cast_variable = tf.get_default_graph().get_tensor_by_name(cast_name + ":0") except KeyError: cast_variable = tf.cast(variable, dtype, name=cast_name) cast_variable._ref = variable._ref @@ -465,31 +548,26 @@ def _fp32_trainvar_getter(getter, name, shape=None, dtype=None, return variable -def fp32_trainable_vars(name='fp32_vars', *args, **kwargs): +def fp32_trainable_vars(name="fp32_vars", *args, **kwargs): """A varible scope with custom variable getter to convert fp16 trainable variables with fp32 storage followed by fp16 cast. """ - return tf.variable_scope( - name, custom_getter=_fp32_trainvar_getter, *args, **kwargs) + return tf.variable_scope(name, custom_getter=_fp32_trainvar_getter, *args, **kwargs) class MixedPrecisionOptimizer(tf.train.Optimizer): """An optimizer that updates trainable variables in fp32.""" - def __init__(self, optimizer, - scale=None, - name="MixedPrecisionOptimizer", - use_locking=False): - super(MixedPrecisionOptimizer, self).__init__( - name=name, use_locking=use_locking) + def __init__(self, optimizer, scale=None, name="MixedPrecisionOptimizer", use_locking=False): + super(MixedPrecisionOptimizer, self).__init__(name=name, use_locking=use_locking) self._optimizer = optimizer self._scale = float(scale) if scale is not None else 1.0 def compute_gradients(self, loss, var_list=None, *args, **kwargs): if var_list is None: - var_list = ( - tf.trainable_variables() + - tf.get_collection(tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)) + var_list = tf.trainable_variables() + tf.get_collection( + tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES + ) replaced_list = var_list @@ -503,7 +581,7 @@ def compute_gradients(self, loss, var_list=None, *args, **kwargs): if var is not orig_var: grad = tf.cast(grad, orig_var.dtype) if self._scale != 1.0: - grad = tf.scalar_mul(1. / self._scale, grad) + grad = tf.scalar_mul(1.0 / self._scale, grad) final_gradvar.append((grad, orig_var)) return final_gradvar @@ -511,6 +589,7 @@ def compute_gradients(self, loss, var_list=None, *args, **kwargs): def apply_gradients(self, *args, **kwargs): return self._optimizer.apply_gradients(*args, **kwargs) + class LarcOptimizer(tf.train.Optimizer): """ LARC implementation ------------------- @@ -524,10 +603,17 @@ class LarcOptimizer(tf.train.Optimizer): - use_locking """ - def __init__(self, optimizer, learning_rate, eta, clip=True, epsilon=1., - name="LarcOptimizer", use_locking=False): - super(LarcOptimizer, self).__init__( - name=name, use_locking=use_locking) + def __init__( + self, + optimizer, + learning_rate, + eta, + clip=True, + epsilon=1.0, + name="LarcOptimizer", + use_locking=False, + ): + super(LarcOptimizer, self).__init__(name=name, use_locking=use_locking) self._optimizer = optimizer self._learning_rate = learning_rate self._eta = float(eta) @@ -539,16 +625,13 @@ def compute_gradients(self, *args, **kwargs): def apply_gradients(self, gradvars, *args, **kwargs): v_list = [tf.norm(tensor=v, ord=2) for _, v in gradvars] - g_list = [tf.norm(tensor=g, ord=2) if g is not None else 0.0 - for g, _ in gradvars] + g_list = [tf.norm(tensor=g, ord=2) if g is not None else 0.0 for g, _ in gradvars] v_norms = tf.stack(v_list) g_norms = tf.stack(g_list) zeds = tf.zeros_like(v_norms) # assign epsilon if weights or grads = 0, to avoid division by zero # also prevent biases to get stuck at initialization (0.) - cond = tf.logical_and( - tf.not_equal(v_norms, zeds), - tf.not_equal(g_norms, zeds)) + cond = tf.logical_and(tf.not_equal(v_norms, zeds), tf.not_equal(g_norms, zeds)) true_vals = tf.scalar_mul(self._eta, tf.div(v_norms, g_norms)) # true_vals = tf.scalar_mul(tf.cast(self._eta, tf.float32), tf.div(tf.cast(v_norms, tf.float32), tf.cast(g_norms, tf.float32))) false_vals = tf.fill(tf.shape(v_norms), self._epsilon) @@ -561,9 +644,10 @@ def apply_gradients(self, gradvars, *args, **kwargs): # for which learning rate is already fixed # We then have to scale the gradients instead of the learning rate. larc_local_lr = tf.minimum(tf.div(larc_local_lr, lr), ones) - gradvars = [(tf.multiply(larc_local_lr[i], g), v) - if g is not None else (None, v) - for i, (g, v) in enumerate(gradvars)] + gradvars = [ + (tf.multiply(larc_local_lr[i], g), v) if g is not None else (None, v) + for i, (g, v) in enumerate(gradvars) + ] return self._optimizer.apply_gradients(gradvars, *args, **kwargs) @@ -571,45 +655,64 @@ def get_with_default(obj, key, default_value): return obj[key] if key in obj and obj[key] is not None else default_value -def get_lr(lr, steps, lr_steps, warmup_it, decay_steps, global_step, lr_decay_mode, - cdr_first_decay_ratio, cdr_t_mul, cdr_m_mul, cdr_alpha, lc_periods, lc_alpha, lc_beta): - if lr_decay_mode == 'steps': - learning_rate = tf.train.piecewise_constant(global_step, - steps, lr_steps) - elif lr_decay_mode == 'poly' or lr_decay_mode == 'poly_cycle': - cycle = lr_decay_mode == 'poly_cycle' - learning_rate = tf.train.polynomial_decay(lr, - global_step - warmup_it, - decay_steps=decay_steps - warmup_it, - end_learning_rate=0.00001, - power=2, - cycle=cycle) - elif lr_decay_mode == 'cosine_decay_restarts': - learning_rate = tf.train.cosine_decay_restarts(lr, - global_step - warmup_it, - (decay_steps - warmup_it) * cdr_first_decay_ratio, - t_mul=cdr_t_mul, - m_mul=cdr_m_mul, - alpha=cdr_alpha) - elif lr_decay_mode == 'cosine': - learning_rate = tf.train.cosine_decay(lr, - global_step - warmup_it, - decay_steps=decay_steps - warmup_it, - alpha=0.0) - elif lr_decay_mode == 'linear_cosine': - learning_rate = tf.train.linear_cosine_decay(lr, - global_step - warmup_it, - decay_steps=decay_steps - warmup_it, - num_periods=lc_periods,#0.47, - alpha=lc_alpha,#0.0, - beta=lc_beta)#0.00001) +def get_lr( + lr, + steps, + lr_steps, + warmup_it, + decay_steps, + global_step, + lr_decay_mode, + cdr_first_decay_ratio, + cdr_t_mul, + cdr_m_mul, + cdr_alpha, + lc_periods, + lc_alpha, + lc_beta, +): + if lr_decay_mode == "steps": + learning_rate = tf.train.piecewise_constant(global_step, steps, lr_steps) + elif lr_decay_mode == "poly" or lr_decay_mode == "poly_cycle": + cycle = lr_decay_mode == "poly_cycle" + learning_rate = tf.train.polynomial_decay( + lr, + global_step - warmup_it, + decay_steps=decay_steps - warmup_it, + end_learning_rate=0.00001, + power=2, + cycle=cycle, + ) + elif lr_decay_mode == "cosine_decay_restarts": + learning_rate = tf.train.cosine_decay_restarts( + lr, + global_step - warmup_it, + (decay_steps - warmup_it) * cdr_first_decay_ratio, + t_mul=cdr_t_mul, + m_mul=cdr_m_mul, + alpha=cdr_alpha, + ) + elif lr_decay_mode == "cosine": + learning_rate = tf.train.cosine_decay( + lr, global_step - warmup_it, decay_steps=decay_steps - warmup_it, alpha=0.0 + ) + elif lr_decay_mode == "linear_cosine": + learning_rate = tf.train.linear_cosine_decay( + lr, + global_step - warmup_it, + decay_steps=decay_steps - warmup_it, + num_periods=lc_periods, # 0.47, + alpha=lc_alpha, # 0.0, + beta=lc_beta, + ) # 0.00001) else: - raise ValueError('Invalid type of lr_decay_mode') + raise ValueError("Invalid type of lr_decay_mode") return learning_rate def warmup_decay(warmup_lr, global_step, warmup_steps, warmup_end_lr): from tensorflow.python.ops import math_ops + p = tf.cast(global_step, tf.float32) / tf.cast(warmup_steps, tf.float32) diff = math_ops.subtract(warmup_end_lr, warmup_lr) res = math_ops.add(warmup_lr, math_ops.multiply(diff, p)) @@ -618,40 +721,40 @@ def warmup_decay(warmup_lr, global_step, warmup_steps, warmup_end_lr): def cnn_model_function(features, labels, mode, params): labels = tf.reshape(labels, (-1,)) # Squash unnecessary unary dim - lr = params['lr'] - lr_steps = params['lr_steps'] - steps = params['steps'] - use_larc = params['use_larc'] - leta = params['leta'] - lr_decay_mode = params['lr_decay_mode'] - decay_steps = params['decay_steps'] - cdr_first_decay_ratio = params['cdr_first_decay_ratio'] - cdr_t_mul = params['cdr_t_mul'] - cdr_m_mul = params['cdr_m_mul'] - cdr_alpha = params['cdr_alpha'] - lc_periods = params['lc_periods'] - lc_alpha = params['lc_alpha'] - lc_beta = params['lc_beta'] - - model_name = params['model'] - num_classes = params['n_classes'] - model_dtype = get_with_default(params, 'dtype', tf.float32) - model_format = get_with_default(params, 'format', 'channels_first') - device = get_with_default(params, 'device', '/gpu:0') + lr = params["lr"] + lr_steps = params["lr_steps"] + steps = params["steps"] + use_larc = params["use_larc"] + leta = params["leta"] + lr_decay_mode = params["lr_decay_mode"] + decay_steps = params["decay_steps"] + cdr_first_decay_ratio = params["cdr_first_decay_ratio"] + cdr_t_mul = params["cdr_t_mul"] + cdr_m_mul = params["cdr_m_mul"] + cdr_alpha = params["cdr_alpha"] + lc_periods = params["lc_periods"] + lc_alpha = params["lc_alpha"] + lc_beta = params["lc_beta"] + + model_name = params["model"] + num_classes = params["n_classes"] + model_dtype = get_with_default(params, "dtype", tf.float32) + model_format = get_with_default(params, "format", "channels_first") + device = get_with_default(params, "device", "/gpu:0") model_func = get_model_func(model_name) inputs = features # TODO: Should be using feature columns? - is_training = (mode == tf.estimator.ModeKeys.TRAIN) - momentum = params['mom'] - weight_decay = params['wdecay'] - warmup_lr = params['warmup_lr'] - warmup_it = params['warmup_it'] - loss_scale = params['loss_scale'] + is_training = mode == tf.estimator.ModeKeys.TRAIN + momentum = params["mom"] + weight_decay = params["wdecay"] + warmup_lr = params["warmup_lr"] + warmup_it = params["warmup_it"] + loss_scale = params["loss_scale"] - adv_bn_init = params['adv_bn_init'] - conv_init = params['conv_init'] + adv_bn_init = params["adv_bn_init"] + conv_init = params["conv_init"] if mode == tf.estimator.ModeKeys.TRAIN: - with tf.device('/cpu:0'): + with tf.device("/cpu:0"): preload_op, (inputs, labels) = stage([inputs, labels]) with tf.device(device): @@ -661,73 +764,87 @@ def cnn_model_function(features, labels, mode, params): imagenet_mean = np.array([121, 115, 100], dtype=np.float32) imagenet_std = np.array([70, 68, 71], dtype=np.float32) inputs = tf.subtract(inputs, imagenet_mean) - inputs = tf.multiply(inputs, 1. / imagenet_std) - if model_format == 'channels_first': + inputs = tf.multiply(inputs, 1.0 / imagenet_std) + if model_format == "channels_first": inputs = tf.transpose(inputs, [0, 3, 1, 2]) - with fp32_trainable_vars( - regularizer=tf.contrib.layers.l2_regularizer(weight_decay)): + with fp32_trainable_vars(regularizer=tf.contrib.layers.l2_regularizer(weight_decay)): top_layer = model_func( - inputs, data_format=model_format, training=is_training, - conv_initializer=conv_init, adv_bn_init=adv_bn_init) - logits = tf.layers.dense(top_layer, num_classes, - kernel_initializer=tf.random_normal_initializer(stddev=0.01)) + inputs, + data_format=model_format, + training=is_training, + conv_initializer=conv_init, + adv_bn_init=adv_bn_init, + ) + logits = tf.layers.dense( + top_layer, num_classes, kernel_initializer=tf.random_normal_initializer(stddev=0.01) + ) predicted_classes = tf.argmax(logits, axis=1, output_type=tf.int32) logits = tf.cast(logits, tf.float32) if mode == tf.estimator.ModeKeys.PREDICT: probabilities = tf.softmax(logits) predictions = { - 'class_ids': predicted_classes[:, None], - 'probabilities': probabilities, - 'logits': logits + "class_ids": predicted_classes[:, None], + "probabilities": probabilities, + "logits": logits, } return tf.estimator.EstimatorSpec(mode, predictions=predictions) - loss = tf.losses.sparse_softmax_cross_entropy( - logits=logits, labels=labels) - loss = tf.identity(loss, name='loss') # For access by logger (TODO: Better way to access it?) + loss = tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels) + loss = tf.identity( + loss, name="loss" + ) # For access by logger (TODO: Better way to access it?) if mode == tf.estimator.ModeKeys.EVAL: with tf.device(None): # Allow fallback to CPU if no GPU support for these ops - accuracy = tf.metrics.accuracy( - labels=labels, predictions=predicted_classes) - top5acc = tf.metrics.mean( - tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32)) + accuracy = tf.metrics.accuracy(labels=labels, predictions=predicted_classes) + top5acc = tf.metrics.mean(tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32)) newaccuracy = (hvd.allreduce(accuracy[0]), accuracy[1]) newtop5acc = (hvd.allreduce(top5acc[0]), top5acc[1]) - metrics = {'val-top1acc': newaccuracy, 'val-top5acc': newtop5acc} - return tf.estimator.EstimatorSpec( - mode, loss=loss, eval_metric_ops=metrics) + metrics = {"val-top1acc": newaccuracy, "val-top5acc": newtop5acc} + return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics) - assert (mode == tf.estimator.ModeKeys.TRAIN) + assert mode == tf.estimator.ModeKeys.TRAIN reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) - total_loss = tf.add_n([loss] + reg_losses, name='total_loss') + total_loss = tf.add_n([loss] + reg_losses, name="total_loss") batch_size = tf.shape(inputs)[0] global_step = tf.train.get_global_step() - with tf.device('/cpu:0'): # Allow fallback to CPU if no GPU support for these ops - learning_rate = tf.cond(global_step < warmup_it, - lambda: warmup_decay(warmup_lr, global_step, warmup_it, - lr), - lambda: get_lr(lr, steps, lr_steps, warmup_it, decay_steps, global_step, - lr_decay_mode, - cdr_first_decay_ratio, cdr_t_mul, cdr_m_mul, cdr_alpha, - lc_periods, lc_alpha, lc_beta)) - learning_rate = tf.identity(learning_rate, 'learning_rate') - tf.summary.scalar('learning_rate', learning_rate) - - opt = tf.train.MomentumOptimizer( - learning_rate, momentum, use_nesterov=True) + with tf.device("/cpu:0"): # Allow fallback to CPU if no GPU support for these ops + learning_rate = tf.cond( + global_step < warmup_it, + lambda: warmup_decay(warmup_lr, global_step, warmup_it, lr), + lambda: get_lr( + lr, + steps, + lr_steps, + warmup_it, + decay_steps, + global_step, + lr_decay_mode, + cdr_first_decay_ratio, + cdr_t_mul, + cdr_m_mul, + cdr_alpha, + lc_periods, + lc_alpha, + lc_beta, + ), + ) + learning_rate = tf.identity(learning_rate, "learning_rate") + tf.summary.scalar("learning_rate", learning_rate) + + opt = tf.train.MomentumOptimizer(learning_rate, momentum, use_nesterov=True) opt = hvd.DistributedOptimizer(opt) if use_larc: opt = LarcOptimizer(opt, learning_rate, leta, clip=True) opt = MixedPrecisionOptimizer(opt, scale=loss_scale) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) or [] with tf.control_dependencies(update_ops): - gate_gradients = (tf.train.Optimizer.GATE_NONE) + gate_gradients = tf.train.Optimizer.GATE_NONE train_op = opt.minimize( - total_loss, global_step=tf.train.get_global_step(), - gate_gradients=gate_gradients) + total_loss, global_step=tf.train.get_global_step(), gate_gradients=gate_gradients + ) train_op = tf.group(preload_op, gpucopy_op, train_op) # , update_ops) return tf.estimator.EstimatorSpec(mode, loss=total_loss, train_op=train_op) @@ -741,158 +858,234 @@ def count_records(tf_record_filename): return count nfile = len(filenames) - return (count_records(filenames[0]) * (nfile - 1) + - count_records(filenames[-1])) + return count_records(filenames[0]) * (nfile - 1) + count_records(filenames[-1]) def add_bool_argument(cmdline, shortname, longname=None, default=False, help=None): if longname is None: shortname, longname = None, shortname elif default == True: - raise ValueError("""Boolean arguments that are True by default should not have short names.""") + raise ValueError( + """Boolean arguments that are True by default should not have short names.""" + ) name = longname[2:] feature_parser = cmdline.add_mutually_exclusive_group(required=False) if shortname is not None: - feature_parser.add_argument(shortname, '--' + name, dest=name, action='store_true', help=help, default=default) + feature_parser.add_argument( + shortname, "--" + name, dest=name, action="store_true", help=help, default=default + ) else: - feature_parser.add_argument('--' + name, dest=name, action='store_true', help=help, default=default) - feature_parser.add_argument('--no' + name, dest=name, action='store_false') + feature_parser.add_argument( + "--" + name, dest=name, action="store_true", help=help, default=default + ) + feature_parser.add_argument("--no" + name, dest=name, action="store_false") return cmdline def add_cli_args(): - cmdline = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + cmdline = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Basic options - cmdline.add_argument('-m', '--model', default='resnet50', - help="""Name of model to run: resnet[18,34,50,101,152]""") - cmdline.add_argument('--data_dir', - help="""Path to dataset in TFRecord format + cmdline.add_argument( + "-m", + "--model", + default="resnet50", + help="""Name of model to run: resnet[18,34,50,101,152]""", + ) + cmdline.add_argument( + "--data_dir", + help="""Path to dataset in TFRecord format (aka Example protobufs). Files should be - named 'train-*' and 'validation-*'.""") - add_bool_argument(cmdline, '--synthetic', help="""Whether to use synthetic data for training""") - cmdline.add_argument('-b', '--batch_size', default=256, type=int, - help="""Size of each minibatch per GPU""") - cmdline.add_argument('--num_batches', type=int, - help="""Number of batches to run. - Ignored during eval or if num epochs given""") - cmdline.add_argument('--num_epochs', type=int, - help="""Number of epochs to run. - Overrides --num_batches. Ignored during eval.""") - cmdline.add_argument('--log_dir', default='imagenet_resnet', - help="""Directory in which to write training + named 'train-*' and 'validation-*'.""", + ) + add_bool_argument(cmdline, "--synthetic", help="""Whether to use synthetic data for training""") + cmdline.add_argument( + "-b", "--batch_size", default=256, type=int, help="""Size of each minibatch per GPU""" + ) + cmdline.add_argument( + "--num_batches", + type=int, + help="""Number of batches to run. + Ignored during eval or if num epochs given""", + ) + cmdline.add_argument( + "--num_epochs", + type=int, + help="""Number of epochs to run. + Overrides --num_batches. Ignored during eval.""", + ) + cmdline.add_argument( + "--log_dir", + default="imagenet_resnet", + help="""Directory in which to write training summaries and checkpoints. If the log directory already contains some checkpoints, it tries to resume training from the last saved checkpoint. Pass --clear_log if you - want to clear all checkpoints and start a fresh run""") - add_bool_argument(cmdline, '--clear_log', default=False, - help="""Clear the log folder passed so a fresh run can be started""") - cmdline.add_argument('--log_name', type=str, default='hvd_train.log') - add_bool_argument(cmdline, '--local_ckpt', - help="""Performs local checkpoints (i.e. one per node)""") - cmdline.add_argument('--display_every', default=50, type=int, - help="""How often (in iterations) to print out - running information.""") - add_bool_argument(cmdline, '--eval', - help="""Evaluate the top-1 and top-5 accuracy of + want to clear all checkpoints and start a fresh run""", + ) + add_bool_argument( + cmdline, + "--clear_log", + default=False, + help="""Clear the log folder passed so a fresh run can be started""", + ) + cmdline.add_argument("--log_name", type=str, default="hvd_train.log") + add_bool_argument( + cmdline, "--local_ckpt", help="""Performs local checkpoints (i.e. one per node)""" + ) + cmdline.add_argument( + "--display_every", + default=50, + type=int, + help="""How often (in iterations) to print out + running information.""", + ) + add_bool_argument( + cmdline, + "--eval", + help="""Evaluate the top-1 and top-5 accuracy of the latest checkpointed model. If you want to evaluate using multiple GPUs ensure that all processes have access to all checkpoints. Either if checkpoints were saved using --local_ckpt or they were saved to a shared directory which all processes - can access.""") - cmdline.add_argument('--eval_interval', type=int, - help="""Evaluate accuracy per eval_interval number of epochs""") - add_bool_argument(cmdline, '--fp16', default=True, - help="""Train using float16 (half) precision instead - of float32.""") - cmdline.add_argument('--num_gpus', default=1, type=int, - help="""Specify total number of GPUS used to train a checkpointed model during eval. - Used only to calculate epoch number to print during evaluation""") - - cmdline.add_argument('--save_checkpoints_steps', type=int, default=1000) - cmdline.add_argument('--save_summary_steps', type=int, default=0) - add_bool_argument(cmdline, '--adv_bn_init', default=True, - help="""init gamme of the last BN of each ResMod at 0.""") - add_bool_argument(cmdline, '--adv_conv_init', default=True, - help="""init conv with MSRA initializer""") - - cmdline.add_argument('--lr', type=float, - help="""Start learning rate""") - cmdline.add_argument('--mom', default=0.90, type=float, - help="""Momentum""") - cmdline.add_argument('--wdecay', default=0.0001, type=float, - help="""Weight decay""") - cmdline.add_argument('--loss_scale', default=1024., type=float, - help="""loss scale""") - cmdline.add_argument('--warmup_lr', default=0.001, type=float, - help="""Warmup starting from this learning rate""") - cmdline.add_argument('--warmup_epochs', default=0, type=int, - help="""Number of epochs in which to warmup to given lr""") - cmdline.add_argument('--lr_decay_steps', default='30,60,80', type=str, - help="""epoch numbers at which lr is decayed by lr_decay_lrs. - Used when lr_decay_mode is steps""") - cmdline.add_argument('--lr_decay_lrs', default='', type=str, - help="""learning rates at specific epochs""") - cmdline.add_argument('--lr_decay_mode', default='poly', - help="""Takes either `steps` (decay by a factor at specified steps) - or `poly`(polynomial_decay with degree 2)""") - - add_bool_argument(cmdline, '--use_larc', default=False, - help="""Use Layer wise Adaptive Rate Control which helps convergence at really large batch sizes""") - cmdline.add_argument('--leta', default=0.013, type=float, - help="""The trust coefficient for LARC optimization, LARC Eta""") - - cmdline.add_argument('--cdr_first_decay_ratio', default=0.33, type=float, - help="""Cosine Decay Restart First Deacy Steps ratio""") - cmdline.add_argument('--cdr_t_mul', default=2.0, type=float, - help="""Cosine Decay Restart t_mul""") - cmdline.add_argument('--cdr_m_mul', default=0.1, type=float, - help="""Cosine Decay Restart m_mul""") - cmdline.add_argument('--cdr_alpha', default=0.0, type=float, - help="""Cosine Decay Restart alpha""") - cmdline.add_argument('--lc_periods', default=0.47, type=float, - help="""Linear Cosine num of periods""") - cmdline.add_argument('--lc_alpha', default=0.0, type=float, - help="""linear Cosine alpha""") - cmdline.add_argument('--lc_beta', default=0.00001, type=float, - help="""Liner Cosine Beta""") - - add_bool_argument(cmdline, '--increased_aug', default=False, - help="""Increase augmentations helpful when training with large number of GPUs such as 128 or 256""") - cmdline.add_argument('--contrast', default=0.6, type=float, - help="""contrast factor""") - cmdline.add_argument('--saturation', default=0.6, type=float, - help="""saturation factor""") - cmdline.add_argument('--hue', default=0.13, type=float, - help="""hue max delta factor, hue delta = hue * math.pi""") - cmdline.add_argument('--brightness', default=0.3, type=float, - help="""Brightness factor""") + can access.""", + ) + cmdline.add_argument( + "--eval_interval", type=int, help="""Evaluate accuracy per eval_interval number of epochs""" + ) + add_bool_argument( + cmdline, + "--fp16", + default=True, + help="""Train using float16 (half) precision instead + of float32.""", + ) + cmdline.add_argument( + "--num_gpus", + default=1, + type=int, + help="""Specify total number of GPUS used to train a checkpointed model during eval. + Used only to calculate epoch number to print during evaluation""", + ) + + cmdline.add_argument("--save_checkpoints_steps", type=int, default=1000) + cmdline.add_argument("--save_summary_steps", type=int, default=0) + add_bool_argument( + cmdline, + "--adv_bn_init", + default=True, + help="""init gamme of the last BN of each ResMod at 0.""", + ) + add_bool_argument( + cmdline, "--adv_conv_init", default=True, help="""init conv with MSRA initializer""" + ) + + cmdline.add_argument("--lr", type=float, help="""Start learning rate""") + cmdline.add_argument("--mom", default=0.90, type=float, help="""Momentum""") + cmdline.add_argument("--wdecay", default=0.0001, type=float, help="""Weight decay""") + cmdline.add_argument("--loss_scale", default=1024.0, type=float, help="""loss scale""") + cmdline.add_argument( + "--warmup_lr", default=0.001, type=float, help="""Warmup starting from this learning rate""" + ) + cmdline.add_argument( + "--warmup_epochs", + default=0, + type=int, + help="""Number of epochs in which to warmup to given lr""", + ) + cmdline.add_argument( + "--lr_decay_steps", + default="30,60,80", + type=str, + help="""epoch numbers at which lr is decayed by lr_decay_lrs. + Used when lr_decay_mode is steps""", + ) + cmdline.add_argument( + "--lr_decay_lrs", default="", type=str, help="""learning rates at specific epochs""" + ) + cmdline.add_argument( + "--lr_decay_mode", + default="poly", + help="""Takes either `steps` (decay by a factor at specified steps) + or `poly`(polynomial_decay with degree 2)""", + ) + + add_bool_argument( + cmdline, + "--use_larc", + default=False, + help="""Use Layer wise Adaptive Rate Control which helps convergence at really large batch sizes""", + ) + cmdline.add_argument( + "--leta", + default=0.013, + type=float, + help="""The trust coefficient for LARC optimization, LARC Eta""", + ) + + cmdline.add_argument( + "--cdr_first_decay_ratio", + default=0.33, + type=float, + help="""Cosine Decay Restart First Deacy Steps ratio""", + ) + cmdline.add_argument( + "--cdr_t_mul", default=2.0, type=float, help="""Cosine Decay Restart t_mul""" + ) + cmdline.add_argument( + "--cdr_m_mul", default=0.1, type=float, help="""Cosine Decay Restart m_mul""" + ) + cmdline.add_argument( + "--cdr_alpha", default=0.0, type=float, help="""Cosine Decay Restart alpha""" + ) + cmdline.add_argument( + "--lc_periods", default=0.47, type=float, help="""Linear Cosine num of periods""" + ) + cmdline.add_argument("--lc_alpha", default=0.0, type=float, help="""linear Cosine alpha""") + cmdline.add_argument("--lc_beta", default=0.00001, type=float, help="""Liner Cosine Beta""") + + add_bool_argument( + cmdline, + "--increased_aug", + default=False, + help="""Increase augmentations helpful when training with large number of GPUs such as 128 or 256""", + ) + cmdline.add_argument("--contrast", default=0.6, type=float, help="""contrast factor""") + cmdline.add_argument("--saturation", default=0.6, type=float, help="""saturation factor""") + cmdline.add_argument( + "--hue", + default=0.13, + type=float, + help="""hue max delta factor, hue delta = hue * math.pi""", + ) + cmdline.add_argument("--brightness", default=0.3, type=float, help="""Brightness factor""") return cmdline def sort_and_load_ckpts(log_dir): ckpts = [] for f in os.listdir(log_dir): - m = re.match(r'model.ckpt-([0-9]+).index', f) + m = re.match(r"model.ckpt-([0-9]+).index", f) if m is None: continue fullpath = os.path.join(log_dir, f) - ckpts.append({'step': int(m.group(1)), - 'path': os.path.splitext(fullpath)[0], - 'mtime': os.stat(fullpath).st_mtime, - }) - ckpts.sort(key=itemgetter('step')) + ckpts.append( + { + "step": int(m.group(1)), + "path": os.path.splitext(fullpath)[0], + "mtime": os.stat(fullpath).st_mtime, + } + ) + ckpts.sort(key=itemgetter("step")) return ckpts def main(): gpu_thread_count = 2 - os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private' - os.environ['TF_GPU_THREAD_COUNT'] = str(gpu_thread_count) - os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1' - os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' + os.environ["TF_GPU_THREAD_MODE"] = "gpu_private" + os.environ["TF_GPU_THREAD_COUNT"] = str(gpu_thread_count) + os.environ["TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT"] = "1" + os.environ["TF_ENABLE_WINOGRAD_NONFUSED"] = "1" hvd.init() - config = tf.ConfigProto() config.gpu_options.visible_device_list = str(hvd.local_rank()) config.gpu_options.force_gpu_compatible = True # Force pinned memory @@ -914,7 +1107,7 @@ def main(): FLAGS.log_dir = None if FLAGS.log_dir == "" else FLAGS.log_dir if FLAGS.eval: - FLAGS.log_name = 'eval_' + FLAGS.log_name + FLAGS.log_name = "eval_" + FLAGS.log_name if hvd.rank() != 0: return if FLAGS.local_ckpt: @@ -930,7 +1123,7 @@ def main(): os.makedirs(FLAGS.log_dir) barrier = hvd.allreduce(tf.constant(0, dtype=tf.float32)) tf.Session(config=config).run(barrier) - + logger = logging.getLogger(FLAGS.log_name) logger.setLevel(logging.INFO) # INFO, ERROR # file handler which logs debug messages @@ -939,7 +1132,7 @@ def main(): ch.setLevel(logging.INFO) # add formatter to the handlers # formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') - formatter = logging.Formatter('%(message)s') + formatter = logging.Formatter("%(message)s") ch.setFormatter(formatter) logger.addHandler(ch) if not hvd.rank(): @@ -948,23 +1141,25 @@ def main(): fh.setFormatter(formatter) # add handlers to logger logger.addHandler(fh) - + height, width = 224, 224 global_batch_size = FLAGS.batch_size * hvd.size() - rank0log(logger, 'PY' + str(sys.version) + 'TF' + str(tf.__version__)) + rank0log(logger, "PY" + str(sys.version) + "TF" + str(tf.__version__)) rank0log(logger, "Horovod size: ", hvd.size()) if FLAGS.data_dir: - filename_pattern = os.path.join(FLAGS.data_dir, '%s-*') - train_filenames = sorted(tf.gfile.Glob(filename_pattern % 'train')) - eval_filenames = sorted(tf.gfile.Glob(filename_pattern % 'validation')) + filename_pattern = os.path.join(FLAGS.data_dir, "%s-*") + train_filenames = sorted(tf.gfile.Glob(filename_pattern % "train")) + eval_filenames = sorted(tf.gfile.Glob(filename_pattern % "validation")) num_training_samples = get_num_records(train_filenames) rank0log(logger, "Using data from: ", FLAGS.data_dir) if not FLAGS.eval: - rank0log(logger, 'Found ', num_training_samples, ' training samples') + rank0log(logger, "Found ", num_training_samples, " training samples") else: if not FLAGS.synthetic: - raise ValueError('data_dir missing. Please pass --synthetic if you want to run on synthetic data. Else please pass --data_dir') + raise ValueError( + "data_dir missing. Please pass --synthetic if you want to run on synthetic data. Else please pass --data_dir" + ) train_filenames = eval_filenames = [] num_training_samples = 1281167 training_samples_per_rank = num_training_samples // hvd.size() @@ -979,9 +1174,9 @@ def main(): nstep_per_epoch = num_training_samples // global_batch_size decay_steps = nstep - if FLAGS.lr_decay_mode == 'steps': - steps = [int(x) * nstep_per_epoch for x in FLAGS.lr_decay_steps.split(',')] - lr_steps = [float(x) for x in FLAGS.lr_decay_lrs.split(',')] + if FLAGS.lr_decay_mode == "steps": + steps = [int(x) * nstep_per_epoch for x in FLAGS.lr_decay_steps.split(",")] + lr_steps = [float(x) for x in FLAGS.lr_decay_lrs.split(",")] else: steps = [] lr_steps = [] @@ -997,11 +1192,11 @@ def main(): if not FLAGS.save_summary_steps: # default to save one checkpoint per epoch FLAGS.save_summary_steps = nstep_per_epoch - + if not FLAGS.eval: - rank0log(logger, 'Using a learning rate of ', FLAGS.lr) - rank0log(logger, 'Checkpointing every ' + str(FLAGS.save_checkpoints_steps) + ' steps') - rank0log(logger, 'Saving summary every ' + str(FLAGS.save_summary_steps) + ' steps') + rank0log(logger, "Using a learning rate of ", FLAGS.lr) + rank0log(logger, "Checkpointing every " + str(FLAGS.save_checkpoints_steps) + " steps") + rank0log(logger, "Saving summary every " + str(FLAGS.save_summary_steps) + " steps") warmup_it = nstep_per_epoch * FLAGS.warmup_epochs @@ -1009,62 +1204,74 @@ def main(): model_fn=cnn_model_function, model_dir=FLAGS.log_dir, params={ - 'model': FLAGS.model, - 'decay_steps': decay_steps, - 'n_classes': 1000, - 'dtype': tf.float16 if FLAGS.fp16 else tf.float32, - 'format': 'channels_first', - 'device': '/gpu:0', - 'lr': FLAGS.lr, - 'mom': FLAGS.mom, - 'wdecay': FLAGS.wdecay, - 'use_larc': FLAGS.use_larc, - 'leta': FLAGS.leta, - 'steps': steps, - 'lr_steps': lr_steps, - 'lr_decay_mode': FLAGS.lr_decay_mode, - 'warmup_it': warmup_it, - 'warmup_lr': FLAGS.warmup_lr, - 'cdr_first_decay_ratio': FLAGS.cdr_first_decay_ratio, - 'cdr_t_mul': FLAGS.cdr_t_mul, - 'cdr_m_mul': FLAGS.cdr_m_mul, - 'cdr_alpha': FLAGS.cdr_alpha, - 'lc_periods': FLAGS.lc_periods, - 'lc_alpha': FLAGS.lc_alpha, - 'lc_beta': FLAGS.lc_beta, - 'loss_scale': FLAGS.loss_scale, - 'adv_bn_init': FLAGS.adv_bn_init, - 'conv_init': tf.variance_scaling_initializer() if FLAGS.adv_conv_init else None + "model": FLAGS.model, + "decay_steps": decay_steps, + "n_classes": 1000, + "dtype": tf.float16 if FLAGS.fp16 else tf.float32, + "format": "channels_first", + "device": "/gpu:0", + "lr": FLAGS.lr, + "mom": FLAGS.mom, + "wdecay": FLAGS.wdecay, + "use_larc": FLAGS.use_larc, + "leta": FLAGS.leta, + "steps": steps, + "lr_steps": lr_steps, + "lr_decay_mode": FLAGS.lr_decay_mode, + "warmup_it": warmup_it, + "warmup_lr": FLAGS.warmup_lr, + "cdr_first_decay_ratio": FLAGS.cdr_first_decay_ratio, + "cdr_t_mul": FLAGS.cdr_t_mul, + "cdr_m_mul": FLAGS.cdr_m_mul, + "cdr_alpha": FLAGS.cdr_alpha, + "lc_periods": FLAGS.lc_periods, + "lc_alpha": FLAGS.lc_alpha, + "lc_beta": FLAGS.lc_beta, + "loss_scale": FLAGS.loss_scale, + "adv_bn_init": FLAGS.adv_bn_init, + "conv_init": tf.variance_scaling_initializer() if FLAGS.adv_conv_init else None, }, config=tf.estimator.RunConfig( # tf_random_seed=31 * (1 + hvd.rank()), session_config=config, save_summary_steps=FLAGS.save_summary_steps if do_checkpoint else None, save_checkpoints_steps=FLAGS.save_checkpoints_steps if do_checkpoint else None, - keep_checkpoint_max=None)) + keep_checkpoint_max=None, + ), + ) if not FLAGS.eval: num_preproc_threads = 5 rank0log(logger, "Using preprocessing threads per GPU: ", num_preproc_threads) - training_hooks = [hvd.BroadcastGlobalVariablesHook(0), - PrefillStagingAreasHook()] + training_hooks = [hvd.BroadcastGlobalVariablesHook(0), PrefillStagingAreasHook()] if hvd.rank() == 0: training_hooks.append( - LogSessionRunHook(global_batch_size, - num_training_samples, - FLAGS.display_every, logger)) + LogSessionRunHook( + global_batch_size, num_training_samples, FLAGS.display_every, logger + ) + ) try: start_time = time.time() classifier.train( input_fn=lambda: make_dataset( train_filenames, training_samples_per_rank, - FLAGS.batch_size, height, width, - FLAGS.brightness, FLAGS.contrast, FLAGS.saturation, FLAGS.hue, - training=True, num_threads=num_preproc_threads, - shard=True, synthetic=FLAGS.synthetic, increased_aug=FLAGS.increased_aug), + FLAGS.batch_size, + height, + width, + FLAGS.brightness, + FLAGS.contrast, + FLAGS.saturation, + FLAGS.hue, + training=True, + num_threads=num_preproc_threads, + shard=True, + synthetic=FLAGS.synthetic, + increased_aug=FLAGS.increased_aug, + ), max_steps=nstep, - hooks=training_hooks) + hooks=training_hooks, + ) rank0log(logger, "Finished in ", time.time() - start_time) except KeyboardInterrupt: print("Keyboard interrupt") @@ -1075,45 +1282,62 @@ def main(): tf.Session(config=config).run(barrier) time.sleep(5) # a little extra margin... if FLAGS.num_gpus == 1: - rank0log(logger, """If you are evaluating checkpoints of a multi-GPU run on a single GPU, + rank0log( + logger, + """If you are evaluating checkpoints of a multi-GPU run on a single GPU, ensure you set --num_gpus to the number of GPUs it was trained on. - This will ensure that the epoch number is accurately displayed in the below logs.""") + This will ensure that the epoch number is accurately displayed in the below logs.""", + ) try: ckpts = sort_and_load_ckpts(FLAGS.log_dir) for i, c in enumerate(ckpts): if i < len(ckpts) - 1: - if (not FLAGS.eval_interval) or \ - (i % FLAGS.eval_interval != 0): + if (not FLAGS.eval_interval) or (i % FLAGS.eval_interval != 0): continue eval_result = classifier.evaluate( input_fn=lambda: make_dataset( eval_filenames, - get_num_records(eval_filenames), FLAGS.batch_size, - height, width, - FLAGS.brightness, FLAGS.contrast, FLAGS.saturation, FLAGS.hue, - training=False, shard=True, increased_aug=False), - checkpoint_path=c['path']) - c['epoch'] = math.ceil(c['step'] / (num_training_samples / (FLAGS.batch_size * FLAGS.num_gpus))) - c['top1'] = eval_result['val-top1acc'] - c['top5'] = eval_result['val-top5acc'] - c['loss'] = eval_result['loss'] - rank0log(logger, ' step epoch top1 top5 loss checkpoint_time(UTC)') + get_num_records(eval_filenames), + FLAGS.batch_size, + height, + width, + FLAGS.brightness, + FLAGS.contrast, + FLAGS.saturation, + FLAGS.hue, + training=False, + shard=True, + increased_aug=False, + ), + checkpoint_path=c["path"], + ) + c["epoch"] = math.ceil( + c["step"] / (num_training_samples / (FLAGS.batch_size * FLAGS.num_gpus)) + ) + c["top1"] = eval_result["val-top1acc"] + c["top5"] = eval_result["val-top5acc"] + c["loss"] = eval_result["loss"] + rank0log(logger, " step epoch top1 top5 loss checkpoint_time(UTC)") barrier = hvd.allreduce(tf.constant(0, dtype=tf.float32)) for i, c in enumerate(ckpts): tf.Session(config=config).run(barrier) - if 'top1' not in c: + if "top1" not in c: continue - rank0log(logger,'{:5d} {:5.1f} {:5.3f} {:6.2f} {:6.2f} {time}' - .format(c['step'], - c['epoch'], - c['top1'] * 100, - c['top5'] * 100, - c['loss'], - time=time.strftime('%Y-%m-%d %H:%M:%S', - time.localtime(c['mtime'])))) + rank0log( + logger, + "{:5d} {:5.1f} {:5.3f} {:6.2f} {:6.2f} {time}".format( + c["step"], + c["epoch"], + c["top1"] * 100, + c["top5"] * 100, + c["loss"], + time=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(c["mtime"])), + ), + ) rank0log(logger, "Finished evaluation") except KeyboardInterrupt: logger.error("Keyboard interrupt") -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/benchmarks/tf_benchmarks/execute_tensorflow_training.py b/benchmarks/tf_benchmarks/execute_tensorflow_training.py index b4f15304..88380a97 100755 --- a/benchmarks/tf_benchmarks/execute_tensorflow_training.py +++ b/benchmarks/tf_benchmarks/execute_tensorflow_training.py @@ -26,13 +26,13 @@ dir_path = os.path.dirname(os.path.realpath(__file__)) _DEFAULT_HYPERPARAMETERS = { - 'batch_size': 32, - 'model': 'resnet32', - 'num_epochs': 10, - 'data_format': 'NHWC', - 'summary_verbosity': 1, - 'save_summaries_steps': 10, - 'data_name': 'cifar10' + "batch_size": 32, + "model": "resnet32", + "num_epochs": 10, + "data_format": "NHWC", + "summary_verbosity": 1, + "save_summaries_steps": 10, + "data_name": "cifar10", } @@ -44,67 +44,73 @@ class ScriptModeTensorFlow(Framework): create_model = TensorFlow.create_model - def __init__(self, py_version='py3', **kwargs): + def __init__(self, py_version="py3", **kwargs): super(ScriptModeTensorFlow, self).__init__(**kwargs) self.py_version = py_version self.image_name = None - self.framework_version = '1.10.0' + self.framework_version = "1.10.0" def get_args(): parser = argparse.ArgumentParser() - parser.add_argument('-t', '--instance-types', nargs='+', help=' Set flag', required=True) - parser.add_argument('-r', '--role', required=True) - parser.add_argument('-w', '--wait', action='store_true') - parser.add_argument('--region', default='us-west-2') - parser.add_argument('--py-versions', nargs='+', help=' Set flag', default=['py3']) - parser.add_argument('--checkpoint-path', - default=os.path.join(default_bucket(), 'benchmarks', 'checkpoints'), - help='The S3 location where the model checkpoints and tensorboard events are saved after training') + parser.add_argument( + "-t", "--instance-types", nargs="+", help=" Set flag", required=True + ) + parser.add_argument("-r", "--role", required=True) + parser.add_argument("-w", "--wait", action="store_true") + parser.add_argument("--region", default="us-west-2") + parser.add_argument("--py-versions", nargs="+", help=" Set flag", default=["py3"]) + parser.add_argument( + "--checkpoint-path", + default=os.path.join(default_bucket(), "benchmarks", "checkpoints"), + help="The S3 location where the model checkpoints and tensorboard events are saved after training", + ) return parser.parse_known_args() def main(args, script_args): for instance_type, py_version in itertools.product(args.instance_types, args.py_versions): - base_name = '%s-%s-%s' % (py_version, instance_type[3:5], instance_type[6:]) + base_name = "%s-%s-%s" % (py_version, instance_type[3:5], instance_type[6:]) model_dir = os.path.join(args.checkpoint_path, base_name) job_hps = create_hyperparameters(model_dir, script_args) - print('hyperparameters:') + print("hyperparameters:") print(job_hps) estimator = ScriptModeTensorFlow( - entry_point='tf_cnn_benchmarks.py', - role='SageMakerRole', - source_dir=os.path.join(dir_path, 'tf_cnn_benchmarks'), + entry_point="tf_cnn_benchmarks.py", + role="SageMakerRole", + source_dir=os.path.join(dir_path, "tf_cnn_benchmarks"), base_job_name=base_name, train_instance_count=1, hyperparameters=job_hps, train_instance_type=instance_type, ) - input_dir = 's3://sagemaker-sample-data-%s/spark/mnist/train/' % args.region - estimator.fit({'train': input_dir}, wait=args.wait) + input_dir = "s3://sagemaker-sample-data-%s/spark/mnist/train/" % args.region + estimator.fit({"train": input_dir}, wait=args.wait) print("To use TensorBoard, execute the following command:") - cmd = 'S3_USE_HTTPS=0 S3_VERIFY_SSL=0 AWS_REGION=%s tensorboard --host localhost --port 6006 --logdir %s' + cmd = "S3_USE_HTTPS=0 S3_VERIFY_SSL=0 AWS_REGION=%s tensorboard --host localhost --port 6006 --logdir %s" print(cmd % (args.region, args.checkpoint_path)) def create_hyperparameters(model_dir, script_args): job_hps = _DEFAULT_HYPERPARAMETERS.copy() - job_hps.update({'train_dir': model_dir, 'eval_dir': model_dir}) + job_hps.update({"train_dir": model_dir, "eval_dir": model_dir}) - script_arg_keys_without_dashes = [key[2:] if key.startswith('--') else key[1:] for key in script_args[::2]] + script_arg_keys_without_dashes = [ + key[2:] if key.startswith("--") else key[1:] for key in script_args[::2] + ] script_arg_values = script_args[1::2] job_hps.update(dict(zip(script_arg_keys_without_dashes, script_arg_values))) return job_hps -if __name__ == '__main__': +if __name__ == "__main__": args, script_args = get_args() - main(args, script_args) \ No newline at end of file + main(args, script_args) diff --git a/scripts/build_all.py b/scripts/build_all.py index 9cd23fc0..7c7fc220 100644 --- a/scripts/build_all.py +++ b/scripts/build_all.py @@ -16,70 +16,88 @@ import os import subprocess -VERSION = '1.13.1' -REPO = 'sagemaker-tensorflow-scriptmode' -PY2_CPU_BINARY = 'https://s3-us-west-2.amazonaws.com/tensorflow-aws/1.13/AmazonLinux/cpu/latest-patch-latest-patch/tensorflow-1.13.1-cp27-cp27mu-linux_x86_64.whl' # noqa -PY3_CPU_BINARY = 'https://s3-us-west-2.amazonaws.com/tensorflow-aws/1.13/AmazonLinux/cpu/latest-patch-latest-patch/tensorflow-1.13.1-cp36-cp36m-linux_x86_64.whl' # noqa -PY2_GPU_BINARY = 'https://s3-us-west-2.amazonaws.com/tensorflow-aws/1.13/AmazonLinux/gpu/latest-patch-latest-patch/tensorflow-1.13.1-cp27-cp27mu-linux_x86_64.whl' # noqa -PY3_GPU_BINARY = 'https://s3-us-west-2.amazonaws.com/tensorflow-aws/1.13/AmazonLinux/gpu/latest-patch-latest-patch/tensorflow-1.13.1-cp36-cp36m-linux_x86_64.whl' # noqa -DEV_ACCOUNT = '142577830533' -REGION = 'us-west-2' +VERSION = "1.13.1" +REPO = "sagemaker-tensorflow-scriptmode" +PY2_CPU_BINARY = "https://s3-us-west-2.amazonaws.com/tensorflow-aws/1.13/AmazonLinux/cpu/latest-patch-latest-patch/tensorflow-1.13.1-cp27-cp27mu-linux_x86_64.whl" # noqa +PY3_CPU_BINARY = "https://s3-us-west-2.amazonaws.com/tensorflow-aws/1.13/AmazonLinux/cpu/latest-patch-latest-patch/tensorflow-1.13.1-cp36-cp36m-linux_x86_64.whl" # noqa +PY2_GPU_BINARY = "https://s3-us-west-2.amazonaws.com/tensorflow-aws/1.13/AmazonLinux/gpu/latest-patch-latest-patch/tensorflow-1.13.1-cp27-cp27mu-linux_x86_64.whl" # noqa +PY3_GPU_BINARY = "https://s3-us-west-2.amazonaws.com/tensorflow-aws/1.13/AmazonLinux/gpu/latest-patch-latest-patch/tensorflow-1.13.1-cp36-cp36m-linux_x86_64.whl" # noqa +DEV_ACCOUNT = "142577830533" +REGION = "us-west-2" def _parse_args(): parser = argparse.ArgumentParser() - parser.add_argument('--account', type=str, default=DEV_ACCOUNT) - parser.add_argument('--region', type=str, default=REGION) - parser.add_argument('--version', type=str, default=VERSION) - parser.add_argument('--py2-cpu-binary', type=str, default=PY2_CPU_BINARY) - parser.add_argument('--py3-cpu-binary', type=str, default=PY3_CPU_BINARY) - parser.add_argument('--py2-gpu-binary', type=str, default=PY2_GPU_BINARY) - parser.add_argument('--py3-gpu-binary', type=str, default=PY3_GPU_BINARY) - parser.add_argument('--repo', type=str, default=REPO) + parser.add_argument("--account", type=str, default=DEV_ACCOUNT) + parser.add_argument("--region", type=str, default=REGION) + parser.add_argument("--version", type=str, default=VERSION) + parser.add_argument("--py2-cpu-binary", type=str, default=PY2_CPU_BINARY) + parser.add_argument("--py3-cpu-binary", type=str, default=PY3_CPU_BINARY) + parser.add_argument("--py2-gpu-binary", type=str, default=PY2_GPU_BINARY) + parser.add_argument("--py3-gpu-binary", type=str, default=PY3_GPU_BINARY) + parser.add_argument("--repo", type=str, default=REPO) return parser.parse_args() args = _parse_args() binaries = { - 'py2-cpu': args.py2_cpu_binary, - 'py3-cpu': args.py3_cpu_binary, - 'py2-gpu': args.py2_gpu_binary, - 'py3-gpu': args.py3_gpu_binary + "py2-cpu": args.py2_cpu_binary, + "py3-cpu": args.py3_cpu_binary, + "py2-gpu": args.py2_gpu_binary, + "py3-gpu": args.py3_gpu_binary, } -build_dir = os.path.join('docker', args.version) +build_dir = os.path.join("docker", args.version) # Run docker-login so we can pull the cached image login_cmd = subprocess.check_output( - 'aws ecr get-login --no-include-email --registry-id {}'.format(args.account).split()) -print('Executing docker login command: {}'.format(login_cmd)) + "aws ecr get-login --no-include-email --registry-id {}".format(args.account).split() +) +print("Executing docker login command: {}".format(login_cmd)) subprocess.check_call(login_cmd.split()) -for arch in ['cpu', 'gpu']: - for py_version in ['2', '3']: +for arch in ["cpu", "gpu"]: + for py_version in ["2", "3"]: - binary_url = binaries['py{}-{}'.format(py_version, arch)] + binary_url = binaries["py{}-{}".format(py_version, arch)] binary_file = os.path.basename(binary_url) - cmd = 'wget -O {}/{} {}'.format(build_dir, binary_file, binary_url) - print('Downloading binary file: {}'.format(cmd)) + cmd = "wget -O {}/{} {}".format(build_dir, binary_file, binary_url) + print("Downloading binary file: {}".format(cmd)) subprocess.check_call(cmd.split()) - tag = '{}-{}-py{}'.format(args.version, arch, py_version) - prev_image_uri = '{}.dkr.ecr.{}.amazonaws.com/{}:{}'.format(args.account, args.region, args.repo, tag) - dockerfile = os.path.join(build_dir, 'Dockerfile.{}'.format(arch)) - - tar_file_name = subprocess.check_output('ls {}/sagemaker_tensorflow_container*'.format(build_dir), - shell=True).strip().decode('ascii') - print('framework_support_installable is {}'.format(os.path.basename(tar_file_name))) - - build_cmd = 'docker build -f {} --cache-from {} --build-arg framework_support_installable={} ' \ - '--build-arg py_version={} --build-arg framework_installable={} ' \ - '-t {}:{} {}'.format(dockerfile, prev_image_uri, os.path.basename(tar_file_name), py_version, - binary_file, args.repo, tag, build_dir) - print('Building docker image: {}'.format(build_cmd)) + tag = "{}-{}-py{}".format(args.version, arch, py_version) + prev_image_uri = "{}.dkr.ecr.{}.amazonaws.com/{}:{}".format( + args.account, args.region, args.repo, tag + ) + dockerfile = os.path.join(build_dir, "Dockerfile.{}".format(arch)) + + tar_file_name = ( + subprocess.check_output( + "ls {}/sagemaker_tensorflow_container*".format(build_dir), shell=True + ) + .strip() + .decode("ascii") + ) + print("framework_support_installable is {}".format(os.path.basename(tar_file_name))) + + build_cmd = ( + "docker build -f {} --cache-from {} --build-arg framework_support_installable={} " + "--build-arg py_version={} --build-arg framework_installable={} " + "-t {}:{} {}".format( + dockerfile, + prev_image_uri, + os.path.basename(tar_file_name), + py_version, + binary_file, + args.repo, + tag, + build_dir, + ) + ) + print("Building docker image: {}".format(build_cmd)) subprocess.check_call(build_cmd.split()) - print('Deleting binary file {}'.format(binary_file)) - subprocess.check_call('rm {}'.format(os.path.join(build_dir, binary_file)).split()) + print("Deleting binary file {}".format(binary_file)) + subprocess.check_call("rm {}".format(os.path.join(build_dir, binary_file)).split()) diff --git a/scripts/publish_all.py b/scripts/publish_all.py index 2c78e8a7..9d8498cd 100644 --- a/scripts/publish_all.py +++ b/scripts/publish_all.py @@ -15,38 +15,40 @@ import argparse import subprocess -DEV_ACCOUNT = '142577830533' -VERSION = '1.13.1' -REGION = 'us-west-2' -REPO = 'sagemaker-tensorflow-scriptmode' +DEV_ACCOUNT = "142577830533" +VERSION = "1.13.1" +REGION = "us-west-2" +REPO = "sagemaker-tensorflow-scriptmode" def _parse_args(): parser = argparse.ArgumentParser() - parser.add_argument('--account', type=str, default=DEV_ACCOUNT) - parser.add_argument('--version', type=str, default=VERSION) - parser.add_argument('--repo', type=str, default=REPO) - parser.add_argument('--region', type=str, default=REGION) + parser.add_argument("--account", type=str, default=DEV_ACCOUNT) + parser.add_argument("--version", type=str, default=VERSION) + parser.add_argument("--repo", type=str, default=REPO) + parser.add_argument("--region", type=str, default=REGION) return parser.parse_args() args = _parse_args() -for arch in ['cpu', 'gpu']: - for py_version in ['2', '3']: - source = '{}:{}-{}-py{}'.format(args.repo, args.version, arch, py_version) - dest = '{}.dkr.ecr.{}.amazonaws.com/{}'.format(args.account, args.region, source) - tag_cmd = 'docker tag {} {}'.format(source, dest) - print('Tagging image: {}'.format(tag_cmd)) +for arch in ["cpu", "gpu"]: + for py_version in ["2", "3"]: + source = "{}:{}-{}-py{}".format(args.repo, args.version, arch, py_version) + dest = "{}.dkr.ecr.{}.amazonaws.com/{}".format(args.account, args.region, source) + tag_cmd = "docker tag {} {}".format(source, dest) + print("Tagging image: {}".format(tag_cmd)) subprocess.check_call(tag_cmd.split()) login_cmd = subprocess.check_output( - 'aws ecr get-login --no-include-email --registry-id {} --region {}' - .format(args.account, args.region).split()) - print('Executing docker login command: {}'.format(login_cmd)) + "aws ecr get-login --no-include-email --registry-id {} --region {}".format( + args.account, args.region + ).split() + ) + print("Executing docker login command: {}".format(login_cmd)) subprocess.check_call(login_cmd.split()) - push_cmd = 'docker push {}'.format(dest) - print('Pushing image: {}'.format(push_cmd)) + push_cmd = "docker push {}".format(dest) + print("Pushing image: {}".format(push_cmd)) subprocess.check_call(push_cmd.split()) diff --git a/setup.py b/setup.py index c99f772a..cd7af9fa 100644 --- a/setup.py +++ b/setup.py @@ -26,47 +26,60 @@ def read(fname): def read_version(): - return read('VERSION').strip() + return read("VERSION").strip() -test_dependencies = ['tox', 'flake8', 'pytest', 'pytest-cov', 'pytest-xdist', 'mock', - 'sagemaker==1.51.3', 'tensorflow<2.4', 'docker-compose', 'boto3==1.10.50', - 'six==1.13.0', 'python-dateutil>=2.1,<2.8.1', 'botocore==1.13.50', - 'requests-mock', 'awscli==1.16.314'] +test_dependencies = [ + "tox", + "flake8", + "pytest", + "pytest-cov", + "pytest-xdist", + "mock", + "sagemaker==1.51.3", + "tensorflow<2.4", + "docker-compose", + "boto3==1.10.50", + "six==1.13.0", + "python-dateutil>=2.1,<2.8.1", + "botocore==1.13.50", + "requests-mock", + "awscli==1.16.314", +] if sys.version_info.major > 2: - test_dependencies.append('sagemaker-experiments==0.1.7') + test_dependencies.append("sagemaker-experiments==0.1.7") setup( - name='sagemaker_tensorflow_training', + name="sagemaker_tensorflow_training", version=read_version(), - description='Open source library for creating ' - 'TensorFlow containers to run on Amazon SageMaker.', - - packages=find_packages(where='src', exclude=('test',)), - package_dir={'': 'src'}, - py_modules=[splitext(basename(path))[0] for path in glob('src/*.py')], - - long_description=read('README.rst'), - author='Amazon Web Services', - url='https://github.com/aws/sagemaker-tensorflow-containers', - license='Apache License 2.0', - + description="Open source library for creating " + "TensorFlow containers to run on Amazon SageMaker.", + packages=find_packages(where="src", exclude=("test",)), + package_dir={"": "src"}, + py_modules=[splitext(basename(path))[0] for path in glob("src/*.py")], + long_description=read("README.rst"), + author="Amazon Web Services", + url="https://github.com/aws/sagemaker-tensorflow-containers", + license="Apache License 2.0", classifiers=[ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "Natural Language :: English", "License :: OSI Approved :: Apache Software License", "Programming Language :: Python", - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", ], - - install_requires=['sagemaker-training>=3.5.2', 'numpy', 'scipy', 'sklearn', - 'pandas', 'Pillow', 'h5py'], - extras_require={ - 'test': test_dependencies, - 'benchmark': ['click'], - }, + install_requires=[ + "sagemaker-training>=3.5.2", + "numpy", + "scipy", + "sklearn", + "pandas", + "Pillow", + "h5py", + ], + extras_require={"test": test_dependencies, "benchmark": ["click"], }, ) diff --git a/src/sagemaker_tensorflow_container/deep_learning_container.py b/src/sagemaker_tensorflow_container/deep_learning_container.py index 6dd437e1..965fb1ac 100644 --- a/src/sagemaker_tensorflow_container/deep_learning_container.py +++ b/src/sagemaker_tensorflow_container/deep_learning_container.py @@ -51,10 +51,24 @@ def _retrieve_instance_region(): Retrieve instance region from instance metadata service """ region = None - valid_regions = ['ap-northeast-1', 'ap-northeast-2', 'ap-southeast-1', 'ap-southeast-2', - 'ap-south-1', 'ca-central-1', 'eu-central-1', 'eu-north-1', - 'eu-west-1', 'eu-west-2', 'eu-west-3', 'sa-east-1', - 'us-east-1', 'us-east-2', 'us-west-1', 'us-west-2'] + valid_regions = [ + "ap-northeast-1", + "ap-northeast-2", + "ap-southeast-1", + "ap-southeast-2", + "ap-south-1", + "ca-central-1", + "eu-central-1", + "eu-north-1", + "eu-west-1", + "eu-west-2", + "eu-west-3", + "sa-east-1", + "us-east-1", + "us-east-2", + "us-west-1", + "us-west-2", + ] url = "http://169.254.169.254/latest/dynamic/instance-identity/document" response = requests_helper(url, timeout=0.1) @@ -62,8 +76,8 @@ def _retrieve_instance_region(): if response is not None: response_json = json.loads(response.text) - if response_json['region'] in valid_regions: - region = response_json['region'] + if response_json["region"] in valid_regions: + region = response_json["region"] return region @@ -77,8 +91,9 @@ def query_bucket(): region = _retrieve_instance_region() if instance_id is not None and region is not None: - url = "https://aws-deep-learning-containers-{0}.s3.{0}.amazonaws.com/dlc-containers.txt?x-instance-id={1}"\ - .format(region, instance_id) + url = "https://aws-deep-learning-containers-{0}.s3.{0}.amazonaws.com/dlc-containers.txt?x-instance-id={1}".format( # noqa: E501 + region, instance_id + ) response = requests_helper(url, timeout=0.2) logging.debug("Query bucket finished: {}".format(response)) @@ -107,5 +122,5 @@ def main(): query_bucket() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/sagemaker_tensorflow_container/s3_utils.py b/src/sagemaker_tensorflow_container/s3_utils.py index 0137ef25..15902c55 100644 --- a/src/sagemaker_tensorflow_container/s3_utils.py +++ b/src/sagemaker_tensorflow_container/s3_utils.py @@ -20,23 +20,23 @@ def configure(model_dir, job_region): - os.environ['S3_REGION'] = _s3_region(job_region, model_dir) + os.environ["S3_REGION"] = _s3_region(job_region, model_dir) # setting log level to WARNING - os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' - os.environ['S3_USE_HTTPS'] = '1' + os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1" + os.environ["S3_USE_HTTPS"] = "1" def _s3_region(job_region, model_dir): - if model_dir and model_dir.startswith('s3://'): - s3 = boto3.client('s3', region_name=job_region) + if model_dir and model_dir.startswith("s3://"): + s3 = boto3.client("s3", region_name=job_region) # We get the AWS region of the checkpoint bucket, which may be different from # the region this container is currently running in. parsed_url = urlparse(model_dir) bucket_name = parsed_url.netloc - bucket_location = s3.get_bucket_location(Bucket=bucket_name)['LocationConstraint'] + bucket_location = s3.get_bucket_location(Bucket=bucket_name)["LocationConstraint"] return bucket_location or job_region else: diff --git a/src/sagemaker_tensorflow_container/training.py b/src/sagemaker_tensorflow_container/training.py index bc41b6b2..a1bf5fef 100644 --- a/src/sagemaker_tensorflow_container/training.py +++ b/src/sagemaker_tensorflow_container/training.py @@ -26,8 +26,8 @@ logger = logging.getLogger(__name__) -SAGEMAKER_PARAMETER_SERVER_ENABLED = 'sagemaker_parameter_server_enabled' -MODEL_DIR = '/opt/ml/model' +SAGEMAKER_PARAMETER_SERVER_ENABLED = "sagemaker_parameter_server_enabled" +MODEL_DIR = "/opt/ml/model" def _is_host_master(hosts, current_host): @@ -56,50 +56,46 @@ def _build_tf_config(hosts, current_host, ps_task=False): ps = hosts if len(hosts) > 1 else None def host_addresses(hosts, port=2222): - return ['{}:{}'.format(host, port) for host in hosts] + return ["{}:{}".format(host, port) for host in hosts] - tf_config = { - 'cluster': { - 'master': host_addresses(masters) - }, - 'environment': 'cloud' - } + tf_config = {"cluster": {"master": host_addresses(masters)}, "environment": "cloud"} if ps: - tf_config['cluster']['ps'] = host_addresses(ps, port='2223') + tf_config["cluster"]["ps"] = host_addresses(ps, port="2223") if workers: - tf_config['cluster']['worker'] = host_addresses(workers) + tf_config["cluster"]["worker"] = host_addresses(workers) if ps_task: if ps is None: raise ValueError( - 'Cannot have a ps task if there are no parameter servers in the cluster') - task_type = 'ps' + "Cannot have a ps task if there are no parameter servers in the cluster" + ) + task_type = "ps" task_index = ps.index(current_host) elif _is_host_master(hosts, current_host): - task_type = 'master' + task_type = "master" task_index = 0 else: - task_type = 'worker' + task_type = "worker" task_index = workers.index(current_host) - tf_config['task'] = {'index': task_index, 'type': task_type} + tf_config["task"] = {"index": task_index, "type": task_type} return tf_config def _run_ps(env, cluster): - logger.info('Running distributed training job with parameter servers') + logger.info("Running distributed training job with parameter servers") cluster_spec = tf.train.ClusterSpec(cluster) task_index = env.hosts.index(env.current_host) # Force parameter server to run on cpu. Running multiple TensorFlow processes on the same # GPU is not safe: # https://stackoverflow.com/questions/46145100/is-it-unsafe-to-run-multiple-tensorflow-processes-on-the-same-gpu - no_gpu_config = tf.compat.v1.ConfigProto(device_count={'GPU': 0}) + no_gpu_config = tf.compat.v1.ConfigProto(device_count={"GPU": 0}) server = tf.distribute.Server( - cluster_spec, job_name='ps', task_index=task_index, config=no_gpu_config + cluster_spec, job_name="ps", task_index=task_index, config=no_gpu_config ) multiprocessing.Process(target=lambda: server.join()).start() @@ -107,20 +103,27 @@ def _run_ps(env, cluster): def _run_worker(env, cmd_args, tf_config): env_vars = env.to_env_vars() - env_vars['TF_CONFIG'] = json.dumps(tf_config) - - entry_point.run(env.module_dir, env.user_entry_point, cmd_args, env_vars) + env_vars["TF_CONFIG"] = json.dumps(tf_config) + + entry_point.run( + uri=env.module_dir, + user_entry_point=env.user_entry_point, + args=cmd_args, + env_vars=env_vars, + capture_error=True, + ) def _wait_until_master_is_down(master): while True: try: subprocess.check_call( - ['curl', '{}:2222'.format(master)], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - logger.info('master {} is still up, waiting for it to exit'.format(master)) + ["curl", "{}:2222".format(master)], stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + logger.info("master {} is still up, waiting for it to exit".format(master)) time.sleep(10) except subprocess.CalledProcessError: - logger.info('master {} is down, stopping parameter server'.format(master)) + logger.info("master {} is down, stopping parameter server".format(master)) return @@ -131,15 +134,16 @@ def train(env, cmd_args): env (sagemaker_training.environment.Environment): Instance of Environment class """ parameter_server_enabled = env.additional_framework_parameters.get( - SAGEMAKER_PARAMETER_SERVER_ENABLED, False) + SAGEMAKER_PARAMETER_SERVER_ENABLED, False + ) if len(env.hosts) > 1 and parameter_server_enabled: tf_config = _build_tf_config(hosts=env.hosts, current_host=env.current_host) - logger.info('Running distributed training job with parameter servers') - logger.info('Launching parameter server process') - _run_ps(env, tf_config['cluster']) - logger.info('Launching worker process') + logger.info("Running distributed training job with parameter servers") + logger.info("Launching parameter server process") + _run_ps(env, tf_config["cluster"]) + logger.info("Launching worker process") _run_worker(env, cmd_args, tf_config) if not _is_host_master(env.hosts, env.current_host): @@ -147,18 +151,21 @@ def train(env, cmd_args): else: - mpi_enabled = env.additional_framework_parameters.get('sagemaker_mpi_enabled') + mpi_enabled = env.additional_framework_parameters.get("sagemaker_mpi_enabled") if mpi_enabled: runner_type = runner.MPIRunnerType else: runner_type = runner.ProcessRunnerType - entry_point.run(env.module_dir, - env.user_entry_point, - cmd_args, - env.to_env_vars(), - runner_type=runner_type) + entry_point.run( + uri=env.module_dir, + user_entry_point=env.user_entry_point, + args=cmd_args, + env_vars=env.to_env_vars(), + capture_error=True, + runner_type=runner_type, + ) def _log_model_missing_warning(model_dir): @@ -168,31 +175,39 @@ def _log_model_missing_warning(model_dir): if filenames: file_exists = True for f in filenames: - if 'saved_model.pb' in f or 'saved_model.pbtxt' in f: + if "saved_model.pb" in f or "saved_model.pbtxt" in f: pb_file_exists = True path, direct_parent_dir = os.path.split(dirpath) if not str.isdigit(direct_parent_dir): - logger.warn('Your model will NOT be servable with SageMaker TensorFlow Serving containers. ' - 'The SavedModel bundle is under directory \"{}\", not a numeric name.' - .format(direct_parent_dir)) + logger.warn( + "Your model will NOT be servable with SageMaker TensorFlow Serving containers. " + 'The SavedModel bundle is under directory "{}", not a numeric name.'.format( + direct_parent_dir + ) + ) if not file_exists: - logger.warn('No model artifact is saved under path {}.' - ' Your training job will not save any model files to S3.\n' - 'For details of how to construct your training script see:\n' - 'https://sagemaker.readthedocs.io/en/stable/using_tf.html#adapting-your-local-tensorflow-script' - .format(model_dir)) + logger.warn( + "No model artifact is saved under path {}." + " Your training job will not save any model files to S3.\n" + "For details of how to construct your training script see:\n" + "https://sagemaker.readthedocs.io/en/stable/using_tf.html#adapting-your-local-tensorflow-script".format( + model_dir + ) + ) elif not pb_file_exists: - logger.warn('Your model will NOT be servable with SageMaker TensorFlow Serving container. ' - 'The model artifact was not saved in the TensorFlow SavedModel directory structure:\n' - 'https://www.tensorflow.org/guide/saved_model#structure_of_a_savedmodel_directory') + logger.warn( + "Your model will NOT be servable with SageMaker TensorFlow Serving container. " + "The model artifact was not saved in the TensorFlow SavedModel directory structure:\n" + "https://www.tensorflow.org/guide/saved_model#structure_of_a_savedmodel_directory" + ) def _model_dir_with_training_job(model_dir, job_name): - if model_dir.startswith('/opt/ml'): + if model_dir.startswith("/opt/ml"): return model_dir else: - return '{}/{}/model'.format(model_dir, job_name) + return "{}/{}/model".format(model_dir, job_name) def main(): @@ -205,11 +220,11 @@ def main(): # If the training job is part of the multiple training jobs for tuning, we need to append the training job name to # model_dir in case they read from/write to the same object - if '_tuning_objective_metric' in hyperparameters: - model_dir = _model_dir_with_training_job(hyperparameters.get('model_dir'), env.job_name) - logger.info('Appending the training job name to model_dir: {}'.format(model_dir)) - user_hyperparameters['model_dir'] = model_dir + if "_tuning_objective_metric" in hyperparameters: + model_dir = _model_dir_with_training_job(hyperparameters.get("model_dir"), env.job_name) + logger.info("Appending the training job name to model_dir: {}".format(model_dir)) + user_hyperparameters["model_dir"] = model_dir - s3_utils.configure(user_hyperparameters.get('model_dir'), os.environ.get('SAGEMAKER_REGION')) + s3_utils.configure(user_hyperparameters.get("model_dir"), os.environ.get("SAGEMAKER_REGION")) train(env, mapping.to_cmd_args(user_hyperparameters)) _log_model_missing_warning(MODEL_DIR) diff --git a/test/conftest.py b/test/conftest.py index 0a27fa9c..6069a08a 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -23,180 +23,182 @@ logger = logging.getLogger(__name__) -logging.getLogger('boto').setLevel(logging.INFO) -logging.getLogger('botocore').setLevel(logging.INFO) -logging.getLogger('factory.py').setLevel(logging.INFO) -logging.getLogger('auth.py').setLevel(logging.INFO) -logging.getLogger('connectionpool.py').setLevel(logging.INFO) +logging.getLogger("boto").setLevel(logging.INFO) +logging.getLogger("botocore").setLevel(logging.INFO) +logging.getLogger("factory.py").setLevel(logging.INFO) +logging.getLogger("auth.py").setLevel(logging.INFO) +logging.getLogger("connectionpool.py").setLevel(logging.INFO) DIR_PATH = os.path.dirname(os.path.realpath(__file__)) # these regions have some p2 and p3 instances, but not enough for automated testing NO_P2_REGIONS = [ - 'ca-central-1', - 'eu-central-1', - 'eu-west-2', - 'us-west-1', - 'eu-west-3', - 'eu-north-1', - 'sa-east-1', - 'ap-east-1', - 'me-south-1' + "ca-central-1", + "eu-central-1", + "eu-west-2", + "us-west-1", + "eu-west-3", + "eu-north-1", + "sa-east-1", + "ap-east-1", + "me-south-1", ] NO_P3_REGIONS = [ - 'ap-southeast-1', - 'ap-southeast-2', - 'ap-south-1', - 'ca-central-1', - 'eu-central-1', - 'eu-west-2', - 'us-west-1' - 'eu-west-3', - 'eu-north-1', - 'sa-east-1', - 'ap-east-1', - 'me-south-1' + "ap-southeast-1", + "ap-southeast-2", + "ap-south-1", + "ca-central-1", + "eu-central-1", + "eu-west-2", + "us-west-1" "eu-west-3", + "eu-north-1", + "sa-east-1", + "ap-east-1", + "me-south-1", ] def pytest_addoption(parser): - parser.addoption('--build-image', '-B', action='store_true') - parser.addoption('--push-image', '-P', action='store_true') - parser.addoption('--dockerfile-type', '-T', choices=['dlc.cpu', 'dlc.gpu', 'tf'], - default='tf') - parser.addoption('--dockerfile', '-D', default=None) - parser.addoption('--docker-base-name', default='sagemaker-tensorflow-training') - parser.addoption('--tag', default=None) - parser.addoption('--region', default='us-west-2') - parser.addoption('--framework-version', default='2.2.0') - parser.addoption('--processor', default='cpu', choices=['cpu', 'gpu', 'cpu,gpu']) - parser.addoption('--py-version', default='3', choices=['2', '3', '2,3']) - parser.addoption('--account-id', default='142577830533') - parser.addoption('--instance-type', default=None) + parser.addoption("--build-image", "-B", action="store_true") + parser.addoption("--push-image", "-P", action="store_true") + parser.addoption("--dockerfile-type", "-T", choices=["dlc.cpu", "dlc.gpu", "tf"], default="tf") + parser.addoption("--dockerfile", "-D", default=None) + parser.addoption("--docker-base-name", default="sagemaker-tensorflow-training") + parser.addoption("--tag", default=None) + parser.addoption("--region", default="us-west-2") + parser.addoption("--framework-version", default="2.2.0") + parser.addoption("--processor", default="cpu", choices=["cpu", "gpu", "cpu,gpu"]) + parser.addoption("--py-version", default="3", choices=["2", "3", "2,3"]) + parser.addoption("--account-id", default="142577830533") + parser.addoption("--instance-type", default=None) def pytest_generate_tests(metafunc): - if 'py_version' in metafunc.fixturenames: - py_version_params = ['py' + v for v in metafunc.config.getoption('--py-version').split(',')] - metafunc.parametrize('py_version', py_version_params, scope='session') + if "py_version" in metafunc.fixturenames: + py_version_params = ["py" + v for v in metafunc.config.getoption("--py-version").split(",")] + metafunc.parametrize("py_version", py_version_params, scope="session") - if 'processor' in metafunc.fixturenames: - processor_params = metafunc.config.getoption('--processor').split(',') - metafunc.parametrize('processor', processor_params, scope='session') + if "processor" in metafunc.fixturenames: + processor_params = metafunc.config.getoption("--processor").split(",") + metafunc.parametrize("processor", processor_params, scope="session") -@pytest.fixture(scope='session', name='dockerfile_type') +@pytest.fixture(scope="session", name="dockerfile_type") def fixture_dockerfile_type(request): - return request.config.getoption('--dockerfile-type') + return request.config.getoption("--dockerfile-type") -@pytest.fixture(scope='session', name='dockerfile') +@pytest.fixture(scope="session", name="dockerfile") def fixture_dockerfile(request, dockerfile_type): - dockerfile = request.config.getoption('--dockerfile') - return dockerfile if dockerfile else 'Dockerfile.{}'.format(dockerfile_type) + dockerfile = request.config.getoption("--dockerfile") + return dockerfile if dockerfile else "Dockerfile.{}".format(dockerfile_type) -@pytest.fixture(scope='session', name='build_image', autouse=True) +@pytest.fixture(scope="session", name="build_image", autouse=True) def fixture_build_image(request, framework_version, dockerfile, image_uri, region): - build_image = request.config.getoption('--build-image') + build_image = request.config.getoption("--build-image") if build_image: - return image_utils.build_image(framework_version=framework_version, - dockerfile=dockerfile, - image_uri=image_uri, - region=region, - cwd=os.path.join(DIR_PATH, '..')) + return image_utils.build_image( + framework_version=framework_version, + dockerfile=dockerfile, + image_uri=image_uri, + region=region, + cwd=os.path.join(DIR_PATH, ".."), + ) return image_uri -@pytest.fixture(scope='session', name='push_image', autouse=True) +@pytest.fixture(scope="session", name="push_image", autouse=True) def fixture_push_image(request, image_uri, region, account_id): - push_image = request.config.getoption('--push-image') + push_image = request.config.getoption("--push-image") if push_image: return image_utils.push_image(image_uri, region, account_id) return None -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def docker_base_name(request): - return request.config.getoption('--docker-base-name') + return request.config.getoption("--docker-base-name") -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def region(request): - return request.config.getoption('--region') + return request.config.getoption("--region") -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def framework_version(request): - return request.config.getoption('--framework-version') + return request.config.getoption("--framework-version") -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def tag(request, framework_version, processor, py_version): - provided_tag = request.config.getoption('--tag') - default_tag = '{}-{}-py{}'.format(framework_version, processor, py_version) + provided_tag = request.config.getoption("--tag") + default_tag = "{}-{}-py{}".format(framework_version, processor, py_version) return provided_tag if provided_tag is not None else default_tag -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def sagemaker_session(region): return Session(boto_session=boto3.Session(region_name=region)) -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def sagemaker_local_session(region): return LocalSession(boto_session=boto3.Session(region_name=region)) -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def account_id(request): - return request.config.getoption('--account-id') + return request.config.getoption("--account-id") @pytest.fixture def instance_type(request, processor): - provided_instance_type = request.config.getoption('--instance-type') - default_instance_type = 'ml.c4.xlarge' if processor == 'cpu' else 'ml.p2.xlarge' + provided_instance_type = request.config.getoption("--instance-type") + default_instance_type = "ml.c4.xlarge" if processor == "cpu" else "ml.p2.xlarge" return provided_instance_type if provided_instance_type is not None else default_instance_type @pytest.fixture(autouse=True) def skip_by_device_type(request, processor): - is_gpu = (processor == 'gpu') - if (request.node.get_closest_marker('skip_gpu') and is_gpu) or \ - (request.node.get_closest_marker('skip_cpu') and not is_gpu): - pytest.skip('Skipping because running on \'{}\' instance'.format(processor)) + is_gpu = processor == "gpu" + if (request.node.get_closest_marker("skip_gpu") and is_gpu) or ( + request.node.get_closest_marker("skip_cpu") and not is_gpu + ): + pytest.skip("Skipping because running on '{}' instance".format(processor)) @pytest.fixture(autouse=True) def skip_gpu_instance_restricted_regions(region, instance_type): - if (region in NO_P2_REGIONS and instance_type.startswith('ml.p2')) or \ - (region in NO_P3_REGIONS and instance_type.startswith('ml.p3')): - pytest.skip('Skipping GPU test in region {}'.format(region)) + if (region in NO_P2_REGIONS and instance_type.startswith("ml.p2")) or ( + region in NO_P3_REGIONS and instance_type.startswith("ml.p3") + ): + pytest.skip("Skipping GPU test in region {}".format(region)) @pytest.fixture(autouse=True) def skip_py2_containers(request, tag): - if request.node.get_closest_marker('skip_py2_containers'): - if 'py2' in tag: - pytest.skip('Skipping python2 container with tag {}'.format(tag)) + if request.node.get_closest_marker("skip_py2_containers"): + if "py2" in tag: + pytest.skip("Skipping python2 container with tag {}".format(tag)) @pytest.fixture(autouse=True) def skip_by_dockerfile_type(request, dockerfile_type): - is_generic = (dockerfile_type == 'tf') - if request.node.get_closest_marker('skip_generic') and is_generic: - pytest.skip('Skipping because running generic image without mpi and horovod') + is_generic = dockerfile_type == "tf" + if request.node.get_closest_marker("skip_generic") and is_generic: + pytest.skip("Skipping because running generic image without mpi and horovod") -@pytest.fixture(name='docker_registry', scope='session') +@pytest.fixture(name="docker_registry", scope="session") def fixture_docker_registry(account_id, region): - return '{}.dkr.ecr.{}.amazonaws.com'.format(account_id, region) if account_id else None + return "{}.dkr.ecr.{}.amazonaws.com".format(account_id, region) if account_id else None -@pytest.fixture(name='image_uri', scope='session') +@pytest.fixture(name="image_uri", scope="session") def fixture_image_uri(docker_registry, docker_base_name, tag): if docker_registry: - return '{}/{}:{}'.format(docker_registry, docker_base_name, tag) - return '{}:{}'.format(docker_base_name, tag) + return "{}/{}:{}".format(docker_registry, docker_base_name, tag) + return "{}:{}".format(docker_base_name, tag) diff --git a/test/integration/__init__.py b/test/integration/__init__.py index a979028a..905d6d70 100644 --- a/test/integration/__init__.py +++ b/test/integration/__init__.py @@ -15,8 +15,8 @@ import logging import os -logging.getLogger('boto3').setLevel(logging.INFO) -logging.getLogger('botocore').setLevel(logging.INFO) +logging.getLogger("boto3").setLevel(logging.INFO) +logging.getLogger("botocore").setLevel(logging.INFO) -RESOURCE_PATH = os.path.join(os.path.dirname(__file__), '..', 'resources') +RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "resources") DEFAULT_TIMEOUT = 120 diff --git a/test/integration/local/test_horovod.py b/test/integration/local/test_horovod.py index 2d98e37c..e9e4a784 100644 --- a/test/integration/local/test_horovod.py +++ b/test/integration/local/test_horovod.py @@ -20,7 +20,7 @@ from sagemaker.tensorflow import TensorFlow -RESOURCE_PATH = os.path.join(os.path.dirname(__file__), '..', '..', 'resources') +RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "resources") @pytest.mark.skip_cpu @@ -29,51 +29,55 @@ def test_distributed_training_horovod_gpu( sagemaker_local_session, image_uri, tmpdir, framework_version ): _test_distributed_training_horovod( - 1, 2, sagemaker_local_session, image_uri, tmpdir, framework_version, 'local_gpu' + 1, 2, sagemaker_local_session, image_uri, tmpdir, framework_version, "local_gpu" ) @pytest.mark.skip_gpu @pytest.mark.skip_generic -@pytest.mark.parametrize( - 'instances, processes', [(1, 2), (2, 1), (2, 2), (5, 2)] -) +@pytest.mark.parametrize("instances, processes", [(1, 2), (2, 1), (2, 2), (5, 2)]) def test_distributed_training_horovod_cpu( instances, processes, sagemaker_local_session, image_uri, tmpdir, framework_version ): _test_distributed_training_horovod( - instances, processes, sagemaker_local_session, image_uri, tmpdir, framework_version, 'local' + instances, processes, sagemaker_local_session, image_uri, tmpdir, framework_version, "local" ) def _test_distributed_training_horovod( instances, processes, session, image_uri, tmpdir, framework_version, instance_type ): - output_path = 'file://%s' % tmpdir + output_path = "file://%s" % tmpdir estimator = TensorFlow( - entry_point=os.path.join(RESOURCE_PATH, 'hvdbasic', 'train_hvd_basic.py'), - role='SageMakerRole', + entry_point=os.path.join(RESOURCE_PATH, "hvdbasic", "train_hvd_basic.py"), + role="SageMakerRole", train_instance_type=instance_type, sagemaker_session=session, train_instance_count=instances, image_name=image_uri, output_path=output_path, framework_version=framework_version, - hyperparameters={'sagemaker_mpi_enabled': True, - 'sagemaker_network_interface_name': 'eth0', - 'sagemaker_mpi_num_of_processes_per_host': processes}) + hyperparameters={ + "sagemaker_mpi_enabled": True, + "sagemaker_network_interface_name": "eth0", + "sagemaker_mpi_num_of_processes_per_host": processes, + }, + ) - estimator.fit('file://{}'.format(os.path.join(RESOURCE_PATH, 'mnist', 'data-distributed'))) + estimator.fit("file://{}".format(os.path.join(RESOURCE_PATH, "mnist", "data-distributed"))) tmp = str(tmpdir) - extract_files(output_path.replace('file://', ''), tmp) + extract_files(output_path.replace("file://", ""), tmp) size = instances * processes for rank in range(size): local_rank = rank % processes - assert read_json('local-rank-%s-rank-%s' % (local_rank, rank), tmp) == { - 'local-rank': local_rank, 'rank': rank, 'size': size} + assert read_json("local-rank-%s-rank-%s" % (local_rank, rank), tmp) == { + "local-rank": local_rank, + "rank": rank, + "size": size, + } def read_json(file, tmp): @@ -82,14 +86,14 @@ def read_json(file, tmp): def assert_files_exist_in_tar(output_path, files): - if output_path.startswith('file://'): + if output_path.startswith("file://"): output_path = output_path[7:] - model_file = os.path.join(output_path, 'model.tar.gz') + model_file = os.path.join(output_path, "model.tar.gz") with tarfile.open(model_file) as tar: for f in files: tar.getmember(f) def extract_files(output_path, tmpdir): - with tarfile.open(os.path.join(output_path, 'model.tar.gz')) as tar: + with tarfile.open(os.path.join(output_path, "model.tar.gz")) as tar: tar.extractall(tmpdir) diff --git a/test/integration/local/test_training.py b/test/integration/local/test_training.py index 2b7b5521..35a676a6 100644 --- a/test/integration/local/test_training.py +++ b/test/integration/local/test_training.py @@ -18,102 +18,109 @@ import pytest from sagemaker.tensorflow import TensorFlow -RESOURCE_PATH = os.path.join(os.path.dirname(__file__), '..', '..', 'resources') -TF_CHECKPOINT_FILES = ['graph.pbtxt', 'model.ckpt-0.index', 'model.ckpt-0.meta'] +RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "resources") +TF_CHECKPOINT_FILES = ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"] @pytest.fixture # noqa: F811 def py_full_version(py_version): # noqa: F811 - if py_version == '2': - return '2.7' + if py_version == "2": + return "2.7" else: - return '3.6' + return "3.6" @pytest.mark.skip_gpu def test_mnist_cpu(sagemaker_local_session, image_uri, tmpdir, framework_version): - output_path = 'file://{}'.format(tmpdir) - run_tf_training(script=os.path.join(RESOURCE_PATH, 'mnist', 'mnist.py'), - instance_type='local', - instance_count=1, - sagemaker_local_session=sagemaker_local_session, - image_uri=image_uri, - framework_version=framework_version, - output_path=output_path, - training_data_path='file://{}'.format( - os.path.join(RESOURCE_PATH, 'mnist', 'data'))) - _assert_files_exist_in_tar(output_path, ['my_model.h5']) + output_path = "file://{}".format(tmpdir) + run_tf_training( + script=os.path.join(RESOURCE_PATH, "mnist", "mnist.py"), + instance_type="local", + instance_count=1, + sagemaker_local_session=sagemaker_local_session, + image_uri=image_uri, + framework_version=framework_version, + output_path=output_path, + training_data_path="file://{}".format(os.path.join(RESOURCE_PATH, "mnist", "data")), + ) + _assert_files_exist_in_tar(output_path, ["my_model.h5"]) @pytest.mark.skip_gpu -def test_distributed_training_cpu_no_ps(sagemaker_local_session, - image_uri, - tmpdir, - framework_version): - output_path = 'file://{}'.format(tmpdir) - run_tf_training(script=os.path.join(RESOURCE_PATH, 'mnist', 'mnist_estimator.py'), - instance_type='local', - instance_count=2, - sagemaker_local_session=sagemaker_local_session, - image_uri=image_uri, - framework_version=framework_version, - output_path=output_path, - training_data_path='file://{}'.format( - os.path.join(RESOURCE_PATH, 'mnist', 'data-distributed'))) +def test_distributed_training_cpu_no_ps( + sagemaker_local_session, image_uri, tmpdir, framework_version +): + output_path = "file://{}".format(tmpdir) + run_tf_training( + script=os.path.join(RESOURCE_PATH, "mnist", "mnist_estimator.py"), + instance_type="local", + instance_count=2, + sagemaker_local_session=sagemaker_local_session, + image_uri=image_uri, + framework_version=framework_version, + output_path=output_path, + training_data_path="file://{}".format( + os.path.join(RESOURCE_PATH, "mnist", "data-distributed") + ), + ) _assert_files_exist_in_tar(output_path, TF_CHECKPOINT_FILES) @pytest.mark.skip_gpu -def test_distributed_training_cpu_ps(sagemaker_local_session, - image_uri, - tmpdir, - framework_version): - output_path = 'file://{}'.format(tmpdir) - run_tf_training(script=os.path.join(RESOURCE_PATH, 'mnist', 'mnist_estimator.py'), - instance_type='local', - instance_count=2, - sagemaker_local_session=sagemaker_local_session, - image_uri=image_uri, - framework_version=framework_version, - output_path=output_path, - hyperparameters={'sagemaker_parameter_server_enabled': True}, - training_data_path='file://{}'.format( - os.path.join(RESOURCE_PATH, 'mnist', 'data-distributed'))) +def test_distributed_training_cpu_ps(sagemaker_local_session, image_uri, tmpdir, framework_version): + output_path = "file://{}".format(tmpdir) + run_tf_training( + script=os.path.join(RESOURCE_PATH, "mnist", "mnist_estimator.py"), + instance_type="local", + instance_count=2, + sagemaker_local_session=sagemaker_local_session, + image_uri=image_uri, + framework_version=framework_version, + output_path=output_path, + hyperparameters={"sagemaker_parameter_server_enabled": True}, + training_data_path="file://{}".format( + os.path.join(RESOURCE_PATH, "mnist", "data-distributed") + ), + ) _assert_files_exist_in_tar(output_path, TF_CHECKPOINT_FILES) -def run_tf_training(script, - instance_type, - instance_count, - sagemaker_local_session, - image_uri, - framework_version, - training_data_path, - output_path=None, - hyperparameters=None): +def run_tf_training( + script, + instance_type, + instance_count, + sagemaker_local_session, + image_uri, + framework_version, + training_data_path, + output_path=None, + hyperparameters=None, +): hyperparameters = hyperparameters or {} - estimator = TensorFlow(entry_point=script, - role='SageMakerRole', - train_instance_count=instance_count, - train_instance_type=instance_type, - sagemaker_session=sagemaker_local_session, - image_name=image_uri, - model_dir='/opt/ml/model', - output_path=output_path, - hyperparameters=hyperparameters, - base_job_name='test-tf', - framework_version=framework_version, - py_version='py3') + estimator = TensorFlow( + entry_point=script, + role="SageMakerRole", + train_instance_count=instance_count, + train_instance_type=instance_type, + sagemaker_session=sagemaker_local_session, + image_name=image_uri, + model_dir="/opt/ml/model", + output_path=output_path, + hyperparameters=hyperparameters, + base_job_name="test-tf", + framework_version=framework_version, + py_version="py3", + ) estimator.fit(training_data_path) def _assert_files_exist_in_tar(output_path, files): - if output_path.startswith('file://'): + if output_path.startswith("file://"): output_path = output_path[7:] - model_file = os.path.join(output_path, 'model.tar.gz') + model_file = os.path.join(output_path, "model.tar.gz") with tarfile.open(model_file) as tar: for f in files: tar.getmember(f) diff --git a/test/integration/sagemaker/recordio_utils.py b/test/integration/sagemaker/recordio_utils.py index e7aa28e3..9863d0a8 100644 --- a/test/integration/sagemaker/recordio_utils.py +++ b/test/integration/sagemaker/recordio_utils.py @@ -28,7 +28,7 @@ # This file can be used in script mode to generate a single file or be used # as a module to generate files via build_record_file. -_kmagic = 0xced7230a +_kmagic = 0xCED7230A padding = {} for amount in range(4): @@ -41,9 +41,9 @@ def write_recordio(f, data, header_flag=0): """Writes a single data point as a RecordIO record to the given file.""" length = len(data) - f.write(struct.pack('I', _kmagic)) + f.write(struct.pack("I", _kmagic)) header = (header_flag << 29) | length - f.write(struct.pack('I', header)) + f.write(struct.pack("I", header)) pad = (((length + 3) >> 2) << 2) - length f.write(data) f.write(padding[pad]) @@ -55,8 +55,8 @@ def write_recordio_multipart(f, data): stride = int(length / 3) data_start = data[0:stride] - data_middle = data[stride:2 * stride] - data_end = data[2 * stride:] + data_middle = data[stride : 2 * stride] + data_end = data[2 * stride :] write_recordio(f, data_start, 1) write_recordio(f, data_middle, 2) @@ -72,7 +72,7 @@ def label_feature(value): def write_numpy_array(f, feature_name, label, arr, multipart=False): - feature = {'labels': label_feature(label), feature_name: string_feature(arr)} + feature = {"labels": label_feature(label), feature_name: string_feature(arr)} example = tf.train.Example(features=tf.train.Features(feature=feature)) if multipart: write_recordio_multipart(f, example.SerializeToString()) @@ -80,7 +80,9 @@ def write_numpy_array(f, feature_name, label, arr, multipart=False): write_recordio(f, example.SerializeToString()) -def build_record_file(filename, num_records, dimension, classes=2, data_feature_name='data', multipart=False): +def build_record_file( + filename, num_records, dimension, classes=2, data_feature_name="data", multipart=False +): """Builds a recordio encoded file of TF protobuf Example objects. Each object is a labeled numpy array. Each example has two field - a single int64 'label' field and a single bytes list field, containing a serialized numpy array. @@ -99,46 +101,54 @@ def build_record_file(filename, num_records, dimension, classes=2, data_feature_ data_feature_name - the name to give the numpy array in the Example object dimension - the size of each numpy array. """ - with open(filename, 'wb') as f: + with open(filename, "wb") as f: for i in range(num_records): cur_class = i % classes loc = int(cur_class - (classes / 2)) - write_numpy_array(f, data_feature_name, cur_class, np.random.normal(loc=loc, size=(dimension,)), multipart) + write_numpy_array( + f, + data_feature_name, + cur_class, + np.random.normal(loc=loc, size=(dimension,)), + multipart, + ) -def build_single_record_file(filename, dimension, classes=2, data_feature_name='data'): +def build_single_record_file(filename, dimension, classes=2, data_feature_name="data"): cur_class = randint(0, classes - 1) loc = int(cur_class - (classes / 2)) arr = np.random.normal(loc=loc, size=(dimension,)) - feature = {'labels': label_feature(cur_class), data_feature_name: string_feature(arr)} + feature = {"labels": label_feature(cur_class), data_feature_name: string_feature(arr)} example = tf.train.Example(features=tf.train.Features(feature=feature)) - with open(filename, 'wb') as f: + with open(filename, "wb") as f: f.write(example.SerializeToString()) def validate_record_file(filename, dimension): - data = open(filename, 'rb').read() - magic_number, length = struct.unpack('II', data[0:8]) - encoded = data[8:8 + length] + data = open(filename, "rb").read() + magic_number, length = struct.unpack("II", data[0:8]) + encoded = data[8 : 8 + length] features = { - 'data': tf.io.FixedLenFeature([], tf.string), - 'labels': tf.io.FixedLenFeature([], tf.int64), + "data": tf.io.FixedLenFeature([], tf.string), + "labels": tf.io.FixedLenFeature([], tf.int64), } parsed = tf.io.parse_single_example(encoded, features) - array = tf.io.decode_raw(parsed['data'], tf.float64) + array = tf.io.decode_raw(parsed["data"], tf.float64) assert array.shape[0] == dimension -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser(description="Generate synthetic multi-class training data") - parser.add_argument('--dimension', default=65536, type=int) - parser.add_argument('--classes', default=2, type=int) - parser.add_argument('--num-records', default=4, type=int) - parser.add_argument('--data-feature-name', default='data') - parser.add_argument('filename', type=str) + parser.add_argument("--dimension", default=65536, type=int) + parser.add_argument("--classes", default=2, type=int) + parser.add_argument("--num-records", default=4, type=int) + parser.add_argument("--data-feature-name", default="data") + parser.add_argument("filename", type=str) args = parser.parse_args() - build_record_file(args.filename, args.num_records, args.dimension, args.classes, args.data_feature_name) + build_record_file( + args.filename, args.num_records, args.dimension, args.classes, args.data_feature_name + ) validate_record_file(args.filename, args.dimension) diff --git a/test/integration/sagemaker/test_mnist.py b/test/integration/sagemaker/test_mnist.py index 11567c37..c466f573 100644 --- a/test/integration/sagemaker/test_mnist.py +++ b/test/integration/sagemaker/test_mnist.py @@ -26,104 +26,117 @@ @pytest.mark.deploy_test def test_mnist(sagemaker_session, image_uri, instance_type, framework_version): - resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources') - script = os.path.join(resource_path, 'mnist', 'mnist.py') - estimator = TensorFlow(entry_point=script, - role='SageMakerRole', - train_instance_type=instance_type, - train_instance_count=1, - sagemaker_session=sagemaker_session, - image_name=image_uri, - framework_version=framework_version, - script_mode=True) + resource_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources") + script = os.path.join(resource_path, "mnist", "mnist.py") + estimator = TensorFlow( + entry_point=script, + role="SageMakerRole", + train_instance_type=instance_type, + train_instance_count=1, + sagemaker_session=sagemaker_session, + image_name=image_uri, + framework_version=framework_version, + script_mode=True, + ) inputs = estimator.sagemaker_session.upload_data( - path=os.path.join(resource_path, 'mnist', 'data'), - key_prefix='scriptmode/mnist') - estimator.fit(inputs, job_name=unique_name_from_base('test-sagemaker-mnist')) + path=os.path.join(resource_path, "mnist", "data"), key_prefix="scriptmode/mnist" + ) + estimator.fit(inputs, job_name=unique_name_from_base("test-sagemaker-mnist")) _assert_s3_file_exists(sagemaker_session.boto_region_name, estimator.model_data) def test_distributed_mnist_no_ps(sagemaker_session, image_uri, instance_type, framework_version): - resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources') - script = os.path.join(resource_path, 'mnist', 'mnist.py') - estimator = TensorFlow(entry_point=script, - role='SageMakerRole', - train_instance_count=2, - train_instance_type=instance_type, - sagemaker_session=sagemaker_session, - image_name=image_uri, - framework_version=framework_version, - script_mode=True) + resource_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources") + script = os.path.join(resource_path, "mnist", "mnist.py") + estimator = TensorFlow( + entry_point=script, + role="SageMakerRole", + train_instance_count=2, + train_instance_type=instance_type, + sagemaker_session=sagemaker_session, + image_name=image_uri, + framework_version=framework_version, + script_mode=True, + ) inputs = estimator.sagemaker_session.upload_data( - path=os.path.join(resource_path, 'mnist', 'data'), - key_prefix='scriptmode/mnist') - estimator.fit(inputs, job_name=unique_name_from_base('test-tf-sm-distributed-mnist')) + path=os.path.join(resource_path, "mnist", "data"), key_prefix="scriptmode/mnist" + ) + estimator.fit(inputs, job_name=unique_name_from_base("test-tf-sm-distributed-mnist")) _assert_s3_file_exists(sagemaker_session.boto_region_name, estimator.model_data) def test_distributed_mnist_ps(sagemaker_session, image_uri, instance_type, framework_version): - resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources') - script = os.path.join(resource_path, 'mnist', 'mnist_estimator.py') - estimator = TensorFlow(entry_point=script, - role='SageMakerRole', - hyperparameters={'sagemaker_parameter_server_enabled': True}, - train_instance_count=2, - train_instance_type=instance_type, - sagemaker_session=sagemaker_session, - image_name=image_uri, - framework_version=framework_version, - script_mode=True) + resource_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources") + script = os.path.join(resource_path, "mnist", "mnist_estimator.py") + estimator = TensorFlow( + entry_point=script, + role="SageMakerRole", + hyperparameters={"sagemaker_parameter_server_enabled": True}, + train_instance_count=2, + train_instance_type=instance_type, + sagemaker_session=sagemaker_session, + image_name=image_uri, + framework_version=framework_version, + script_mode=True, + ) inputs = estimator.sagemaker_session.upload_data( - path=os.path.join(resource_path, 'mnist', 'data-distributed'), - key_prefix='scriptmode/mnist-distributed') - estimator.fit(inputs, job_name=unique_name_from_base('test-tf-sm-distributed-mnist')) + path=os.path.join(resource_path, "mnist", "data-distributed"), + key_prefix="scriptmode/mnist-distributed", + ) + estimator.fit(inputs, job_name=unique_name_from_base("test-tf-sm-distributed-mnist")) _assert_checkpoint_exists(sagemaker_session.boto_region_name, estimator.model_dir, 0) _assert_s3_file_exists(sagemaker_session.boto_region_name, estimator.model_data) def test_tuning(sagemaker_session, image_uri, instance_type, framework_version): - resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources') - script = os.path.join(resource_path, 'mnist', 'mnist.py') - - estimator = TensorFlow(entry_point=script, - role='SageMakerRole', - train_instance_type=instance_type, - train_instance_count=1, - sagemaker_session=sagemaker_session, - image_name=image_uri, - framework_version=framework_version, - script_mode=True) - - hyperparameter_ranges = {'epochs': IntegerParameter(1, 2)} - objective_metric_name = 'accuracy' - metric_definitions = [{'Name': objective_metric_name, 'Regex': 'accuracy = ([0-9\\.]+)'}] - - tuner = HyperparameterTuner(estimator, - objective_metric_name, - hyperparameter_ranges, - metric_definitions, - max_jobs=2, - max_parallel_jobs=2) + resource_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources") + script = os.path.join(resource_path, "mnist", "mnist.py") + + estimator = TensorFlow( + entry_point=script, + role="SageMakerRole", + train_instance_type=instance_type, + train_instance_count=1, + sagemaker_session=sagemaker_session, + image_name=image_uri, + framework_version=framework_version, + script_mode=True, + ) + + hyperparameter_ranges = {"epochs": IntegerParameter(1, 2)} + objective_metric_name = "accuracy" + metric_definitions = [{"Name": objective_metric_name, "Regex": "accuracy = ([0-9\\.]+)"}] + + tuner = HyperparameterTuner( + estimator, + objective_metric_name, + hyperparameter_ranges, + metric_definitions, + max_jobs=2, + max_parallel_jobs=2, + ) with timeout(minutes=20): inputs = estimator.sagemaker_session.upload_data( - path=os.path.join(resource_path, 'mnist', 'data'), - key_prefix='scriptmode/mnist') + path=os.path.join(resource_path, "mnist", "data"), key_prefix="scriptmode/mnist" + ) - tuning_job_name = unique_name_from_base('test-tf-sm-tuning', max_length=32) + tuning_job_name = unique_name_from_base("test-tf-sm-tuning", max_length=32) tuner.fit(inputs, job_name=tuning_job_name) tuner.wait() def _assert_checkpoint_exists(region, model_dir, checkpoint_number): - _assert_s3_file_exists(region, os.path.join(model_dir, 'graph.pbtxt')) - _assert_s3_file_exists(region, - os.path.join(model_dir, 'model.ckpt-{}.index'.format(checkpoint_number))) - _assert_s3_file_exists(region, - os.path.join(model_dir, 'model.ckpt-{}.meta'.format(checkpoint_number))) + _assert_s3_file_exists(region, os.path.join(model_dir, "graph.pbtxt")) + _assert_s3_file_exists( + region, os.path.join(model_dir, "model.ckpt-{}.index".format(checkpoint_number)) + ) + _assert_s3_file_exists( + region, os.path.join(model_dir, "model.ckpt-{}.meta".format(checkpoint_number)) + ) def _assert_s3_file_exists(region, s3_url): parsed_url = urlparse(s3_url) - s3 = boto3.resource('s3', region_name=region) - s3.Object(parsed_url.netloc, parsed_url.path.lstrip('/')).load() + s3 = boto3.resource("s3", region_name=region) + s3.Object(parsed_url.netloc, parsed_url.path.lstrip("/")).load() diff --git a/test/integration/sagemaker/test_tuning_model_dir.py b/test/integration/sagemaker/test_tuning_model_dir.py index e78b913e..c113c1cb 100644 --- a/test/integration/sagemaker/test_tuning_model_dir.py +++ b/test/integration/sagemaker/test_tuning_model_dir.py @@ -19,26 +19,32 @@ from sagemaker.utils import unique_name_from_base -def test_model_dir_with_training_job_name(sagemaker_session, image_uri, instance_type, framework_version): - resource_path = os.path.join(os.path.dirname(__file__), '../..', 'resources') - script = os.path.join(resource_path, 'tuning_model_dir', 'entry.py') +def test_model_dir_with_training_job_name( + sagemaker_session, image_uri, instance_type, framework_version +): + resource_path = os.path.join(os.path.dirname(__file__), "../..", "resources") + script = os.path.join(resource_path, "tuning_model_dir", "entry.py") - estimator = TensorFlow(entry_point=script, - role='SageMakerRole', - train_instance_type=instance_type, - train_instance_count=1, - image_name=image_uri, - framework_version=framework_version, - py_version='py3', - sagemaker_session=sagemaker_session) + estimator = TensorFlow( + entry_point=script, + role="SageMakerRole", + train_instance_type=instance_type, + train_instance_count=1, + image_name=image_uri, + framework_version=framework_version, + py_version="py3", + sagemaker_session=sagemaker_session, + ) - tuner = HyperparameterTuner(estimator=estimator, - objective_metric_name='accuracy', - hyperparameter_ranges={'arbitrary_value': IntegerParameter(0, 1)}, - metric_definitions=[{'Name': 'accuracy', 'Regex': 'accuracy=([01])'}], - max_jobs=1, - max_parallel_jobs=1) + tuner = HyperparameterTuner( + estimator=estimator, + objective_metric_name="accuracy", + hyperparameter_ranges={"arbitrary_value": IntegerParameter(0, 1)}, + metric_definitions=[{"Name": "accuracy", "Regex": "accuracy=([01])"}], + max_jobs=1, + max_parallel_jobs=1, + ) # User script has logic to check for the correct model_dir - tuner.fit(job_name=unique_name_from_base('test-tf-model-dir', max_length=32)) + tuner.fit(job_name=unique_name_from_base("test-tf-model-dir", max_length=32)) tuner.wait() diff --git a/test/integration/sagemaker/timeout.py b/test/integration/sagemaker/timeout.py index d4738d32..1ff4278c 100644 --- a/test/integration/sagemaker/timeout.py +++ b/test/integration/sagemaker/timeout.py @@ -16,7 +16,7 @@ import logging import signal -LOGGER = logging.getLogger('timeout') +LOGGER = logging.getLogger("timeout") class TimeoutError(Exception): @@ -39,7 +39,7 @@ def timeout(seconds=0, minutes=0, hours=0): limit = seconds + 60 * minutes + 3600 * hours def handler(signum, frame): - raise TimeoutError('timed out after {} seconds'.format(limit)) + raise TimeoutError("timed out after {} seconds".format(limit)) try: signal.signal(signal.SIGALRM, handler) diff --git a/test/resources/hvdbasic/train_hvd_basic.py b/test/resources/hvdbasic/train_hvd_basic.py index cc068678..24a35a8b 100644 --- a/test/resources/hvdbasic/train_hvd_basic.py +++ b/test/resources/hvdbasic/train_hvd_basic.py @@ -4,8 +4,10 @@ hvd.init() -with open(os.path.join('/opt/ml/model/local-rank-%s-rank-%s' % (hvd.local_rank(), hvd.rank())), 'w+') as f: - basic_info = {'local-rank': hvd.local_rank(), 'rank': hvd.rank(), 'size': hvd.size()} +with open( + os.path.join("/opt/ml/model/local-rank-%s-rank-%s" % (hvd.local_rank(), hvd.rank())), "w+" +) as f: + basic_info = {"local-rank": hvd.local_rank(), "rank": hvd.rank(), "size": hvd.size()} print(basic_info) json.dump(basic_info, f) diff --git a/test/resources/hvdbasic/train_hvd_env_vars.py b/test/resources/hvdbasic/train_hvd_env_vars.py index 31be37e4..da67367c 100644 --- a/test/resources/hvdbasic/train_hvd_env_vars.py +++ b/test/resources/hvdbasic/train_hvd_env_vars.py @@ -4,16 +4,16 @@ hvd.init() -with open('/opt/ml/model/local-rank-%s-rank-%s' % (hvd.local_rank(), hvd.rank()), 'w+') as f: - basic_info = {'local-rank': hvd.local_rank(), 'rank': hvd.rank(), 'size': hvd.size()} +with open("/opt/ml/model/local-rank-%s-rank-%s" % (hvd.local_rank(), hvd.rank()), "w+") as f: + basic_info = {"local-rank": hvd.local_rank(), "rank": hvd.rank(), "size": hvd.size()} print(basic_info) json.dump(basic_info, f) -val = os.environ.get('AWS_CONTAINER_CREDENTIALS_RELATIVE_URI') -host = os.environ.get('SM_CURRENT_HOST') +val = os.environ.get("AWS_CONTAINER_CREDENTIALS_RELATIVE_URI") +host = os.environ.get("SM_CURRENT_HOST") assert val is not None assert host is not None -print('host {}: AWS_CONTAINER_CREDENTIALS_RELATIVE_URI={}'.format(host, val)) +print("host {}: AWS_CONTAINER_CREDENTIALS_RELATIVE_URI={}".format(host, val)) diff --git a/test/resources/mnist/horovod_mnist.py b/test/resources/mnist/horovod_mnist.py index c5c0b242..f2bf4e8f 100644 --- a/test/resources/mnist/horovod_mnist.py +++ b/test/resources/mnist/horovod_mnist.py @@ -18,76 +18,76 @@ hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) -gpus = tf.config.experimental.list_physical_devices('GPU') +gpus = tf.config.experimental.list_physical_devices("GPU") for gpu in gpus: - tf.config.experimental.set_memory_growth(gpu, True) + tf.config.experimental.set_memory_growth(gpu, True) if gpus: - tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') + tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU") -(mnist_images, mnist_labels), _ = \ - tf.keras.datasets.mnist.load_data(path='mnist-%d.npz' % hvd.rank()) +(mnist_images, mnist_labels), _ = tf.keras.datasets.mnist.load_data( + path="mnist-%d.npz" % hvd.rank() +) dataset = tf.data.Dataset.from_tensor_slices( - (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), - tf.cast(mnist_labels, tf.int64)) + (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), tf.cast(mnist_labels, tf.int64)) ) dataset = dataset.repeat().shuffle(10000).batch(128) -mnist_model = tf.keras.Sequential([ - tf.keras.layers.Conv2D(32, [3, 3], activation='relu'), - tf.keras.layers.Conv2D(64, [3, 3], activation='relu'), - tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), - tf.keras.layers.Dropout(0.25), - tf.keras.layers.Flatten(), - tf.keras.layers.Dense(128, activation='relu'), - tf.keras.layers.Dropout(0.5), - tf.keras.layers.Dense(10, activation='softmax') -]) +mnist_model = tf.keras.Sequential( + [ + tf.keras.layers.Conv2D(32, [3, 3], activation="relu"), + tf.keras.layers.Conv2D(64, [3, 3], activation="relu"), + tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), + tf.keras.layers.Dropout(0.25), + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(128, activation="relu"), + tf.keras.layers.Dropout(0.5), + tf.keras.layers.Dense(10, activation="softmax"), + ] +) loss = tf.losses.SparseCategoricalCrossentropy() # Horovod: adjust learning rate based on number of GPUs. opt = tf.optimizers.Adam(0.001 * hvd.size()) -checkpoint_dir = './checkpoints' +checkpoint_dir = "./checkpoints" checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt) @tf.function def training_step(images, labels, first_batch): - with tf.GradientTape() as tape: - probs = mnist_model(images, training=True) - loss_value = loss(labels, probs) + with tf.GradientTape() as tape: + probs = mnist_model(images, training=True) + loss_value = loss(labels, probs) - # Horovod: add Horovod Distributed GradientTape. - tape = hvd.DistributedGradientTape(tape) + # Horovod: add Horovod Distributed GradientTape. + tape = hvd.DistributedGradientTape(tape) - grads = tape.gradient(loss_value, mnist_model.trainable_variables) - opt.apply_gradients(zip(grads, mnist_model.trainable_variables)) + grads = tape.gradient(loss_value, mnist_model.trainable_variables) + opt.apply_gradients(zip(grads, mnist_model.trainable_variables)) - # Horovod: broadcast initial variable states from rank 0 to all other processes. - # This is necessary to ensure consistent initialization of all workers when - # training is started with random weights or restored from a checkpoint. - # - # Note: broadcast should be done after the first gradient step to ensure optimizer - # initialization. - if first_batch: - hvd.broadcast_variables(mnist_model.variables, root_rank=0) - hvd.broadcast_variables(opt.variables(), root_rank=0) + # Horovod: broadcast initial variable states from rank 0 to all other processes. + # This is necessary to ensure consistent initialization of all workers when + # training is started with random weights or restored from a checkpoint. + # + # Note: broadcast should be done after the first gradient step to ensure optimizer + # initialization. + if first_batch: + hvd.broadcast_variables(mnist_model.variables, root_rank=0) + hvd.broadcast_variables(opt.variables(), root_rank=0) - return loss_value + return loss_value # Horovod: adjust number of steps based on number of GPUs. for batch, (images, labels) in enumerate(dataset.take(600 // hvd.size())): - loss_value = training_step(images, labels, batch == 0) + loss_value = training_step(images, labels, batch == 0) - if batch % 10 == 0 and hvd.local_rank() == 0: - print('Step #%d\tLoss: %.6f' % (batch, loss_value)) + if batch % 10 == 0 and hvd.local_rank() == 0: + print("Step #%d\tLoss: %.6f" % (batch, loss_value)) # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting it. if hvd.rank() == 0: - # Export the keras model as Tensorflow SavedModelBundle - mnist_model.save( - os.path.join('/opt/ml/model/mnist/1'), - save_format='tf') + # Export the keras model as Tensorflow SavedModelBundle + mnist_model.save(os.path.join("/opt/ml/model/mnist/1"), save_format="tf") diff --git a/test/resources/mnist/mnist.py b/test/resources/mnist/mnist.py index 86dbba8c..e1c2b275 100644 --- a/test/resources/mnist/mnist.py +++ b/test/resources/mnist/mnist.py @@ -7,50 +7,49 @@ import tensorflow as tf - def _parse_args(): parser = argparse.ArgumentParser() # hyperparameters sent by the client are passed as command-line arguments to the script. - parser.add_argument('--epochs', type=int, default=1) + parser.add_argument("--epochs", type=int, default=1) # Data, model, and output directories - parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR']) - parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAINING']) - parser.add_argument('--hosts', type=list, default=json.loads(os.environ['SM_HOSTS'])) - parser.add_argument('--current-host', type=str, default=os.environ['SM_CURRENT_HOST']) + parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"]) + parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAINING"]) + parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"])) + parser.add_argument("--current-host", type=str, default=os.environ["SM_CURRENT_HOST"]) return parser.parse_known_args() def _load_training_data(base_dir): - x_train = np.load(os.path.join(base_dir, 'train', 'x_train.npy')) - y_train = np.load(os.path.join(base_dir, 'train', 'y_train.npy')) + x_train = np.load(os.path.join(base_dir, "train", "x_train.npy")) + y_train = np.load(os.path.join(base_dir, "train", "y_train.npy")) return x_train, y_train def _load_testing_data(base_dir): - x_test = np.load(os.path.join(base_dir, 'test', 'x_test.npy')) - y_test = np.load(os.path.join(base_dir, 'test', 'y_test.npy')) + x_test = np.load(os.path.join(base_dir, "test", "x_test.npy")) + y_test = np.load(os.path.join(base_dir, "test", "y_test.npy")) return x_test, y_test args, unknown = _parse_args() -model = tf.keras.models.Sequential([ - tf.keras.layers.Flatten(input_shape=(28, 28)), - tf.keras.layers.Dense(512, activation=tf.nn.relu), - tf.keras.layers.Dropout(0.2), - tf.keras.layers.Dense(10, activation=tf.nn.softmax) -]) +model = tf.keras.models.Sequential( + [ + tf.keras.layers.Flatten(input_shape=(28, 28)), + tf.keras.layers.Dense(512, activation=tf.nn.relu), + tf.keras.layers.Dropout(0.2), + tf.keras.layers.Dense(10, activation=tf.nn.softmax), + ] +) -model.compile(optimizer='adam', - loss='sparse_categorical_crossentropy', - metrics=['accuracy']) +model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]) x_train, y_train = _load_training_data(args.train) x_test, y_test = _load_testing_data(args.train) model.fit(x_train, y_train, epochs=args.epochs) model.evaluate(x_test, y_test) if args.current_host == args.hosts[0]: - model.save(os.path.join('/opt/ml/model', 'my_model.h5')) + model.save(os.path.join("/opt/ml/model", "my_model.h5")) diff --git a/test/resources/mnist/mnist_estimator.py b/test/resources/mnist/mnist_estimator.py index 904e48c2..82fb75ac 100644 --- a/test/resources/mnist/mnist_estimator.py +++ b/test/resources/mnist/mnist_estimator.py @@ -12,124 +12,122 @@ import argparse import json + def cnn_model_fn(features, labels, mode): - """Model function for CNN.""" - # Input Layer - # Reshape X to 4-D tensor: [batch_size, width, height, channels] - # MNIST images are 28x28 pixels, and have one color channel - input_layer = tf.reshape(features["x"], [-1, 28, 28, 1]) - - # Convolutional Layer #1 - # Computes 32 features using a 5x5 filter with ReLU activation. - # Padding is added to preserve width and height. - # Input Tensor Shape: [batch_size, 28, 28, 1] - # Output Tensor Shape: [batch_size, 28, 28, 32] - conv1 = tf.compat.v1.layers.conv2d( - inputs=input_layer, - filters=32, - kernel_size=[5, 5], - padding="same", - activation=tf.nn.relu) - - # Pooling Layer #1 - # First max pooling layer with a 2x2 filter and stride of 2 - # Input Tensor Shape: [batch_size, 28, 28, 32] - # Output Tensor Shape: [batch_size, 14, 14, 32] - pool1 = tf.compat.v1.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2) - - # Convolutional Layer #2 - # Computes 64 features using a 5x5 filter. - # Padding is added to preserve width and height. - # Input Tensor Shape: [batch_size, 14, 14, 32] - # Output Tensor Shape: [batch_size, 14, 14, 64] - conv2 = tf.compat.v1.layers.conv2d( - inputs=pool1, - filters=64, - kernel_size=[5, 5], - padding="same", - activation=tf.nn.relu) - - # Pooling Layer #2 - # Second max pooling layer with a 2x2 filter and stride of 2 - # Input Tensor Shape: [batch_size, 14, 14, 64] - # Output Tensor Shape: [batch_size, 7, 7, 64] - pool2 = tf.compat.v1.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2) - - # Flatten tensor into a batch of vectors - # Input Tensor Shape: [batch_size, 7, 7, 64] - # Output Tensor Shape: [batch_size, 7 * 7 * 64] - pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64]) - - # Dense Layer - # Densely connected layer with 1024 neurons - # Input Tensor Shape: [batch_size, 7 * 7 * 64] - # Output Tensor Shape: [batch_size, 1024] - dense = tf.compat.v1.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu) - - # Add dropout operation; 0.6 probability that element will be kept - dropout = tf.compat.v1.layers.dropout( - inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN) - - # Logits layer - # Input Tensor Shape: [batch_size, 1024] - # Output Tensor Shape: [batch_size, 10] - logits = tf.compat.v1.layers.dense(inputs=dropout, units=10) - - predictions = { - # Generate predictions (for PREDICT and EVAL mode) - "classes": tf.argmax(input=logits, axis=1), - # Add `softmax_tensor` to the graph. It is used for PREDICT and by the - # `logging_hook`. - "probabilities": tf.nn.softmax(logits, name="softmax_tensor") - } - if mode == tf.estimator.ModeKeys.PREDICT: - return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) - - # Calculate Loss (for both TRAIN and EVAL modes) - loss = tf.compat.v1.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) - - # Configure the Training Op (for TRAIN mode) - if mode == tf.estimator.ModeKeys.TRAIN: - optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.001) - train_op = optimizer.minimize( - loss=loss, - global_step=tf.compat.v1.train.get_global_step()) - return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) - - # Add evaluation metrics (for EVAL mode) - eval_metric_ops = { - "accuracy": tf.compat.v1.metrics.accuracy( - labels=labels, predictions=predictions["classes"])} - return tf.estimator.EstimatorSpec( - mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) + """Model function for CNN.""" + # Input Layer + # Reshape X to 4-D tensor: [batch_size, width, height, channels] + # MNIST images are 28x28 pixels, and have one color channel + input_layer = tf.reshape(features["x"], [-1, 28, 28, 1]) + + # Convolutional Layer #1 + # Computes 32 features using a 5x5 filter with ReLU activation. + # Padding is added to preserve width and height. + # Input Tensor Shape: [batch_size, 28, 28, 1] + # Output Tensor Shape: [batch_size, 28, 28, 32] + conv1 = tf.compat.v1.layers.conv2d( + inputs=input_layer, filters=32, kernel_size=[5, 5], padding="same", activation=tf.nn.relu + ) + + # Pooling Layer #1 + # First max pooling layer with a 2x2 filter and stride of 2 + # Input Tensor Shape: [batch_size, 28, 28, 32] + # Output Tensor Shape: [batch_size, 14, 14, 32] + pool1 = tf.compat.v1.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2) + + # Convolutional Layer #2 + # Computes 64 features using a 5x5 filter. + # Padding is added to preserve width and height. + # Input Tensor Shape: [batch_size, 14, 14, 32] + # Output Tensor Shape: [batch_size, 14, 14, 64] + conv2 = tf.compat.v1.layers.conv2d( + inputs=pool1, filters=64, kernel_size=[5, 5], padding="same", activation=tf.nn.relu + ) + + # Pooling Layer #2 + # Second max pooling layer with a 2x2 filter and stride of 2 + # Input Tensor Shape: [batch_size, 14, 14, 64] + # Output Tensor Shape: [batch_size, 7, 7, 64] + pool2 = tf.compat.v1.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2) + + # Flatten tensor into a batch of vectors + # Input Tensor Shape: [batch_size, 7, 7, 64] + # Output Tensor Shape: [batch_size, 7 * 7 * 64] + pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64]) + + # Dense Layer + # Densely connected layer with 1024 neurons + # Input Tensor Shape: [batch_size, 7 * 7 * 64] + # Output Tensor Shape: [batch_size, 1024] + dense = tf.compat.v1.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu) + + # Add dropout operation; 0.6 probability that element will be kept + dropout = tf.compat.v1.layers.dropout( + inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN + ) + + # Logits layer + # Input Tensor Shape: [batch_size, 1024] + # Output Tensor Shape: [batch_size, 10] + logits = tf.compat.v1.layers.dense(inputs=dropout, units=10) + + predictions = { + # Generate predictions (for PREDICT and EVAL mode) + "classes": tf.argmax(input=logits, axis=1), + # Add `softmax_tensor` to the graph. It is used for PREDICT and by the + # `logging_hook`. + "probabilities": tf.nn.softmax(logits, name="softmax_tensor"), + } + if mode == tf.estimator.ModeKeys.PREDICT: + return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) + + # Calculate Loss (for both TRAIN and EVAL modes) + loss = tf.compat.v1.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) + + # Configure the Training Op (for TRAIN mode) + if mode == tf.estimator.ModeKeys.TRAIN: + optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.001) + train_op = optimizer.minimize(loss=loss, global_step=tf.compat.v1.train.get_global_step()) + return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) + + # Add evaluation metrics (for EVAL mode) + eval_metric_ops = { + "accuracy": tf.compat.v1.metrics.accuracy(labels=labels, predictions=predictions["classes"]) + } + return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) + def _load_training_data(base_dir): - x_train = np.load(os.path.join(base_dir, 'train_data.npy')) - y_train = np.load(os.path.join(base_dir, 'train_labels.npy')) + x_train = np.load(os.path.join(base_dir, "train_data.npy")) + y_train = np.load(os.path.join(base_dir, "train_labels.npy")) return x_train, y_train + def _load_testing_data(base_dir): - x_test = np.load(os.path.join(base_dir, 'eval_data.npy')) - y_test = np.load(os.path.join(base_dir, 'eval_labels.npy')) + x_test = np.load(os.path.join(base_dir, "eval_data.npy")) + y_test = np.load(os.path.join(base_dir, "eval_labels.npy")) return x_test, y_test + def _parse_args(): parser = argparse.ArgumentParser() - parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAINING']) - parser.add_argument('--model_dir', type=str) - parser.add_argument('--max-steps', type=int, default=200) - parser.add_argument('--save-checkpoint-steps', type=int, default=200) - parser.add_argument('--throttle-secs', type=int, default=60) - parser.add_argument('--hosts', type=list, default=json.loads(os.environ['SM_HOSTS'])) - parser.add_argument('--current-host', type=str, default=os.environ['SM_CURRENT_HOST']) - parser.add_argument('--batch-size', type=int, default=100) - parser.add_argument('--export-model-during-training', type=bool, default=False) + parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAINING"]) + parser.add_argument("--model_dir", type=str) + parser.add_argument("--max-steps", type=int, default=200) + parser.add_argument("--save-checkpoint-steps", type=int, default=200) + parser.add_argument("--throttle-secs", type=int, default=60) + parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"])) + parser.add_argument("--current-host", type=str, default=os.environ["SM_CURRENT_HOST"]) + parser.add_argument("--batch-size", type=int, default=100) + parser.add_argument("--export-model-during-training", type=bool, default=False) return parser.parse_known_args() + def serving_input_fn(): - inputs = {'x': tf.compat.v1.placeholder(tf.float32, [None, 784])} + inputs = {"x": tf.compat.v1.placeholder(tf.float32, [None, 784])} return tf.estimator.export.ServingInputReceiver(inputs, inputs) + if __name__ == "__main__": args, unknown = _parse_args() for arg in vars(args): @@ -144,14 +142,13 @@ def serving_input_fn(): # Saving a checkpoint after every step run_config = tf.estimator.RunConfig(save_checkpoints_steps=args.save_checkpoint_steps) mnist_classifier = tf.estimator.Estimator( - model_fn=cnn_model_fn, model_dir=args.model_dir, config=run_config) + model_fn=cnn_model_fn, model_dir=args.model_dir, config=run_config + ) # Set up logging for predictions # Log the values in the "Softmax" tensor with label "probabilities" tensors_to_log = {"probabilities": "softmax_tensor"} - logging_hook = tf.estimator.LoggingTensorHook( - tensors=tensors_to_log, every_n_iter=50 - ) + logging_hook = tf.estimator.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=50) # Train the model train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn( @@ -159,20 +156,24 @@ def serving_input_fn(): y=train_labels, batch_size=args.batch_size, num_epochs=None, - shuffle=True) + shuffle=True, + ) - exporter = tf.compat.v1.estimator.LatestExporter('Servo', serving_input_receiver_fn=serving_input_fn) \ - if args.export_model_during_training else None + exporter = ( + tf.compat.v1.estimator.LatestExporter("Servo", serving_input_receiver_fn=serving_input_fn) + if args.export_model_during_training + else None + ) # Evaluate the model and print results eval_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn( - x={"x": eval_data}, - y=eval_labels, - num_epochs=1, - shuffle=False) + x={"x": eval_data}, y=eval_labels, num_epochs=1, shuffle=False + ) train_spec = tf.estimator.TrainSpec(train_input_fn, max_steps=args.max_steps) - eval_spec = tf.estimator.EvalSpec(eval_input_fn, throttle_secs=args.throttle_secs, exporters=exporter) + eval_spec = tf.estimator.EvalSpec( + eval_input_fn, throttle_secs=args.throttle_secs, exporters=exporter + ) tf.estimator.train_and_evaluate(mnist_classifier, train_spec, eval_spec) if args.current_host == args.hosts[0]: - mnist_classifier.export_saved_model('/opt/ml/model', serving_input_fn) + mnist_classifier.export_saved_model("/opt/ml/model", serving_input_fn) diff --git a/test/resources/tuning_model_dir/entry.py b/test/resources/tuning_model_dir/entry.py index 0bce7165..09d44abc 100644 --- a/test/resources/tuning_model_dir/entry.py +++ b/test/resources/tuning_model_dir/entry.py @@ -16,11 +16,13 @@ import os parser = argparse.ArgumentParser() -parser.add_argument('--model_dir', type=str) -parser.add_argument('--arbitrary_value', type=int, default=0) +parser.add_argument("--model_dir", type=str) +parser.add_argument("--arbitrary_value", type=int, default=0) args = parser.parse_args() -assert os.environ['TRAINING_JOB_NAME'] in args.model_dir, 'model_dir not unique to training job: %s' % args.model_dir +assert os.environ["TRAINING_JOB_NAME"] in args.model_dir, ( + "model_dir not unique to training job: %s" % args.model_dir +) # For the "hyperparameter tuning" to work -print('accuracy=1') +print("accuracy=1") diff --git a/test/unit/test_s3_utils.py b/test/unit/test_s3_utils.py index 03de70a3..2bd63bf8 100644 --- a/test/unit/test_s3_utils.py +++ b/test/unit/test_s3_utils.py @@ -19,30 +19,30 @@ from sagemaker_tensorflow_container import s3_utils -BUCKET_REGION = 'us-west-2' -JOB_REGION = 'us-west-1' -JOB_BUKCET = 'sagemaker-us-west-2-000-00-1' -PREFIX = 'sagemaker/something' -MODEL_DIR = 's3://{}/{}'.format(JOB_BUKCET, PREFIX) +BUCKET_REGION = "us-west-2" +JOB_REGION = "us-west-1" +JOB_BUKCET = "sagemaker-us-west-2-000-00-1" +PREFIX = "sagemaker/something" +MODEL_DIR = "s3://{}/{}".format(JOB_BUKCET, PREFIX) -@patch('boto3.client') +@patch("boto3.client") def test_configure(client): s3 = MagicMock() client.return_value = s3 - loc = {'LocationConstraint': BUCKET_REGION} + loc = {"LocationConstraint": BUCKET_REGION} s3.get_bucket_location.return_value = loc s3_utils.configure(MODEL_DIR, JOB_REGION) - assert os.environ['S3_REGION'] == BUCKET_REGION - assert os.environ['TF_CPP_MIN_LOG_LEVEL'] == '1' - assert os.environ['S3_USE_HTTPS'] == '1' + assert os.environ["S3_REGION"] == BUCKET_REGION + assert os.environ["TF_CPP_MIN_LOG_LEVEL"] == "1" + assert os.environ["S3_USE_HTTPS"] == "1" def test_configure_local_dir(): - s3_utils.configure('/opt/ml/model', JOB_REGION) + s3_utils.configure("/opt/ml/model", JOB_REGION) - assert os.environ['S3_REGION'] == JOB_REGION - assert os.environ['TF_CPP_MIN_LOG_LEVEL'] == '1' - assert os.environ['S3_USE_HTTPS'] == '1' + assert os.environ["S3_REGION"] == JOB_REGION + assert os.environ["TF_CPP_MIN_LOG_LEVEL"] == "1" + assert os.environ["S3_USE_HTTPS"] == "1" diff --git a/test/unit/test_training.py b/test/unit/test_training.py index 8aec142a..0262458a 100644 --- a/test/unit/test_training.py +++ b/test/unit/test_training.py @@ -22,27 +22,27 @@ from sagemaker_tensorflow_container import training -MODULE_DIR = 's3://my/bucket' -MODULE_NAME = 'script_name' -LOG_LEVEL = 'Debug' -HOST1 = 'host1' -HOST2 = 'host2' +MODULE_DIR = "s3://my/bucket" +MODULE_NAME = "script_name" +LOG_LEVEL = "Debug" +HOST1 = "host1" +HOST2 = "host2" HOST_LIST = [HOST1, HOST2] CURRENT_HOST = HOST1 -CMD_ARGS = {'some_key': 'some_value'} +CMD_ARGS = {"some_key": "some_value"} CLUSTER_WITH_PS = { - 'master': ['{}:2222'.format(HOST1)], - 'worker': ['{}:2222'.format(HOST2)], - 'ps': ['{}:2223'.format(HOST1), '{}:2223'.format(HOST2)] + "master": ["{}:2222".format(HOST1)], + "worker": ["{}:2222".format(HOST2)], + "ps": ["{}:2223".format(HOST1), "{}:2223".format(HOST2)], } -MASTER_TASK = {'index': 0, 'type': 'master'} -WORKER_TASK = {'index': 0, 'type': 'worker'} -PS_TASK_1 = {'index': 0, 'type': 'ps'} -PS_TASK_2 = {'index': 1, 'type': 'ps'} -MODEL_DIR = 's3://bucket/prefix' -MODEL_DIR_CMD_LIST = ['--model_dir', MODEL_DIR] -REGION = 'us-west-2' -RESOURCE_PATH = os.path.join(os.path.dirname(__file__), '..', 'resources') +MASTER_TASK = {"index": 0, "type": "master"} +WORKER_TASK = {"index": 0, "type": "worker"} +PS_TASK_1 = {"index": 0, "type": "ps"} +PS_TASK_2 = {"index": 1, "type": "ps"} +MODEL_DIR = "s3://bucket/prefix" +MODEL_DIR_CMD_LIST = ["--model_dir", MODEL_DIR] +REGION = "us-west-2" +RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "resources") @pytest.fixture @@ -50,9 +50,7 @@ def distributed_training_env(): env = simple_training_env() env.hosts = HOST_LIST - env.additional_framework_parameters = { - training.SAGEMAKER_PARAMETER_SERVER_ENABLED: True - } + env.additional_framework_parameters = {training.SAGEMAKER_PARAMETER_SERVER_ENABLED: True} return env @@ -65,189 +63,244 @@ def simple_training_env(): env = MagicMock() env.module_dir = MODULE_DIR env.user_entry_point = MODULE_NAME - env.hyperparameters = {'model_dir': MODEL_DIR} + env.hyperparameters = {"model_dir": MODEL_DIR} env.log_level = LOG_LEVEL env.additional_framework_parameters = {} env.hosts = CURRENT_HOST env.current_host = CURRENT_HOST env.to_env_vars = lambda: {} - env.job_name = 'test-training-job' + env.job_name = "test-training-job" return env def test_is_host_master(): assert training._is_host_master(HOST_LIST, CURRENT_HOST) is True - assert training._is_host_master(HOST_LIST, 'host2') is False - assert training._is_host_master(HOST_LIST, 'somehost') is False + assert training._is_host_master(HOST_LIST, "host2") is False + assert training._is_host_master(HOST_LIST, "somehost") is False -@patch('sagemaker_training.entry_point.run') +@patch("sagemaker_training.entry_point.run") def test_single_machine(run_module, single_machine_training_env): training.train(single_machine_training_env, MODEL_DIR_CMD_LIST) - run_module.assert_called_with(MODULE_DIR, MODULE_NAME, MODEL_DIR_CMD_LIST, - single_machine_training_env.to_env_vars(), - runner_type=runner.ProcessRunnerType) + run_module.assert_called_with( + uri=MODULE_DIR, + user_entry_point=MODULE_NAME, + args=MODEL_DIR_CMD_LIST, + env_vars=single_machine_training_env.to_env_vars(), + capture_error=True, + runner_type=runner.ProcessRunnerType, + ) -@patch('sagemaker_training.entry_point.run') +@patch("sagemaker_training.entry_point.run") def test_train_horovod(run_module, single_machine_training_env): - single_machine_training_env.additional_framework_parameters['sagemaker_mpi_enabled'] = True + single_machine_training_env.additional_framework_parameters["sagemaker_mpi_enabled"] = True training.train(single_machine_training_env, MODEL_DIR_CMD_LIST) - run_module.assert_called_with(MODULE_DIR, MODULE_NAME, MODEL_DIR_CMD_LIST, - single_machine_training_env.to_env_vars(), - runner_type=runner.MPIRunnerType) + run_module.assert_called_with( + uri=MODULE_DIR, + user_entry_point=MODULE_NAME, + args=MODEL_DIR_CMD_LIST, + env_vars=single_machine_training_env.to_env_vars(), + capture_error=True, + runner_type=runner.MPIRunnerType, + ) @pytest.mark.skip_on_pipeline -@pytest.mark.skipif(sys.version_info.major != 3, - reason="Skip this for python 2 because of dict key order mismatch") -@patch('tensorflow.train.ClusterSpec') -@patch('tensorflow.distribute.Server') -@patch('sagemaker_training.entry_point.run') -@patch('multiprocessing.Process', lambda target: target()) -@patch('time.sleep', MagicMock()) +@pytest.mark.skipif( + sys.version_info.major != 3, reason="Skip this for python 2 because of dict key order mismatch" +) +@patch("tensorflow.train.ClusterSpec") +@patch("tensorflow.distribute.Server") +@patch("sagemaker_training.entry_point.run") +@patch("multiprocessing.Process", lambda target: target()) +@patch("time.sleep", MagicMock()) def test_train_distributed_master(run, tf_server, cluster_spec, distributed_training_env): training.train(distributed_training_env, MODEL_DIR_CMD_LIST) - cluster_spec.assert_called_with({'worker': ['host2:2222'], - 'master': ['host1:2222'], - 'ps': ['host1:2223', 'host2:2223']}) + cluster_spec.assert_called_with( + {"worker": ["host2:2222"], "master": ["host1:2222"], "ps": ["host1:2223", "host2:2223"]} + ) tf_server.assert_called_with( - cluster_spec(), job_name='ps', task_index=0, config=tf.compat.v1.ConfigProto(device_count={'GPU': 0}) + cluster_spec(), + job_name="ps", + task_index=0, + config=tf.compat.v1.ConfigProto(device_count={"GPU": 0}), ) tf_server().join.assert_called_with() - tf_config = '{"cluster": {' \ - '"master": ["host1:2222"], ' \ - '"ps": ["host1:2223", "host2:2223"], ' \ - '"worker": ["host2:2222"]}, ' \ - '"environment": "cloud", ' \ - '"task": {"index": 0, "type": "master"}}' + tf_config = ( + '{"cluster": {' + '"master": ["host1:2222"], ' + '"ps": ["host1:2223", "host2:2223"], ' + '"worker": ["host2:2222"]}, ' + '"environment": "cloud", ' + '"task": {"index": 0, "type": "master"}}' + ) - run.assert_called_with('s3://my/bucket', 'script_name', MODEL_DIR_CMD_LIST, - {'TF_CONFIG': tf_config}) + run.assert_called_with( + uri="s3://my/bucket", + user_entry_point="script_name", + args=MODEL_DIR_CMD_LIST, + env_vars={"TF_CONFIG": tf_config}, + capture_error=True, + ) @pytest.mark.skip_on_pipeline -@pytest.mark.skipif(sys.version_info.major != 3, - reason="Skip this for python 2 because of dict key order mismatch") -@patch('tensorflow.train.ClusterSpec') -@patch('tensorflow.distribute.Server') -@patch('sagemaker_training.entry_point.run') -@patch('multiprocessing.Process', lambda target: target()) -@patch('time.sleep', MagicMock()) +@pytest.mark.skipif( + sys.version_info.major != 3, reason="Skip this for python 2 because of dict key order mismatch" +) +@patch("tensorflow.train.ClusterSpec") +@patch("tensorflow.distribute.Server") +@patch("sagemaker_training.entry_point.run") +@patch("multiprocessing.Process", lambda target: target()) +@patch("time.sleep", MagicMock()) def test_train_distributed_worker(run, tf_server, cluster_spec, distributed_training_env): distributed_training_env.current_host = HOST2 training.train(distributed_training_env, MODEL_DIR_CMD_LIST) - cluster_spec.assert_called_with({'worker': ['host2:2222'], - 'master': ['host1:2222'], - 'ps': ['host1:2223', 'host2:2223']}) + cluster_spec.assert_called_with( + {"worker": ["host2:2222"], "master": ["host1:2222"], "ps": ["host1:2223", "host2:2223"]} + ) tf_server.assert_called_with( - cluster_spec(), job_name='ps', task_index=1, config=tf.compat.v1.ConfigProto(device_count={'GPU': 0}) + cluster_spec(), + job_name="ps", + task_index=1, + config=tf.compat.v1.ConfigProto(device_count={"GPU": 0}), ) tf_server().join.assert_called_with() - tf_config = '{"cluster": {' \ - '"master": ["host1:2222"], ' \ - '"ps": ["host1:2223", "host2:2223"], ' \ - '"worker": ["host2:2222"]}, ' \ - '"environment": "cloud", ' \ - '"task": {"index": 0, "type": "worker"}}' + tf_config = ( + '{"cluster": {' + '"master": ["host1:2222"], ' + '"ps": ["host1:2223", "host2:2223"], ' + '"worker": ["host2:2222"]}, ' + '"environment": "cloud", ' + '"task": {"index": 0, "type": "worker"}}' + ) - run.assert_called_with('s3://my/bucket', 'script_name', MODEL_DIR_CMD_LIST, - {'TF_CONFIG': tf_config}) + run.assert_called_with( + uri="s3://my/bucket", + user_entry_point="script_name", + args=MODEL_DIR_CMD_LIST, + env_vars={"TF_CONFIG": tf_config}, + capture_error=True, + ) -@patch('sagemaker_training.entry_point.run') +@patch("sagemaker_training.entry_point.run") def test_train_distributed_no_ps(run, distributed_training_env): distributed_training_env.additional_framework_parameters[ - training.SAGEMAKER_PARAMETER_SERVER_ENABLED] = False + training.SAGEMAKER_PARAMETER_SERVER_ENABLED + ] = False distributed_training_env.current_host = HOST2 training.train(distributed_training_env, MODEL_DIR_CMD_LIST) - run.assert_called_with(MODULE_DIR, MODULE_NAME, MODEL_DIR_CMD_LIST, - distributed_training_env.to_env_vars(), runner_type=runner.ProcessRunnerType) + run.assert_called_with( + uri=MODULE_DIR, + user_entry_point=MODULE_NAME, + args=MODEL_DIR_CMD_LIST, + env_vars=distributed_training_env.to_env_vars(), + capture_error=True, + runner_type=runner.ProcessRunnerType, + ) def test_build_tf_config(): assert training._build_tf_config(HOST_LIST, HOST1) == { - 'cluster': CLUSTER_WITH_PS, - 'environment': 'cloud', - 'task': MASTER_TASK + "cluster": CLUSTER_WITH_PS, + "environment": "cloud", + "task": MASTER_TASK, } assert training._build_tf_config(HOST_LIST, HOST1, ps_task=True) == { - 'cluster': CLUSTER_WITH_PS, - 'environment': 'cloud', - 'task': PS_TASK_1 + "cluster": CLUSTER_WITH_PS, + "environment": "cloud", + "task": PS_TASK_1, } assert training._build_tf_config(HOST_LIST, HOST2) == { - 'cluster': CLUSTER_WITH_PS, - 'environment': 'cloud', - 'task': WORKER_TASK + "cluster": CLUSTER_WITH_PS, + "environment": "cloud", + "task": WORKER_TASK, } assert training._build_tf_config(HOST_LIST, HOST2, ps_task=True) == { - 'cluster': CLUSTER_WITH_PS, - 'environment': 'cloud', - 'task': PS_TASK_2} + "cluster": CLUSTER_WITH_PS, + "environment": "cloud", + "task": PS_TASK_2, + } def test_build_tf_config_error(): with pytest.raises(ValueError) as error: training._build_tf_config([HOST1], HOST1, ps_task=True) - assert 'Cannot have a ps task if there are no parameter servers in the cluster' in str(error.value) + assert "Cannot have a ps task if there are no parameter servers in the cluster" in str( + error.value + ) -@patch('sagemaker_tensorflow_container.training.logger') +@patch("sagemaker_tensorflow_container.training.logger") def test_log_model_missing_warning_no_model(logger): - path = os.path.join(RESOURCE_PATH, 'test_dir_empty') + path = os.path.join(RESOURCE_PATH, "test_dir_empty") if not os.path.exists(path): os.mkdir(path) training._log_model_missing_warning(path) - logger.warn.assert_called_with('No model artifact is saved under path {}.' - ' Your training job will not save any model files to S3.\n' - 'For details of how to construct your training script see:\n' - 'https://sagemaker.readthedocs.io/en/stable/using_tf.html#adapting-your-local-tensorflow-script' # noqa - .format(path)) + logger.warn.assert_called_with( + "No model artifact is saved under path {}." + " Your training job will not save any model files to S3.\n" + "For details of how to construct your training script see:\n" + "https://sagemaker.readthedocs.io/en/stable/using_tf.html#adapting-your-local-tensorflow-script".format( # noqa + path + ) + ) -@patch('sagemaker_tensorflow_container.training.logger') +@patch("sagemaker_tensorflow_container.training.logger") def test_log_model_missing_warning_wrong_format(logger): - training._log_model_missing_warning(os.path.join(RESOURCE_PATH, 'test_dir_wrong_model')) - logger.warn.assert_called_with('Your model will NOT be servable with SageMaker TensorFlow Serving container. ' - 'The model artifact was not saved in the TensorFlow ' - 'SavedModel directory structure:\n' - 'https://www.tensorflow.org/guide/saved_model#structure_of_a_savedmodel_directory') + training._log_model_missing_warning(os.path.join(RESOURCE_PATH, "test_dir_wrong_model")) + logger.warn.assert_called_with( + "Your model will NOT be servable with SageMaker TensorFlow Serving container. " + "The model artifact was not saved in the TensorFlow " + "SavedModel directory structure:\n" + "https://www.tensorflow.org/guide/saved_model#structure_of_a_savedmodel_directory" + ) -@patch('sagemaker_tensorflow_container.training.logger') +@patch("sagemaker_tensorflow_container.training.logger") def test_log_model_missing_warning_wrong_parent_dir(logger): - training._log_model_missing_warning(os.path.join(RESOURCE_PATH, 'test_dir_wrong_parent_dir')) - logger.warn.assert_called_with('Your model will NOT be servable with SageMaker TensorFlow Serving containers. ' - 'The SavedModel bundle is under directory \"{}\", not a numeric name.' - .format('not-digit')) + training._log_model_missing_warning(os.path.join(RESOURCE_PATH, "test_dir_wrong_parent_dir")) + logger.warn.assert_called_with( + "Your model will NOT be servable with SageMaker TensorFlow Serving containers. " + 'The SavedModel bundle is under directory "{}", not a numeric name.'.format("not-digit") + ) -@patch('sagemaker_tensorflow_container.training.logger') +@patch("sagemaker_tensorflow_container.training.logger") def test_log_model_missing_warning_correct(logger): - training._log_model_missing_warning(os.path.join(RESOURCE_PATH, 'test_dir_correct_model')) + training._log_model_missing_warning(os.path.join(RESOURCE_PATH, "test_dir_correct_model")) logger.warn.assert_not_called() -@patch('sagemaker_tensorflow_container.training.logger') -@patch('sagemaker_tensorflow_container.training.train') -@patch('logging.Logger.setLevel') -@patch('sagemaker_training.environment.Environment') -@patch('sagemaker_training.environment.read_hyperparameters', return_value={}) -@patch('sagemaker_tensorflow_container.s3_utils.configure') -def test_main(configure_s3_env, read_hyperparameters, training_env, - set_level, train, logger, single_machine_training_env): +@patch("sagemaker_tensorflow_container.training.logger") +@patch("sagemaker_tensorflow_container.training.train") +@patch("logging.Logger.setLevel") +@patch("sagemaker_training.environment.Environment") +@patch("sagemaker_training.environment.read_hyperparameters", return_value={}) +@patch("sagemaker_tensorflow_container.s3_utils.configure") +def test_main( + configure_s3_env, + read_hyperparameters, + training_env, + set_level, + train, + logger, + single_machine_training_env, +): training_env.return_value = single_machine_training_env - os.environ['SAGEMAKER_REGION'] = REGION + os.environ["SAGEMAKER_REGION"] = REGION training.main() read_hyperparameters.assert_called_once_with() training_env.assert_called_once_with(hyperparameters={}) @@ -255,46 +308,71 @@ def test_main(configure_s3_env, read_hyperparameters, training_env, configure_s3_env.assert_called_once() -@patch('sagemaker_tensorflow_container.training.logger') -@patch('sagemaker_tensorflow_container.training.train') -@patch('logging.Logger.setLevel') -@patch('sagemaker_training.environment.Environment') -@patch('sagemaker_training.environment.read_hyperparameters', return_value={'model_dir': MODEL_DIR}) -@patch('sagemaker_tensorflow_container.s3_utils.configure') -def test_main_simple_training_model_dir(configure_s3_env, read_hyperparameters, training_env, - set_level, train, logger, single_machine_training_env): +@patch("sagemaker_tensorflow_container.training.logger") +@patch("sagemaker_tensorflow_container.training.train") +@patch("logging.Logger.setLevel") +@patch("sagemaker_training.environment.Environment") +@patch("sagemaker_training.environment.read_hyperparameters", return_value={"model_dir": MODEL_DIR}) +@patch("sagemaker_tensorflow_container.s3_utils.configure") +def test_main_simple_training_model_dir( + configure_s3_env, + read_hyperparameters, + training_env, + set_level, + train, + logger, + single_machine_training_env, +): training_env.return_value = single_machine_training_env - os.environ['SAGEMAKER_REGION'] = REGION + os.environ["SAGEMAKER_REGION"] = REGION training.main() configure_s3_env.assert_called_once_with(MODEL_DIR, REGION) -@patch('sagemaker_tensorflow_container.training.logger') -@patch('sagemaker_tensorflow_container.training.train') -@patch('logging.Logger.setLevel') -@patch('sagemaker_training.environment.Environment') -@patch('sagemaker_training.environment.read_hyperparameters', return_value={'model_dir': MODEL_DIR, - '_tuning_objective_metric': 'auc'}) -@patch('sagemaker_tensorflow_container.s3_utils.configure') -def test_main_tuning_model_dir(configure_s3_env, read_hyperparameters, training_env, - set_level, train, logger, single_machine_training_env): +@patch("sagemaker_tensorflow_container.training.logger") +@patch("sagemaker_tensorflow_container.training.train") +@patch("logging.Logger.setLevel") +@patch("sagemaker_training.environment.Environment") +@patch( + "sagemaker_training.environment.read_hyperparameters", + return_value={"model_dir": MODEL_DIR, "_tuning_objective_metric": "auc"}, +) +@patch("sagemaker_tensorflow_container.s3_utils.configure") +def test_main_tuning_model_dir( + configure_s3_env, + read_hyperparameters, + training_env, + set_level, + train, + logger, + single_machine_training_env, +): training_env.return_value = single_machine_training_env - os.environ['SAGEMAKER_REGION'] = REGION + os.environ["SAGEMAKER_REGION"] = REGION training.main() - expected_model_dir = '{}/{}/model'.format(MODEL_DIR, single_machine_training_env.job_name) + expected_model_dir = "{}/{}/model".format(MODEL_DIR, single_machine_training_env.job_name) configure_s3_env.assert_called_once_with(expected_model_dir, REGION) -@patch('sagemaker_tensorflow_container.training.logger') -@patch('sagemaker_tensorflow_container.training.train') -@patch('logging.Logger.setLevel') -@patch('sagemaker_training.environment.Environment') -@patch('sagemaker_training.environment.read_hyperparameters', return_value={'model_dir': '/opt/ml/model', - '_tuning_objective_metric': 'auc'}) -@patch('sagemaker_tensorflow_container.s3_utils.configure') -def test_main_tuning_mpi_model_dir(configure_s3_env, read_hyperparameters, training_env, - set_level, train, logger, single_machine_training_env): +@patch("sagemaker_tensorflow_container.training.logger") +@patch("sagemaker_tensorflow_container.training.train") +@patch("logging.Logger.setLevel") +@patch("sagemaker_training.environment.Environment") +@patch( + "sagemaker_training.environment.read_hyperparameters", + return_value={"model_dir": "/opt/ml/model", "_tuning_objective_metric": "auc"}, +) +@patch("sagemaker_tensorflow_container.s3_utils.configure") +def test_main_tuning_mpi_model_dir( + configure_s3_env, + read_hyperparameters, + training_env, + set_level, + train, + logger, + single_machine_training_env, +): training_env.return_value = single_machine_training_env - os.environ['SAGEMAKER_REGION'] = REGION + os.environ["SAGEMAKER_REGION"] = REGION training.main() - configure_s3_env.assert_called_once_with('/opt/ml/model', REGION) + configure_s3_env.assert_called_once_with("/opt/ml/model", REGION) diff --git a/test/utils/image_utils.py b/test/utils/image_utils.py index c7158269..9fe5b590 100644 --- a/test/utils/image_utils.py +++ b/test/utils/image_utils.py @@ -16,35 +16,48 @@ import subprocess import sys -CYAN_COLOR = '\033[36m' -END_COLOR = '\033[0m' -DLC_AWS_ID = '763104351884' +CYAN_COLOR = "\033[36m" +END_COLOR = "\033[0m" +DLC_AWS_ID = "763104351884" -def build_image(framework_version, dockerfile, image_uri, region, cwd='.'): - _check_call('python setup.py sdist') +def build_image(framework_version, dockerfile, image_uri, region, cwd="."): + _check_call("python setup.py sdist") - if 'dlc' in dockerfile: + if "dlc" in dockerfile: ecr_login(region, DLC_AWS_ID) - dockerfile_location = os.path.join('test', 'container', framework_version, dockerfile) + dockerfile_location = os.path.join("test", "container", framework_version, dockerfile) subprocess.check_call( - ['docker', 'build', '-t', image_uri, '-f', dockerfile_location, '--build-arg', - 'region={}'.format(region), cwd], cwd=cwd) - print('created image {}'.format(image_uri)) + [ + "docker", + "build", + "-t", + image_uri, + "-f", + dockerfile_location, + "--build-arg", + "region={}".format(region), + cwd, + ], + cwd=cwd, + ) + print("created image {}".format(image_uri)) return image_uri def push_image(ecr_image, region, aws_id): ecr_login(region, aws_id) - _check_call('docker push {}'.format(ecr_image)) + _check_call("docker push {}".format(ecr_image)) def ecr_login(region, aws_id): - login = _check_call('aws ecr get-login --registry-ids {} '.format(aws_id) - + '--no-include-email --region {}'.format(region)) - _check_call(login.decode('utf-8').rstrip('\n')) + login = _check_call( + "aws ecr get-login --registry-ids {} ".format(aws_id) + + "--no-include-email --region {}".format(region) + ) + _check_call(login.decode("utf-8").rstrip("\n")) def _check_call(cmd, *popenargs, **kwargs): @@ -55,5 +68,5 @@ def _check_call(cmd, *popenargs, **kwargs): def _print_cmd(cmd): - print('executing docker command: {}{}{}'.format(CYAN_COLOR, ' '.join(cmd), END_COLOR)) + print("executing docker command: {}{}{}".format(CYAN_COLOR, " ".join(cmd), END_COLOR)) sys.stdout.flush()