Skip to content

Commit

Permalink
Fix/ci (#419)
Browse files Browse the repository at this point in the history
* deprecation: drop py2 support, Update python and other CI

* update image uris

* update tf requirement

* test only py39

* fix tox

* correct docker image URI

* Trigger CI

* fix tf dependency

* test with tf 2.8

* test with tf.25 and py37

* change everything to 2.5.0

* fix tox.ini

* fix: pin protobuf version

* fix: dlc name

* fix: don't touch docker dir

* revert changes in docker/

* fix: revert more changes

* test: use p3 instances

* fix: protobuf version

* update: bump pip version

* reduce protobuf version

* fix: use py37

* update: use py38

* install tfio, separate tf builds

* fix: install ssh

* no-op to retrigger cb

* Revert "no-op to retrigger cb"

This reverts commit 9582aee.

* update: use python 3.8 for ec2 env

* trigger ci

* install ssh in gen gpu

* fix nvidia gpg

* use mnist custom; update instance type; reduce reruns

* fix gpg issue again

* reduce horovod tests; fix test_mnist flake8 issues

* fix: use p3.16 and skip gen

* protobuf and use p3.2xl

Co-authored-by: Satish Pasumarthi <spasuma@amazon.com>
  • Loading branch information
nish21 and satishpasumarthi authored Jun 3, 2022
1 parent 777d9fc commit a58d124
Show file tree
Hide file tree
Showing 23 changed files with 257 additions and 119 deletions.
2 changes: 1 addition & 1 deletion .coveragerc_py36 → .coveragerc_py38
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@ partial_branches =

show_missing = True

fail_under = 90
fail_under = 90
10 changes: 5 additions & 5 deletions .coveragerc_py27 → .coveragerc_py39
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@ timid = True
[report]
exclude_lines =
pragma: no cover
pragma: py2 no cover
if six.PY3
elif six.PY3
pragma: py3 no cover
if six.PY2
elif six.PY2

partial_branches =
pragma: no cover
pragma: py2 no cover
pragma: py3 no cover
if six.PY3
elif six.PY3

show_missing = True

fail_under = 75
fail_under = 90
6 changes: 3 additions & 3 deletions buildspec-dlc-cpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ version: 0.2

env:
variables:
FRAMEWORK_VERSION: '2.3.1'
FRAMEWORK_VERSION: '2.7.1'
CPU_INSTANCE_TYPE: 'ml.c4.xlarge'
ECR_REPO: 'sagemaker-test'

Expand All @@ -27,11 +27,11 @@ phases:
- TEST_OPTS=" --dockerfile-type dlc.cpu --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $DLC_CPU_TAG"

# run local CPU integration tests (build and push the image to ECR repo)
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local --build-image --push-image $TEST_OPTS"
- test_cmd="IGNORE_COVERAGE=- tox -e py38 -- test/integration/local --build-image --push-image $TEST_OPTS"
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-dlc-cpu-tests.yml"

# run sagemaker CPU sagemaker integration tests
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker -n auto --reruns 3 --reruns-delay 15 --instance-type $CPU_INSTANCE_TYPE $TEST_OPTS"
- test_cmd="IGNORE_COVERAGE=- tox -e py38 -- test/integration/sagemaker -n auto --reruns 1 --reruns-delay 15 --instance-type $CPU_INSTANCE_TYPE $TEST_OPTS"
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-dlc-cpu-tests.yml"
finally:
# remove ECR image
Expand Down
14 changes: 7 additions & 7 deletions buildspec-dlc-gpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@ version: 0.2

env:
variables:
FRAMEWORK_VERSION: '2.3.1'
GPU_INSTANCE_TYPE: 'ml.p2.xlarge'
FRAMEWORK_VERSION: '2.7.1'
GPU_INSTANCE_TYPE: 'ml.p3.2xlarge'
ECR_REPO: 'sagemaker-test'
GITHUB_REPO: 'sagemaker-tensorflow-training-toolkit'
DLC_ACCOUNT: '763104351884'
SETUP_FILE: 'setup_cmds.sh'
SETUP_CMDS: '#!/bin/bash\npython3.6 -m pip install --upgrade pip==20.3.1\npython3.6 -m pip install -U .\npython3.6 -m pip install -U .[test]'
SETUP_CMDS: '#!/bin/bash\npython3 -m pip install --upgrade pip==21.3.1\npython3 -m pip install -U .\npython3 -m pip install -U .[test]'

phases:
pre_build:
Expand All @@ -32,7 +32,7 @@ phases:
- TEST_OPTS=" --dockerfile-type dlc.gpu --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $DLC_GPU_TAG"

# build DLC GPU image because the base DLC image is too big and takes too long to build as part of the test
- python3.6 setup.py sdist
- python3 setup.py sdist
- build_dir="test/container/$FRAMEWORK_VERSION"
- $(aws ecr get-login --registry-ids $DLC_ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
- docker build -f "$build_dir/Dockerfile.dlc.gpu" -t $PREPROD_IMAGE:$DLC_GPU_TAG --build-arg region=$AWS_DEFAULT_REGION .
Expand All @@ -48,12 +48,12 @@ phases:

# run dlc gpu local tests on remote host
- printf "$SETUP_CMDS" > $SETUP_FILE
- dlc_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local $TEST_OPTS"
- test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$dlc_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\""
- dlc_cmd="IGNORE_COVERAGE=- tox -e py38 -- test/integration/local $TEST_OPTS"
- test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$dlc_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\" --python-version 3.8"
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-dlc-gpu-tests.yml"

# run GPU sagemaker integration tests
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker -n auto --reruns 3 --reruns-delay 15 --instance-type $GPU_INSTANCE_TYPE $TEST_OPTS"
- test_cmd="IGNORE_COVERAGE=- tox -e py38 -- test/integration/sagemaker -n auto --reruns 1 --reruns-delay 15 --instance-type $GPU_INSTANCE_TYPE $TEST_OPTS"
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-dlc-gpu-tests.yml"
finally:
# shut down remote GPU instance
Expand Down
22 changes: 3 additions & 19 deletions buildspec-gen-cpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ version: 0.2

env:
variables:
FRAMEWORK_VERSION: '2.3.1'
FRAMEWORK_VERSION: '2.7.1'
CPU_INSTANCE_TYPE: 'ml.c4.xlarge'
ECR_REPO: 'sagemaker-test'

Expand All @@ -17,22 +17,6 @@ phases:

build:
commands:
# no-op tests to prioritize dlc tests
- TOX_PARALLEL_NO_SPINNER=1
- PY_COLORS=0

# define tags
- GEN_CPU_TAG="$FRAMEWORK_VERSION-gen-cpu-$BUILD_ID"

# establish common test options
- TEST_OPTS=" --dockerfile-type tf --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $GEN_CPU_TAG"

# run local CPU integration tests (build and push the image to ECR repo)
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local --build-image --push-image $TEST_OPTS"
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-gen-cpu-tests.yml"

# run CPU sagemaker integration tests
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker -n auto --reruns 3 --reruns-delay 15 --instance-type $CPU_INSTANCE_TYPE $TEST_OPTS"
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-gen-cpu-tests.yml"
finally:
# remove ECR image
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GEN_CPU_TAG
- PY_COLORS=0
46 changes: 5 additions & 41 deletions buildspec-gen-gpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@ version: 0.2

env:
variables:
FRAMEWORK_VERSION: '2.3.1'
GPU_INSTANCE_TYPE: 'ml.p2.xlarge'
FRAMEWORK_VERSION: '2.7.1'
GPU_INSTANCE_TYPE: 'ml.p3.16xlarge'
ECR_REPO: 'sagemaker-test'
GITHUB_REPO: 'sagemaker-tensorflow-training-toolkit'
DLC_ACCOUNT: '763104351884'
SETUP_FILE: 'setup_cmds.sh'
SETUP_CMDS: '#!/bin/bash\npython3.6 -m pip install --upgrade pip==20.3.1\npython3.6 -m pip install -U .\npython3.6 -m pip install -U .[test]'
SETUP_CMDS: '#!/bin/bash\npython3 -m pip install --upgrade pip==21.3.1\npython3 -m pip install -U .\npython3 -m pip install -U .[test]'

phases:
pre_build:
Expand All @@ -22,42 +22,6 @@ phases:

build:
commands:
# no-op tests to prioritize dlc tests
- TOX_PARALLEL_NO_SPINNER=1
- PY_COLORS=0

# define tags
- GEN_GPU_TAG="$FRAMEWORK_VERSION-gen-gpu-$BUILD_ID"

# establish common test options
- TEST_OPTS=" --dockerfile-type tf --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $GEN_GPU_TAG"

# build Generic GPU image on build host instead of GPU instance
- python3.6 setup.py sdist
- build_dir="test/container/$FRAMEWORK_VERSION"
- docker build -f "$build_dir/Dockerfile.tf" -t $PREPROD_IMAGE:$GEN_GPU_TAG --build-arg region=$AWS_DEFAULT_REGION .
# push Generic GPU image to ECR
- $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
- docker push $PREPROD_IMAGE:$GEN_GPU_TAG

# launch remote GPU instance
- prefix='ml.'
- instance_type=${GPU_INSTANCE_TYPE#"$prefix"}
- create-key-pair
- launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu-latest

# run generic gpu local tests on remote host
- printf "$SETUP_CMDS" > $SETUP_FILE
- generic_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local $TEST_OPTS"
- test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$generic_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\""
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-gen-gpu-tests.yml"

# run GPU sagemaker integration tests
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker -n auto --reruns 3 --reruns-delay 15 --instance-type $GPU_INSTANCE_TYPE $TEST_OPTS"
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-gen-gpu-tests.yml"
finally:
# shut down remote GPU instance
- cleanup-gpu-instances
- cleanup-key-pairs

# remove ECR image
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GEN_GPU_TAG
- PY_COLORS=0
2 changes: 1 addition & 1 deletion buildspec-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ phases:
# run unit tests
- AWS_ACCESS_KEY_ID= AWS_SECRET_ACCESS_KEY= AWS_SESSION_TOKEN=
AWS_CONTAINER_CREDENTIALS_RELATIVE_URI= AWS_DEFAULT_REGION=
tox -e py36,py37 --parallel all -- test/unit
tox -e py38 --parallel all -- test/unit

# publish the release to github
- git-release --publish
Expand Down
2 changes: 1 addition & 1 deletion buildspec.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ phases:
- tox -e flake8,twine

# run unit tests
- tox -e py36,py37 --parallel all test/unit
- tox -e py38 --parallel all test/unit
Binary file not shown.
12 changes: 7 additions & 5 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def read_version():
"botocore==1.19.34",
"requests-mock",
"awscli==1.18.194",
"protobuf>=3.20,<3.21"
]

if sys.version_info.major > 2:
Expand All @@ -53,26 +54,27 @@ def read_version():
setup(
name="sagemaker_tensorflow_training",
version=read_version(),
description="Open source library for creating "
"TensorFlow containers to run on Amazon SageMaker.",
description="Open source library for using "
"TensorFlow to train models on on Amazon SageMaker.",
packages=find_packages(where="src", exclude=("test",)),
package_dir={"": "src"},
py_modules=[splitext(basename(path))[0] for path in glob("src/*.py")],
long_description=read("README.rst"),
author="Amazon Web Services",
url="https://github.com/aws/sagemaker-tensorflow-containers",
url="https://github.com/aws/sagemaker-tensorflow-training-toolkit",
license="Apache License 2.0",
classifiers=[
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"Natural Language :: English",
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
],
install_requires=[
"sagemaker-training>=3.7.1",
"sagemaker-training>=4.1.0",
"numpy",
"scipy",
"sklearn",
Expand Down
6 changes: 3 additions & 3 deletions test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,12 @@
def pytest_addoption(parser):
parser.addoption("--build-image", "-B", action="store_true")
parser.addoption("--push-image", "-P", action="store_true")
parser.addoption("--dockerfile-type", "-T", choices=["dlc.cpu", "dlc.gpu", "tf"], default="tf")
parser.addoption("--dockerfile-type", "-T", choices=["dlc.cpu", "dlc.gpu", "tf.gpu", "tf.cpu"], default="tf.cpu")
parser.addoption("--dockerfile", "-D", default=None)
parser.addoption("--docker-base-name", default="sagemaker-tensorflow-training")
parser.addoption("--tag", default=None)
parser.addoption("--region", default="us-west-2")
parser.addoption("--framework-version", default="2.2.0")
parser.addoption("--framework-version", default="2.5.0")
parser.addoption("--processor", default="cpu", choices=["cpu", "gpu", "cpu,gpu"])
parser.addoption("--py-version", default="3", choices=["2", "3", "2,3"])
parser.addoption("--account-id", default="142577830533")
Expand Down Expand Up @@ -158,7 +158,7 @@ def account_id(request):
@pytest.fixture
def instance_type(request, processor):
provided_instance_type = request.config.getoption("--instance-type")
default_instance_type = "ml.c4.xlarge" if processor == "cpu" else "ml.p2.xlarge"
default_instance_type = "ml.c4.xlarge" if processor == "cpu" else "ml.p3.2xlarge"
return provided_instance_type if provided_instance_type is not None else default_instance_type


Expand Down
6 changes: 0 additions & 6 deletions test/container/2.2.0/Dockerfile.dlc.cpu

This file was deleted.

6 changes: 0 additions & 6 deletions test/container/2.2.0/Dockerfile.dlc.gpu

This file was deleted.

7 changes: 0 additions & 7 deletions test/container/2.3.1/Dockerfile.tf

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
ARG region
FROM 763104351884.dkr.ecr.$region.amazonaws.com/tensorflow-training:2.3.1-cpu-py37
FROM 763104351884.dkr.ecr.$region.amazonaws.com/tensorflow-training:2.7.1-cpu-py38-ubuntu20.04-sagemaker

COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz
RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
ARG region
FROM 763104351884.dkr.ecr.$region.amazonaws.com/tensorflow-training:2.3.1-gpu-py37-cu110-ubuntu18.04
FROM 763104351884.dkr.ecr.$region.amazonaws.com/tensorflow-training:2.7.1-gpu-py38-cu112-ubuntu20.04-sagemaker

COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz
RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
FROM tensorflow/tensorflow:2.3.0-gpu
FROM tensorflow/tensorflow:2.7.1

ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main

COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz
RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \
rm /sagemaker_tensorflow_training.tar.gz
RUN pip install --no-cache-dir tensorflow-io
RUN apt-get update && apt-get install -y --no-install-recommends openssh-server && mkdir -p /var/run/sshd
13 changes: 13 additions & 0 deletions test/container/2.7.1/Dockerfile.tf.gpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
FROM tensorflow/tensorflow:2.7.1-gpu

ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main

COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz
RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \
rm /sagemaker_tensorflow_training.tar.gz
RUN pip install --no-cache-dir tensorflow-io
RUN apt-key del 7fa2af80 \
&& rm /etc/apt/sources.list.d/nvidia-ml.list \
&& apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub \
&& apt-get update \
&& apt-get install -y --no-install-recommends openssh-server && mkdir -p /var/run/sshd
2 changes: 1 addition & 1 deletion test/integration/local/test_horovod.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def test_distributed_training_horovod_gpu(

@pytest.mark.skip_gpu
@pytest.mark.skip_generic
@pytest.mark.parametrize("instances, processes", [(1, 2), (2, 1), (2, 2), (5, 2)])
@pytest.mark.parametrize("instances, processes", [(2, 2)])
def test_distributed_training_horovod_cpu(
instances, processes, sagemaker_local_session, image_uri, tmpdir, framework_version
):
Expand Down
10 changes: 5 additions & 5 deletions test/integration/local/test_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def py_full_version(py_version): # noqa: F811
if py_version == "2":
return "2.7"
else:
return "3.6"
return "3.8"


@pytest.mark.skip_gpu
Expand All @@ -46,13 +46,13 @@ def test_mnist_cpu(sagemaker_local_session, image_uri, tmpdir, framework_version
_assert_files_exist_in_tar(output_path, ["my_model.h5"])


@pytest.mark.skip_gpu
@pytest.mark.skip
def test_distributed_training_cpu_no_ps(
sagemaker_local_session, image_uri, tmpdir, framework_version
):
output_path = "file://{}".format(tmpdir)
run_tf_training(
script=os.path.join(RESOURCE_PATH, "mnist", "mnist_estimator.py"),
script=os.path.join(RESOURCE_PATH, "mnist", "mnist_custom.py"),
instance_type="local",
instance_count=2,
sagemaker_local_session=sagemaker_local_session,
Expand All @@ -66,11 +66,11 @@ def test_distributed_training_cpu_no_ps(
_assert_files_exist_in_tar(output_path, TF_CHECKPOINT_FILES)


@pytest.mark.skip_gpu
@pytest.mark.skip
def test_distributed_training_cpu_ps(sagemaker_local_session, image_uri, tmpdir, framework_version):
output_path = "file://{}".format(tmpdir)
run_tf_training(
script=os.path.join(RESOURCE_PATH, "mnist", "mnist_estimator.py"),
script=os.path.join(RESOURCE_PATH, "mnist", "mnist_custom.py"),
instance_type="local",
instance_count=2,
sagemaker_local_session=sagemaker_local_session,
Expand Down
Loading

0 comments on commit a58d124

Please sign in to comment.