CI #887
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Do not edit this file! It has been generated by .github/gen-workflow-ci.py | |
name: CI | |
on: | |
schedule: | |
# run a build on master (this does not publish test results or cancel concurrent builds) | |
- cron: '0 10 * * *' # everyday at 10am | |
push: | |
# only consider push to master, hotfix-branches, and tags | |
# otherwise modify job.config.outputs.push | |
branches: [ 'master', 'hotfix-*' ] | |
tags: [ 'v*.*.*' ] | |
pull_request: | |
# only consider pull requests into master | |
branches: [ master ] | |
workflow_dispatch: | |
permissions: {} | |
concurrency: | |
# This controls which concurrent builds to cancel: | |
# - we do not want any concurrent builds on a branch (pull_request) | |
# - we do not want concurrent builds on the same commit on master (push) | |
# - we do not want concurrent builds on the same commit on a tag (push) | |
# - we allow concurrent runs on the same commit on master and its tag (push) | |
# - we allow concurrent runs on the same commit on master (push) and a scheduled build (schedule) | |
# | |
# A pull_request event only runs on branch commit, a push event only on master and tag commit. | |
# A schedule event only runs on master HEAD commit. | |
# | |
# Expression github.ref means something like refs/heads/master or refs/tags/v0.22.1 or the branch. | |
# This helps to not cancel concurrent runs on master or a tag that share the same commit. | |
# Expression github.head_ref refers to the branch of the pull request. | |
# On master, github.head_ref is empty, so we use the SHA of the commit, this means individual | |
# commits to master will not be cancelled, while there can only be one concurrent build on a branch. | |
# | |
# We include the event name to we allow for concurrent scheduled and master builds. | |
group: ci-${{ github.event_name }}-${{ github.ref }}-${{ github.head_ref || github.sha }} | |
cancel-in-progress: true | |
jobs: | |
event_file: | |
name: "Event File" | |
runs-on: ubuntu-latest | |
steps: | |
- name: Upload | |
uses: actions/upload-artifact@v4 | |
with: | |
name: Event File | |
path: ${{ github.event_path }} | |
setup-py: | |
name: "setup.py" | |
runs-on: ubuntu-latest | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Setup Python | |
uses: actions/setup-python@v5 | |
with: | |
python-version: 3.8 | |
- name: Test setup.py | |
env: | |
HOROVOD_WITHOUT_TENSORFLOW: 1 | |
HOROVOD_WITHOUT_PYTORCH: 1 | |
HOROVOD_WITHOUT_MXNET: 1 | |
HOROVOD_WITHOUT_GLOO: 1 | |
HOROVOD_WITHOUT_MPI: 1 | |
run: | | |
python -m pip install --upgrade pip | |
python -m pip install setuptools wheel | |
python setup.py sdist | |
pip -v install dist/horovod-*.tar.gz | |
init-workflow: | |
name: "Init Workflow" | |
runs-on: ubuntu-latest | |
outputs: | |
run-at-all: ${{ github.event_name != 'schedule' || github.repository == 'horovod/horovod' }} | |
# if we don't get a clear 'false', we fall back to building and testing | |
run-builds-and-tests: ${{ steps.tests.outputs.needed != 'false' }} | |
buildkite-branch-label: "${{ steps.config-buildkite.outputs.branch-label }}" | |
buildkite-message: "${{ steps.config-buildkite.outputs.message }}" | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Setup Python | |
uses: actions/setup-python@v5 | |
with: | |
python-version: 3.8 | |
- name: Pip install dependencies | |
run: pip install -r .github/requirements.txt | |
- name: Check ci.yaml is up-to-date | |
run: | | |
python .github/gen-workflow-ci.py | |
if [[ $(git diff .github/workflows/ci.yaml | wc -l) -gt 0 ]] | |
then | |
echo "::error::Workflow file .github/workflows/ci.yaml is out-dated, please run .github/gen-workflow-ci.py and commit changes" | |
exit 1 | |
fi | |
shell: bash | |
- name: Check if tests are needed | |
id: tests | |
env: | |
GITHUB_BASE_SHA: ${{ github.event.pull_request.base.sha }} | |
GITHUB_HEAD_SHA: ${{ github.event.pull_request.head.sha }} | |
run: | | |
if [[ "${{ github.event_name }}" == "pull_request" ]] | |
then | |
changes="$(python .github/get-changed-code-files.py)" | |
if [[ -z "$changes" ]] | |
then | |
echo "No code changes, no need to build and test" | |
echo "needed=false" >> $GITHUB_OUTPUT | |
else | |
echo "Code changes, we need to build and test:" | |
echo "$changes" | |
echo "needed=true" >> $GITHUB_OUTPUT | |
fi | |
else | |
echo "This is not part of a pull request, we need to build and test" | |
echo "needed=true" >> $GITHUB_OUTPUT | |
fi | |
- name: Configure Buildkite Build | |
id: config-buildkite | |
env: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
run: | | |
branch="${{ github.event.pull_request.head.ref || github.ref }}" | |
branch="${branch#"refs/heads/"}" | |
branch="${branch#"refs/tags/"}" | |
branch_label="${branch}" | |
if [[ "${{ github.event_name }}" == "schedule" ]] | |
then | |
# we add this label to the branch used by Buildkite to avoid it cancelling one of concurrent schedule and push builds on master | |
branch_label="${branch} (schedule)" | |
fi | |
echo "branch-label=${branch_label}" >> $GITHUB_OUTPUT | |
if [[ "${{ github.event_name }}" == "pull_request" ]] | |
then | |
head_sha="${{ github.event.pull_request.head.sha }}" | |
message="$(gh api https://api.github.com/repos/horovod/horovod/commits/${head_sha} -q .commit.message | head -n1)" | |
echo "message=${message}" >> $GITHUB_OUTPUT | |
fi | |
- name: Provide PR meta | |
if: github.event_name == 'pull_request' | |
run: | | |
rm -f pr.json | |
echo -n "{" >> pr.json | |
echo -n " \"merge_sha\": \"${{ github.sha }}\"," >> pr.json | |
echo -n " \"base_sha\": \"${{ github.event.pull_request.base.sha }}\"," >> pr.json | |
echo -n " \"head_sha\": \"${{ github.event.pull_request.head.sha }}\" " >> pr.json | |
echo -n "}" >> pr.json | |
cat pr.json | |
- name: Upload PR meta | |
uses: actions/upload-artifact@v4 | |
if: github.event_name == 'pull_request' | |
with: | |
name: PR Meta | |
path: pr.json | |
build-and-test: | |
name: "Build and Test (${{ matrix.image }})" | |
needs: [init-workflow] | |
if: > | |
needs.init-workflow.outputs.run-at-all == 'true' && | |
needs.init-workflow.outputs.run-builds-and-tests == 'true' | |
runs-on: ubuntu-latest | |
strategy: | |
max-parallel: 10 | |
fail-fast: false | |
matrix: | |
include: | |
- image: test-cpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_8_1-mxnet1_5_1_p0-pyspark3_4_0 | |
Elastic_Spark_TensorFlow_Tests_2: true | |
Elastic_Tests_2: true | |
Gloo_Cluster_PyTests: true | |
Gloo_Keras_MNIST: true | |
Gloo_MXNet_MNIST_horovodrun: true | |
Gloo_Parallel_PyTests: true | |
Gloo_PyTorch_MNIST_api: true | |
Gloo_PyTorch_MNIST_horovodrun: true | |
Gloo_Single_PyTests: true | |
Gloo_TensorFlow_MNIST: true | |
Single_Keras_MNIST: true | |
Single_MXNet_MNIST: true | |
Single_PyTorch_MNIST: true | |
Spark_Keras_MNIST: true | |
Spark_Keras_Rossmann_Estimator: true | |
Spark_Keras_Rossmann_Run: true | |
Spark_Lightning_MNIST: true | |
Spark_PyTests: true | |
Spark_Torch_MNIST: true | |
build_timeout: 30 | |
- image: test-cpu-gloo-py3_7-tf2_11_0-keras2_11_0-torch1_13_1-mxnet1_9_1-pyspark2_4_8 | |
Elastic_Spark_TensorFlow_Tests_1: true | |
Elastic_Spark_Torch_Tests: true | |
Elastic_Tests_1: true | |
Gloo_Cluster_PyTests: true | |
Gloo_MXNet_MNIST_horovodrun: true | |
Gloo_Parallel_PyTests: true | |
Gloo_PyTorch_MNIST_api: true | |
Gloo_PyTorch_MNIST_horovodrun: true | |
Gloo_Single_PyTests: true | |
Gloo_TensorFlow_2_0_Keras_MNIST_api: true | |
Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun: true | |
Gloo_TensorFlow_2_0_MNIST_Elastic_api: true | |
Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun: true | |
Gloo_TensorFlow_2_0_MNIST_api: true | |
Gloo_TensorFlow_2_0_MNIST_horovodrun: true | |
Single_MXNet_MNIST: true | |
Single_PyTorch_MNIST: true | |
Spark_Lightning_MNIST: true | |
Spark_PyTests: true | |
Spark_TensorFlow_2_0_MNIST_Data_Service: true | |
Spark_Torch_MNIST: true | |
build_timeout: 30 | |
- image: test-cpu-gloo-py3_8-tf2_10_1-keras2_10_0-torch1_12_1-mxnet1_7_0_p2-pyspark3_4_0 | |
Elastic_Tests_1: true | |
Gloo_Cluster_PyTests: true | |
Gloo_MXNet_MNIST_horovodrun: true | |
Gloo_Parallel_PyTests: true | |
Gloo_PyTorch_MNIST_api: true | |
Gloo_PyTorch_MNIST_horovodrun: true | |
Gloo_Single_PyTests: true | |
Gloo_TensorFlow_2_0_Keras_MNIST_api: true | |
Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun: true | |
Gloo_TensorFlow_2_0_MNIST_Data_Service: true | |
Gloo_TensorFlow_2_0_MNIST_Elastic_api: true | |
Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun: true | |
Gloo_TensorFlow_2_0_MNIST_api: true | |
Gloo_TensorFlow_2_0_MNIST_horovodrun: true | |
Single_MXNet_MNIST: true | |
Single_PyTorch_MNIST: true | |
Spark_Lightning_MNIST: true | |
Spark_PyTests: true | |
Spark_TensorFlow_2_0_MNIST_Data_Service: true | |
Spark_Torch_MNIST: true | |
build_timeout: 30 | |
- image: test-cpu-gloo-py3_8-tf2_11_1-keras2_11_0-torch1_13_1-mxnet1_8_0_p0-pyspark3_4_0 | |
Elastic_Tests_1: true | |
Gloo_Cluster_PyTests: true | |
Gloo_MXNet_MNIST_horovodrun: true | |
Gloo_Parallel_PyTests: true | |
Gloo_PyTorch_MNIST_api: true | |
Gloo_PyTorch_MNIST_horovodrun: true | |
Gloo_Single_PyTests: true | |
Gloo_TensorFlow_2_0_Keras_MNIST_api: true | |
Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun: true | |
Gloo_TensorFlow_2_0_MNIST_Elastic_api: true | |
Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun: true | |
Gloo_TensorFlow_2_0_MNIST_api: true | |
Gloo_TensorFlow_2_0_MNIST_horovodrun: true | |
Single_MXNet_MNIST: true | |
Single_PyTorch_MNIST: true | |
Spark_Lightning_MNIST: true | |
Spark_PyTests: true | |
Spark_TensorFlow_2_0_MNIST_Data_Service: true | |
Spark_Torch_MNIST: true | |
build_timeout: 30 | |
- image: test-cpu-gloo-py3_8-tf2_12_0-keras2_12_0-torch2_0_0-mxnet1_9_1-pyspark3_3_2 | |
Elastic_Spark_TensorFlow_Tests_1: true | |
Elastic_Spark_Torch_Tests: true | |
Elastic_Tests_1: true | |
Gloo_Cluster_PyTests: true | |
Gloo_MXNet_MNIST_horovodrun: true | |
Gloo_Parallel_PyTests: true | |
Gloo_PyTorch_MNIST_api: true | |
Gloo_PyTorch_MNIST_horovodrun: true | |
Gloo_Single_PyTests: true | |
Gloo_TensorFlow_2_0_Keras_MNIST_api: true | |
Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun: true | |
Gloo_TensorFlow_2_0_MNIST_Elastic_api: true | |
Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun: true | |
Gloo_TensorFlow_2_0_MNIST_api: true | |
Gloo_TensorFlow_2_0_MNIST_horovodrun: true | |
Single_MXNet_MNIST: true | |
Single_PyTorch_MNIST: true | |
Spark_Lightning_MNIST: true | |
Spark_PyTests: true | |
Spark_TensorFlow_2_0_MNIST_Data_Service: true | |
Spark_Torch_MNIST: true | |
build_timeout: 30 | |
- image: test-cpu-gloo-py3_8-tf2_12_0-keras2_12_0-torch2_0_0-mxnet1_9_1-pyspark3_4_0 | |
Elastic_Spark_TensorFlow_Tests_1: true | |
Elastic_Spark_Torch_Tests: true | |
Elastic_Tests_1: true | |
Gloo_Cluster_PyTests: true | |
Gloo_MXNet_MNIST_horovodrun: true | |
Gloo_Parallel_PyTests: true | |
Gloo_PyTorch_MNIST_api: true | |
Gloo_PyTorch_MNIST_horovodrun: true | |
Gloo_Single_PyTests: true | |
Gloo_TensorFlow_2_0_Keras_MNIST_api: true | |
Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun: true | |
Gloo_TensorFlow_2_0_MNIST_Elastic_api: true | |
Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun: true | |
Gloo_TensorFlow_2_0_MNIST_api: true | |
Gloo_TensorFlow_2_0_MNIST_horovodrun: true | |
Single_MXNet_MNIST: true | |
Single_PyTorch_MNIST: true | |
Spark_Lightning_MNIST: true | |
Spark_PyTests: true | |
Spark_TensorFlow_2_0_MNIST_Data_Service: true | |
Spark_Torch_MNIST: true | |
build_timeout: 30 | |
- image: test-cpu-mpich-py3_8-tf2_12_0-keras2_12_0-torch2_0_0-mxnet1_9_1-pyspark3_4_0 | |
MPI_Cluster_PyTests: true | |
MPI_MXNet_MNIST_horovodrun: true | |
MPI_Parallel_PyTests: true | |
MPI_PyTorch_MNIST_api: true | |
MPI_PyTorch_MNIST_horovodrun: true | |
MPI_Single_PyTests: true | |
MPI_TensorFlow_2_0_Keras_MNIST_api: true | |
MPI_TensorFlow_2_0_Keras_MNIST_horovodrun: true | |
MPI_TensorFlow_2_0_MNIST_api: true | |
MPI_TensorFlow_2_0_MNIST_horovodrun: true | |
Single_MXNet_MNIST: true | |
Single_PyTorch_MNIST: true | |
build_timeout: 30 | |
- image: test-cpu-openmpi-gloo-py3_8-tf2_12_0-keras2_12_0-torch2_0_0-mxnet1_9_1-pyspark3_4_0 | |
Elastic_Tests_1: true | |
Gloo_Cluster_PyTests: true | |
Gloo_MXNet_MNIST_horovodrun: true | |
Gloo_Parallel_PyTests: true | |
Gloo_PyTorch_MNIST_api: true | |
Gloo_PyTorch_MNIST_horovodrun: true | |
Gloo_Single_PyTests: true | |
Gloo_TensorFlow_2_0_Keras_MNIST_api: true | |
Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun: true | |
Gloo_TensorFlow_2_0_MNIST_Elastic_api: true | |
Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun: true | |
Gloo_TensorFlow_2_0_MNIST_api: true | |
Gloo_TensorFlow_2_0_MNIST_horovodrun: true | |
MPI_Cluster_PyTests: true | |
MPI_MXNet_MNIST_horovodrun: true | |
MPI_Parallel_PyTests: true | |
MPI_PyTorch_MNIST_api: true | |
MPI_PyTorch_MNIST_horovodrun: true | |
MPI_Single_PyTests: true | |
MPI_TensorFlow_2_0_Keras_MNIST_api: true | |
MPI_TensorFlow_2_0_Keras_MNIST_horovodrun: true | |
MPI_TensorFlow_2_0_MNIST_api: true | |
MPI_TensorFlow_2_0_MNIST_horovodrun: true | |
Run_PyTests_test_interactiverun: true | |
Single_MXNet_MNIST: true | |
Single_PyTorch_MNIST: true | |
Spark_Lightning_MNIST: true | |
Spark_PyTests: true | |
Spark_TensorFlow_2_0_MNIST_Data_Service: true | |
Spark_Torch_MNIST: true | |
build_timeout: 30 | |
- image: test-cpu-openmpi-py3_8-tf2_12_0-keras2_12_0-torch2_0_0-mxnet1_9_1-pyspark3_4_0 | |
MPI_Cluster_PyTests: true | |
MPI_MXNet_MNIST_horovodrun: true | |
MPI_Parallel_PyTests: true | |
MPI_PyTorch_MNIST_api: true | |
MPI_PyTorch_MNIST_horovodrun: true | |
MPI_Single_PyTests: true | |
MPI_TensorFlow_2_0_Keras_MNIST_api: true | |
MPI_TensorFlow_2_0_Keras_MNIST_horovodrun: true | |
MPI_TensorFlow_2_0_MNIST_api: true | |
MPI_TensorFlow_2_0_MNIST_horovodrun: true | |
Run_PyTests_test_interactiverun: true | |
Single_MXNet_MNIST: true | |
Single_PyTorch_MNIST: true | |
Spark_Lightning_MNIST: true | |
Spark_PyTests: true | |
Spark_TensorFlow_2_0_MNIST_Data_Service: true | |
Spark_Torch_MNIST: true | |
build_timeout: 30 | |
- image: test-gpu-gloo-py3_8-tf1_15_5-keras2_2_4-torch1_12_1-mxnet1_8_0_p0-pyspark3_4_0 | |
build_timeout: 40 | |
- image: test-gpu-gloo-py3_8-tf2_10_1-keras2_10_0-torch1_12_1-mxnet1_8_0_p0-pyspark3_4_0 | |
build_timeout: 40 | |
- image: test-gpu-gloo-py3_8-tf2_11_1-keras2_11_0-torch1_13_1-mxnet1_8_0_p0-pyspark3_4_0 | |
build_timeout: 40 | |
- image: test-gpu-openmpi-gloo-py3_8-tf2_12_0-keras2_12_0-torch2_0_0-mxnet1_9_1-pyspark3_4_0 | |
build_timeout: 40 | |
- image: test-mixed-openmpi-gloo-py3_8-tf2_12_0-keras2_12_0-torch2_0_0-mxnet1_9_1-pyspark3_4_0 | |
build_timeout: 40 | |
steps: | |
- name: Clean up disk space | |
# deleting these paths frees 38 GB disk space: | |
# sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc | |
# but this sometimes takes 3-4 minutes | |
# so we delete only some sub-paths which are known to be quick (10s) and 20 GB | |
run: | | |
echo ::group::Disk space before clean up | |
df -h | |
echo ::endgroup:: | |
for dir in /usr/share/dotnet/sdk/\*/nuGetPackagesArchive.lzma \ | |
/usr/share/dotnet/shared \ | |
/usr/local/lib/android/sdk/ndk \ | |
/usr/local/lib/android/sdk/build-tools \ | |
/opt/ghc | |
do | |
echo ::group::Deleting "$dir" | |
sudo du -hsc $dir | tail -n1 || true | |
sudo rm -rf $dir | |
echo ::endgroup:: | |
done | |
echo ::group::Disk space after clean up | |
df -h | |
echo ::endgroup:: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
with: | |
submodules: recursive | |
- name: Setup Python | |
uses: actions/setup-python@v5 | |
with: | |
python-version: 3.8 | |
- name: Build | |
id: build | |
run: | | |
.github/timeout-and-retry.sh ${{ matrix.build_timeout }}m 3 10 docker compose -f docker-compose.test.yml build ${{ matrix.image }} | |
env: | |
COMPOSE_DOCKER_CLI_BUILD: 1 | |
DOCKER_BUILDKIT: 1 | |
- name: "Elastic Spark TensorFlow Tests 1 [attempt 1 of 3]" | |
id: Elastic_Spark_TensorFlow_Tests_1_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_1 && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow2.py" | |
shell: bash | |
- name: "Elastic Spark TensorFlow Tests 1 [attempt 2 of 3]" | |
id: Elastic_Spark_TensorFlow_Tests_1_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_1 && steps.Elastic_Spark_TensorFlow_Tests_1_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow2.py" | |
shell: bash | |
- name: "Elastic Spark TensorFlow Tests 1 [attempt 3 of 3]" | |
id: Elastic_Spark_TensorFlow_Tests_1_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_1 && steps.Elastic_Spark_TensorFlow_Tests_1_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow2.py" | |
shell: bash | |
- name: "Elastic Spark TensorFlow Tests 2 [attempt 1 of 3]" | |
id: Elastic_Spark_TensorFlow_Tests_2_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_2 && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow.py" | |
shell: bash | |
- name: "Elastic Spark TensorFlow Tests 2 [attempt 2 of 3]" | |
id: Elastic_Spark_TensorFlow_Tests_2_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_2 && steps.Elastic_Spark_TensorFlow_Tests_2_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow.py" | |
shell: bash | |
- name: "Elastic Spark TensorFlow Tests 2 [attempt 3 of 3]" | |
id: Elastic_Spark_TensorFlow_Tests_2_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_2 && steps.Elastic_Spark_TensorFlow_Tests_2_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow.py" | |
shell: bash | |
- name: "Elastic Spark Torch Tests [attempt 1 of 3]" | |
id: Elastic_Spark_Torch_Tests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_Torch_Tests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.torch.xml test_elastic_spark_torch.py" | |
shell: bash | |
- name: "Elastic Spark Torch Tests [attempt 2 of 3]" | |
id: Elastic_Spark_Torch_Tests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_Torch_Tests && steps.Elastic_Spark_Torch_Tests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.torch.xml test_elastic_spark_torch.py" | |
shell: bash | |
- name: "Elastic Spark Torch Tests [attempt 3 of 3]" | |
id: Elastic_Spark_Torch_Tests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_Torch_Tests && steps.Elastic_Spark_Torch_Tests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.torch.xml test_elastic_spark_torch.py" | |
shell: bash | |
- name: "Elastic Tests 1 [attempt 1 of 3]" | |
id: Elastic_Tests_1_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_1 && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_1_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_1_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow2.py" | |
shell: bash | |
- name: "Elastic Tests 1 [attempt 2 of 3]" | |
id: Elastic_Tests_1_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_1 && steps.Elastic_Tests_1_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_1_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_1_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow2.py" | |
shell: bash | |
- name: "Elastic Tests 1 [attempt 3 of 3]" | |
id: Elastic_Tests_1_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_1 && steps.Elastic_Tests_1_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_1_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_1_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow2.py" | |
shell: bash | |
- name: "Elastic Tests 2 [attempt 1 of 3]" | |
id: Elastic_Tests_2_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_2 && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_2_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_2_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow.py test_elastic_tensorflow_keras.py" | |
shell: bash | |
- name: "Elastic Tests 2 [attempt 2 of 3]" | |
id: Elastic_Tests_2_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_2 && steps.Elastic_Tests_2_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_2_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_2_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow.py test_elastic_tensorflow_keras.py" | |
shell: bash | |
- name: "Elastic Tests 2 [attempt 3 of 3]" | |
id: Elastic_Tests_2_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_2 && steps.Elastic_Tests_2_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_2_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_2_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow.py test_elastic_tensorflow_keras.py" | |
shell: bash | |
- name: "Gloo Cluster PyTests [attempt 1 of 3]" | |
id: Gloo_Cluster_PyTests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Cluster_PyTests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.static.xml test_static_run.py" | |
shell: bash | |
- name: "Gloo Cluster PyTests [attempt 2 of 3]" | |
id: Gloo_Cluster_PyTests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Cluster_PyTests && steps.Gloo_Cluster_PyTests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.static.xml test_static_run.py" | |
shell: bash | |
- name: "Gloo Cluster PyTests [attempt 3 of 3]" | |
id: Gloo_Cluster_PyTests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Cluster_PyTests && steps.Gloo_Cluster_PyTests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.static.xml test_static_run.py" | |
shell: bash | |
- name: "Gloo Keras MNIST [attempt 1 of 3]" | |
id: Gloo_Keras_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Keras_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/keras/keras_mnist_advanced.py | |
shell: bash | |
- name: "Gloo Keras MNIST [attempt 2 of 3]" | |
id: Gloo_Keras_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Keras_MNIST && steps.Gloo_Keras_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/keras/keras_mnist_advanced.py | |
shell: bash | |
- name: "Gloo Keras MNIST [attempt 3 of 3]" | |
id: Gloo_Keras_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Keras_MNIST && steps.Gloo_Keras_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/keras/keras_mnist_advanced.py | |
shell: bash | |
- name: "Gloo MXNet2 MNIST api [attempt 1 of 3]" | |
id: Gloo_MXNet2_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication gloo | |
shell: bash | |
- name: "Gloo MXNet2 MNIST api [attempt 2 of 3]" | |
id: Gloo_MXNet2_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_api && steps.Gloo_MXNet2_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication gloo | |
shell: bash | |
- name: "Gloo MXNet2 MNIST api [attempt 3 of 3]" | |
id: Gloo_MXNet2_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_api && steps.Gloo_MXNet2_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication gloo | |
shell: bash | |
- name: "Gloo MXNet2 MNIST horovodrun [attempt 1 of 3]" | |
id: Gloo_MXNet2_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet2_mnist.py | |
shell: bash | |
- name: "Gloo MXNet2 MNIST horovodrun [attempt 2 of 3]" | |
id: Gloo_MXNet2_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_horovodrun && steps.Gloo_MXNet2_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet2_mnist.py | |
shell: bash | |
- name: "Gloo MXNet2 MNIST horovodrun [attempt 3 of 3]" | |
id: Gloo_MXNet2_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_horovodrun && steps.Gloo_MXNet2_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet2_mnist.py | |
shell: bash | |
- name: "Gloo MXNet MNIST horovodrun [attempt 1 of 3]" | |
id: Gloo_MXNet_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet_mnist.py | |
shell: bash | |
- name: "Gloo MXNet MNIST horovodrun [attempt 2 of 3]" | |
id: Gloo_MXNet_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet_MNIST_horovodrun && steps.Gloo_MXNet_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet_mnist.py | |
shell: bash | |
- name: "Gloo MXNet MNIST horovodrun [attempt 3 of 3]" | |
id: Gloo_MXNet_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet_MNIST_horovodrun && steps.Gloo_MXNet_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet_mnist.py | |
shell: bash | |
- name: "Gloo Parallel PyTests [attempt 1 of 3]" | |
id: Gloo_Parallel_PyTests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Parallel_PyTests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 horovodrun -np 2 -H localhost:2 --gloo /bin/bash /pytest.sh gloo)" | |
shell: bash | |
- name: "Gloo Parallel PyTests [attempt 2 of 3]" | |
id: Gloo_Parallel_PyTests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Parallel_PyTests && steps.Gloo_Parallel_PyTests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 horovodrun -np 2 -H localhost:2 --gloo /bin/bash /pytest.sh gloo)" | |
shell: bash | |
- name: "Gloo Parallel PyTests [attempt 3 of 3]" | |
id: Gloo_Parallel_PyTests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Parallel_PyTests && steps.Gloo_Parallel_PyTests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 horovodrun -np 2 -H localhost:2 --gloo /bin/bash /pytest.sh gloo)" | |
shell: bash | |
- name: "Gloo PyTorch MNIST api [attempt 1 of 3]" | |
id: Gloo_PyTorch_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication gloo | |
shell: bash | |
- name: "Gloo PyTorch MNIST api [attempt 2 of 3]" | |
id: Gloo_PyTorch_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_api && steps.Gloo_PyTorch_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication gloo | |
shell: bash | |
- name: "Gloo PyTorch MNIST api [attempt 3 of 3]" | |
id: Gloo_PyTorch_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_api && steps.Gloo_PyTorch_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication gloo | |
shell: bash | |
- name: "Gloo PyTorch MNIST horovodrun [attempt 1 of 3]" | |
id: Gloo_PyTorch_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets | |
shell: bash | |
- name: "Gloo PyTorch MNIST horovodrun [attempt 2 of 3]" | |
id: Gloo_PyTorch_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_horovodrun && steps.Gloo_PyTorch_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets | |
shell: bash | |
- name: "Gloo PyTorch MNIST horovodrun [attempt 3 of 3]" | |
id: Gloo_PyTorch_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_horovodrun && steps.Gloo_PyTorch_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets | |
shell: bash | |
- name: "Gloo Single PyTests [attempt 1 of 3]" | |
id: Gloo_Single_PyTests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Single_PyTests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh gloo)" | |
shell: bash | |
- name: "Gloo Single PyTests [attempt 2 of 3]" | |
id: Gloo_Single_PyTests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Single_PyTests && steps.Gloo_Single_PyTests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh gloo)" | |
shell: bash | |
- name: "Gloo Single PyTests [attempt 3 of 3]" | |
id: Gloo_Single_PyTests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Single_PyTests && steps.Gloo_Single_PyTests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh gloo)" | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 Keras MNIST api [attempt 1 of 3]" | |
id: Gloo_TensorFlow_2_0_Keras_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 gloo | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 Keras MNIST api [attempt 2 of 3]" | |
id: Gloo_TensorFlow_2_0_Keras_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_api && steps.Gloo_TensorFlow_2_0_Keras_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 gloo | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 Keras MNIST api [attempt 3 of 3]" | |
id: Gloo_TensorFlow_2_0_Keras_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_api && steps.Gloo_TensorFlow_2_0_Keras_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 gloo | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 Keras MNIST horovodrun [attempt 1 of 3]" | |
id: Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 Keras MNIST horovodrun [attempt 2 of 3]" | |
id: Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun && steps.Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 Keras MNIST horovodrun [attempt 3 of 3]" | |
id: Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun && steps.Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Data Service [attempt 1 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Data_Service_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Data_Service && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "horovodrun -np 2 python -m horovod.tensorflow.data.compute_worker /tmp/compute.json & horovodrun -np 2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist_data_service.py /tmp/compute.json" | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Data Service [attempt 2 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Data_Service_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Data_Service && steps.Gloo_TensorFlow_2_0_MNIST_Data_Service_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "horovodrun -np 2 python -m horovod.tensorflow.data.compute_worker /tmp/compute.json & horovodrun -np 2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist_data_service.py /tmp/compute.json" | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Data Service [attempt 3 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Data_Service_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Data_Service && steps.Gloo_TensorFlow_2_0_MNIST_Data_Service_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "horovodrun -np 2 python -m horovod.tensorflow.data.compute_worker /tmp/compute.json & horovodrun -np 2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist_data_service.py /tmp/compute.json" | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Elastic api [attempt 1 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py 2 2 2 localhost:2,127.0.0.1:2 | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Elastic api [attempt 2 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_api && steps.Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py 2 2 2 localhost:2,127.0.0.1:2 | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Elastic api [attempt 3 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_api && steps.Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py 2 2 2 localhost:2,127.0.0.1:2 | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Elastic horovodrun [attempt 1 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 --min-np 2 --max-np 2 -H localhost:2,127.0.0.1:2 --gloo python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Elastic horovodrun [attempt 2 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun && steps.Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 --min-np 2 --max-np 2 -H localhost:2,127.0.0.1:2 --gloo python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Elastic horovodrun [attempt 3 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun && steps.Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 --min-np 2 --max-np 2 -H localhost:2,127.0.0.1:2 --gloo python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST api [attempt 1 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 gloo | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST api [attempt 2 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_api && steps.Gloo_TensorFlow_2_0_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 gloo | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST api [attempt 3 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_api && steps.Gloo_TensorFlow_2_0_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 gloo | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST horovodrun [attempt 1 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST horovodrun [attempt 2 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_horovodrun && steps.Gloo_TensorFlow_2_0_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST horovodrun [attempt 3 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_horovodrun && steps.Gloo_TensorFlow_2_0_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow MNIST [attempt 1 of 3]" | |
id: Gloo_TensorFlow_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow/tensorflow_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow MNIST [attempt 2 of 3]" | |
id: Gloo_TensorFlow_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_MNIST && steps.Gloo_TensorFlow_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow/tensorflow_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow MNIST [attempt 3 of 3]" | |
id: Gloo_TensorFlow_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_MNIST && steps.Gloo_TensorFlow_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow/tensorflow_mnist.py | |
shell: bash | |
- name: "MPI Cluster PyTests [attempt 1 of 3]" | |
id: MPI_Cluster_PyTests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [attempt 2 of 3]" | |
id: MPI_Cluster_PyTests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests && steps.MPI_Cluster_PyTests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [attempt 3 of 3]" | |
id: MPI_Cluster_PyTests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests && steps.MPI_Cluster_PyTests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_Cluster_PyTests_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_Cluster_PyTests_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_MPI && steps.MPI_Cluster_PyTests_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_Cluster_PyTests_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_MPI && steps.MPI_Cluster_PyTests_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_Cluster_PyTests_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_Cluster_PyTests_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_OFI && steps.MPI_Cluster_PyTests_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_Cluster_PyTests_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_OFI && steps.MPI_Cluster_PyTests_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI MXNet2 MNIST api [attempt 1 of 3]" | |
id: MPI_MXNet2_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI MXNet2 MNIST api [attempt 2 of 3]" | |
id: MPI_MXNet2_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_api && steps.MPI_MXNet2_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI MXNet2 MNIST api [attempt 3 of 3]" | |
id: MPI_MXNet2_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_api && steps.MPI_MXNet2_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI MXNet2 MNIST horovodrun [attempt 1 of 3]" | |
id: MPI_MXNet2_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet2_mnist.py" | |
shell: bash | |
- name: "MPI MXNet2 MNIST horovodrun [attempt 2 of 3]" | |
id: MPI_MXNet2_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_horovodrun && steps.MPI_MXNet2_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet2_mnist.py" | |
shell: bash | |
- name: "MPI MXNet2 MNIST horovodrun [attempt 3 of 3]" | |
id: MPI_MXNet2_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_horovodrun && steps.MPI_MXNet2_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet2_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [attempt 1 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [attempt 2 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun && steps.MPI_MXNet_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [attempt 3 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun && steps.MPI_MXNet_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_MPI && steps.MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_MPI && steps.MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_OFI && steps.MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_OFI && steps.MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI Parallel PyTests [attempt 1 of 3]" | |
id: MPI_Parallel_PyTests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [attempt 2 of 3]" | |
id: MPI_Parallel_PyTests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests && steps.MPI_Parallel_PyTests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [attempt 3 of 3]" | |
id: MPI_Parallel_PyTests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests && steps.MPI_Parallel_PyTests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_Parallel_PyTests_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_Parallel_PyTests_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_MPI && steps.MPI_Parallel_PyTests_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_Parallel_PyTests_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_MPI && steps.MPI_Parallel_PyTests_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_Parallel_PyTests_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_Parallel_PyTests_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_OFI && steps.MPI_Parallel_PyTests_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_Parallel_PyTests_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_OFI && steps.MPI_Parallel_PyTests_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [attempt 1 of 3]" | |
id: MPI_PyTorch_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [attempt 2 of 3]" | |
id: MPI_PyTorch_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api && steps.MPI_PyTorch_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [attempt 3 of 3]" | |
id: MPI_PyTorch_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api && steps.MPI_PyTorch_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_PyTorch_MNIST_api_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_PyTorch_MNIST_api_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_MPI && steps.MPI_PyTorch_MNIST_api_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_PyTorch_MNIST_api_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_MPI && steps.MPI_PyTorch_MNIST_api_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_PyTorch_MNIST_api_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_PyTorch_MNIST_api_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_OFI && steps.MPI_PyTorch_MNIST_api_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_PyTorch_MNIST_api_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_OFI && steps.MPI_PyTorch_MNIST_api_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [attempt 1 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [attempt 2 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun && steps.MPI_PyTorch_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [attempt 3 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun && steps.MPI_PyTorch_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI && steps.MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI && steps.MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI && steps.MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI && steps.MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI Single PyTests [attempt 1 of 3]" | |
id: MPI_Single_PyTests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [attempt 2 of 3]" | |
id: MPI_Single_PyTests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests && steps.MPI_Single_PyTests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [attempt 3 of 3]" | |
id: MPI_Single_PyTests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests && steps.MPI_Single_PyTests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_Single_PyTests_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_Single_PyTests_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_MPI && steps.MPI_Single_PyTests_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_Single_PyTests_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_MPI && steps.MPI_Single_PyTests_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_Single_PyTests_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_Single_PyTests_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_OFI && steps.MPI_Single_PyTests_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_Single_PyTests_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_OFI && steps.MPI_Single_PyTests_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api && steps.MPI_TensorFlow_2_0_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api && steps.MPI_TensorFlow_2_0_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI && steps.MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI && steps.MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI && steps.MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI && steps.MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "Run PyTests test_interactiverun [attempt 1 of 3]" | |
id: Run_PyTests_test_interactiverun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Run_PyTests_test_interactiverun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/test && pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.integration.xml integration/test_interactiverun.py" | |
shell: bash | |
- name: "Run PyTests test_interactiverun [attempt 2 of 3]" | |
id: Run_PyTests_test_interactiverun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Run_PyTests_test_interactiverun && steps.Run_PyTests_test_interactiverun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/test && pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.integration.xml integration/test_interactiverun.py" | |
shell: bash | |
- name: "Run PyTests test_interactiverun [attempt 3 of 3]" | |
id: Run_PyTests_test_interactiverun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Run_PyTests_test_interactiverun && steps.Run_PyTests_test_interactiverun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/test && pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.integration.xml integration/test_interactiverun.py" | |
shell: bash | |
- name: "Single Keras MNIST [attempt 1 of 3]" | |
id: Single_Keras_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_Keras_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/keras/keras_mnist_advanced.py --epochs 3 --batch-size 64" | |
shell: bash | |
- name: "Single Keras MNIST [attempt 2 of 3]" | |
id: Single_Keras_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_Keras_MNIST && steps.Single_Keras_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/keras/keras_mnist_advanced.py --epochs 3 --batch-size 64" | |
shell: bash | |
- name: "Single Keras MNIST [attempt 3 of 3]" | |
id: Single_Keras_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_Keras_MNIST && steps.Single_Keras_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/keras/keras_mnist_advanced.py --epochs 3 --batch-size 64" | |
shell: bash | |
- name: "Single MXNet2 MNIST [attempt 1 of 3]" | |
id: Single_MXNet2_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet2_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet2 MNIST [attempt 2 of 3]" | |
id: Single_MXNet2_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet2_MNIST && steps.Single_MXNet2_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet2 MNIST [attempt 3 of 3]" | |
id: Single_MXNet2_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet2_MNIST && steps.Single_MXNet2_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [attempt 1 of 3]" | |
id: Single_MXNet_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [attempt 2 of 3]" | |
id: Single_MXNet_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST && steps.Single_MXNet_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [attempt 3 of 3]" | |
id: Single_MXNet_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST && steps.Single_MXNet_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [ONECCL MPI] [attempt 1 of 3]" | |
id: Single_MXNet_MNIST_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [ONECCL MPI] [attempt 2 of 3]" | |
id: Single_MXNet_MNIST_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_MPI && steps.Single_MXNet_MNIST_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [ONECCL MPI] [attempt 3 of 3]" | |
id: Single_MXNet_MNIST_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_MPI && steps.Single_MXNet_MNIST_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [ONECCL OFI] [attempt 1 of 3]" | |
id: Single_MXNet_MNIST_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [ONECCL OFI] [attempt 2 of 3]" | |
id: Single_MXNet_MNIST_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_OFI && steps.Single_MXNet_MNIST_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [ONECCL OFI] [attempt 3 of 3]" | |
id: Single_MXNet_MNIST_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_OFI && steps.Single_MXNet_MNIST_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single PyTorch MNIST [attempt 1 of 3]" | |
id: Single_PyTorch_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [attempt 2 of 3]" | |
id: Single_PyTorch_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST && steps.Single_PyTorch_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [attempt 3 of 3]" | |
id: Single_PyTorch_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST && steps.Single_PyTorch_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [ONECCL MPI] [attempt 1 of 3]" | |
id: Single_PyTorch_MNIST_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [ONECCL MPI] [attempt 2 of 3]" | |
id: Single_PyTorch_MNIST_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_MPI && steps.Single_PyTorch_MNIST_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [ONECCL MPI] [attempt 3 of 3]" | |
id: Single_PyTorch_MNIST_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_MPI && steps.Single_PyTorch_MNIST_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [ONECCL OFI] [attempt 1 of 3]" | |
id: Single_PyTorch_MNIST_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [ONECCL OFI] [attempt 2 of 3]" | |
id: Single_PyTorch_MNIST_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_OFI && steps.Single_PyTorch_MNIST_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [ONECCL OFI] [attempt 3 of 3]" | |
id: Single_PyTorch_MNIST_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_OFI && steps.Single_PyTorch_MNIST_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Spark Keras MNIST [attempt 1 of 3]" | |
id: Spark_Keras_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark Keras MNIST [attempt 2 of 3]" | |
id: Spark_Keras_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_MNIST && steps.Spark_Keras_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark Keras MNIST [attempt 3 of 3]" | |
id: Spark_Keras_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_MNIST && steps.Spark_Keras_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark Keras Rossmann Estimator [attempt 1 of 3]" | |
id: Spark_Keras_Rossmann_Estimator_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Estimator && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_estimator.py --num-proc 2 --work-dir /work --data-dir file:///data --epochs 3 --sample-rate 0.1" | |
shell: bash | |
- name: "Spark Keras Rossmann Estimator [attempt 2 of 3]" | |
id: Spark_Keras_Rossmann_Estimator_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Estimator && steps.Spark_Keras_Rossmann_Estimator_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_estimator.py --num-proc 2 --work-dir /work --data-dir file:///data --epochs 3 --sample-rate 0.1" | |
shell: bash | |
- name: "Spark Keras Rossmann Estimator [attempt 3 of 3]" | |
id: Spark_Keras_Rossmann_Estimator_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Estimator && steps.Spark_Keras_Rossmann_Estimator_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_estimator.py --num-proc 2 --work-dir /work --data-dir file:///data --epochs 3 --sample-rate 0.1" | |
shell: bash | |
- name: "Spark Keras Rossmann Run [attempt 1 of 3]" | |
id: Spark_Keras_Rossmann_Run_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Run && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_run.py --num-proc 2 --data-dir file:///data --epochs 3 --sample-rate 0.1" | |
shell: bash | |
- name: "Spark Keras Rossmann Run [attempt 2 of 3]" | |
id: Spark_Keras_Rossmann_Run_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Run && steps.Spark_Keras_Rossmann_Run_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_run.py --num-proc 2 --data-dir file:///data --epochs 3 --sample-rate 0.1" | |
shell: bash | |
- name: "Spark Keras Rossmann Run [attempt 3 of 3]" | |
id: Spark_Keras_Rossmann_Run_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Run && steps.Spark_Keras_Rossmann_Run_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_run.py --num-proc 2 --data-dir file:///data --epochs 3 --sample-rate 0.1" | |
shell: bash | |
- name: "Spark Lightning MNIST [attempt 1 of 3]" | |
id: Spark_Lightning_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Lightning_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_lightning_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark Lightning MNIST [attempt 2 of 3]" | |
id: Spark_Lightning_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Lightning_MNIST && steps.Spark_Lightning_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_lightning_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark Lightning MNIST [attempt 3 of 3]" | |
id: Spark_Lightning_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Lightning_MNIST && steps.Spark_Lightning_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_lightning_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark PyTests [attempt 1 of 3]" | |
id: Spark_PyTests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_PyTests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_PyTests_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && (ls -1 test_spark*.py | xargs -n 1 /bin/bash /pytest_standalone.sh spark)" | |
shell: bash | |
- name: "Spark PyTests [attempt 2 of 3]" | |
id: Spark_PyTests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_PyTests && steps.Spark_PyTests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_PyTests_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && (ls -1 test_spark*.py | xargs -n 1 /bin/bash /pytest_standalone.sh spark)" | |
shell: bash | |
- name: "Spark PyTests [attempt 3 of 3]" | |
id: Spark_PyTests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_PyTests && steps.Spark_PyTests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_PyTests_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && (ls -1 test_spark*.py | xargs -n 1 /bin/bash /pytest_standalone.sh spark)" | |
shell: bash | |
- name: "Spark TensorFlow 2.0 MNIST Data Service [attempt 1 of 3]" | |
id: Spark_TensorFlow_2_0_MNIST_Data_Service_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_TensorFlow_2_0_MNIST_Data_Service && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/examples/spark/tensorflow2; spark-submit --master \"local[2]\" \"/horovod/horovod/spark/tensorflow/compute_worker.py\" /tmp/compute.json & OMP_NUM_THREADS=1 /spark_env.sh spark-submit --master \"local[2]\" --py-files tensorflow2_mnist_data_service_train_fn_compute_side_dispatcher.py,tensorflow2_mnist_data_service_train_fn_training_side_dispatcher.py tensorflow2_mnist_data_service.py /tmp/compute.json" | |
shell: bash | |
- name: "Spark TensorFlow 2.0 MNIST Data Service [attempt 2 of 3]" | |
id: Spark_TensorFlow_2_0_MNIST_Data_Service_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_TensorFlow_2_0_MNIST_Data_Service && steps.Spark_TensorFlow_2_0_MNIST_Data_Service_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/examples/spark/tensorflow2; spark-submit --master \"local[2]\" \"/horovod/horovod/spark/tensorflow/compute_worker.py\" /tmp/compute.json & OMP_NUM_THREADS=1 /spark_env.sh spark-submit --master \"local[2]\" --py-files tensorflow2_mnist_data_service_train_fn_compute_side_dispatcher.py,tensorflow2_mnist_data_service_train_fn_training_side_dispatcher.py tensorflow2_mnist_data_service.py /tmp/compute.json" | |
shell: bash | |
- name: "Spark TensorFlow 2.0 MNIST Data Service [attempt 3 of 3]" | |
id: Spark_TensorFlow_2_0_MNIST_Data_Service_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_TensorFlow_2_0_MNIST_Data_Service && steps.Spark_TensorFlow_2_0_MNIST_Data_Service_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/examples/spark/tensorflow2; spark-submit --master \"local[2]\" \"/horovod/horovod/spark/tensorflow/compute_worker.py\" /tmp/compute.json & OMP_NUM_THREADS=1 /spark_env.sh spark-submit --master \"local[2]\" --py-files tensorflow2_mnist_data_service_train_fn_compute_side_dispatcher.py,tensorflow2_mnist_data_service_train_fn_training_side_dispatcher.py tensorflow2_mnist_data_service.py /tmp/compute.json" | |
shell: bash | |
- name: "Spark Torch MNIST [attempt 1 of 3]" | |
id: Spark_Torch_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Torch_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark Torch MNIST [attempt 2 of 3]" | |
id: Spark_Torch_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Torch_MNIST && steps.Spark_Torch_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark Torch MNIST [attempt 3 of 3]" | |
id: Spark_Torch_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Torch_MNIST && steps.Spark_Torch_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: Upload Test Results | |
uses: actions/upload-artifact@v4 | |
if: always() && contains(matrix.image, '-cpu-') | |
with: | |
name: Unit Test Results - ${{ matrix.image }} | |
path: artifacts/${{ matrix.image }}/**/*.xml | |
build-and-test-heads: | |
name: "Build and Test heads (${{ matrix.image }})" | |
needs: [init-workflow, build-and-test] | |
if: > | |
needs.init-workflow.outputs.run-at-all == 'true' && | |
needs.init-workflow.outputs.run-builds-and-tests == 'true' | |
runs-on: ubuntu-latest | |
strategy: | |
max-parallel: 2 | |
fail-fast: false | |
matrix: | |
include: | |
- image: test-cpu-openmpi-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_4_0 | |
Elastic_Tests_1: true | |
Gloo_Cluster_PyTests: true | |
Gloo_MXNet2_MNIST_api: true | |
Gloo_MXNet2_MNIST_horovodrun: true | |
Gloo_Parallel_PyTests: true | |
Gloo_PyTorch_MNIST_api: true | |
Gloo_PyTorch_MNIST_horovodrun: true | |
Gloo_Single_PyTests: true | |
Gloo_TensorFlow_2_0_Keras_MNIST_api: true | |
Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun: true | |
Gloo_TensorFlow_2_0_MNIST_Elastic_api: true | |
Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun: true | |
Gloo_TensorFlow_2_0_MNIST_api: true | |
Gloo_TensorFlow_2_0_MNIST_horovodrun: true | |
MPI_Cluster_PyTests: true | |
MPI_MXNet2_MNIST_api: true | |
MPI_MXNet2_MNIST_horovodrun: true | |
MPI_Parallel_PyTests: true | |
MPI_PyTorch_MNIST_api: true | |
MPI_PyTorch_MNIST_horovodrun: true | |
MPI_Single_PyTests: true | |
MPI_TensorFlow_2_0_Keras_MNIST_api: true | |
MPI_TensorFlow_2_0_Keras_MNIST_horovodrun: true | |
MPI_TensorFlow_2_0_MNIST_api: true | |
MPI_TensorFlow_2_0_MNIST_horovodrun: true | |
Run_PyTests_test_interactiverun: true | |
Single_MXNet2_MNIST: true | |
Single_PyTorch_MNIST: true | |
Spark_Lightning_MNIST: true | |
Spark_PyTests: true | |
Spark_TensorFlow_2_0_MNIST_Data_Service: true | |
Spark_Torch_MNIST: true | |
build_timeout: 30 | |
- image: test-gpu-openmpi-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_4_0 | |
build_timeout: 40 | |
steps: | |
- name: Clean up disk space | |
# deleting these paths frees 38 GB disk space: | |
# sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc | |
# but this sometimes takes 3-4 minutes | |
# so we delete only some sub-paths which are known to be quick (10s) and 20 GB | |
run: | | |
echo ::group::Disk space before clean up | |
df -h | |
echo ::endgroup:: | |
for dir in /usr/share/dotnet/sdk/\*/nuGetPackagesArchive.lzma \ | |
/usr/share/dotnet/shared \ | |
/usr/local/lib/android/sdk/ndk \ | |
/usr/local/lib/android/sdk/build-tools \ | |
/opt/ghc | |
do | |
echo ::group::Deleting "$dir" | |
sudo du -hsc $dir | tail -n1 || true | |
sudo rm -rf $dir | |
echo ::endgroup:: | |
done | |
echo ::group::Disk space after clean up | |
df -h | |
echo ::endgroup:: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
with: | |
submodules: recursive | |
- name: Setup Python | |
uses: actions/setup-python@v5 | |
with: | |
python-version: 3.8 | |
- name: Build | |
id: build | |
run: | | |
.github/timeout-and-retry.sh ${{ matrix.build_timeout }}m 3 10 docker compose -f docker-compose.test.yml build ${{ matrix.image }} | |
env: | |
COMPOSE_DOCKER_CLI_BUILD: 1 | |
DOCKER_BUILDKIT: 1 | |
- name: "Elastic Spark TensorFlow Tests 1 [attempt 1 of 3]" | |
id: Elastic_Spark_TensorFlow_Tests_1_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_1 && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow2.py" | |
shell: bash | |
- name: "Elastic Spark TensorFlow Tests 1 [attempt 2 of 3]" | |
id: Elastic_Spark_TensorFlow_Tests_1_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_1 && steps.Elastic_Spark_TensorFlow_Tests_1_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow2.py" | |
shell: bash | |
- name: "Elastic Spark TensorFlow Tests 1 [attempt 3 of 3]" | |
id: Elastic_Spark_TensorFlow_Tests_1_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_1 && steps.Elastic_Spark_TensorFlow_Tests_1_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow2.py" | |
shell: bash | |
- name: "Elastic Spark TensorFlow Tests 2 [attempt 1 of 3]" | |
id: Elastic_Spark_TensorFlow_Tests_2_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_2 && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow.py" | |
shell: bash | |
- name: "Elastic Spark TensorFlow Tests 2 [attempt 2 of 3]" | |
id: Elastic_Spark_TensorFlow_Tests_2_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_2 && steps.Elastic_Spark_TensorFlow_Tests_2_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow.py" | |
shell: bash | |
- name: "Elastic Spark TensorFlow Tests 2 [attempt 3 of 3]" | |
id: Elastic_Spark_TensorFlow_Tests_2_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_2 && steps.Elastic_Spark_TensorFlow_Tests_2_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow.py" | |
shell: bash | |
- name: "Elastic Spark Torch Tests [attempt 1 of 3]" | |
id: Elastic_Spark_Torch_Tests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_Torch_Tests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.torch.xml test_elastic_spark_torch.py" | |
shell: bash | |
- name: "Elastic Spark Torch Tests [attempt 2 of 3]" | |
id: Elastic_Spark_Torch_Tests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_Torch_Tests && steps.Elastic_Spark_Torch_Tests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.torch.xml test_elastic_spark_torch.py" | |
shell: bash | |
- name: "Elastic Spark Torch Tests [attempt 3 of 3]" | |
id: Elastic_Spark_Torch_Tests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_Torch_Tests && steps.Elastic_Spark_Torch_Tests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.torch.xml test_elastic_spark_torch.py" | |
shell: bash | |
- name: "Elastic Tests 1 [attempt 1 of 3]" | |
id: Elastic_Tests_1_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_1 && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_1_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_1_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow2.py" | |
shell: bash | |
- name: "Elastic Tests 1 [attempt 2 of 3]" | |
id: Elastic_Tests_1_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_1 && steps.Elastic_Tests_1_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_1_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_1_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow2.py" | |
shell: bash | |
- name: "Elastic Tests 1 [attempt 3 of 3]" | |
id: Elastic_Tests_1_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_1 && steps.Elastic_Tests_1_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_1_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_1_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow2.py" | |
shell: bash | |
- name: "Elastic Tests 2 [attempt 1 of 3]" | |
id: Elastic_Tests_2_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_2 && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_2_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_2_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow.py test_elastic_tensorflow_keras.py" | |
shell: bash | |
- name: "Elastic Tests 2 [attempt 2 of 3]" | |
id: Elastic_Tests_2_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_2 && steps.Elastic_Tests_2_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_2_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_2_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow.py test_elastic_tensorflow_keras.py" | |
shell: bash | |
- name: "Elastic Tests 2 [attempt 3 of 3]" | |
id: Elastic_Tests_2_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_2 && steps.Elastic_Tests_2_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_2_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_2_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow.py test_elastic_tensorflow_keras.py" | |
shell: bash | |
- name: "Gloo Cluster PyTests [attempt 1 of 3]" | |
id: Gloo_Cluster_PyTests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Cluster_PyTests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.static.xml test_static_run.py" | |
shell: bash | |
- name: "Gloo Cluster PyTests [attempt 2 of 3]" | |
id: Gloo_Cluster_PyTests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Cluster_PyTests && steps.Gloo_Cluster_PyTests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.static.xml test_static_run.py" | |
shell: bash | |
- name: "Gloo Cluster PyTests [attempt 3 of 3]" | |
id: Gloo_Cluster_PyTests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Cluster_PyTests && steps.Gloo_Cluster_PyTests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.static.xml test_static_run.py" | |
shell: bash | |
- name: "Gloo Keras MNIST [attempt 1 of 3]" | |
id: Gloo_Keras_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Keras_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/keras/keras_mnist_advanced.py | |
shell: bash | |
- name: "Gloo Keras MNIST [attempt 2 of 3]" | |
id: Gloo_Keras_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Keras_MNIST && steps.Gloo_Keras_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/keras/keras_mnist_advanced.py | |
shell: bash | |
- name: "Gloo Keras MNIST [attempt 3 of 3]" | |
id: Gloo_Keras_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Keras_MNIST && steps.Gloo_Keras_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/keras/keras_mnist_advanced.py | |
shell: bash | |
- name: "Gloo MXNet2 MNIST api [attempt 1 of 3]" | |
id: Gloo_MXNet2_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication gloo | |
shell: bash | |
- name: "Gloo MXNet2 MNIST api [attempt 2 of 3]" | |
id: Gloo_MXNet2_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_api && steps.Gloo_MXNet2_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication gloo | |
shell: bash | |
- name: "Gloo MXNet2 MNIST api [attempt 3 of 3]" | |
id: Gloo_MXNet2_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_api && steps.Gloo_MXNet2_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication gloo | |
shell: bash | |
- name: "Gloo MXNet2 MNIST horovodrun [attempt 1 of 3]" | |
id: Gloo_MXNet2_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet2_mnist.py | |
shell: bash | |
- name: "Gloo MXNet2 MNIST horovodrun [attempt 2 of 3]" | |
id: Gloo_MXNet2_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_horovodrun && steps.Gloo_MXNet2_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet2_mnist.py | |
shell: bash | |
- name: "Gloo MXNet2 MNIST horovodrun [attempt 3 of 3]" | |
id: Gloo_MXNet2_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_horovodrun && steps.Gloo_MXNet2_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet2_mnist.py | |
shell: bash | |
- name: "Gloo MXNet MNIST horovodrun [attempt 1 of 3]" | |
id: Gloo_MXNet_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet_mnist.py | |
shell: bash | |
- name: "Gloo MXNet MNIST horovodrun [attempt 2 of 3]" | |
id: Gloo_MXNet_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet_MNIST_horovodrun && steps.Gloo_MXNet_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet_mnist.py | |
shell: bash | |
- name: "Gloo MXNet MNIST horovodrun [attempt 3 of 3]" | |
id: Gloo_MXNet_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet_MNIST_horovodrun && steps.Gloo_MXNet_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet_mnist.py | |
shell: bash | |
- name: "Gloo Parallel PyTests [attempt 1 of 3]" | |
id: Gloo_Parallel_PyTests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Parallel_PyTests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 horovodrun -np 2 -H localhost:2 --gloo /bin/bash /pytest.sh gloo)" | |
shell: bash | |
- name: "Gloo Parallel PyTests [attempt 2 of 3]" | |
id: Gloo_Parallel_PyTests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Parallel_PyTests && steps.Gloo_Parallel_PyTests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 horovodrun -np 2 -H localhost:2 --gloo /bin/bash /pytest.sh gloo)" | |
shell: bash | |
- name: "Gloo Parallel PyTests [attempt 3 of 3]" | |
id: Gloo_Parallel_PyTests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Parallel_PyTests && steps.Gloo_Parallel_PyTests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 horovodrun -np 2 -H localhost:2 --gloo /bin/bash /pytest.sh gloo)" | |
shell: bash | |
- name: "Gloo PyTorch MNIST api [attempt 1 of 3]" | |
id: Gloo_PyTorch_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication gloo | |
shell: bash | |
- name: "Gloo PyTorch MNIST api [attempt 2 of 3]" | |
id: Gloo_PyTorch_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_api && steps.Gloo_PyTorch_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication gloo | |
shell: bash | |
- name: "Gloo PyTorch MNIST api [attempt 3 of 3]" | |
id: Gloo_PyTorch_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_api && steps.Gloo_PyTorch_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication gloo | |
shell: bash | |
- name: "Gloo PyTorch MNIST horovodrun [attempt 1 of 3]" | |
id: Gloo_PyTorch_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets | |
shell: bash | |
- name: "Gloo PyTorch MNIST horovodrun [attempt 2 of 3]" | |
id: Gloo_PyTorch_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_horovodrun && steps.Gloo_PyTorch_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets | |
shell: bash | |
- name: "Gloo PyTorch MNIST horovodrun [attempt 3 of 3]" | |
id: Gloo_PyTorch_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_horovodrun && steps.Gloo_PyTorch_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets | |
shell: bash | |
- name: "Gloo Single PyTests [attempt 1 of 3]" | |
id: Gloo_Single_PyTests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Single_PyTests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh gloo)" | |
shell: bash | |
- name: "Gloo Single PyTests [attempt 2 of 3]" | |
id: Gloo_Single_PyTests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Single_PyTests && steps.Gloo_Single_PyTests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh gloo)" | |
shell: bash | |
- name: "Gloo Single PyTests [attempt 3 of 3]" | |
id: Gloo_Single_PyTests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Single_PyTests && steps.Gloo_Single_PyTests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh gloo)" | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 Keras MNIST api [attempt 1 of 3]" | |
id: Gloo_TensorFlow_2_0_Keras_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 gloo | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 Keras MNIST api [attempt 2 of 3]" | |
id: Gloo_TensorFlow_2_0_Keras_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_api && steps.Gloo_TensorFlow_2_0_Keras_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 gloo | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 Keras MNIST api [attempt 3 of 3]" | |
id: Gloo_TensorFlow_2_0_Keras_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_api && steps.Gloo_TensorFlow_2_0_Keras_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 gloo | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 Keras MNIST horovodrun [attempt 1 of 3]" | |
id: Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 Keras MNIST horovodrun [attempt 2 of 3]" | |
id: Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun && steps.Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 Keras MNIST horovodrun [attempt 3 of 3]" | |
id: Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun && steps.Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Data Service [attempt 1 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Data_Service_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Data_Service && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "horovodrun -np 2 python -m horovod.tensorflow.data.compute_worker /tmp/compute.json & horovodrun -np 2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist_data_service.py /tmp/compute.json" | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Data Service [attempt 2 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Data_Service_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Data_Service && steps.Gloo_TensorFlow_2_0_MNIST_Data_Service_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "horovodrun -np 2 python -m horovod.tensorflow.data.compute_worker /tmp/compute.json & horovodrun -np 2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist_data_service.py /tmp/compute.json" | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Data Service [attempt 3 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Data_Service_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Data_Service && steps.Gloo_TensorFlow_2_0_MNIST_Data_Service_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "horovodrun -np 2 python -m horovod.tensorflow.data.compute_worker /tmp/compute.json & horovodrun -np 2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist_data_service.py /tmp/compute.json" | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Elastic api [attempt 1 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py 2 2 2 localhost:2,127.0.0.1:2 | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Elastic api [attempt 2 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_api && steps.Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py 2 2 2 localhost:2,127.0.0.1:2 | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Elastic api [attempt 3 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_api && steps.Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py 2 2 2 localhost:2,127.0.0.1:2 | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Elastic horovodrun [attempt 1 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 --min-np 2 --max-np 2 -H localhost:2,127.0.0.1:2 --gloo python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Elastic horovodrun [attempt 2 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun && steps.Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 --min-np 2 --max-np 2 -H localhost:2,127.0.0.1:2 --gloo python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Elastic horovodrun [attempt 3 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun && steps.Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 --min-np 2 --max-np 2 -H localhost:2,127.0.0.1:2 --gloo python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST api [attempt 1 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 gloo | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST api [attempt 2 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_api && steps.Gloo_TensorFlow_2_0_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 gloo | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST api [attempt 3 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_api && steps.Gloo_TensorFlow_2_0_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 gloo | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST horovodrun [attempt 1 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST horovodrun [attempt 2 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_horovodrun && steps.Gloo_TensorFlow_2_0_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST horovodrun [attempt 3 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_horovodrun && steps.Gloo_TensorFlow_2_0_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow MNIST [attempt 1 of 3]" | |
id: Gloo_TensorFlow_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow/tensorflow_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow MNIST [attempt 2 of 3]" | |
id: Gloo_TensorFlow_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_MNIST && steps.Gloo_TensorFlow_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow/tensorflow_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow MNIST [attempt 3 of 3]" | |
id: Gloo_TensorFlow_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_MNIST && steps.Gloo_TensorFlow_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow/tensorflow_mnist.py | |
shell: bash | |
- name: "MPI Cluster PyTests [attempt 1 of 3]" | |
id: MPI_Cluster_PyTests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [attempt 2 of 3]" | |
id: MPI_Cluster_PyTests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests && steps.MPI_Cluster_PyTests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [attempt 3 of 3]" | |
id: MPI_Cluster_PyTests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests && steps.MPI_Cluster_PyTests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_Cluster_PyTests_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_Cluster_PyTests_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_MPI && steps.MPI_Cluster_PyTests_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_Cluster_PyTests_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_MPI && steps.MPI_Cluster_PyTests_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_Cluster_PyTests_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_Cluster_PyTests_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_OFI && steps.MPI_Cluster_PyTests_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_Cluster_PyTests_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_OFI && steps.MPI_Cluster_PyTests_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI MXNet2 MNIST api [attempt 1 of 3]" | |
id: MPI_MXNet2_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI MXNet2 MNIST api [attempt 2 of 3]" | |
id: MPI_MXNet2_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_api && steps.MPI_MXNet2_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI MXNet2 MNIST api [attempt 3 of 3]" | |
id: MPI_MXNet2_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_api && steps.MPI_MXNet2_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI MXNet2 MNIST horovodrun [attempt 1 of 3]" | |
id: MPI_MXNet2_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet2_mnist.py" | |
shell: bash | |
- name: "MPI MXNet2 MNIST horovodrun [attempt 2 of 3]" | |
id: MPI_MXNet2_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_horovodrun && steps.MPI_MXNet2_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet2_mnist.py" | |
shell: bash | |
- name: "MPI MXNet2 MNIST horovodrun [attempt 3 of 3]" | |
id: MPI_MXNet2_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_horovodrun && steps.MPI_MXNet2_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet2_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [attempt 1 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [attempt 2 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun && steps.MPI_MXNet_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [attempt 3 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun && steps.MPI_MXNet_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_MPI && steps.MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_MPI && steps.MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_OFI && steps.MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_OFI && steps.MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI Parallel PyTests [attempt 1 of 3]" | |
id: MPI_Parallel_PyTests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [attempt 2 of 3]" | |
id: MPI_Parallel_PyTests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests && steps.MPI_Parallel_PyTests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [attempt 3 of 3]" | |
id: MPI_Parallel_PyTests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests && steps.MPI_Parallel_PyTests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_Parallel_PyTests_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_Parallel_PyTests_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_MPI && steps.MPI_Parallel_PyTests_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_Parallel_PyTests_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_MPI && steps.MPI_Parallel_PyTests_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_Parallel_PyTests_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_Parallel_PyTests_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_OFI && steps.MPI_Parallel_PyTests_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_Parallel_PyTests_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_OFI && steps.MPI_Parallel_PyTests_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [attempt 1 of 3]" | |
id: MPI_PyTorch_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [attempt 2 of 3]" | |
id: MPI_PyTorch_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api && steps.MPI_PyTorch_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [attempt 3 of 3]" | |
id: MPI_PyTorch_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api && steps.MPI_PyTorch_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_PyTorch_MNIST_api_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_PyTorch_MNIST_api_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_MPI && steps.MPI_PyTorch_MNIST_api_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_PyTorch_MNIST_api_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_MPI && steps.MPI_PyTorch_MNIST_api_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_PyTorch_MNIST_api_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_PyTorch_MNIST_api_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_OFI && steps.MPI_PyTorch_MNIST_api_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_PyTorch_MNIST_api_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_OFI && steps.MPI_PyTorch_MNIST_api_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [attempt 1 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [attempt 2 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun && steps.MPI_PyTorch_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [attempt 3 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun && steps.MPI_PyTorch_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI && steps.MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI && steps.MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI && steps.MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI && steps.MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI Single PyTests [attempt 1 of 3]" | |
id: MPI_Single_PyTests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [attempt 2 of 3]" | |
id: MPI_Single_PyTests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests && steps.MPI_Single_PyTests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [attempt 3 of 3]" | |
id: MPI_Single_PyTests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests && steps.MPI_Single_PyTests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_Single_PyTests_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_Single_PyTests_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_MPI && steps.MPI_Single_PyTests_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_Single_PyTests_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_MPI && steps.MPI_Single_PyTests_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_Single_PyTests_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_Single_PyTests_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_OFI && steps.MPI_Single_PyTests_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_Single_PyTests_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_OFI && steps.MPI_Single_PyTests_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api && steps.MPI_TensorFlow_2_0_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api && steps.MPI_TensorFlow_2_0_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI && steps.MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI && steps.MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI && steps.MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI && steps.MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "Run PyTests test_interactiverun [attempt 1 of 3]" | |
id: Run_PyTests_test_interactiverun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Run_PyTests_test_interactiverun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/test && pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.integration.xml integration/test_interactiverun.py" | |
shell: bash | |
- name: "Run PyTests test_interactiverun [attempt 2 of 3]" | |
id: Run_PyTests_test_interactiverun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Run_PyTests_test_interactiverun && steps.Run_PyTests_test_interactiverun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/test && pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.integration.xml integration/test_interactiverun.py" | |
shell: bash | |
- name: "Run PyTests test_interactiverun [attempt 3 of 3]" | |
id: Run_PyTests_test_interactiverun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Run_PyTests_test_interactiverun && steps.Run_PyTests_test_interactiverun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/test && pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.integration.xml integration/test_interactiverun.py" | |
shell: bash | |
- name: "Single Keras MNIST [attempt 1 of 3]" | |
id: Single_Keras_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_Keras_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/keras/keras_mnist_advanced.py --epochs 3 --batch-size 64" | |
shell: bash | |
- name: "Single Keras MNIST [attempt 2 of 3]" | |
id: Single_Keras_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_Keras_MNIST && steps.Single_Keras_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/keras/keras_mnist_advanced.py --epochs 3 --batch-size 64" | |
shell: bash | |
- name: "Single Keras MNIST [attempt 3 of 3]" | |
id: Single_Keras_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_Keras_MNIST && steps.Single_Keras_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/keras/keras_mnist_advanced.py --epochs 3 --batch-size 64" | |
shell: bash | |
- name: "Single MXNet2 MNIST [attempt 1 of 3]" | |
id: Single_MXNet2_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet2_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet2 MNIST [attempt 2 of 3]" | |
id: Single_MXNet2_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet2_MNIST && steps.Single_MXNet2_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet2 MNIST [attempt 3 of 3]" | |
id: Single_MXNet2_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet2_MNIST && steps.Single_MXNet2_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [attempt 1 of 3]" | |
id: Single_MXNet_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [attempt 2 of 3]" | |
id: Single_MXNet_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST && steps.Single_MXNet_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [attempt 3 of 3]" | |
id: Single_MXNet_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST && steps.Single_MXNet_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [ONECCL MPI] [attempt 1 of 3]" | |
id: Single_MXNet_MNIST_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [ONECCL MPI] [attempt 2 of 3]" | |
id: Single_MXNet_MNIST_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_MPI && steps.Single_MXNet_MNIST_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [ONECCL MPI] [attempt 3 of 3]" | |
id: Single_MXNet_MNIST_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_MPI && steps.Single_MXNet_MNIST_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [ONECCL OFI] [attempt 1 of 3]" | |
id: Single_MXNet_MNIST_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [ONECCL OFI] [attempt 2 of 3]" | |
id: Single_MXNet_MNIST_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_OFI && steps.Single_MXNet_MNIST_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [ONECCL OFI] [attempt 3 of 3]" | |
id: Single_MXNet_MNIST_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_OFI && steps.Single_MXNet_MNIST_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single PyTorch MNIST [attempt 1 of 3]" | |
id: Single_PyTorch_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [attempt 2 of 3]" | |
id: Single_PyTorch_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST && steps.Single_PyTorch_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [attempt 3 of 3]" | |
id: Single_PyTorch_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST && steps.Single_PyTorch_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [ONECCL MPI] [attempt 1 of 3]" | |
id: Single_PyTorch_MNIST_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [ONECCL MPI] [attempt 2 of 3]" | |
id: Single_PyTorch_MNIST_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_MPI && steps.Single_PyTorch_MNIST_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [ONECCL MPI] [attempt 3 of 3]" | |
id: Single_PyTorch_MNIST_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_MPI && steps.Single_PyTorch_MNIST_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [ONECCL OFI] [attempt 1 of 3]" | |
id: Single_PyTorch_MNIST_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [ONECCL OFI] [attempt 2 of 3]" | |
id: Single_PyTorch_MNIST_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_OFI && steps.Single_PyTorch_MNIST_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [ONECCL OFI] [attempt 3 of 3]" | |
id: Single_PyTorch_MNIST_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_OFI && steps.Single_PyTorch_MNIST_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Spark Keras MNIST [attempt 1 of 3]" | |
id: Spark_Keras_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark Keras MNIST [attempt 2 of 3]" | |
id: Spark_Keras_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_MNIST && steps.Spark_Keras_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark Keras MNIST [attempt 3 of 3]" | |
id: Spark_Keras_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_MNIST && steps.Spark_Keras_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark Keras Rossmann Estimator [attempt 1 of 3]" | |
id: Spark_Keras_Rossmann_Estimator_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Estimator && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_estimator.py --num-proc 2 --work-dir /work --data-dir file:///data --epochs 3 --sample-rate 0.1" | |
shell: bash | |
- name: "Spark Keras Rossmann Estimator [attempt 2 of 3]" | |
id: Spark_Keras_Rossmann_Estimator_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Estimator && steps.Spark_Keras_Rossmann_Estimator_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_estimator.py --num-proc 2 --work-dir /work --data-dir file:///data --epochs 3 --sample-rate 0.1" | |
shell: bash | |
- name: "Spark Keras Rossmann Estimator [attempt 3 of 3]" | |
id: Spark_Keras_Rossmann_Estimator_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Estimator && steps.Spark_Keras_Rossmann_Estimator_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_estimator.py --num-proc 2 --work-dir /work --data-dir file:///data --epochs 3 --sample-rate 0.1" | |
shell: bash | |
- name: "Spark Keras Rossmann Run [attempt 1 of 3]" | |
id: Spark_Keras_Rossmann_Run_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Run && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_run.py --num-proc 2 --data-dir file:///data --epochs 3 --sample-rate 0.1" | |
shell: bash | |
- name: "Spark Keras Rossmann Run [attempt 2 of 3]" | |
id: Spark_Keras_Rossmann_Run_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Run && steps.Spark_Keras_Rossmann_Run_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_run.py --num-proc 2 --data-dir file:///data --epochs 3 --sample-rate 0.1" | |
shell: bash | |
- name: "Spark Keras Rossmann Run [attempt 3 of 3]" | |
id: Spark_Keras_Rossmann_Run_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Run && steps.Spark_Keras_Rossmann_Run_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_run.py --num-proc 2 --data-dir file:///data --epochs 3 --sample-rate 0.1" | |
shell: bash | |
- name: "Spark Lightning MNIST [attempt 1 of 3]" | |
id: Spark_Lightning_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Lightning_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_lightning_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark Lightning MNIST [attempt 2 of 3]" | |
id: Spark_Lightning_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Lightning_MNIST && steps.Spark_Lightning_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_lightning_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark Lightning MNIST [attempt 3 of 3]" | |
id: Spark_Lightning_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Lightning_MNIST && steps.Spark_Lightning_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_lightning_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark PyTests [attempt 1 of 3]" | |
id: Spark_PyTests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_PyTests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_PyTests_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && (ls -1 test_spark*.py | xargs -n 1 /bin/bash /pytest_standalone.sh spark)" | |
shell: bash | |
- name: "Spark PyTests [attempt 2 of 3]" | |
id: Spark_PyTests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_PyTests && steps.Spark_PyTests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_PyTests_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && (ls -1 test_spark*.py | xargs -n 1 /bin/bash /pytest_standalone.sh spark)" | |
shell: bash | |
- name: "Spark PyTests [attempt 3 of 3]" | |
id: Spark_PyTests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_PyTests && steps.Spark_PyTests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_PyTests_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && (ls -1 test_spark*.py | xargs -n 1 /bin/bash /pytest_standalone.sh spark)" | |
shell: bash | |
- name: "Spark TensorFlow 2.0 MNIST Data Service [attempt 1 of 3]" | |
id: Spark_TensorFlow_2_0_MNIST_Data_Service_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_TensorFlow_2_0_MNIST_Data_Service && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/examples/spark/tensorflow2; spark-submit --master \"local[2]\" \"/horovod/horovod/spark/tensorflow/compute_worker.py\" /tmp/compute.json & OMP_NUM_THREADS=1 /spark_env.sh spark-submit --master \"local[2]\" --py-files tensorflow2_mnist_data_service_train_fn_compute_side_dispatcher.py,tensorflow2_mnist_data_service_train_fn_training_side_dispatcher.py tensorflow2_mnist_data_service.py /tmp/compute.json" | |
shell: bash | |
- name: "Spark TensorFlow 2.0 MNIST Data Service [attempt 2 of 3]" | |
id: Spark_TensorFlow_2_0_MNIST_Data_Service_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_TensorFlow_2_0_MNIST_Data_Service && steps.Spark_TensorFlow_2_0_MNIST_Data_Service_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/examples/spark/tensorflow2; spark-submit --master \"local[2]\" \"/horovod/horovod/spark/tensorflow/compute_worker.py\" /tmp/compute.json & OMP_NUM_THREADS=1 /spark_env.sh spark-submit --master \"local[2]\" --py-files tensorflow2_mnist_data_service_train_fn_compute_side_dispatcher.py,tensorflow2_mnist_data_service_train_fn_training_side_dispatcher.py tensorflow2_mnist_data_service.py /tmp/compute.json" | |
shell: bash | |
- name: "Spark TensorFlow 2.0 MNIST Data Service [attempt 3 of 3]" | |
id: Spark_TensorFlow_2_0_MNIST_Data_Service_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_TensorFlow_2_0_MNIST_Data_Service && steps.Spark_TensorFlow_2_0_MNIST_Data_Service_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/examples/spark/tensorflow2; spark-submit --master \"local[2]\" \"/horovod/horovod/spark/tensorflow/compute_worker.py\" /tmp/compute.json & OMP_NUM_THREADS=1 /spark_env.sh spark-submit --master \"local[2]\" --py-files tensorflow2_mnist_data_service_train_fn_compute_side_dispatcher.py,tensorflow2_mnist_data_service_train_fn_training_side_dispatcher.py tensorflow2_mnist_data_service.py /tmp/compute.json" | |
shell: bash | |
- name: "Spark Torch MNIST [attempt 1 of 3]" | |
id: Spark_Torch_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Torch_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_1 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark Torch MNIST [attempt 2 of 3]" | |
id: Spark_Torch_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Torch_MNIST && steps.Spark_Torch_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_2 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark Torch MNIST [attempt 3 of 3]" | |
id: Spark_Torch_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Torch_MNIST && steps.Spark_Torch_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_3 | |
docker compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: Upload Test Results | |
uses: actions/upload-artifact@v4 | |
if: always() && contains(matrix.image, '-cpu-') | |
with: | |
name: Unit Test Results - ${{ matrix.image }} | |
path: artifacts/${{ matrix.image }}/**/*.xml | |
build-mins: | |
name: "Build mins (${{ matrix.image }})" | |
needs: [init-workflow, build-and-test] | |
if: > | |
needs.init-workflow.outputs.run-at-all == 'true' && | |
needs.init-workflow.outputs.run-builds-and-tests == 'true' | |
runs-on: ubuntu-latest | |
strategy: | |
max-parallel: 2 | |
fail-fast: false | |
matrix: | |
include: | |
- image: test-cpu-openmpi-gloo-py3_7-tfmin-kerasmin-torchmin-mxnetmin-pysparkmin | |
build_timeout: 30 | |
- image: test-gpu-openmpi-gloo-py3_8-tfmin-kerasmin-torchmin-mxnetmin-pysparkmin | |
build_timeout: 40 | |
steps: | |
- name: Clean up disk space | |
# deleting these paths frees 38 GB disk space: | |
# sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc | |
# but this sometimes takes 3-4 minutes | |
# so we delete only some sub-paths which are known to be quick (10s) and 20 GB | |
run: | | |
echo ::group::Disk space before clean up | |
df -h | |
echo ::endgroup:: | |
for dir in /usr/share/dotnet/sdk/\*/nuGetPackagesArchive.lzma \ | |
/usr/share/dotnet/shared \ | |
/usr/local/lib/android/sdk/ndk \ | |
/usr/local/lib/android/sdk/build-tools \ | |
/opt/ghc | |
do | |
echo ::group::Deleting "$dir" | |
sudo du -hsc $dir | tail -n1 || true | |
sudo rm -rf $dir | |
echo ::endgroup:: | |
done | |
echo ::group::Disk space after clean up | |
df -h | |
echo ::endgroup:: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
with: | |
submodules: recursive | |
- name: Setup Python | |
uses: actions/setup-python@v5 | |
with: | |
python-version: 3.8 | |
- name: Build | |
id: build | |
run: | | |
.github/timeout-and-retry.sh ${{ matrix.build_timeout }}m 3 10 docker compose -f docker-compose.test.yml build ${{ matrix.image }} | |
env: | |
COMPOSE_DOCKER_CLI_BUILD: 1 | |
DOCKER_BUILDKIT: 1 | |
- name: Upload Test Results | |
uses: actions/upload-artifact@v4 | |
if: always() && contains(matrix.image, '-cpu-') | |
with: | |
name: Unit Test Results - ${{ matrix.image }} | |
path: artifacts/${{ matrix.image }}/**/*.xml | |
build-and-test-macos: | |
name: "Build and Test macOS (${{ matrix.image }}-macos)" | |
needs: [init-workflow, build-and-test] | |
if: > | |
needs.init-workflow.outputs.run-at-all == 'true' && | |
needs.init-workflow.outputs.run-builds-and-tests == 'true' | |
runs-on: macos-11 | |
strategy: | |
max-parallel: 3 | |
fail-fast: false | |
matrix: | |
include: | |
- image: test-cpu-openmpi-py3_7-tf1_15_5-keras2_2_4-torch1_6_0-mxnet1_5_1_p0 | |
HOROVOD_WITH_MPI: 1 | |
HOROVOD_WITHOUT_GLOO: 1 | |
TENSORFLOW: 1.15.0 | |
KERAS: 2.2.4 | |
PYTORCH: 1.6.0 | |
PYTORCH_LIGHTNING: 1.3.8 | |
TORCHVISION: 0.7.0 | |
MXNET: 1.5.1.post0 | |
- image: test-cpu-gloo-py3_8-tf2_9_2-keras2_9_0-torch1_11_0-mxnet1_7_0_p2 | |
HOROVOD_WITHOUT_MPI: 1 | |
HOROVOD_WITH_GLOO: 1 | |
TENSORFLOW: 2.9.2 | |
KERAS: 2.9.0 | |
PYTORCH: 1.11.0 | |
PYTORCH_LIGHTNING: 1.5.9 | |
TORCHVISION: 0.12.0 | |
MXNET: 1.7.0.post2 | |
- image: test-openmpi-cpu-gloo-py3_8-tf2_10_0-keras2_10_0-torch1_12_1-mxnet1_9_1 | |
HOROVOD_WITH_MPI: 1 | |
HOROVOD_WITH_GLOO: 1 | |
TENSORFLOW: 2.10.0 | |
KERAS: 2.10.0 | |
PYTORCH: 1.12.1 | |
PYTORCH_LIGHTNING: 1.5.9 | |
TORCHVISION: 0.13.1 | |
MXNET: 1.9.1 | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
with: | |
submodules: recursive | |
- name: Build | |
id: build | |
env: | |
HOROVOD_WITH_MPI: ${{ matrix.HOROVOD_WITH_MPI }} | |
HOROVOD_WITHOUT_MPI: ${{ matrix.HOROVOD_WITHOUT_MPI }} | |
HOROVOD_WITH_GLOO: ${{ matrix.HOROVOD_WITH_GLOO }} | |
HOROVOD_WITHOUT_GLOO: ${{ matrix.HOROVOD_WITHOUT_GLOO }} | |
TENSORFLOW: ${{ matrix.TENSORFLOW }} | |
KERAS: ${{ matrix.KERAS }} | |
PYTORCH: ${{ matrix.PYTORCH }} | |
PYTORCH_LIGHTNING: ${{ matrix.PYTORCH_LIGHTNING }} | |
TORCHVISION: ${{ matrix.TORCHVISION }} | |
MXNET: ${{ matrix.MXNET }} | |
# The python patch in the pyenv install step is to work around an incompatibility introduced in new xcode version in macOS Big Sur. The patch is provided by python team. | |
# The original discussion is here https://github.com/pyenv/pyenv/issues/1737 | |
run: | | |
brew reinstall -f zlib bzip2 | |
brew install -f openmpi cmake libuv pyenv coreutils curl | |
export PATH=$(pyenv root)/shims:$PATH | |
pyenv uninstall -f 3.7.7 | |
CFLAGS="-I$(brew --prefix bzip2)/include -I$(brew --prefix zlib)/include" LDFLAGS="-L$(brew --prefix zlib)/lib -L$(brew --prefix bzip2)/lib" pyenv install --patch 3.7.7 < <(curl -sSL https://github.com/python/cpython/commit/8ea6353.patch) | |
pyenv global 3.7.7 | |
python --version | |
python -m pip install -U pip | |
pip install tensorflow==${TENSORFLOW} keras==${KERAS} | |
if [[ ${TENSORFLOW} == 1.* ]] || [[ ${TENSORFLOW} == 2.[012345].* ]]; then pip install "h5py<3" "protobuf~=3.20"; fi | |
pip install torch==${PYTORCH} pytorch_lightning==${PYTORCH_LIGHTNING} torchvision==${TORCHVISION} | |
pip install mxnet==${MXNET} | |
HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITH_MXNET=1 pip install --no-cache-dir .[test] | |
horovodrun --check-build | |
- name: Test [attempt 1 of 3] | |
id: test-1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && true | |
run: | | |
export PATH=$(pyenv root)/shims:$PATH | |
pyenv global 3.7.7 | |
python --version | |
artifacts_path="$(pwd)/artifacts/${{ matrix.image }}-macos-run-1" | |
mkdir -p "$artifacts_path" | |
echo "artifacts-path=$artifacts_path" >> $GITHUB_OUTPUT | |
echo pytest -v --capture=no --continue-on-collection-errors --junit-xml=$artifacts_path/junit.\$1.\${HOROVOD_RANK:-\${OMPI_COMM_WORLD_RANK:-\${PMI_RANK}}}.\$2.xml \${@:2} > pytest.sh | |
chmod u+x pytest.sh | |
cd test/parallel | |
ls test_*.py | gtimeout 10m xargs -n 1 horovodrun -np 2 /bin/bash ../../pytest.sh macos | |
- name: Test [attempt 2 of 3] | |
id: test-2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && steps.test-1.outcome == 'failure' | |
run: | | |
export PATH=$(pyenv root)/shims:$PATH | |
pyenv global 3.7.7 | |
python --version | |
artifacts_path="$(pwd)/artifacts/${{ matrix.image }}-macos-run-2" | |
mkdir -p "$artifacts_path" | |
echo "artifacts-path=$artifacts_path" >> $GITHUB_OUTPUT | |
echo pytest -v --capture=no --continue-on-collection-errors --junit-xml=$artifacts_path/junit.\$1.\${HOROVOD_RANK:-\${OMPI_COMM_WORLD_RANK:-\${PMI_RANK}}}.\$2.xml \${@:2} > pytest.sh | |
chmod u+x pytest.sh | |
cd test/parallel | |
ls test_*.py | gtimeout 10m xargs -n 1 horovodrun -np 2 /bin/bash ../../pytest.sh macos | |
- name: Test [attempt 3 of 3] | |
id: test-3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && steps.test-2.outcome == 'failure' | |
run: | | |
export PATH=$(pyenv root)/shims:$PATH | |
pyenv global 3.7.7 | |
python --version | |
artifacts_path="$(pwd)/artifacts/${{ matrix.image }}-macos-run-3" | |
mkdir -p "$artifacts_path" | |
echo "artifacts-path=$artifacts_path" >> $GITHUB_OUTPUT | |
echo pytest -v --capture=no --continue-on-collection-errors --junit-xml=$artifacts_path/junit.\$1.\${HOROVOD_RANK:-\${OMPI_COMM_WORLD_RANK:-\${PMI_RANK}}}.\$2.xml \${@:2} > pytest.sh | |
chmod u+x pytest.sh | |
cd test/parallel | |
ls test_*.py | gtimeout 10m xargs -n 1 horovodrun -np 2 /bin/bash ../../pytest.sh macos | |
- name: Upload Test Results | |
uses: actions/upload-artifact@v4 | |
if: always() | |
with: | |
name: Unit Test Results - ${{ matrix.image }}-macos | |
path: | | |
${{ steps.test-1.outputs.artifacts-path }} | |
${{ steps.test-2.outputs.artifacts-path }} | |
${{ steps.test-3.outputs.artifacts-path }} | |
buildkite-trigger: | |
name: "Build and Test GPU (trigger Builtkite)" | |
needs: [init-workflow, build-and-test] | |
runs-on: ubuntu-latest | |
if: > | |
github.repository == 'horovod/horovod' && | |
needs.init-workflow.outputs.run-at-all == 'true' && | |
needs.init-workflow.outputs.run-builds-and-tests == 'true' && | |
( github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository ) | |
outputs: | |
url: ${{ steps.build.outputs.url }} | |
steps: | |
- name: Trigger Buildkite Pipeline | |
id: build | |
uses: buildkite/trigger-pipeline-action@v1.3.1 | |
env: | |
PIPELINE: "horovod/horovod" | |
# COMMIT is taken from GITHUB_SHA | |
BRANCH: "${{ needs.init-workflow.outputs.buildkite-branch-label }} (GPU NON HEADS)" | |
# empty MESSAGE will be filled by Buildkite from commit message | |
MESSAGE: "${{ needs.init-workflow.outputs.buildkite-message }}" | |
BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_TOKEN }} | |
BUILD_ENV_VARS: "{\"PIPELINE_MODE\": \"GPU NON HEADS\"}" | |
buildkite: | |
name: "Build and Test GPU (download Builtkite)" | |
needs: [buildkite-trigger] | |
runs-on: ubuntu-latest | |
steps: | |
- name: Download Buildkite Artifacts | |
id: download | |
uses: EnricoMi/download-buildkite-artifact-action@v1 | |
with: | |
buildkite_token: ${{ secrets.BUILDKITE_TOKEN }} | |
buildkite_build_url: ${{ needs.buildkite-trigger.outputs.url }} | |
ignore_build_states: blocked,canceled,skipped,not_run | |
ignore_job_states: timed_out | |
output_path: artifacts/Unit Test Results - GPU NON HEADS on Builtkite | |
- name: Upload Test Results | |
uses: actions/upload-artifact@v4 | |
if: always() | |
with: | |
name: Unit Test Results - GPU NON HEADS on Builtkite | |
path: artifacts/Unit Test Results - GPU NON HEADS on Builtkite/**/*.xml | |
- name: Check Buildkite job state | |
if: > | |
always() && | |
steps.download.conclusion == 'success' && | |
steps.download.outputs.build-state != 'passed' | |
run: | | |
echo "::warning::Buildkite pipeline did not pass: ${{ needs.buildkite-trigger.outputs.url }}" | |
exit 1 | |
buildkite-heads-trigger: | |
name: "Build and Test GPU heads (trigger Builtkite)" | |
needs: [init-workflow, build-and-test] | |
runs-on: ubuntu-latest | |
if: > | |
github.repository == 'horovod/horovod' && | |
needs.init-workflow.outputs.run-at-all == 'true' && | |
needs.init-workflow.outputs.run-builds-and-tests == 'true' && | |
( github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository ) | |
outputs: | |
url: ${{ steps.build.outputs.url }} | |
steps: | |
- name: Trigger Buildkite Pipeline | |
id: build | |
uses: buildkite/trigger-pipeline-action@v1.3.1 | |
env: | |
PIPELINE: "horovod/horovod" | |
# COMMIT is taken from GITHUB_SHA | |
BRANCH: "${{ needs.init-workflow.outputs.buildkite-branch-label }} (GPU HEADS)" | |
# empty MESSAGE will be filled by Buildkite from commit message | |
MESSAGE: "${{ needs.init-workflow.outputs.buildkite-message }}" | |
BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_TOKEN }} | |
BUILD_ENV_VARS: "{\"PIPELINE_MODE\": \"GPU HEADS\"}" | |
buildkite-heads: | |
name: "Build and Test GPU heads (download Builtkite)" | |
needs: [buildkite-heads-trigger] | |
runs-on: ubuntu-latest | |
steps: | |
- name: Download Buildkite Artifacts | |
id: download | |
uses: EnricoMi/download-buildkite-artifact-action@v1 | |
with: | |
buildkite_token: ${{ secrets.BUILDKITE_TOKEN }} | |
buildkite_build_url: ${{ needs.buildkite-heads-trigger.outputs.url }} | |
ignore_build_states: blocked,canceled,skipped,not_run | |
ignore_job_states: timed_out | |
output_path: artifacts/Unit Test Results - GPU HEADS on Builtkite | |
- name: Upload Test Results | |
uses: actions/upload-artifact@v4 | |
if: always() | |
with: | |
name: Unit Test Results - GPU HEADS on Builtkite | |
path: artifacts/Unit Test Results - GPU HEADS on Builtkite/**/*.xml | |
- name: Check Buildkite job state | |
if: > | |
always() && | |
steps.download.conclusion == 'success' && | |
steps.download.outputs.build-state != 'passed' | |
run: | | |
echo "::warning::Buildkite pipeline did not pass: ${{ needs.buildkite-heads-trigger.outputs.url }}" | |
exit 1 | |
docker-config: | |
name: Configure docker build | |
needs: [init-workflow, build-and-test, buildkite] | |
# build-and-test and buildkite might have been skipped (! needs.init-workflow.outputs.run-builds-and-tests) | |
# buildkite might have been skipped (workflow runs for a fork PR), | |
# we still want to build docker images (though we might not want to push them) | |
if: > | |
always() && | |
needs.init-workflow.outputs.run-at-all == 'true' && | |
needs.init-workflow.outputs.run-builds-and-tests == 'true' && | |
needs.build-and-test.result == 'success' && | |
( needs.buildkite.result == 'success' || needs.buildkite.result == 'skipped' ) | |
runs-on: ubuntu-latest | |
outputs: | |
run: ${{ steps.config.outputs.run }} | |
push: ${{ steps.config.outputs.push }} | |
steps: | |
- name: Config | |
id: config | |
env: | |
# run workflow for all events on Horovod repo and non-schedule events on forks | |
run: ${{ github.repository == 'horovod/horovod' || github.event_name != 'schedule' }} | |
# push images only from Horovod repo and for schedule and push events | |
push: ${{ github.repository == 'horovod/horovod' && contains('schedule,push', github.event_name) }} | |
run: | | |
echo Repository: ${{ github.repository }} | |
echo Event: ${{ github.event_name }} | |
echo Run: $run | |
echo "run=$run" >> $GITHUB_OUTPUT | |
echo Push: $push | |
echo "push=$push" >> $GITHUB_OUTPUT | |
docker-build: | |
name: Build docker image ${{ matrix.docker-image }} (push=${{ needs.docker-config.outputs.push }}) | |
needs: docker-config | |
if: always() && needs.docker-config.outputs.run == 'true' | |
runs-on: ubuntu-latest | |
# we want an ongoing run of this workflow to be canceled by a later commit | |
# so that there is only one concurrent run of this workflow for each branch | |
concurrency: | |
# github.ref means something like refs/heads/master or refs/tags/v0.22.1 or the branch. | |
# This helps to not cancel concurrent runs on master and a tag that share the same commit | |
# head_ref refers to the pull request branch so we run only one workflow for the given pull request. | |
# On master, head_ref is empty, so we use the SHA of the commit, this means | |
# commits to master will not be cancelled, which is important to ensure | |
# that every commit to master is full tested and deployed. | |
group: docker-${{ matrix.docker-image }}-${{ github.ref }}-${{ github.head_ref || github.sha }} | |
cancel-in-progress: true | |
strategy: | |
fail-fast: false | |
matrix: | |
docker-image: | |
- horovod | |
- horovod-cpu | |
- horovod-nvtabular | |
- horovod-ray | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
with: | |
submodules: 'recursive' | |
- name: Docker meta | |
id: meta | |
uses: crazy-max/ghaction-docker-meta@v5 | |
with: | |
# list of Docker images to use as base name for tags | |
images: | | |
horovod/${{ matrix.docker-image }} | |
# generate Docker tags based on the following events/attributes | |
tags: | | |
type=schedule | |
type=ref,event=branch | |
type=ref,event=pr | |
type=semver,pattern={{version}} | |
type=semver,pattern={{major}}.{{minor}} | |
type=semver,pattern={{major}} | |
type=sha | |
- name: Set up Docker Buildx | |
uses: docker/setup-buildx-action@v3 | |
with: | |
driver: docker | |
- name: Login to DockerHub | |
if: needs.docker-config.outputs.push == 'true' | |
uses: docker/login-action@v3 | |
with: | |
username: ${{ secrets.DOCKERHUB_USERNAME }} | |
password: ${{ secrets.DOCKERHUB_TOKEN }} | |
- name: Clean up disk space | |
# deleting these paths frees 38 GB disk space: | |
# sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc | |
# but this sometimes takes 3-4 minutes | |
# so we delete only some sub-paths which are known to be quick (10s) and 20 GB | |
run: | | |
echo ::group::Disk space before clean up | |
df -h | |
echo ::endgroup:: | |
for dir in /usr/share/dotnet/sdk/\*/nuGetPackagesArchive.lzma \ | |
/usr/share/dotnet/shared \ | |
/usr/local/lib/android/sdk/ndk \ | |
/usr/local/lib/android/sdk/build-tools \ | |
/opt/ghc | |
do | |
echo ::group::Deleting "$dir" | |
sudo du -hsc $dir | tail -n1 || true | |
sudo rm -rf $dir | |
echo ::endgroup:: | |
done | |
echo ::group::Disk space after clean up | |
df -h | |
echo ::endgroup:: | |
- name: Build image | |
id: build | |
uses: docker/build-push-action@v5 | |
timeout-minutes: 60 | |
with: | |
context: . | |
file: ./docker/${{ matrix.docker-image }}/Dockerfile | |
pull: true | |
push: false | |
load: true | |
tags: horovod-test | |
outputs: type=docker | |
- name: List image | |
run: | | |
docker image ls horovod-test | |
- name: Prepare container for test | |
run: | | |
grep "RUN sed" Dockerfile.test.cpu | sed "s/^RUN //" | docker run -i --name horovod-test horovod-test:latest /bin/bash | |
- name: Test image (pytorch gloo) | |
if: always() && steps.build.outcome == 'success' | |
run: | | |
docker start -ai horovod-test <<<"python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication gloo" | |
- name: Test image (tensorflow2 gloo) | |
if: always() && steps.build.outcome == 'success' | |
run: | | |
docker start -ai horovod-test <<<"python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 gloo" | |
- name: Test image (pytorch mpi) | |
if: always() && steps.build.outcome == 'success' && matrix.docker-image != 'horovod-ray' | |
run: | | |
docker start -ai horovod-test <<<"python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
- name: Test image (tensorflow2 mpi) | |
if: always() && steps.build.outcome == 'success' && matrix.docker-image != 'horovod-ray' | |
run: | | |
docker start -ai horovod-test <<<"python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
- name: Push image | |
if: needs.docker-config.outputs.push == 'true' | |
uses: docker/build-push-action@v5 | |
timeout-minutes: 60 | |
with: | |
context: . | |
file: ./docker/${{ matrix.docker-image }}/Dockerfile | |
push: ${{ needs.docker-config.outputs.push }} | |
tags: ${{ steps.meta.outputs.tags }} | |
labels: ${{ steps.meta.outputs.labels }} | |
- name: Show free space | |
if: always() | |
run: | | |
echo ::group::Disk Space | |
df -h | |
echo ::endgroup:: | |
echo ::group::Docker Space | |
docker system df | |
echo ::endgroup:: | |
echo ::group::Docker Images | |
docker images -a | |
echo ::endgroup:: | |
echo ::group::Docker Container | |
docker container list -a | |
echo ::endgroup:: | |
sync-files: | |
name: "Sync Files (${{ matrix.name }})" | |
needs: [init-workflow] | |
runs-on: ubuntu-latest | |
strategy: | |
fail-fast: false | |
matrix: | |
include: | |
- name: Docs Summary | |
left_file: README.rst | |
right_file: docs/summary.rst | |
init: sed -i -e s/docs\///g README.rst | |
- name: Examples Keras Spark3 | |
left_file: examples/spark/keras/keras_spark_rossmann_run.py | |
right_file: examples/spark/keras/keras_spark3_rossmann.py | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Diffing ${{ matrix.left_file }} with ${{ matrix.right_file }} | |
env: | |
LEFT: ${{ matrix.left_file }} | |
RIGHT: ${{ matrix.right_file }} | |
INIT: ${{ matrix.init }} | |
run: | | |
$INIT | |
patch --quiet -p0 $LEFT ${RIGHT}.patch -o ${LEFT}.expected | |
if ! diff -q ${LEFT}.expected --label $LEFT $RIGHT | |
then | |
echo | |
echo "::error::Files are out-of-sync: $LEFT vs. $RIGHT" | |
echo "Unexpected differences are:" | |
diff ${LEFT}.expected --label $LEFT $RIGHT || true | |
echo | |
echo "Use the following as ${RIGHT}.patch to accept those changes:" | |
diff $LEFT $RIGHT || true | |
false | |
fi |