From 37c6d98d5ff7d4ea23ccda8ba9b4da0f74006d6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 3 Apr 2024 21:25:35 +0200 Subject: [PATCH] WIP - try running distributed tests on a single device --- .azure/gpu-tests.yml | 42 --------------------------------- scripts/run_standalone_tests.sh | 4 +++- 2 files changed, 3 insertions(+), 43 deletions(-) diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index df030eaefd..287276624f 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -85,48 +85,6 @@ jobs: - bash: bash .azure/sanity-check.sh displayName: 'Sanity check / details' - - bash: | - set -ex - coverage run --source thunder -m \ - pytest thunder/tests/ \ - -m "not standalone" \ - -v --datefmt="%Y%m%d-%H:%M:%S.%f" \ - --timeout=240 \ - --random-order-seed=42 \ - --durations=250 \ - --timeout=240 \ - --numprocesses=9 \ - --ignore=thunder/tests/distributed --ignore=thunder/tests/test_networks.py - # compile coverage results - python -m coverage report - python -m coverage xml - # upload to codecov - ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \ - --flags=gpu,pytest,regular --name="GPU-coverage" --env=linux,azure - condition: ne(variables['testing'], 'distributed') - timeoutInMinutes: "30" - displayName: 'Testing: regular' - - - bash: | - set -ex - # these test need to run in single thread as they occurs with CUDA OOM - coverage run --source thunder -m \ - pytest \ - thunder/tests/test_networks.py \ - -m "not standalone" \ - -v --durations=0 \ - --random-order-seed=42 \ - --numprocesses=3 - # compile coverage results - python -m coverage report - python -m coverage xml - # upload to codecov - ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \ - --flags=gpu,pytest,networks --name="GPU-coverage" --env=linux,azure - condition: ne(variables['testing'], 'distributed') - timeoutInMinutes: "15" - displayName: 'Testing: networks' - #- bash: | # bash .azure/run_standalone_tests.sh \ # "thunder/tests" \ diff --git a/scripts/run_standalone_tests.sh b/scripts/run_standalone_tests.sh index f8bcc1846f..b69cd5d26c 100644 --- a/scripts/run_standalone_tests.sh +++ b/scripts/run_standalone_tests.sh @@ -37,8 +37,10 @@ tests=$(grep -oP '\S+::test_\S+' "$TEST_FILE") printf "collected tests:\n----------------\n$tests\n================\n" status=0 +devices=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f1) +echo "$CUDA_VISIBLE_DEVICES, $devices" for test in $tests; do - python -um pytest -sv "$test" --pythonwarnings ignore --junitxml="$test-results.xml" 2>&1 > "$test-output.txt" + CUDA_VISIBLE_DEVICES=$devices python -um pytest -sv "$test" --pythonwarnings ignore --junitxml="$test-results.xml" 2>&1 > "$test-output.txt" pytest_status=$? printf "$test status >>> $pytest_status\n"