diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index 669bf6db6a..8b9217147a 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -16,16 +16,34 @@ jobs: strategy: matrix: # CUDA 12.1 - "ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.5.1 | regular": + "ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.5.1 | main": docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.5.2-py3.10-pt_2.5.1-dev" CUDA_VERSION_MM: "121" + testing: "main" + "ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.5.1 | ops": + docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.5.2-py3.10-pt_2.5.1-dev" + CUDA_VERSION_MM: "121" + testing: "ops" + "ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.5.1 | grads": + docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.5.2-py3.10-pt_2.5.1-dev" + CUDA_VERSION_MM: "121" + testing: "grads" "ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.5.1 | distributed": docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.5.2-py3.10-pt_2.5.1-dev" CUDA_VERSION_MM: "121" testing: "distributed" - "ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | regular": + "ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | main": + docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.5.2-py3.10-pt_main-dev" + CUDA_VERSION_MM: "121" + testing: "main" + "ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | ops": docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.5.2-py3.10-pt_main-dev" CUDA_VERSION_MM: "121" + testing: "ops" + "ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | grads": + docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.5.2-py3.10-pt_main-dev" + CUDA_VERSION_MM: "121" + testing: "grads" "ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | distributed": docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.5.2-py3.10-pt_main-dev" CUDA_VERSION_MM: "121" @@ -88,16 +106,17 @@ jobs: --durations=250 \ --timeout=240 \ --numprocesses=9 \ - --ignore=thunder/tests/distributed --ignore=thunder/tests/test_networks.py + --ignore=thunder/tests/distributed --ignore=thunder/tests/test_networks.py \ + --ignore=thunder/tests/test_ops.py --ignore=thunder/tests/test_grad.py # compile coverage results python -m coverage report python -m coverage xml # upload to codecov ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \ --flags=gpu,pytest,regular --name="GPU-coverage" --env=linux,azure - condition: ne(variables['testing'], 'distributed') + condition: eq(variables['testing'], 'main') timeoutInMinutes: "40" - displayName: "Testing: regular" + displayName: "Testing: main" - bash: | set -ex @@ -115,10 +134,52 @@ jobs: # upload to codecov ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \ --flags=gpu,pytest,networks --name="GPU-coverage" --env=linux,azure - condition: ne(variables['testing'], 'distributed') + condition: eq(variables['testing'], 'main') timeoutInMinutes: "15" displayName: "Testing: networks" + - bash: | + set -ex + export CUDA_LAUNCH_BLOCKING=1 + coverage run --source thunder -m \ + pytest thunder/tests/test_ops.py \ + -m "not standalone" \ + -v --datefmt="%Y%m%d-%H:%M:%S.%f" \ + --random-order-seed=42 \ + --durations=250 \ + --timeout=240 \ + --numprocesses=9 + # compile coverage results + python -m coverage report + python -m coverage xml + # upload to codecov + ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \ + --flags=gpu,pytest,regular --name="GPU-coverage" --env=linux,azure + condition: eq(variables['testing'], 'ops') + timeoutInMinutes: "40" + displayName: "Testing: ops" + + - bash: | + set -ex + export CUDA_LAUNCH_BLOCKING=1 + coverage run --source thunder -m \ + pytest thunder/tests/test_grad.py \ + -m "not standalone" \ + -v --datefmt="%Y%m%d-%H:%M:%S.%f" \ + --random-order-seed=42 \ + --durations=250 \ + --timeout=240 \ + --numprocesses=9 + # compile coverage results + python -m coverage report + python -m coverage xml + # upload to codecov + ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \ + --flags=gpu,pytest,regular --name="GPU-coverage" --env=linux,azure + condition: eq(variables['testing'], 'grads') + timeoutInMinutes: "40" + displayName: "Testing: grads" + - bash: | set -ex # run all found tests in given past as standalone diff --git a/docs/.readthedocs.yaml b/docs/.readthedocs.yaml index 759a51c292..999ec4bf5a 100644 --- a/docs/.readthedocs.yaml +++ b/docs/.readthedocs.yaml @@ -9,6 +9,7 @@ version: 2 # reference: https://docs.readthedocs.io/en/stable/config-file/v2.html#sphinx sphinx: fail_on_warning: true + configuration: docs/conf.py build: os: "ubuntu-22.04"