Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into tom/drop-old-style-di…
Browse files Browse the repository at this point in the history
…stributed
  • Loading branch information
t-vi committed Jan 13, 2025
2 parents 0deac3c + f56f71a commit a477a7c
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 6 deletions.
73 changes: 67 additions & 6 deletions .azure/gpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,34 @@ jobs:
strategy:
matrix:
# CUDA 12.1
"ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.5.1 | regular":
"ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.5.1 | main":
docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.5.2-py3.10-pt_2.5.1-dev"
CUDA_VERSION_MM: "121"
testing: "main"
"ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.5.1 | ops":
docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.5.2-py3.10-pt_2.5.1-dev"
CUDA_VERSION_MM: "121"
testing: "ops"
"ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.5.1 | grads":
docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.5.2-py3.10-pt_2.5.1-dev"
CUDA_VERSION_MM: "121"
testing: "grads"
"ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.5.1 | distributed":
docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.5.2-py3.10-pt_2.5.1-dev"
CUDA_VERSION_MM: "121"
testing: "distributed"
"ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | regular":
"ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | main":
docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.5.2-py3.10-pt_main-dev"
CUDA_VERSION_MM: "121"
testing: "main"
"ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | ops":
docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.5.2-py3.10-pt_main-dev"
CUDA_VERSION_MM: "121"
testing: "ops"
"ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | grads":
docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.5.2-py3.10-pt_main-dev"
CUDA_VERSION_MM: "121"
testing: "grads"
"ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | distributed":
docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.5.2-py3.10-pt_main-dev"
CUDA_VERSION_MM: "121"
Expand Down Expand Up @@ -88,16 +106,17 @@ jobs:
--durations=250 \
--timeout=240 \
--numprocesses=9 \
--ignore=thunder/tests/distributed --ignore=thunder/tests/test_networks.py
--ignore=thunder/tests/distributed --ignore=thunder/tests/test_networks.py \
--ignore=thunder/tests/test_ops.py --ignore=thunder/tests/test_grad.py
# compile coverage results
python -m coverage report
python -m coverage xml
# upload to codecov
./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
--flags=gpu,pytest,regular --name="GPU-coverage" --env=linux,azure
condition: ne(variables['testing'], 'distributed')
condition: eq(variables['testing'], 'main')
timeoutInMinutes: "40"
displayName: "Testing: regular"
displayName: "Testing: main"
- bash: |
set -ex
Expand All @@ -115,10 +134,52 @@ jobs:
# upload to codecov
./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
--flags=gpu,pytest,networks --name="GPU-coverage" --env=linux,azure
condition: ne(variables['testing'], 'distributed')
condition: eq(variables['testing'], 'main')
timeoutInMinutes: "15"
displayName: "Testing: networks"
- bash: |
set -ex
export CUDA_LAUNCH_BLOCKING=1
coverage run --source thunder -m \
pytest thunder/tests/test_ops.py \
-m "not standalone" \
-v --datefmt="%Y%m%d-%H:%M:%S.%f" \
--random-order-seed=42 \
--durations=250 \
--timeout=240 \
--numprocesses=9
# compile coverage results
python -m coverage report
python -m coverage xml
# upload to codecov
./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
--flags=gpu,pytest,regular --name="GPU-coverage" --env=linux,azure
condition: eq(variables['testing'], 'ops')
timeoutInMinutes: "40"
displayName: "Testing: ops"
- bash: |
set -ex
export CUDA_LAUNCH_BLOCKING=1
coverage run --source thunder -m \
pytest thunder/tests/test_grad.py \
-m "not standalone" \
-v --datefmt="%Y%m%d-%H:%M:%S.%f" \
--random-order-seed=42 \
--durations=250 \
--timeout=240 \
--numprocesses=9
# compile coverage results
python -m coverage report
python -m coverage xml
# upload to codecov
./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
--flags=gpu,pytest,regular --name="GPU-coverage" --env=linux,azure
condition: eq(variables['testing'], 'grads')
timeoutInMinutes: "40"
displayName: "Testing: grads"
- bash: |
set -ex
# run all found tests in given past as standalone
Expand Down
1 change: 1 addition & 0 deletions docs/.readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ version: 2
# reference: https://docs.readthedocs.io/en/stable/config-file/v2.html#sphinx
sphinx:
fail_on_warning: true
configuration: docs/conf.py

build:
os: "ubuntu-22.04"
Expand Down

0 comments on commit a477a7c

Please sign in to comment.