Skip to content

Commit

Permalink
ci: testing with torch==2.3 /test and move 2.4 /nightly (#86)
Browse files Browse the repository at this point in the history
  • Loading branch information
Borda authored Mar 30, 2024
1 parent b24e5b2 commit 926121c
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 4 deletions.
4 changes: 3 additions & 1 deletion .azure/docker-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@ jobs:
# CUDA 12.1
'cuda 12.1 | torch 2.2 | cudnn FE v1.2':
{CUDA_VERSION: '12.1.1', TORCH_VERSION: '2.2.1', TRITON_VERSION: '2.2.0', CUDNN_FRONTEND: "1.2.1"}
'cuda 12.1 | torch 2.3 /nightly | cudnn FE v1.2':
'cuda 12.1 | torch 2.3 /test | cudnn FE v1.2':
{CUDA_VERSION: '12.1.1', TORCH_VERSION: '2.3.0', TRITON_VERSION: '2.2.0', TORCH_INSTALL: 'test', CUDNN_FRONTEND: "1.2.1"}
'cuda 12.1 | torch 2.4 /nightly | cudnn FE v1.2':
{CUDA_VERSION: '12.1.1', TORCH_VERSION: 'main', TORCH_INSTALL: 'source', CUDNN_FRONTEND: "1.2.1"}
#'cuda 12.1': # this version - '8.9.5.29-1+cuda12.1' for 'libcudnn8' was not found
# how much time to give 'run always even if cancelled tasks' before stopping them
Expand Down
13 changes: 11 additions & 2 deletions .azure/gpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,20 @@ jobs:
docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.2.1-apex'
CUDA_VERSION_MM: '121'
testing: 'distributed'
'ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.3 | regular':
docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.3.0-apex'
CUDA_VERSION_MM: '121'
'ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.3 | distributed':
docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.3.0-apex'
CUDA_VERSION_MM: '121'
testing: 'distributed'
'ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | regular':
docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_main-apex'
CUDA_VERSION_MM: '121'
'ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | distributed':
docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_main-apex'
CUDA_VERSION_MM: '121'
testing: 'distributed'
# how long to run the job before automatically cancelling
timeoutInMinutes: "35"
# how much time to give 'run always even if cancelled tasks' before stopping them
cancelTimeoutInMinutes: "2"
pool: "lit-rtx-3090"
Expand Down Expand Up @@ -89,6 +94,7 @@ jobs:
--timeout=240 \
--random-order-seed=42 \
--durations=250 \
--timeout=240 \
--numprocesses=9 \
--ignore=thunder/tests/distributed --ignore=thunder/tests/test_networks.py
# compile coverage results
Expand All @@ -98,6 +104,7 @@ jobs:
./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
--flags=gpu,pytest,regular --name="GPU-coverage" --env=linux,azure
condition: ne(variables['testing'], 'distributed')
timeoutInMinutes: "30"
displayName: 'Testing: regular'
- bash: |
Expand All @@ -117,6 +124,7 @@ jobs:
./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
--flags=gpu,pytest,networks --name="GPU-coverage" --env=linux,azure
condition: ne(variables['testing'], 'distributed')
timeoutInMinutes: "15"
displayName: 'Testing: networks'
#- bash: |
Expand All @@ -138,6 +146,7 @@ jobs:
# ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
# --flags=gpu,pytest,distributed --name="GPU-coverage" --env=linux,azure
condition: eq(variables['testing'], 'distributed')
timeoutInMinutes: "20"
displayName: 'Testing: distributed'
# todo (mruberry): decide whether this should be here or in another workflow
Expand Down
9 changes: 8 additions & 1 deletion dockers/ubuntu-cuda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,14 @@ RUN \
cd .. && \
rm -rf Fuser ; \
elif [ "${TORCH_INSTALL}" == "test" ]; then \
echo "Not supported option" ; \
# building nvFuser from source
git clone https://github.com/NVIDIA/Fuser.git && \
cd Fuser && \
git submodule update --init --recursive && \
pip install -r requirements.txt && \
python setup.py install --no-test --no-benchmark && \
cd .. && \
rm -rf Fuser ; \
else \
# installing pytorch from wheels \
CUDA_VERSION_MM=${CUDA_VERSION%.*} && \
Expand Down

0 comments on commit 926121c

Please sign in to comment.