From 926121cfe48a2762ae8076e00ae1b1f50593bb45 Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Sat, 30 Mar 2024 11:27:51 +0100 Subject: [PATCH] ci: testing with `torch==2.3` /test and move `2.4` /nightly (#86) --- .azure/docker-build.yml | 4 +++- .azure/gpu-tests.yml | 13 +++++++++++-- dockers/ubuntu-cuda/Dockerfile | 9 ++++++++- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/.azure/docker-build.yml b/.azure/docker-build.yml index a1803b596d..dc353f689f 100644 --- a/.azure/docker-build.yml +++ b/.azure/docker-build.yml @@ -42,7 +42,9 @@ jobs: # CUDA 12.1 'cuda 12.1 | torch 2.2 | cudnn FE v1.2': {CUDA_VERSION: '12.1.1', TORCH_VERSION: '2.2.1', TRITON_VERSION: '2.2.0', CUDNN_FRONTEND: "1.2.1"} - 'cuda 12.1 | torch 2.3 /nightly | cudnn FE v1.2': + 'cuda 12.1 | torch 2.3 /test | cudnn FE v1.2': + {CUDA_VERSION: '12.1.1', TORCH_VERSION: '2.3.0', TRITON_VERSION: '2.2.0', TORCH_INSTALL: 'test', CUDNN_FRONTEND: "1.2.1"} + 'cuda 12.1 | torch 2.4 /nightly | cudnn FE v1.2': {CUDA_VERSION: '12.1.1', TORCH_VERSION: 'main', TORCH_INSTALL: 'source', CUDNN_FRONTEND: "1.2.1"} #'cuda 12.1': # this version - '8.9.5.29-1+cuda12.1' for 'libcudnn8' was not found # how much time to give 'run always even if cancelled tasks' before stopping them diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index 2671f5309f..df030eaefd 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -23,6 +23,13 @@ jobs: docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.2.1-apex' CUDA_VERSION_MM: '121' testing: 'distributed' + 'ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.3 | regular': + docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.3.0-apex' + CUDA_VERSION_MM: '121' + 'ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.3 | distributed': + docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.3.0-apex' + CUDA_VERSION_MM: '121' + testing: 'distributed' 'ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | regular': docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_main-apex' CUDA_VERSION_MM: '121' @@ -30,8 +37,6 @@ jobs: docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_main-apex' CUDA_VERSION_MM: '121' testing: 'distributed' - # how long to run the job before automatically cancelling - timeoutInMinutes: "35" # how much time to give 'run always even if cancelled tasks' before stopping them cancelTimeoutInMinutes: "2" pool: "lit-rtx-3090" @@ -89,6 +94,7 @@ jobs: --timeout=240 \ --random-order-seed=42 \ --durations=250 \ + --timeout=240 \ --numprocesses=9 \ --ignore=thunder/tests/distributed --ignore=thunder/tests/test_networks.py # compile coverage results @@ -98,6 +104,7 @@ jobs: ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \ --flags=gpu,pytest,regular --name="GPU-coverage" --env=linux,azure condition: ne(variables['testing'], 'distributed') + timeoutInMinutes: "30" displayName: 'Testing: regular' - bash: | @@ -117,6 +124,7 @@ jobs: ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \ --flags=gpu,pytest,networks --name="GPU-coverage" --env=linux,azure condition: ne(variables['testing'], 'distributed') + timeoutInMinutes: "15" displayName: 'Testing: networks' #- bash: | @@ -138,6 +146,7 @@ jobs: # ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \ # --flags=gpu,pytest,distributed --name="GPU-coverage" --env=linux,azure condition: eq(variables['testing'], 'distributed') + timeoutInMinutes: "20" displayName: 'Testing: distributed' # todo (mruberry): decide whether this should be here or in another workflow diff --git a/dockers/ubuntu-cuda/Dockerfile b/dockers/ubuntu-cuda/Dockerfile index 5fd03bc134..e37e29c216 100644 --- a/dockers/ubuntu-cuda/Dockerfile +++ b/dockers/ubuntu-cuda/Dockerfile @@ -138,7 +138,14 @@ RUN \ cd .. && \ rm -rf Fuser ; \ elif [ "${TORCH_INSTALL}" == "test" ]; then \ - echo "Not supported option" ; \ + # building nvFuser from source + git clone https://github.com/NVIDIA/Fuser.git && \ + cd Fuser && \ + git submodule update --init --recursive && \ + pip install -r requirements.txt && \ + python setup.py install --no-test --no-benchmark && \ + cd .. && \ + rm -rf Fuser ; \ else \ # installing pytorch from wheels \ CUDA_VERSION_MM=${CUDA_VERSION%.*} && \