From 5d18fce56a2e4c18ef04306c7c7b043cac882dac Mon Sep 17 00:00:00 2001 From: Vedaanta Agarwalla <142048820+vedaanta@users.noreply.github.com> Date: Wed, 19 Jun 2024 23:12:16 -0700 Subject: [PATCH] Bumps cudnn FE to v1.5 (#593) --- .azure/docker-build.yml | 8 ++++---- .azure/gpu-tests.yml | 8 ++++---- .azure/notebook-runs.yml | 4 ++-- dockers/ubuntu-cuda/Dockerfile | 2 +- thunder/executors/cudnnex.py | 7 +++++-- 5 files changed, 16 insertions(+), 13 deletions(-) diff --git a/.azure/docker-build.yml b/.azure/docker-build.yml index a33f4a2f58..df8a8874df 100644 --- a/.azure/docker-build.yml +++ b/.azure/docker-build.yml @@ -40,10 +40,10 @@ jobs: #maxParallel: "3" matrix: # CUDA 12.1 - "cuda 12.1 | torch 2.3 | cudnn FE v1.4": - { CUDA_VERSION: "12.1.1", TORCH_VERSION: "2.3.0", TRITON_VERSION: "2.3.0", CUDNN_FRONTEND_VERSION: "1.4.0" } - "cuda 12.1 | torch 2.4 /nightly | cudnn FE v1.4": - { CUDA_VERSION: "12.1.1", TORCH_VERSION: "main", TORCH_INSTALL: "source", CUDNN_FRONTEND_VERSION: "1.4.0" } + "cuda 12.1 | torch 2.3 | cudnn FE v1.5.1": + { CUDA_VERSION: "12.1.1", TORCH_VERSION: "2.3.0", TRITON_VERSION: "2.3.0", CUDNN_FRONTEND_VERSION: "1.5.1" } + "cuda 12.1 | torch 2.4 /nightly | cudnn FE v1.5.1": + { CUDA_VERSION: "12.1.1", TORCH_VERSION: "main", TORCH_INSTALL: "source", CUDNN_FRONTEND_VERSION: "1.5.1" } #'cuda 12.1': # this version - '8.9.5.29-1+cuda12.1' for 'libcudnn8' was not found # how much time to give 'run always even if cancelled tasks' before stopping them cancelTimeoutInMinutes: "2" diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index 6ba3398ebd..0b8bfdb348 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -17,17 +17,17 @@ jobs: matrix: # CUDA 12.1 "ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.3 | regular": - docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.4.0-py3.10-pt_2.3.0-dev" + docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.5.1-py3.10-pt_2.3.0-dev" CUDA_VERSION_MM: "121" "ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.3 | distributed": - docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.4.0-py3.10-pt_2.3.0-dev" + docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.5.1-py3.10-pt_2.3.0-dev" CUDA_VERSION_MM: "121" testing: "distributed" "ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | regular": - docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.4.0-py3.10-pt_main-dev" + docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.5.1-py3.10-pt_main-dev" CUDA_VERSION_MM: "121" "ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | distributed": - docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.4.0-py3.10-pt_main-dev" + docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.5.1-py3.10-pt_main-dev" CUDA_VERSION_MM: "121" testing: "distributed" # how much time to give 'run always even if cancelled tasks' before stopping them diff --git a/.azure/notebook-runs.yml b/.azure/notebook-runs.yml index 5181fba015..bfc326fb4c 100644 --- a/.azure/notebook-runs.yml +++ b/.azure/notebook-runs.yml @@ -16,10 +16,10 @@ jobs: strategy: matrix: "ubuntu22.04 | cuda 12.1 | torch 2.3": - docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.4.0-py3.10-pt_2.3.0-dev" + docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.5.1-py3.10-pt_2.3.0-dev" CUDA_VERSION_MM: "121" "ubuntu22.04 | cuda 12.1 | torch-nightly": - docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.4.0-py3.10-pt_main-dev" + docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.5.1-py3.10-pt_main-dev" CUDA_VERSION_MM: "121" # how long to run the job before automatically cancelling timeoutInMinutes: "45" diff --git a/dockers/ubuntu-cuda/Dockerfile b/dockers/ubuntu-cuda/Dockerfile index 6da7b4418f..d6cd7c8e6f 100644 --- a/dockers/ubuntu-cuda/Dockerfile +++ b/dockers/ubuntu-cuda/Dockerfile @@ -20,7 +20,7 @@ ARG IMAGE_TYPE="devel" FROM nvidia/cuda:${CUDA_VERSION}-${IMAGE_TYPE}-ubuntu${UBUNTU_VERSION} ARG CUDNN_VERSION="9.1.0.70" -ARG CUDNN_FRONTEND_VERSION="1.4.0" +ARG CUDNN_FRONTEND_VERSION="1.5.1" ARG PYTHON_VERSION="3.10" ARG TORCH_VERSION="2.2.1" ARG TRITON_VERSION="2.2.0" diff --git a/thunder/executors/cudnnex.py b/thunder/executors/cudnnex.py index 38aa997169..e2a67e0b94 100644 --- a/thunder/executors/cudnnex.py +++ b/thunder/executors/cudnnex.py @@ -28,8 +28,11 @@ def cudnn_version() -> LooseVersion | None: def required_cudnn_version() -> LooseVersion: - # Using 1.3.0 majorly because it works better with other libraries (e.g. torch) that also build on top of cudnn backend - return LooseVersion("1.3.0") + # History of versions: + # Using 1.3.0+ because it works better with other libraries (e.g. torch) that also build on top of cudnn + # Using 1.5.0+ because it handles exception with unsupported graphs better + # Using 1.5.1 because of a compatibility fix + return LooseVersion("1.5.1") def cudnn_available() -> bool: