From 5d18fce56a2e4c18ef04306c7c7b043cac882dac Mon Sep 17 00:00:00 2001
From: Vedaanta Agarwalla <142048820+vedaanta@users.noreply.github.com>
Date: Wed, 19 Jun 2024 23:12:16 -0700
Subject: [PATCH] Bumps cudnn FE to v1.5 (#593)

---
 .azure/docker-build.yml        | 8 ++++----
 .azure/gpu-tests.yml           | 8 ++++----
 .azure/notebook-runs.yml       | 4 ++--
 dockers/ubuntu-cuda/Dockerfile | 2 +-
 thunder/executors/cudnnex.py   | 7 +++++--
 5 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/.azure/docker-build.yml b/.azure/docker-build.yml
index a33f4a2f58..df8a8874df 100644
--- a/.azure/docker-build.yml
+++ b/.azure/docker-build.yml
@@ -40,10 +40,10 @@ jobs:
       #maxParallel: "3"
       matrix:
         # CUDA 12.1
-        "cuda 12.1 | torch 2.3 | cudnn FE v1.4":
-          { CUDA_VERSION: "12.1.1", TORCH_VERSION: "2.3.0", TRITON_VERSION: "2.3.0", CUDNN_FRONTEND_VERSION: "1.4.0" }
-        "cuda 12.1 | torch 2.4 /nightly | cudnn FE v1.4":
-          { CUDA_VERSION: "12.1.1", TORCH_VERSION: "main", TORCH_INSTALL: "source", CUDNN_FRONTEND_VERSION: "1.4.0" }
+        "cuda 12.1 | torch 2.3 | cudnn FE v1.5.1":
+          { CUDA_VERSION: "12.1.1", TORCH_VERSION: "2.3.0", TRITON_VERSION: "2.3.0", CUDNN_FRONTEND_VERSION: "1.5.1" }
+        "cuda 12.1 | torch 2.4 /nightly | cudnn FE v1.5.1":
+          { CUDA_VERSION: "12.1.1", TORCH_VERSION: "main", TORCH_INSTALL: "source", CUDNN_FRONTEND_VERSION: "1.5.1" }
         #'cuda 12.1': # this version - '8.9.5.29-1+cuda12.1' for 'libcudnn8' was not found
     # how much time to give 'run always even if cancelled tasks' before stopping them
     cancelTimeoutInMinutes: "2"
diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml
index 6ba3398ebd..0b8bfdb348 100644
--- a/.azure/gpu-tests.yml
+++ b/.azure/gpu-tests.yml
@@ -17,17 +17,17 @@ jobs:
       matrix:
         # CUDA 12.1
         "ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.3 | regular":
-          docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.4.0-py3.10-pt_2.3.0-dev"
+          docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.5.1-py3.10-pt_2.3.0-dev"
           CUDA_VERSION_MM: "121"
         "ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.3 | distributed":
-          docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.4.0-py3.10-pt_2.3.0-dev"
+          docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.5.1-py3.10-pt_2.3.0-dev"
           CUDA_VERSION_MM: "121"
           testing: "distributed"
         "ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | regular":
-          docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.4.0-py3.10-pt_main-dev"
+          docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.5.1-py3.10-pt_main-dev"
           CUDA_VERSION_MM: "121"
         "ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | distributed":
-          docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.4.0-py3.10-pt_main-dev"
+          docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.5.1-py3.10-pt_main-dev"
           CUDA_VERSION_MM: "121"
           testing: "distributed"
     # how much time to give 'run always even if cancelled tasks' before stopping them
diff --git a/.azure/notebook-runs.yml b/.azure/notebook-runs.yml
index 5181fba015..bfc326fb4c 100644
--- a/.azure/notebook-runs.yml
+++ b/.azure/notebook-runs.yml
@@ -16,10 +16,10 @@ jobs:
     strategy:
       matrix:
         "ubuntu22.04 | cuda 12.1 | torch 2.3":
-          docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.4.0-py3.10-pt_2.3.0-dev"
+          docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.5.1-py3.10-pt_2.3.0-dev"
           CUDA_VERSION_MM: "121"
         "ubuntu22.04 | cuda 12.1 | torch-nightly":
-          docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.4.0-py3.10-pt_main-dev"
+          docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.5.1-py3.10-pt_main-dev"
           CUDA_VERSION_MM: "121"
     # how long to run the job before automatically cancelling
     timeoutInMinutes: "45"
diff --git a/dockers/ubuntu-cuda/Dockerfile b/dockers/ubuntu-cuda/Dockerfile
index 6da7b4418f..d6cd7c8e6f 100644
--- a/dockers/ubuntu-cuda/Dockerfile
+++ b/dockers/ubuntu-cuda/Dockerfile
@@ -20,7 +20,7 @@ ARG IMAGE_TYPE="devel"
 FROM nvidia/cuda:${CUDA_VERSION}-${IMAGE_TYPE}-ubuntu${UBUNTU_VERSION}
 
 ARG CUDNN_VERSION="9.1.0.70"
-ARG CUDNN_FRONTEND_VERSION="1.4.0"
+ARG CUDNN_FRONTEND_VERSION="1.5.1"
 ARG PYTHON_VERSION="3.10"
 ARG TORCH_VERSION="2.2.1"
 ARG TRITON_VERSION="2.2.0"
diff --git a/thunder/executors/cudnnex.py b/thunder/executors/cudnnex.py
index 38aa997169..e2a67e0b94 100644
--- a/thunder/executors/cudnnex.py
+++ b/thunder/executors/cudnnex.py
@@ -28,8 +28,11 @@ def cudnn_version() -> LooseVersion | None:
 
 
 def required_cudnn_version() -> LooseVersion:
-    # Using 1.3.0 majorly because it works better with other libraries (e.g. torch) that also build on top of cudnn backend
-    return LooseVersion("1.3.0")
+    # History of versions:
+    # Using 1.3.0+ because it works better with other libraries (e.g. torch) that also build on top of cudnn
+    # Using 1.5.0+ because it handles exception with unsupported graphs better
+    # Using 1.5.1 because of a compatibility fix
+    return LooseVersion("1.5.1")
 
 
 def cudnn_available() -> bool: