From c1e6acaf32a31b1543aa8e6b6179708ddb310e28 Mon Sep 17 00:00:00 2001 From: Renjith Ravindran Date: Fri, 6 Feb 2026 11:38:57 -0800 Subject: [PATCH 01/13] Enabling ci build for py-torch 2.9, 2.10 on rocm --- .../builtin/packages/hwloc/package.py | 2 +- .../PR152569-Update-spack-includes-2.5.patch | 5 +-- .../PR152569-Update-spack-includes-2.7.patch | 14 ++++++-- .../builtin/packages/py_torch/package.py | 36 ++++++++++++++----- stacks/ml-linux-x86_64-rocm/spack.yaml | 27 +++++++------- 5 files changed, 56 insertions(+), 28 deletions(-) diff --git a/repos/spack_repo/builtin/packages/hwloc/package.py b/repos/spack_repo/builtin/packages/hwloc/package.py index 65ad3db3056..9ab963dfdff 100644 --- a/repos/spack_repo/builtin/packages/hwloc/package.py +++ b/repos/spack_repo/builtin/packages/hwloc/package.py @@ -135,7 +135,7 @@ class Hwloc(AutotoolsPackage, CudaPackage, ROCmPackage): depends_on("mpi", when="+netloc") with when("+rocm"): - depends_on("rocm-smi-lib") + depends_on("rocm-smi-lib@7.0:") depends_on("rocm-opencl", when="+opencl") # Avoid a circular dependency since the openmp # variant of llvm-amdgpu depends on hwloc. diff --git a/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.5.patch b/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.5.patch index 2e7a80bcbe8..2c35aafac2f 100644 --- a/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.5.patch +++ b/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.5.patch @@ -25,10 +25,10 @@ index 9be7f37..39d0f24 100644 endif() diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake -index 1c0d3a2..e0de4b1 100644 +index 1c0d3a2..83f9f9d 100644 --- a/cmake/public/LoadHIP.cmake +++ b/cmake/public/LoadHIP.cmake -@@ -167,6 +167,10 @@ if(HIP_FOUND) +@@ -167,6 +167,11 @@ if(HIP_FOUND) find_package_and_print_version(hipsolver REQUIRED) find_package_and_print_version(hiprtc REQUIRED) @@ -36,6 +36,7 @@ index 1c0d3a2..e0de4b1 100644 + list(APPEND ROCM_INCLUDE ${rocprim_INCLUDE_DIR}) + list(APPEND ROCM_INCLUDE ${hipcub_INCLUDE_DIR}) + list(APPEND ROCM_INCLUDE ${rocRAND_INCLUDE_DIR}) ++ list(APPEND ROCM_INCLUDE $ENV{AOTRITON_INSTALLED_PREFIX}/include) find_library(PYTORCH_HIP_LIBRARIES amdhip64 HINTS ${ROCM_PATH}/lib) # TODO: miopen_LIBRARIES should return fullpath to the library file, diff --git a/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.7.patch b/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.7.patch index 4392e00d76a..173aabc12aa 100644 --- a/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.7.patch +++ b/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.7.patch @@ -1,5 +1,5 @@ diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt -index d2d23b7..620a89f 100644 +index d2d23b7ab65..620a89f65cb 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -1379,13 +1379,6 @@ if(USE_ROCM) @@ -26,7 +26,7 @@ index d2d23b7..620a89f 100644 endif() diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake -index 58c74dd..d3e1ad4 100644 +index 58c74ddda35..54f96871372 100644 --- a/cmake/public/LoadHIP.cmake +++ b/cmake/public/LoadHIP.cmake @@ -26,12 +26,6 @@ else() @@ -78,7 +78,15 @@ index 58c74dd..d3e1ad4 100644 find_package_and_print_version(amd_comgr REQUIRED) find_package_and_print_version(rocrand REQUIRED) find_package_and_print_version(hiprand REQUIRED) -@@ -171,7 +168,11 @@ if(HIP_FOUND) +@@ -157,6 +154,7 @@ if(HIP_FOUND) + find_package_and_print_version(hipcub REQUIRED) + find_package_and_print_version(rocthrust REQUIRED) + find_package_and_print_version(hipsolver REQUIRED) ++ list(APPEND ROCM_INCLUDE_DIRS $ENV{AOTRITON_INSTALLED_PREFIX}/include) + # workaround cmake 4 build issue + if(CMAKE_VERSION VERSION_GREATER_EQUAL "4.0.0") + message(WARNING "Work around hiprtc cmake failure for cmake >= 4") +@@ -171,7 +169,11 @@ if(HIP_FOUND) if(UNIX) find_package_and_print_version(rccl) find_package_and_print_version(hsa-runtime64 REQUIRED) diff --git a/repos/spack_repo/builtin/packages/py_torch/package.py b/repos/spack_repo/builtin/packages/py_torch/package.py index 844a73f0e9e..c2f224b1b59 100644 --- a/repos/spack_repo/builtin/packages/py_torch/package.py +++ b/repos/spack_repo/builtin/packages/py_torch/package.py @@ -123,6 +123,7 @@ class PyTorch(PythonPackage, CudaPackage, ROCmPackage): conflicts("+gloo+rocm") conflicts("+rocm", when="@2.3", msg="Rocm doesn't support py-torch 2.3 release") conflicts("+rocm", when="@2.4", msg="Rocm doesn't support py-torch 2.4 release") + conflicts("+rocm", when="@2.8", msg="Rocm doesn't support py-torch 2.8 release") conflicts("+tensorpipe", when="+rocm ^hip@:5.1", msg="TensorPipe not supported until ROCm 5.2") conflicts("+breakpad", when="target=ppc64:") conflicts("+breakpad", when="target=ppc64le:") @@ -305,7 +306,8 @@ class PyTorch(PythonPackage, CudaPackage, ROCmPackage): depends_on("valgrind", when="+valgrind") with when("+rocm"): depends_on("hsa-rocr-dev") - depends_on("hip") + depends_on("hip@7.0:", when="@2.9:") + depends_on("hip@:6.4", when="@:2.7") depends_on("rccl", when="+nccl") depends_on("rocprim") depends_on("hipcub") @@ -320,11 +322,20 @@ class PyTorch(PythonPackage, CudaPackage, ROCmPackage): depends_on("rocfft") depends_on("rocblas") depends_on("miopen-hip") + for target in ROCmPackage.amdgpu_targets: + depends_on(f"composable-kernel amdgpu_target={target}", when=f"amdgpu_target={target}") + # This constraint applies to ANY hipblaslt in the dependency tree + # including the one used by miopen-hip + depends_on(f"hipblaslt amdgpu_target={target}", when=f"amdgpu_target={target}") + # Ensure hipblaslt version for 2.9+ + depends_on( + f"hipblaslt@7.0: amdgpu_target={target}", when=f"@2.9: amdgpu_target={target}" + ) depends_on("rocminfo") - depends_on("aotriton@0.8.1b", when="@2.5:2.6") - depends_on("aotriton@0.9.1b", when="@2.7:") - depends_on("composable-kernel@:6.3.2", when="@2.5") - depends_on("composable-kernel@6.3.2:", when="@2.6:") + depends_on("hipsparselt@7.0:", when="@2.9:") + depends_on("aotriton@0.8b", when="@2.5:2.6") + depends_on("aotriton@0.9.2b", when="@2.7") + depends_on("aotriton@0.10b", when="@2.8:") depends_on("mpi", when="+mpi") depends_on("ucc", when="+ucc") depends_on("ucx", when="+ucc") @@ -568,6 +579,14 @@ def patch(self): "torch_global_deps PROPERTIES LINKER_LANGUAGE CXX", "caffe2/CMakeLists.txt", ) + if self.spec.satisfies("@2.5:+rocm"): + filter_file( + "find_library(ROCM_ROCTX_LIB roctx64 HINTS ${ROCM_PATH}/lib)", + "find_library(ROCM_ROCTX_LIB roctx64 HINTS ${ROCM_PATH}/lib)\n" + "set(ROCTRACER_INCLUDE_DIR $ENV{ROCTRACER_INCLUDE_DIR})", + "cmake/public/LoadHIP.cmake", + string=True, + ) if self.spec.satisfies("@2.1:2.7+rocm"): filter_file( "${ROCM_INCLUDE_DIRS}/rocm-core/rocm_version.h", @@ -757,9 +776,10 @@ def enable_or_disable(variant, keyword="USE", var=None): env.set("BLAS", "FLAME") env.set("WITH_BLAS", "FLAME") elif self.spec["blas"].name == "intel-oneapi-mkl": - env.set("BLAS", "MKL") - env.set("WITH_BLAS", "mkl") - env.set("INTEL_MKL_DIR", self.spec["mkl"].prefix.mkl.latest) + if "+mkldnn" in self.spec: + env.set("BLAS", "MKL") + env.set("WITH_BLAS", "mkl") + env.set("INTEL_MKL_DIR", self.spec["mkl"].prefix.mkl.latest) elif self.spec["blas"].name == "openblas": env.set("BLAS", "OpenBLAS") env.set("WITH_BLAS", "open") diff --git a/stacks/ml-linux-x86_64-rocm/spack.yaml b/stacks/ml-linux-x86_64-rocm/spack.yaml index b0533168015..7d387f53431 100644 --- a/stacks/ml-linux-x86_64-rocm/spack.yaml +++ b/stacks/ml-linux-x86_64-rocm/spack.yaml @@ -48,23 +48,22 @@ spack: # - py-keras backend=torch # PyTorch - # Does not yet support Spack-installed ROCm - # - py-botorch - # - py-gpytorch - # - py-kornia - # - py-lightning - # - py-pytorch-lightning - # - py-segmentation-models-pytorch - # - py-timm - # - py-torch - # - py-torch-geometric + - py-botorch + - py-gpytorch + - py-kornia + - py-lightning + - py-pytorch-lightning + - py-segmentation-models-pytorch + - py-timm + - py-torch + - py-torch-geometric # - py-torch-nvidia-apex # - py-torchaudio - # - py-torchdata - # - py-torchgeo - # - py-torchmetrics + - py-torchdata + - py-torchgeo + - py-torchmetrics # - py-torchvision - # - py-vector-quantize-pytorch + - py-vector-quantize-pytorch # scikit-learn - py-scikit-learn From 22b06cb507640b203c8cc886ca394d05a31ad8b9 Mon Sep 17 00:00:00 2001 From: Renjith Ravindran Date: Mon, 9 Feb 2026 01:04:42 -0800 Subject: [PATCH 02/13] Removing mkldnn check and manual variant --- .../builtin/packages/py_torch/package.py | 20 +++++++------------ stacks/ml-linux-x86_64-rocm/spack.yaml | 2 +- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/repos/spack_repo/builtin/packages/py_torch/package.py b/repos/spack_repo/builtin/packages/py_torch/package.py index c2f224b1b59..d360242270a 100644 --- a/repos/spack_repo/builtin/packages/py_torch/package.py +++ b/repos/spack_repo/builtin/packages/py_torch/package.py @@ -322,15 +322,10 @@ class PyTorch(PythonPackage, CudaPackage, ROCmPackage): depends_on("rocfft") depends_on("rocblas") depends_on("miopen-hip") - for target in ROCmPackage.amdgpu_targets: - depends_on(f"composable-kernel amdgpu_target={target}", when=f"amdgpu_target={target}") - # This constraint applies to ANY hipblaslt in the dependency tree - # including the one used by miopen-hip - depends_on(f"hipblaslt amdgpu_target={target}", when=f"amdgpu_target={target}") - # Ensure hipblaslt version for 2.9+ - depends_on( - f"hipblaslt@7.0: amdgpu_target={target}", when=f"@2.9: amdgpu_target={target}" - ) + depends_on("composable-kernel") + depends_on("hipblaslt") + # Ensure hipblaslt version for 2.9+ + depends_on("hipblaslt@7.0:", when="@2.9:") depends_on("rocminfo") depends_on("hipsparselt@7.0:", when="@2.9:") depends_on("aotriton@0.8b", when="@2.5:2.6") @@ -776,10 +771,9 @@ def enable_or_disable(variant, keyword="USE", var=None): env.set("BLAS", "FLAME") env.set("WITH_BLAS", "FLAME") elif self.spec["blas"].name == "intel-oneapi-mkl": - if "+mkldnn" in self.spec: - env.set("BLAS", "MKL") - env.set("WITH_BLAS", "mkl") - env.set("INTEL_MKL_DIR", self.spec["mkl"].prefix.mkl.latest) + env.set("BLAS", "MKL") + env.set("WITH_BLAS", "mkl") + env.set("INTEL_MKL_DIR", self.spec["mkl"].prefix.mkl.latest) elif self.spec["blas"].name == "openblas": env.set("BLAS", "OpenBLAS") env.set("WITH_BLAS", "open") diff --git a/stacks/ml-linux-x86_64-rocm/spack.yaml b/stacks/ml-linux-x86_64-rocm/spack.yaml index be94ec835c6..700c3bfa66a 100644 --- a/stacks/ml-linux-x86_64-rocm/spack.yaml +++ b/stacks/ml-linux-x86_64-rocm/spack.yaml @@ -43,7 +43,7 @@ spack: # Keras - py-keras backend=tensorflow # - py-keras backend=jax - # - py-keras backend=torch + - py-keras backend=torch # PyTorch - py-botorch From c397270c45ba1d45398ee87abb605a54cd01d20b Mon Sep 17 00:00:00 2001 From: Renjith Ravindran Date: Tue, 10 Feb 2026 22:54:17 -0800 Subject: [PATCH 03/13] version check correction for aotriton --- repos/spack_repo/builtin/packages/aotriton/package.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/repos/spack_repo/builtin/packages/aotriton/package.py b/repos/spack_repo/builtin/packages/aotriton/package.py index 673c678a94b..c23aaf7c94a 100644 --- a/repos/spack_repo/builtin/packages/aotriton/package.py +++ b/repos/spack_repo/builtin/packages/aotriton/package.py @@ -84,7 +84,7 @@ def patch(self): string=True, ) - if self.spec.satisfies("@:0.9"): + if self.spec.satisfies("@:0.9b"): filter_file( r"LLVM_INCLUDE_DIRS", f"{self.spec['aotriton-llvm'].prefix}/include", @@ -103,7 +103,7 @@ def patch(self): "third_party/triton/python/setup.py", string=True, ) - if self.spec.satisfies("@0.10:"): + if self.spec.satisfies("@0.10b:"): filter_file( r"LLVM_INCLUDE_DIRS", f"{self.spec['aotriton-llvm'].prefix}/include", From 58629f9140f04ba4c23f6ff16a5448cf28c37db5 Mon Sep 17 00:00:00 2001 From: renjithravindrankannath Date: Wed, 11 Feb 2026 22:47:28 -0500 Subject: [PATCH 04/13] Increase timout for ck --- .ci/gitlab/configs/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/gitlab/configs/ci.yaml b/.ci/gitlab/configs/ci.yaml index 9dc89ba702b..6e6e4c3c3b5 100644 --- a/.ci/gitlab/configs/ci.yaml +++ b/.ci/gitlab/configs/ci.yaml @@ -23,7 +23,7 @@ ci: script:: - - if [ -n "$SPACK_EXTRA_MIRROR" ]; then spack mirror add local "${SPACK_EXTRA_MIRROR}/${SPACK_CI_STACK_NAME}"; fi - spack config blame mirrors - - - spack --color=always --backtrace ci rebuild -j ${SPACK_BUILD_JOBS} --tests --timeout 300 > >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_out.txt) 2> >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_err.txt >&2) + - - spack --color=always --backtrace ci rebuild -j ${SPACK_BUILD_JOBS} --tests --timeout 1200 > >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_out.txt) 2> >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_err.txt >&2) after_script: - - cat /proc/loadavg || true - cat /proc/meminfo | grep 'MemTotal\|MemFree' || true From a5e4a0ab6e21797be9566a18058078318235c4c2 Mon Sep 17 00:00:00 2001 From: renjithravindrankannath Date: Thu, 12 Feb 2026 15:02:46 -0500 Subject: [PATCH 05/13] 24h for long ROCm/ML rebuilds; GitLab project/runner max must allow this --- .ci/gitlab/configs/linux/ci.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.ci/gitlab/configs/linux/ci.yaml b/.ci/gitlab/configs/linux/ci.yaml index 92a1cd14d9c..7edfea15803 100644 --- a/.ci/gitlab/configs/linux/ci.yaml +++ b/.ci/gitlab/configs/linux/ci.yaml @@ -13,6 +13,8 @@ ci: - wrf build-job: tags: [ "spack", "huge" ] + # 24h for long ROCm/ML rebuilds; GitLab project/runner max must allow this + timeout: 1440 minutes variables: CI_JOB_SIZE: huge SPACK_BUILD_JOBS: "12" From 12b27100c1ef65dc1d4df987fd140d6e6b03cffc Mon Sep 17 00:00:00 2001 From: renjithravindrankannath Date: Thu, 12 Feb 2026 15:07:00 -0500 Subject: [PATCH 06/13] Reverting commit 58629f9140f04ba4c23f6ff16a5448cf28c37db5 timout for ck --- .ci/gitlab/configs/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/gitlab/configs/ci.yaml b/.ci/gitlab/configs/ci.yaml index 6e6e4c3c3b5..9dc89ba702b 100644 --- a/.ci/gitlab/configs/ci.yaml +++ b/.ci/gitlab/configs/ci.yaml @@ -23,7 +23,7 @@ ci: script:: - - if [ -n "$SPACK_EXTRA_MIRROR" ]; then spack mirror add local "${SPACK_EXTRA_MIRROR}/${SPACK_CI_STACK_NAME}"; fi - spack config blame mirrors - - - spack --color=always --backtrace ci rebuild -j ${SPACK_BUILD_JOBS} --tests --timeout 1200 > >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_out.txt) 2> >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_err.txt >&2) + - - spack --color=always --backtrace ci rebuild -j ${SPACK_BUILD_JOBS} --tests --timeout 300 > >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_out.txt) 2> >(tee ${SPACK_ARTIFACTS_ROOT}/user_data/pipeline_err.txt >&2) after_script: - - cat /proc/loadavg || true - cat /proc/meminfo | grep 'MemTotal\|MemFree' || true From 441d218eef484184150d77a0991bad683187c148 Mon Sep 17 00:00:00 2001 From: renjithravindrankannath Date: Tue, 17 Feb 2026 12:42:58 -0500 Subject: [PATCH 07/13] py-torchvision requires rocm math lib paths indirectly when py-torch is built with rocm --- .../builtin/packages/py_torchvision/package.py | 12 ++++++++++++ stacks/ml-linux-x86_64-rocm/spack.yaml | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/repos/spack_repo/builtin/packages/py_torchvision/package.py b/repos/spack_repo/builtin/packages/py_torchvision/package.py index 4afaa9895c7..073a4580c34 100644 --- a/repos/spack_repo/builtin/packages/py_torchvision/package.py +++ b/repos/spack_repo/builtin/packages/py_torchvision/package.py @@ -197,6 +197,18 @@ def setup_build_environment(self, env: EnvironmentModifications) -> None: query = self.spec[dep.name] include.extend(query.headers.directories) library.extend(query.libs.directories) + # PyTorch headers include rocthrust, rocprim, hipsparse, hipblas, hipblas-common, + # hipblaslt and hipsolver headers; when building with ROCm we need these headers + # in the include path (py-torch depends on these headers, but it is not a direct + # link dep of torchvision). + if "^py-torch+rocm" in self.spec: + include.extend(self.spec["rocthrust"].headers.directories) + include.extend(self.spec["rocprim"].headers.directories) + include.extend(self.spec["hipsparse"].headers.directories) + include.extend(self.spec["hipblas"].headers.directories) + include.extend(self.spec["hipblas-common"].headers.directories) + include.extend(self.spec["hipblaslt"].headers.directories) + include.extend(self.spec["hipsolver"].headers.directories) # CONTRIBUTING.md says to use TORCHVISION_INCLUDE and TORCHVISION_LIBRARY, but # these do not work for older releases. Build uses a mix of Spack's compiler wrapper diff --git a/stacks/ml-linux-x86_64-rocm/spack.yaml b/stacks/ml-linux-x86_64-rocm/spack.yaml index 700c3bfa66a..de5cbe445e6 100644 --- a/stacks/ml-linux-x86_64-rocm/spack.yaml +++ b/stacks/ml-linux-x86_64-rocm/spack.yaml @@ -60,7 +60,7 @@ spack: - py-torchdata - py-torchgeo - py-torchmetrics - # - py-torchvision + - py-torchvision - py-vector-quantize-pytorch # scikit-learn From 02e36f6cc75f4b09b8e20f00fed6ab24b5017fac Mon Sep 17 00:00:00 2001 From: renjithravindrankannath Date: Wed, 18 Feb 2026 12:10:23 -0500 Subject: [PATCH 08/13] Only add paths for packages that are in the spec to avoid KeyError --- .../packages/py_torchvision/package.py | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/repos/spack_repo/builtin/packages/py_torchvision/package.py b/repos/spack_repo/builtin/packages/py_torchvision/package.py index 073a4580c34..f253fdde728 100644 --- a/repos/spack_repo/builtin/packages/py_torchvision/package.py +++ b/repos/spack_repo/builtin/packages/py_torchvision/package.py @@ -197,18 +197,24 @@ def setup_build_environment(self, env: EnvironmentModifications) -> None: query = self.spec[dep.name] include.extend(query.headers.directories) library.extend(query.libs.directories) + # PyTorch headers include rocthrust, rocprim, hipsparse, hipblas, hipblas-common, - # hipblaslt and hipsolver headers; when building with ROCm we need these headers - # in the include path (py-torch depends on these headers, but it is not a direct - # link dep of torchvision). + # hipblaslt and hipsolver headers; when building with ROCm we need these in the + # include path (py-torch depends on them, but they are not direct link deps of + # torchvision). Only add paths for packages that are in the spec to avoid KeyError. if "^py-torch+rocm" in self.spec: - include.extend(self.spec["rocthrust"].headers.directories) - include.extend(self.spec["rocprim"].headers.directories) - include.extend(self.spec["hipsparse"].headers.directories) - include.extend(self.spec["hipblas"].headers.directories) - include.extend(self.spec["hipblas-common"].headers.directories) - include.extend(self.spec["hipblaslt"].headers.directories) - include.extend(self.spec["hipsolver"].headers.directories) + rocm_include_pkgs = [ + "rocthrust", + "rocprim", + "hipsparse", + "hipblas", + "hipblas-common", + "hipblaslt", + "hipsolver", + ] + for pkg in rocm_include_pkgs: + if pkg in self.spec: + include.extend(self.spec[pkg].headers.directories) # CONTRIBUTING.md says to use TORCHVISION_INCLUDE and TORCHVISION_LIBRARY, but # these do not work for older releases. Build uses a mix of Spack's compiler wrapper From bc1a5f081a8c4d887a042c9de9f45b4f9a814907 Mon Sep 17 00:00:00 2001 From: renjithravindrankannath Date: Thu, 19 Feb 2026 02:17:09 -0500 Subject: [PATCH 09/13] libtorch_hip.so needs aotriton and hip libs at runtime --- .../spack_repo/builtin/packages/py_torchvision/package.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/repos/spack_repo/builtin/packages/py_torchvision/package.py b/repos/spack_repo/builtin/packages/py_torchvision/package.py index f253fdde728..533b6a6cbcd 100644 --- a/repos/spack_repo/builtin/packages/py_torchvision/package.py +++ b/repos/spack_repo/builtin/packages/py_torchvision/package.py @@ -216,6 +216,14 @@ def setup_build_environment(self, env: EnvironmentModifications) -> None: if pkg in self.spec: include.extend(self.spec[pkg].headers.directories) + # At build time, torchvision's setup imports torch; libtorch_hip.so then + # needs aotriton and hip libs at runtime. Add their lib dirs so the loader + # can resolve undefined symbols (e.g. aotriton::v2::flash::attn_bwd_fused). + for pkg in ["aotriton", "hip"]: + if pkg in self.spec: + for lib_dir in self.spec[pkg].libs.directories: + env.prepend_path("LD_LIBRARY_PATH", lib_dir) + # CONTRIBUTING.md says to use TORCHVISION_INCLUDE and TORCHVISION_LIBRARY, but # these do not work for older releases. Build uses a mix of Spack's compiler wrapper # and the actual compiler, so this is needed to get parts of the build working. From 16ac8a8e3c77eb53c0db9506c02a79b06555f1a4 Mon Sep 17 00:00:00 2001 From: renjithravindrankannath Date: Fri, 20 Feb 2026 13:20:06 -0500 Subject: [PATCH 10/13] Correcting the library path with prefix --- repos/spack_repo/builtin/packages/py_torchvision/package.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repos/spack_repo/builtin/packages/py_torchvision/package.py b/repos/spack_repo/builtin/packages/py_torchvision/package.py index 533b6a6cbcd..9370f87e0a0 100644 --- a/repos/spack_repo/builtin/packages/py_torchvision/package.py +++ b/repos/spack_repo/builtin/packages/py_torchvision/package.py @@ -221,7 +221,7 @@ def setup_build_environment(self, env: EnvironmentModifications) -> None: # can resolve undefined symbols (e.g. aotriton::v2::flash::attn_bwd_fused). for pkg in ["aotriton", "hip"]: if pkg in self.spec: - for lib_dir in self.spec[pkg].libs.directories: + for lib_dir in self.spec[pkg].prefix.lib: env.prepend_path("LD_LIBRARY_PATH", lib_dir) # CONTRIBUTING.md says to use TORCHVISION_INCLUDE and TORCHVISION_LIBRARY, but From ee6e80e90e2cecd379abd9470283368d6e8d1463 Mon Sep 17 00:00:00 2001 From: renjithravindrankannath Date: Sat, 21 Feb 2026 11:18:35 -0500 Subject: [PATCH 11/13] Add prefix lib dirs when they exist so the loader can find .so files --- .../packages/py_torchvision/package.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/repos/spack_repo/builtin/packages/py_torchvision/package.py b/repos/spack_repo/builtin/packages/py_torchvision/package.py index 9370f87e0a0..011ab95c08a 100644 --- a/repos/spack_repo/builtin/packages/py_torchvision/package.py +++ b/repos/spack_repo/builtin/packages/py_torchvision/package.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: (Apache-2.0 OR MIT) +import os from spack_repo.builtin.build_systems.python import PythonPackage @@ -220,11 +221,21 @@ def setup_build_environment(self, env: EnvironmentModifications) -> None: # needs aotriton and hip libs at runtime. Add their lib dirs so the loader # can resolve undefined symbols (e.g. aotriton::v2::flash::attn_bwd_fused). for pkg in ["aotriton", "hip"]: - if pkg in self.spec: - for lib_dir in self.spec[pkg].prefix.lib: + if pkg not in self.spec: + continue + try: + for lib_dir in self.spec[pkg].libs.directories: env.prepend_path("LD_LIBRARY_PATH", lib_dir) - - # CONTRIBUTING.md says to use TORCHVISION_INCLUDE and TORCHVISION_LIBRARY, but + except NoLibrariesError: + # Package may not declare 'libraries' (e.g. aotriton), so Spack + # cannot recursively locate libs. Add prefix lib dirs when they + # exist so the loader can find .so files (lib, lib64, or both). + for sub in ("lib", "lib64"): + lib_dir = os.path.join(self.spec[pkg].prefix, sub) + if os.path.isdir(lib_dir): + env.prepend_path("LD_LIBRARY_PATH", lib_dir) + + # CONTRIBUTING.md says to use TORCHVISION_INCLUDE and TORCHVISION_LIBRARY, but # these do not work for older releases. Build uses a mix of Spack's compiler wrapper # and the actual compiler, so this is needed to get parts of the build working. # See https://github.com/pytorch/vision/issues/2591 From a3fb110f9daa581e4764e102819f4514d80c1084 Mon Sep 17 00:00:00 2001 From: renjithravindrankannath Date: Sat, 21 Feb 2026 16:42:22 -0500 Subject: [PATCH 12/13] style error fix --- repos/spack_repo/builtin/packages/py_torchvision/package.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repos/spack_repo/builtin/packages/py_torchvision/package.py b/repos/spack_repo/builtin/packages/py_torchvision/package.py index 011ab95c08a..8fe26cf7343 100644 --- a/repos/spack_repo/builtin/packages/py_torchvision/package.py +++ b/repos/spack_repo/builtin/packages/py_torchvision/package.py @@ -235,7 +235,7 @@ def setup_build_environment(self, env: EnvironmentModifications) -> None: if os.path.isdir(lib_dir): env.prepend_path("LD_LIBRARY_PATH", lib_dir) - # CONTRIBUTING.md says to use TORCHVISION_INCLUDE and TORCHVISION_LIBRARY, but + # CONTRIBUTING.md says to use TORCHVISION_INCLUDE and TORCHVISION_LIBRARY, but # these do not work for older releases. Build uses a mix of Spack's compiler wrapper # and the actual compiler, so this is needed to get parts of the build working. # See https://github.com/pytorch/vision/issues/2591 From 6087d68827de791a11de53d94821889e0c00b971 Mon Sep 17 00:00:00 2001 From: renjithravindrankannath Date: Mon, 23 Feb 2026 11:35:30 -0500 Subject: [PATCH 13/13] import NoLibrariesError --- repos/spack_repo/builtin/packages/py_torchvision/package.py | 1 + 1 file changed, 1 insertion(+) diff --git a/repos/spack_repo/builtin/packages/py_torchvision/package.py b/repos/spack_repo/builtin/packages/py_torchvision/package.py index 8fe26cf7343..dff76f276c9 100644 --- a/repos/spack_repo/builtin/packages/py_torchvision/package.py +++ b/repos/spack_repo/builtin/packages/py_torchvision/package.py @@ -6,6 +6,7 @@ from spack_repo.builtin.build_systems.python import PythonPackage +from spack.error import NoLibrariesError from spack.package import *