diff --git a/.ci/gitlab/configs/ci.yaml b/.ci/gitlab/configs/ci.yaml index 163dd59a2dc..06f3e511d40 100644 --- a/.ci/gitlab/configs/ci.yaml +++ b/.ci/gitlab/configs/ci.yaml @@ -4,6 +4,8 @@ ci: broken-tests-packages: - superlu-dist # srun -n 4 hangs - papyrus + - composable-kernel + - py-llvmlite pipeline-gen: - build-job: diff --git a/.ci/gitlab/configs/linux/ci.yaml b/.ci/gitlab/configs/linux/ci.yaml index 9add0f89153..2a6935cec04 100644 --- a/.ci/gitlab/configs/linux/ci.yaml +++ b/.ci/gitlab/configs/linux/ci.yaml @@ -13,6 +13,7 @@ ci: - wrf build-job: tags: [ "spack", "huge" ] + timeout: 1200 minutes variables: CI_JOB_SIZE: huge SPACK_BUILD_JOBS: "12" diff --git a/repos/spack_repo/builtin/packages/aotriton/package.py b/repos/spack_repo/builtin/packages/aotriton/package.py index 673c678a94b..17a57f6549d 100644 --- a/repos/spack_repo/builtin/packages/aotriton/package.py +++ b/repos/spack_repo/builtin/packages/aotriton/package.py @@ -60,6 +60,7 @@ class Aotriton(CMakePackage): depends_on("pkgconfig", type="build") # build llvm version with mlir with the commit that matches inside the llvm-hash.txt + depends_on("aotriton-llvm@0.10", when="@0.10b") depends_on("aotriton-llvm@0.9", when="@0.9b") depends_on("aotriton-llvm@0.8", when="@0.8b") diff --git a/repos/spack_repo/builtin/packages/hwloc/package.py b/repos/spack_repo/builtin/packages/hwloc/package.py index 65ad3db3056..9ab963dfdff 100644 --- a/repos/spack_repo/builtin/packages/hwloc/package.py +++ b/repos/spack_repo/builtin/packages/hwloc/package.py @@ -135,7 +135,7 @@ class Hwloc(AutotoolsPackage, CudaPackage, ROCmPackage): depends_on("mpi", when="+netloc") with when("+rocm"): - depends_on("rocm-smi-lib") + depends_on("rocm-smi-lib@7.0:") depends_on("rocm-opencl", when="+opencl") # Avoid a circular dependency since the openmp # variant of llvm-amdgpu depends on hwloc. diff --git a/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.5.patch b/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.5.patch index 2e7a80bcbe8..2c35aafac2f 100644 --- a/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.5.patch +++ b/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.5.patch @@ -25,10 +25,10 @@ index 9be7f37..39d0f24 100644 endif() diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake -index 1c0d3a2..e0de4b1 100644 +index 1c0d3a2..83f9f9d 100644 --- a/cmake/public/LoadHIP.cmake +++ b/cmake/public/LoadHIP.cmake -@@ -167,6 +167,10 @@ if(HIP_FOUND) +@@ -167,6 +167,11 @@ if(HIP_FOUND) find_package_and_print_version(hipsolver REQUIRED) find_package_and_print_version(hiprtc REQUIRED) @@ -36,6 +36,7 @@ index 1c0d3a2..e0de4b1 100644 + list(APPEND ROCM_INCLUDE ${rocprim_INCLUDE_DIR}) + list(APPEND ROCM_INCLUDE ${hipcub_INCLUDE_DIR}) + list(APPEND ROCM_INCLUDE ${rocRAND_INCLUDE_DIR}) ++ list(APPEND ROCM_INCLUDE $ENV{AOTRITON_INSTALLED_PREFIX}/include) find_library(PYTORCH_HIP_LIBRARIES amdhip64 HINTS ${ROCM_PATH}/lib) # TODO: miopen_LIBRARIES should return fullpath to the library file, diff --git a/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.7.patch b/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.7.patch index 4392e00d76a..173aabc12aa 100644 --- a/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.7.patch +++ b/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.7.patch @@ -1,5 +1,5 @@ diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt -index d2d23b7..620a89f 100644 +index d2d23b7ab65..620a89f65cb 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -1379,13 +1379,6 @@ if(USE_ROCM) @@ -26,7 +26,7 @@ index d2d23b7..620a89f 100644 endif() diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake -index 58c74dd..d3e1ad4 100644 +index 58c74ddda35..54f96871372 100644 --- a/cmake/public/LoadHIP.cmake +++ b/cmake/public/LoadHIP.cmake @@ -26,12 +26,6 @@ else() @@ -78,7 +78,15 @@ index 58c74dd..d3e1ad4 100644 find_package_and_print_version(amd_comgr REQUIRED) find_package_and_print_version(rocrand REQUIRED) find_package_and_print_version(hiprand REQUIRED) -@@ -171,7 +168,11 @@ if(HIP_FOUND) +@@ -157,6 +154,7 @@ if(HIP_FOUND) + find_package_and_print_version(hipcub REQUIRED) + find_package_and_print_version(rocthrust REQUIRED) + find_package_and_print_version(hipsolver REQUIRED) ++ list(APPEND ROCM_INCLUDE_DIRS $ENV{AOTRITON_INSTALLED_PREFIX}/include) + # workaround cmake 4 build issue + if(CMAKE_VERSION VERSION_GREATER_EQUAL "4.0.0") + message(WARNING "Work around hiprtc cmake failure for cmake >= 4") +@@ -171,7 +169,11 @@ if(HIP_FOUND) if(UNIX) find_package_and_print_version(rccl) find_package_and_print_version(hsa-runtime64 REQUIRED) diff --git a/repos/spack_repo/builtin/packages/py_torch/package.py b/repos/spack_repo/builtin/packages/py_torch/package.py index 844a73f0e9e..510408b1987 100644 --- a/repos/spack_repo/builtin/packages/py_torch/package.py +++ b/repos/spack_repo/builtin/packages/py_torch/package.py @@ -123,6 +123,7 @@ class PyTorch(PythonPackage, CudaPackage, ROCmPackage): conflicts("+gloo+rocm") conflicts("+rocm", when="@2.3", msg="Rocm doesn't support py-torch 2.3 release") conflicts("+rocm", when="@2.4", msg="Rocm doesn't support py-torch 2.4 release") + conflicts("+rocm", when="@2.8", msg="Rocm doesn't support py-torch 2.8 release") conflicts("+tensorpipe", when="+rocm ^hip@:5.1", msg="TensorPipe not supported until ROCm 5.2") conflicts("+breakpad", when="target=ppc64:") conflicts("+breakpad", when="target=ppc64le:") @@ -305,7 +306,8 @@ class PyTorch(PythonPackage, CudaPackage, ROCmPackage): depends_on("valgrind", when="+valgrind") with when("+rocm"): depends_on("hsa-rocr-dev") - depends_on("hip") + depends_on("hip@7.0:", when="@2.9:") + depends_on("hip@:6.4", when="@:2.7") depends_on("rccl", when="+nccl") depends_on("rocprim") depends_on("hipcub") @@ -320,11 +322,20 @@ class PyTorch(PythonPackage, CudaPackage, ROCmPackage): depends_on("rocfft") depends_on("rocblas") depends_on("miopen-hip") + for target in ROCmPackage.amdgpu_targets: + depends_on(f"composable-kernel amdgpu_target={target}", when=f"amdgpu_target={target}") + # This constraint applies to ANY hipblaslt in the dependency tree + # including the one used by miopen-hip + depends_on(f"hipblaslt amdgpu_target={target}", when=f"amdgpu_target={target}") + # Ensure hipblaslt version for 2.9+ + depends_on( + f"hipblaslt@7.0: amdgpu_target={target}", when=f"@2.9: amdgpu_target={target}" + ) depends_on("rocminfo") - depends_on("aotriton@0.8.1b", when="@2.5:2.6") - depends_on("aotriton@0.9.1b", when="@2.7:") - depends_on("composable-kernel@:6.3.2", when="@2.5") - depends_on("composable-kernel@6.3.2:", when="@2.6:") + depends_on("hipsparselt@7.0:", when="@2.9:") + depends_on("aotriton@0.8b", when="@2.5:2.6") + depends_on("aotriton@0.9.2b", when="@2.7") + depends_on("aotriton@0.10b", when="@2.8:") depends_on("mpi", when="+mpi") depends_on("ucc", when="+ucc") depends_on("ucx", when="+ucc") @@ -568,6 +579,14 @@ def patch(self): "torch_global_deps PROPERTIES LINKER_LANGUAGE CXX", "caffe2/CMakeLists.txt", ) + if self.spec.satisfies("@2.5:+rocm"): + filter_file( + "find_library(ROCM_ROCTX_LIB roctx64 HINTS ${ROCM_PATH}/lib)", + "find_library(ROCM_ROCTX_LIB roctx64 HINTS ${ROCM_PATH}/lib)\n" + "set(ROCTRACER_INCLUDE_DIR $ENV{ROCTRACER_INCLUDE_DIR})", + "cmake/public/LoadHIP.cmake", + string=True, + ) if self.spec.satisfies("@2.1:2.7+rocm"): filter_file( "${ROCM_INCLUDE_DIRS}/rocm-core/rocm_version.h", diff --git a/repos/spack_repo/builtin/packages/py_torchvision/package.py b/repos/spack_repo/builtin/packages/py_torchvision/package.py index 4afaa9895c7..011ab95c08a 100644 --- a/repos/spack_repo/builtin/packages/py_torchvision/package.py +++ b/repos/spack_repo/builtin/packages/py_torchvision/package.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: (Apache-2.0 OR MIT) +import os from spack_repo.builtin.build_systems.python import PythonPackage @@ -198,7 +199,43 @@ def setup_build_environment(self, env: EnvironmentModifications) -> None: include.extend(query.headers.directories) library.extend(query.libs.directories) - # CONTRIBUTING.md says to use TORCHVISION_INCLUDE and TORCHVISION_LIBRARY, but + # PyTorch headers include rocthrust, rocprim, hipsparse, hipblas, hipblas-common, + # hipblaslt and hipsolver headers; when building with ROCm we need these in the + # include path (py-torch depends on them, but they are not direct link deps of + # torchvision). Only add paths for packages that are in the spec to avoid KeyError. + if "^py-torch+rocm" in self.spec: + rocm_include_pkgs = [ + "rocthrust", + "rocprim", + "hipsparse", + "hipblas", + "hipblas-common", + "hipblaslt", + "hipsolver", + ] + for pkg in rocm_include_pkgs: + if pkg in self.spec: + include.extend(self.spec[pkg].headers.directories) + + # At build time, torchvision's setup imports torch; libtorch_hip.so then + # needs aotriton and hip libs at runtime. Add their lib dirs so the loader + # can resolve undefined symbols (e.g. aotriton::v2::flash::attn_bwd_fused). + for pkg in ["aotriton", "hip"]: + if pkg not in self.spec: + continue + try: + for lib_dir in self.spec[pkg].libs.directories: + env.prepend_path("LD_LIBRARY_PATH", lib_dir) + except NoLibrariesError: + # Package may not declare 'libraries' (e.g. aotriton), so Spack + # cannot recursively locate libs. Add prefix lib dirs when they + # exist so the loader can find .so files (lib, lib64, or both). + for sub in ("lib", "lib64"): + lib_dir = os.path.join(self.spec[pkg].prefix, sub) + if os.path.isdir(lib_dir): + env.prepend_path("LD_LIBRARY_PATH", lib_dir) + + # CONTRIBUTING.md says to use TORCHVISION_INCLUDE and TORCHVISION_LIBRARY, but # these do not work for older releases. Build uses a mix of Spack's compiler wrapper # and the actual compiler, so this is needed to get parts of the build working. # See https://github.com/pytorch/vision/issues/2591 diff --git a/stacks/e4s-rocm-external/spack.yaml b/stacks/e4s-rocm-external/spack.yaml index c9005354df9..172bd8f2c66 100644 --- a/stacks/e4s-rocm-external/spack.yaml +++ b/stacks/e4s-rocm-external/spack.yaml @@ -259,6 +259,8 @@ spack: image: ghcr.io/spack/e4s-rocm-base-x86_64:v6.4.3-1760790880 broken-tests-packages: - paraview + - composable-kernel + - py-llvmlite cdash: build-group: E4S ROCm External diff --git a/stacks/ml-linux-x86_64-rocm/spack.yaml b/stacks/ml-linux-x86_64-rocm/spack.yaml index ffdcad7de2a..418cd06d47b 100644 --- a/stacks/ml-linux-x86_64-rocm/spack.yaml +++ b/stacks/ml-linux-x86_64-rocm/spack.yaml @@ -46,23 +46,22 @@ spack: # - py-keras backend=torch # PyTorch - # Does not yet support Spack-installed ROCm - # - py-botorch - # - py-gpytorch - # - py-kornia - # - py-lightning - # - py-pytorch-lightning - # - py-segmentation-models-pytorch - # - py-timm - # - py-torch - # - py-torch-geometric + - py-botorch + - py-gpytorch + - py-kornia + - py-lightning + - py-pytorch-lightning + - py-segmentation-models-pytorch + - py-timm + - py-torch + - py-torch-geometric # - py-torch-nvidia-apex # - py-torchaudio - # - py-torchdata - # - py-torchgeo - # - py-torchmetrics + - py-torchdata + - py-torchgeo + - py-torchmetrics # - py-torchvision - # - py-vector-quantize-pytorch + - py-vector-quantize-pytorch # scikit-learn - py-scikit-learn @@ -82,11 +81,13 @@ spack: # - py-xgboost ci: + broken-tests-packages: + - composable-kernel + - py-llvmlite pipeline-gen: - build-job: image: name: ghcr.io/spack/ubuntu-24.04:v2025-09-15 entrypoint: [''] - cdash: build-group: Machine Learning