diff --git a/.ci/gitlab/configs/linux/ci.yaml b/.ci/gitlab/configs/linux/ci.yaml index 9add0f89153..67d2b921d5c 100644 --- a/.ci/gitlab/configs/linux/ci.yaml +++ b/.ci/gitlab/configs/linux/ci.yaml @@ -13,6 +13,8 @@ ci: - wrf build-job: tags: [ "spack", "huge" ] + # 24h for long ROCm/ML rebuilds; GitLab project/runner max must allow this + timeout: 1440 minutes variables: CI_JOB_SIZE: huge SPACK_BUILD_JOBS: "12" diff --git a/repos/spack_repo/builtin/packages/hwloc/package.py b/repos/spack_repo/builtin/packages/hwloc/package.py index 65ad3db3056..9ab963dfdff 100644 --- a/repos/spack_repo/builtin/packages/hwloc/package.py +++ b/repos/spack_repo/builtin/packages/hwloc/package.py @@ -135,7 +135,7 @@ class Hwloc(AutotoolsPackage, CudaPackage, ROCmPackage): depends_on("mpi", when="+netloc") with when("+rocm"): - depends_on("rocm-smi-lib") + depends_on("rocm-smi-lib@7.0:") depends_on("rocm-opencl", when="+opencl") # Avoid a circular dependency since the openmp # variant of llvm-amdgpu depends on hwloc. diff --git a/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.5.patch b/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.5.patch index 2e7a80bcbe8..2c35aafac2f 100644 --- a/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.5.patch +++ b/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.5.patch @@ -25,10 +25,10 @@ index 9be7f37..39d0f24 100644 endif() diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake -index 1c0d3a2..e0de4b1 100644 +index 1c0d3a2..83f9f9d 100644 --- a/cmake/public/LoadHIP.cmake +++ b/cmake/public/LoadHIP.cmake -@@ -167,6 +167,10 @@ if(HIP_FOUND) +@@ -167,6 +167,11 @@ if(HIP_FOUND) find_package_and_print_version(hipsolver REQUIRED) find_package_and_print_version(hiprtc REQUIRED) @@ -36,6 +36,7 @@ index 1c0d3a2..e0de4b1 100644 + list(APPEND ROCM_INCLUDE ${rocprim_INCLUDE_DIR}) + list(APPEND ROCM_INCLUDE ${hipcub_INCLUDE_DIR}) + list(APPEND ROCM_INCLUDE ${rocRAND_INCLUDE_DIR}) ++ list(APPEND ROCM_INCLUDE $ENV{AOTRITON_INSTALLED_PREFIX}/include) find_library(PYTORCH_HIP_LIBRARIES amdhip64 HINTS ${ROCM_PATH}/lib) # TODO: miopen_LIBRARIES should return fullpath to the library file, diff --git a/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.7.patch b/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.7.patch index 4392e00d76a..173aabc12aa 100644 --- a/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.7.patch +++ b/repos/spack_repo/builtin/packages/py_torch/PR152569-Update-spack-includes-2.7.patch @@ -1,5 +1,5 @@ diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt -index d2d23b7..620a89f 100644 +index d2d23b7ab65..620a89f65cb 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -1379,13 +1379,6 @@ if(USE_ROCM) @@ -26,7 +26,7 @@ index d2d23b7..620a89f 100644 endif() diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake -index 58c74dd..d3e1ad4 100644 +index 58c74ddda35..54f96871372 100644 --- a/cmake/public/LoadHIP.cmake +++ b/cmake/public/LoadHIP.cmake @@ -26,12 +26,6 @@ else() @@ -78,7 +78,15 @@ index 58c74dd..d3e1ad4 100644 find_package_and_print_version(amd_comgr REQUIRED) find_package_and_print_version(rocrand REQUIRED) find_package_and_print_version(hiprand REQUIRED) -@@ -171,7 +168,11 @@ if(HIP_FOUND) +@@ -157,6 +154,7 @@ if(HIP_FOUND) + find_package_and_print_version(hipcub REQUIRED) + find_package_and_print_version(rocthrust REQUIRED) + find_package_and_print_version(hipsolver REQUIRED) ++ list(APPEND ROCM_INCLUDE_DIRS $ENV{AOTRITON_INSTALLED_PREFIX}/include) + # workaround cmake 4 build issue + if(CMAKE_VERSION VERSION_GREATER_EQUAL "4.0.0") + message(WARNING "Work around hiprtc cmake failure for cmake >= 4") +@@ -171,7 +169,11 @@ if(HIP_FOUND) if(UNIX) find_package_and_print_version(rccl) find_package_and_print_version(hsa-runtime64 REQUIRED) diff --git a/repos/spack_repo/builtin/packages/py_torch/package.py b/repos/spack_repo/builtin/packages/py_torch/package.py index 844a73f0e9e..7211109c95a 100644 --- a/repos/spack_repo/builtin/packages/py_torch/package.py +++ b/repos/spack_repo/builtin/packages/py_torch/package.py @@ -123,6 +123,7 @@ class PyTorch(PythonPackage, CudaPackage, ROCmPackage): conflicts("+gloo+rocm") conflicts("+rocm", when="@2.3", msg="Rocm doesn't support py-torch 2.3 release") conflicts("+rocm", when="@2.4", msg="Rocm doesn't support py-torch 2.4 release") + conflicts("+rocm", when="@2.8", msg="Rocm doesn't support py-torch 2.8 release") conflicts("+tensorpipe", when="+rocm ^hip@:5.1", msg="TensorPipe not supported until ROCm 5.2") conflicts("+breakpad", when="target=ppc64:") conflicts("+breakpad", when="target=ppc64le:") @@ -305,7 +306,8 @@ class PyTorch(PythonPackage, CudaPackage, ROCmPackage): depends_on("valgrind", when="+valgrind") with when("+rocm"): depends_on("hsa-rocr-dev") - depends_on("hip") + depends_on("hip@7.0:", when="@2.9:") + depends_on("hip@:6.4", when="@:2.7") depends_on("rccl", when="+nccl") depends_on("rocprim") depends_on("hipcub") @@ -320,11 +322,15 @@ class PyTorch(PythonPackage, CudaPackage, ROCmPackage): depends_on("rocfft") depends_on("rocblas") depends_on("miopen-hip") + depends_on("composable-kernel") + depends_on("hipblaslt") + # Ensure hipblaslt version for 2.9+ + depends_on("hipblaslt@7.0:", when="@2.9:") depends_on("rocminfo") - depends_on("aotriton@0.8.1b", when="@2.5:2.6") - depends_on("aotriton@0.9.1b", when="@2.7:") - depends_on("composable-kernel@:6.3.2", when="@2.5") - depends_on("composable-kernel@6.3.2:", when="@2.6:") + depends_on("hipsparselt@7.0:", when="@2.9:") + depends_on("aotriton@0.8b", when="@2.5:2.6") + depends_on("aotriton@0.9.1b", when="@2.7") + depends_on("aotriton@0.10b", when="@2.8:") depends_on("mpi", when="+mpi") depends_on("ucc", when="+ucc") depends_on("ucx", when="+ucc") @@ -568,6 +574,14 @@ def patch(self): "torch_global_deps PROPERTIES LINKER_LANGUAGE CXX", "caffe2/CMakeLists.txt", ) + if self.spec.satisfies("@2.5:+rocm"): + filter_file( + "find_library(ROCM_ROCTX_LIB roctx64 HINTS ${ROCM_PATH}/lib)", + "find_library(ROCM_ROCTX_LIB roctx64 HINTS ${ROCM_PATH}/lib)\n" + "set(ROCTRACER_INCLUDE_DIR $ENV{ROCTRACER_INCLUDE_DIR})", + "cmake/public/LoadHIP.cmake", + string=True, + ) if self.spec.satisfies("@2.1:2.7+rocm"): filter_file( "${ROCM_INCLUDE_DIRS}/rocm-core/rocm_version.h", diff --git a/repos/spack_repo/builtin/packages/py_torchvision/package.py b/repos/spack_repo/builtin/packages/py_torchvision/package.py index 4afaa9895c7..dff76f276c9 100644 --- a/repos/spack_repo/builtin/packages/py_torchvision/package.py +++ b/repos/spack_repo/builtin/packages/py_torchvision/package.py @@ -2,9 +2,11 @@ # # SPDX-License-Identifier: (Apache-2.0 OR MIT) +import os from spack_repo.builtin.build_systems.python import PythonPackage +from spack.error import NoLibrariesError from spack.package import * @@ -198,6 +200,42 @@ def setup_build_environment(self, env: EnvironmentModifications) -> None: include.extend(query.headers.directories) library.extend(query.libs.directories) + # PyTorch headers include rocthrust, rocprim, hipsparse, hipblas, hipblas-common, + # hipblaslt and hipsolver headers; when building with ROCm we need these in the + # include path (py-torch depends on them, but they are not direct link deps of + # torchvision). Only add paths for packages that are in the spec to avoid KeyError. + if "^py-torch+rocm" in self.spec: + rocm_include_pkgs = [ + "rocthrust", + "rocprim", + "hipsparse", + "hipblas", + "hipblas-common", + "hipblaslt", + "hipsolver", + ] + for pkg in rocm_include_pkgs: + if pkg in self.spec: + include.extend(self.spec[pkg].headers.directories) + + # At build time, torchvision's setup imports torch; libtorch_hip.so then + # needs aotriton and hip libs at runtime. Add their lib dirs so the loader + # can resolve undefined symbols (e.g. aotriton::v2::flash::attn_bwd_fused). + for pkg in ["aotriton", "hip"]: + if pkg not in self.spec: + continue + try: + for lib_dir in self.spec[pkg].libs.directories: + env.prepend_path("LD_LIBRARY_PATH", lib_dir) + except NoLibrariesError: + # Package may not declare 'libraries' (e.g. aotriton), so Spack + # cannot recursively locate libs. Add prefix lib dirs when they + # exist so the loader can find .so files (lib, lib64, or both). + for sub in ("lib", "lib64"): + lib_dir = os.path.join(self.spec[pkg].prefix, sub) + if os.path.isdir(lib_dir): + env.prepend_path("LD_LIBRARY_PATH", lib_dir) + # CONTRIBUTING.md says to use TORCHVISION_INCLUDE and TORCHVISION_LIBRARY, but # these do not work for older releases. Build uses a mix of Spack's compiler wrapper # and the actual compiler, so this is needed to get parts of the build working. diff --git a/stacks/ml-linux-x86_64-rocm/spack.yaml b/stacks/ml-linux-x86_64-rocm/spack.yaml index ffdcad7de2a..0a705ef6204 100644 --- a/stacks/ml-linux-x86_64-rocm/spack.yaml +++ b/stacks/ml-linux-x86_64-rocm/spack.yaml @@ -43,26 +43,25 @@ spack: # Keras - py-keras backend=tensorflow # - py-keras backend=jax - # - py-keras backend=torch + - py-keras backend=torch # PyTorch - # Does not yet support Spack-installed ROCm - # - py-botorch - # - py-gpytorch - # - py-kornia - # - py-lightning - # - py-pytorch-lightning - # - py-segmentation-models-pytorch - # - py-timm - # - py-torch - # - py-torch-geometric + - py-botorch + - py-gpytorch + - py-kornia + - py-lightning + - py-pytorch-lightning + - py-segmentation-models-pytorch + - py-timm + - py-torch + - py-torch-geometric # - py-torch-nvidia-apex # - py-torchaudio - # - py-torchdata - # - py-torchgeo - # - py-torchmetrics - # - py-torchvision - # - py-vector-quantize-pytorch + - py-torchdata + - py-torchgeo + - py-torchmetrics + - py-torchvision + - py-vector-quantize-pytorch # scikit-learn - py-scikit-learn