Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .ci/gitlab/configs/linux/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ ci:
- wrf
build-job:
tags: [ "spack", "huge" ]
# 24h for long ROCm/ML rebuilds; GitLab project/runner max must allow this
timeout: 1440 minutes
variables:
CI_JOB_SIZE: huge
SPACK_BUILD_JOBS: "12"
Expand Down
4 changes: 2 additions & 2 deletions repos/spack_repo/builtin/packages/aotriton/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def patch(self):
string=True,
)

if self.spec.satisfies("@:0.9"):
if self.spec.satisfies("@:0.9b"):
filter_file(
r"LLVM_INCLUDE_DIRS",
f"{self.spec['aotriton-llvm'].prefix}/include",
Expand All @@ -103,7 +103,7 @@ def patch(self):
"third_party/triton/python/setup.py",
string=True,
)
if self.spec.satisfies("@0.10:"):
if self.spec.satisfies("@0.10b:"):
filter_file(
r"LLVM_INCLUDE_DIRS",
f"{self.spec['aotriton-llvm'].prefix}/include",
Expand Down
2 changes: 1 addition & 1 deletion repos/spack_repo/builtin/packages/hwloc/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ class Hwloc(AutotoolsPackage, CudaPackage, ROCmPackage):
depends_on("mpi", when="+netloc")

with when("+rocm"):
depends_on("rocm-smi-lib")
depends_on("rocm-smi-lib@7.0:")
depends_on("rocm-opencl", when="+opencl")
# Avoid a circular dependency since the openmp
# variant of llvm-amdgpu depends on hwloc.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,18 @@ index 9be7f37..39d0f24 100644
endif()

diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
index 1c0d3a2..e0de4b1 100644
index 1c0d3a2..83f9f9d 100644
--- a/cmake/public/LoadHIP.cmake
+++ b/cmake/public/LoadHIP.cmake
@@ -167,6 +167,10 @@ if(HIP_FOUND)
@@ -167,6 +167,11 @@ if(HIP_FOUND)
find_package_and_print_version(hipsolver REQUIRED)
find_package_and_print_version(hiprtc REQUIRED)

+ list(APPEND ROCM_INCLUDE ${rocthrust_INCLUDE_DIR})
+ list(APPEND ROCM_INCLUDE ${rocprim_INCLUDE_DIR})
+ list(APPEND ROCM_INCLUDE ${hipcub_INCLUDE_DIR})
+ list(APPEND ROCM_INCLUDE ${rocRAND_INCLUDE_DIR})
+ list(APPEND ROCM_INCLUDE $ENV{AOTRITON_INSTALLED_PREFIX}/include)

find_library(PYTORCH_HIP_LIBRARIES amdhip64 HINTS ${ROCM_PATH}/lib)
# TODO: miopen_LIBRARIES should return fullpath to the library file,
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index d2d23b7..620a89f 100644
index d2d23b7ab65..620a89f65cb 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1379,13 +1379,6 @@ if(USE_ROCM)
Expand All @@ -26,7 +26,7 @@ index d2d23b7..620a89f 100644
endif()

diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
index 58c74dd..d3e1ad4 100644
index 58c74ddda35..54f96871372 100644
--- a/cmake/public/LoadHIP.cmake
+++ b/cmake/public/LoadHIP.cmake
@@ -26,12 +26,6 @@ else()
Expand Down Expand Up @@ -78,7 +78,15 @@ index 58c74dd..d3e1ad4 100644
find_package_and_print_version(amd_comgr REQUIRED)
find_package_and_print_version(rocrand REQUIRED)
find_package_and_print_version(hiprand REQUIRED)
@@ -171,7 +168,11 @@ if(HIP_FOUND)
@@ -157,6 +154,7 @@ if(HIP_FOUND)
find_package_and_print_version(hipcub REQUIRED)
find_package_and_print_version(rocthrust REQUIRED)
find_package_and_print_version(hipsolver REQUIRED)
+ list(APPEND ROCM_INCLUDE_DIRS $ENV{AOTRITON_INSTALLED_PREFIX}/include)
# workaround cmake 4 build issue
if(CMAKE_VERSION VERSION_GREATER_EQUAL "4.0.0")
message(WARNING "Work around hiprtc cmake failure for cmake >= 4")
@@ -171,7 +169,11 @@ if(HIP_FOUND)
if(UNIX)
find_package_and_print_version(rccl)
find_package_and_print_version(hsa-runtime64 REQUIRED)
Expand Down
24 changes: 19 additions & 5 deletions repos/spack_repo/builtin/packages/py_torch/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ class PyTorch(PythonPackage, CudaPackage, ROCmPackage):
conflicts("+gloo+rocm")
conflicts("+rocm", when="@2.3", msg="Rocm doesn't support py-torch 2.3 release")
conflicts("+rocm", when="@2.4", msg="Rocm doesn't support py-torch 2.4 release")
conflicts("+rocm", when="@2.8", msg="Rocm doesn't support py-torch 2.8 release")
conflicts("+tensorpipe", when="+rocm ^hip@:5.1", msg="TensorPipe not supported until ROCm 5.2")
conflicts("+breakpad", when="target=ppc64:")
conflicts("+breakpad", when="target=ppc64le:")
Expand Down Expand Up @@ -305,7 +306,8 @@ class PyTorch(PythonPackage, CudaPackage, ROCmPackage):
depends_on("valgrind", when="+valgrind")
with when("+rocm"):
depends_on("hsa-rocr-dev")
depends_on("hip")
depends_on("hip@7.0:", when="@2.9:")
depends_on("hip@:6.4", when="@:2.7")
depends_on("rccl", when="+nccl")
depends_on("rocprim")
depends_on("hipcub")
Expand All @@ -320,11 +322,15 @@ class PyTorch(PythonPackage, CudaPackage, ROCmPackage):
depends_on("rocfft")
depends_on("rocblas")
depends_on("miopen-hip")
depends_on("composable-kernel")
depends_on("hipblaslt")
# Ensure hipblaslt version for 2.9+
depends_on("hipblaslt@7.0:", when="@2.9:")
depends_on("rocminfo")
depends_on("aotriton@0.8.1b", when="@2.5:2.6")
depends_on("aotriton@0.9.1b", when="@2.7:")
depends_on("composable-kernel@:6.3.2", when="@2.5")
depends_on("composable-kernel@6.3.2:", when="@2.6:")
depends_on("hipsparselt@7.0:", when="@2.9:")
depends_on("aotriton@0.8b", when="@2.5:2.6")
depends_on("aotriton@0.9.2b", when="@2.7")
depends_on("aotriton@0.10b", when="@2.8:")
depends_on("mpi", when="+mpi")
depends_on("ucc", when="+ucc")
depends_on("ucx", when="+ucc")
Expand Down Expand Up @@ -568,6 +574,14 @@ def patch(self):
"torch_global_deps PROPERTIES LINKER_LANGUAGE CXX",
"caffe2/CMakeLists.txt",
)
if self.spec.satisfies("@2.5:+rocm"):
filter_file(
"find_library(ROCM_ROCTX_LIB roctx64 HINTS ${ROCM_PATH}/lib)",
"find_library(ROCM_ROCTX_LIB roctx64 HINTS ${ROCM_PATH}/lib)\n"
"set(ROCTRACER_INCLUDE_DIR $ENV{ROCTRACER_INCLUDE_DIR})",
"cmake/public/LoadHIP.cmake",
string=True,
)
if self.spec.satisfies("@2.1:2.7+rocm"):
filter_file(
"${ROCM_INCLUDE_DIRS}/rocm-core/rocm_version.h",
Expand Down
38 changes: 38 additions & 0 deletions repos/spack_repo/builtin/packages/py_torchvision/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
#
# SPDX-License-Identifier: (Apache-2.0 OR MIT)

import os

from spack_repo.builtin.build_systems.python import PythonPackage

from spack.error import NoLibrariesError
from spack.package import *


Expand Down Expand Up @@ -198,6 +200,42 @@ def setup_build_environment(self, env: EnvironmentModifications) -> None:
include.extend(query.headers.directories)
library.extend(query.libs.directories)

# PyTorch headers include rocthrust, rocprim, hipsparse, hipblas, hipblas-common,
# hipblaslt and hipsolver headers; when building with ROCm we need these in the
# include path (py-torch depends on them, but they are not direct link deps of
# torchvision). Only add paths for packages that are in the spec to avoid KeyError.
if "^py-torch+rocm" in self.spec:
rocm_include_pkgs = [
"rocthrust",
"rocprim",
"hipsparse",
"hipblas",
"hipblas-common",
"hipblaslt",
"hipsolver",
]
for pkg in rocm_include_pkgs:
if pkg in self.spec:
include.extend(self.spec[pkg].headers.directories)

# At build time, torchvision's setup imports torch; libtorch_hip.so then
# needs aotriton and hip libs at runtime. Add their lib dirs so the loader
# can resolve undefined symbols (e.g. aotriton::v2::flash::attn_bwd_fused).
for pkg in ["aotriton", "hip"]:
if pkg not in self.spec:
continue
try:
for lib_dir in self.spec[pkg].libs.directories:
env.prepend_path("LD_LIBRARY_PATH", lib_dir)
except NoLibrariesError:
# Package may not declare 'libraries' (e.g. aotriton), so Spack
# cannot recursively locate libs. Add prefix lib dirs when they
# exist so the loader can find .so files (lib, lib64, or both).
for sub in ("lib", "lib64"):
lib_dir = os.path.join(self.spec[pkg].prefix, sub)
if os.path.isdir(lib_dir):
env.prepend_path("LD_LIBRARY_PATH", lib_dir)

# CONTRIBUTING.md says to use TORCHVISION_INCLUDE and TORCHVISION_LIBRARY, but
# these do not work for older releases. Build uses a mix of Spack's compiler wrapper
# and the actual compiler, so this is needed to get parts of the build working.
Expand Down
31 changes: 15 additions & 16 deletions stacks/ml-linux-x86_64-rocm/spack.yaml
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@alalazo this will make your #3175 a bit simpler

Original file line number Diff line number Diff line change
Expand Up @@ -43,26 +43,25 @@ spack:
# Keras
- py-keras backend=tensorflow
# - py-keras backend=jax
# - py-keras backend=torch
- py-keras backend=torch

# PyTorch
# Does not yet support Spack-installed ROCm
# - py-botorch
# - py-gpytorch
# - py-kornia
# - py-lightning
# - py-pytorch-lightning
# - py-segmentation-models-pytorch
# - py-timm
# - py-torch
# - py-torch-geometric
- py-botorch
- py-gpytorch
- py-kornia
- py-lightning
- py-pytorch-lightning
- py-segmentation-models-pytorch
- py-timm
- py-torch
- py-torch-geometric
# - py-torch-nvidia-apex
# - py-torchaudio
# - py-torchdata
# - py-torchgeo
# - py-torchmetrics
# - py-torchvision
# - py-vector-quantize-pytorch
- py-torchdata
- py-torchgeo
- py-torchmetrics
- py-torchvision
- py-vector-quantize-pytorch

# scikit-learn
- py-scikit-learn
Expand Down
Loading