Skip to content
This repository has been archived by the owner on Jul 26, 2024. It is now read-only.

HIP version of kronmult #13

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 109 additions & 16 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,13 +1,92 @@
cmake_minimum_required (VERSION 3.17 FATAL_ERROR)
cmake_minimum_required (VERSION 3.21 FATAL_ERROR)

#-------------------------------------------------------------------------------
# Setup languages to use. Only enable CUDA if GPU's are in use.
# Setup languages to use. Only enable HIP if GPU's are in use.
#-------------------------------------------------------------------------------
project (kronmult LANGUAGES CXX)

option (USE_GPU "Use CUDA for gpu support" OFF)
option (USE_GPU "Use HIP for gpu support" OFF)
set(GPU_ARCH "70" CACHE STRING "GPU architecture code for AMD/NVIDIA")
if (USE_GPU)
enable_language (CUDA)
set(KRONMULT_PLATFORM_AMD 0)
set(KRONMULT_PLATFORM_NVCC 0)

# search for HIP and libraries
if(NOT DEFINED HIP_PATH)
if(NOT DEFINED ENV{HIP_PATH})
set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
else()
set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
endif()
endif()

# set HIP_CLANG_PATH for potential installs in non-standard locations (such as rocm with spack)
if (NOT DEFINED HIP_CLANG_PATH)
if(NOT DEFINED ENV{HIP_CLANG_PATH})
set(HIP_CLANG_PATH "${ROCM_PATH}/llvm/bin" CACHE PATH "Path to HIP clang binaries")
else()
set(HIP_CLANG_PATH $ENV{HIP_CLANG_PATH} CACHE PATH "Path to HIP clang binaries")
endif()
endif()

# note: could probably grab this path directly using hipconfig?
if (NOT DEFINED HIP_CLANG_INCLUDE_PATH)
if(NOT DEFINED ENV{HIP_CLANG_INCLUDE_PATH})
# probably need a better way to get the compiler version.. this will cause non-existent paths for non-clang compilers
set(HIP_CLANG_INCLUDE_PATH "${HIP_CLANG_PATH}/../lib/clang/${CMAKE_CXX_COMPILER_VERSION}/include" CACHE PATH "Path to HIP clang include directory")
else()
set(HIP_CLANG_INCLUDE_PATH $ENV{HIP_CLANG_INCLUDE_PATH} CACHE PATH "Path to HIP clang include directory")
endif()
endif()

# try to find hipconfig executable which can help detect platforms and include dirs
find_program(HIPCONFIG_PATH hipconfig HINTS "${HIP_PATH}/bin")
if(HIPCONFIG_PATH)
execute_process(COMMAND ${HIPCONFIG_PATH} --platform OUTPUT_VARIABLE KRONMULT_HIP_PLATFORM)
elseif(DEFINED ENV{HIP_PLATFORM})
set(KRONMULT_HIP_PLATFORM "$ENV{HIP_PLATFORM}")
else()
message(FATAL_ERROR "Could not determine HIP platform, make sure HIP_PLATFORM is set")
endif()

message(STATUS "HIP platform has been detected as ${KRONMULT_HIP_PLATFORM}")
# hip >= 4.2 is now using "amd" to identify platform
if(KRONMULT_HIP_PLATFORM STREQUAL "hcc" OR KRONMULT_HIP_PLATFORM STREQUAL "amd")
set(KRONMULT_PLATFORM_AMD 1)
# hip <= 4.1 uses "nvcc" to identify nvidia platforms, >= 4.2 uses "nvidia"
elseif(KRONMULT_HIP_PLATFORM STREQUAL "nvcc" OR KRONMULT_HIP_PLATFORM STREQUAL "nvidia")
set(KRONMULT_PLATFORM_NVCC 1)
endif()

set(HIP_VERBOSE_BUILD ON CACHE STRING "Verbose compilation for HIP")

# look for HIP cmake configs in different locations
list(APPEND CMAKE_MODULE_PATH "${HIP_PATH}/cmake" "${HIP_PATH}/lib/cmake" "${ROCM_PATH}")
list(APPEND CMAKE_PREFIX_PATH "${HIP_PATH}/cmake" "${HIP_PATH}/lib/cmake" "${ROCM_PATH}")
set(CMAKE_HIP_ARCHITECTURES OFF)
find_package(HIP REQUIRED 4.0)
if(KRONMULT_PLATFORM_AMD)
enable_language (HIP)
set_target_properties(hip-lang::device PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${HIP_PATH}/include)
endif()

message(STATUS "HIP PLATFORM: ${HIP_PLATFORM}")
message(STATUS "HIP COMPILER: ${HIP_COMPILER}")
message(STATUS "HIP RUNTIME: ${HIP_RUNTIME}")
message(STATUS "HIP Includes: ${HIP_INCLUDE_DIRS}")
message(STATUS "HIP Libraries: ${HIP_LIBRARIES}")

if(KRONMULT_PLATFORM_NVCC)
find_package(CUDA 9.0 REQUIRED)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a step back from enable_language (CUDA). Can we stay with first-class language support?

include_directories(${CUDA_INCLUDE_DIRS})
enable_language(CUDA)
set (CMAKE_CUDA_STANDARD 14)
set (CMAKE_CUDA_STANDARD_REQUIRED ON)
add_compile_definitions(__HIP_PLATFORM_NVCC__ __HIP_PLATFORM_NVIDIA__)
else()
add_compile_definitions(__HIP_PLATFORM_HCC__ __HIP_PLATFORM_AMD__)
endif()

endif ()

#-------------------------------------------------------------------------------
Expand All @@ -25,7 +104,7 @@
# Sanitizer options
#-------------------------------------------------------------------------------
set (CMAKE_CXX_FLAGS_SANITIZED -fno-omit-frame-pointer)
set (CMAKE_CUDA_FLAGS_SANITIZED -fno-omit-frame-pointer)
set (CMAKE_HIP_FLAGS_SANITIZED -fno-omit-frame-pointer)
set (CMAKE_EXE_LINKER_FLAGS_SANITIZED "")
set (CMAKE_SHARED_LINKER_FLAGS_SANITIZED "")

Expand All @@ -36,7 +115,7 @@

if (${SANITIZE_${upper_name}})
set (CMAKE_CXX_FLAGS_SANITIZED "${CMAKE_CXX_FLAGS_SANITIZED} -fsanitize=${name}")
set (CMAKE_CUDA_FLAGS_SANITIZED "${CMAKE_CUDA_FLAGS_SANITIZED} -fsanitize=${name}")
set (CMAKE_HIP_FLAGS_SANITIZED "${CMAKE_HIP_FLAGS_SANITIZED} -fsanitize=${name}")
set (CMAKE_EXE_LINKER_FLAGS_SANITIZED "${CMAKE_EXE_LINKER_FLAGS_SANITIZED} -fsanitize=${name}")
set (CMAKE_SHARED_LINKER_FLAGS_SANITIZED "${CMAKE_SHARED_LINKER_FLAGS_SANITIZED} -fsanitize=${name}")
endif ()
Expand All @@ -49,19 +128,32 @@
register_sanitizer_option (undefined ON)

#-------------------------------------------------------------------------------
# Define a macro function to set a targets source files to the CUDA language.
# Define a macro function to set a targets source files to the HIP language.
#
# Can't use $<$<COMPILE_LANGUAGE:LANG>:foo> generator expressions for
# target_compile_features
#
# Need to handle this with an explicit if statement.
#-------------------------------------------------------------------------------
macro (target_set_cuda target)
macro (target_set_hip target)
if (USE_GPU)
target_compile_features (${target} PUBLIC cuda_std_14)
get_target_property (SOURCES ${target} SOURCES)
set_source_files_properties (${SOURCES} PROPERTIES LANGUAGE CUDA)
set_target_properties (${target} PROPERTIES CUDA_ARCHITECTURES 60)
set_source_files_properties (${SOURCES} PROPERTIES LANGUAGE HIP)

if(KRONMULT_PLATFORM_NVCC)
target_compile_features (${target} PUBLIC cuda_std_14)
set_source_files_properties (${SOURCES} PROPERTIES LANGUAGE CUDA)
set_target_properties (${target} PROPERTIES CUDA_ARCHITECTURES ${GPU_ARCH})
else()
set_source_files_properties (${SOURCES} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
set_target_properties (${target} PROPERTIES HIP_ARCHITECTURES gfx${GPU_ARCH})
set_target_properties (${target} PROPERTIES LINKER_LANGUAGE HIP)
#target_link_libraries(${target} PUBLIC hip-lang::device)
endif()
set_target_properties (${target} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)

# HIP_INCLUDE_DIRS is not getting set for nvidia platforms
target_include_directories(${target} SYSTEM PRIVATE ${HIP_INCLUDE_DIRS} ${HIP_PATH}/include)
endif ()
endmacro ()

Expand All @@ -76,7 +168,7 @@
add_library (kron SHARED)

# Note can't use generator expressions on the source files since it interfers
# with setting the source property to target CUDA.
# with setting the source property to target HIP.
target_sources (kron

PRIVATE
Expand Down Expand Up @@ -109,17 +201,18 @@
)

target_compile_features (kron PUBLIC cxx_std_17)
target_set_cuda (kron)
target_set_hip (kron)

target_compile_options (kron
PUBLIC
$<$<COMPILE_LANGUAGE:CXX>:-Wall -Wextra -Wpedantic>
$<$<COMPILE_LANGUAGE:CUDA>:--compiler-options -fPIC --keep-device-functions>
$<$<BOOL:${KRONMULT_PLATFORM_NVCC}>:--compiler-options -fPIC --keep-device-functions>
$<$<BOOL:${KRONMULT_PLATFORM_AMD}>:--offload-arch=gfx${GPU_ARCH}>
)

target_compile_definitions (kron
PUBLIC
$<$<COMPILE_LANGUAGE:CUDA>:USE_GPU>
$<$<BOOL:${USE_GPU}>:USE_GPU>
)

if (OpenMP_CXX_FOUND)
Expand All @@ -140,7 +233,7 @@
PRIVATE
${source}
)
target_set_cuda (${target})
target_set_hip (${target})
target_link_libraries (${target} PUBLIC kron)

add_test (NAME ${target}
Expand Down
2 changes: 1 addition & 1 deletion kgemm_nn_batched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ void kgemm_nn_batched( int const mm, int const nn, int const kk,
int constexpr nwarps = 8;
int constexpr nthreads = nwarps * warpsize;

kgemm_nn_batched<double><<< batchCount, nthreads>>>( mm,nn,kk,
hipLaunchKernelGGL(HIP_KERNEL_NAME(kgemm_nn_batched<double>), dim3(batchCount), dim3(nthreads), 0, 0, mm,nn,kk,
alpha,
Aarray_, ldAarray_,
Barray_, ldBarray_,
Expand Down
2 changes: 1 addition & 1 deletion kgemm_nt_batched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ void kgemm_nt_batched( int const mm, int const nn, int const kk,
int constexpr nwarps = 8;
int constexpr nthreads = nwarps * warpsize;

kgemm_nt_batched<double><<< batchCount, nthreads >>>( mm,nn,kk,
hipLaunchKernelGGL(HIP_KERNEL_NAME(kgemm_nt_batched<double>), dim3(batchCount), dim3(nthreads ), 0, 0, mm,nn,kk,
alpha,
Aarray_, ldAarray_,
Barray_, ldBarray_,
Expand Down
3 changes: 1 addition & 2 deletions kroncommon.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@


#ifdef USE_GPU
#include <cuda.h>
#include <cuda_runtime.h>
#include <hip/hip_runtime.h>
#define GLOBAL_FUNCTION __global__
#define SYNCTHREADS __syncthreads()
#define SHARED_MEMORY __shared__
Expand Down
2 changes: 1 addition & 1 deletion kronmult1_batched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ void kronmult1_batched(
int constexpr nwarps = 8;
int constexpr nthreads = nwarps * warpsize;

kronmult1_batched<double><<< batchCount, nthreads >>>( n,
hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult1_batched<double>), dim3(batchCount), dim3(nthreads ), 0, 0, n,
Aarray_, Xarray_, Yarray_, Warray_, batchCount);
#else
kronmult1_batched<double>( n,
Expand Down
2 changes: 1 addition & 1 deletion kronmult1_pbatched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ void kronmult1_pbatched(
int constexpr nwarps = 8;
int constexpr nthreads = nwarps * warpsize;

kronmult1_pbatched<double><<< batchCount, nthreads >>>( n,
hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult1_pbatched<double>), dim3(batchCount), dim3(nthreads ), 0, 0, n,
Aarray_, Xarray_, Yarray_, Warray_, batchCount);
#else
kronmult1_pbatched<double>( n,
Expand Down
2 changes: 1 addition & 1 deletion kronmult1_xbatched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ void kronmult1_xbatched(
int constexpr nwarps = 8;
int constexpr nthreads = nwarps * warpsize;

kronmult1_xbatched<double><<< batchCount, nthreads >>>( n,
hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult1_xbatched<double>), dim3(batchCount), dim3(nthreads ), 0, 0, n,
Aarray_, lda,
Xarray_, Yarray_, Warray_, batchCount);
#else
Expand Down
2 changes: 1 addition & 1 deletion kronmult2_batched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ void kronmult2_batched(
int constexpr nwarps = 8;
int constexpr nthreads = nwarps * warpsize;

kronmult2_batched<double><<< batchCount, nthreads >>>( n,
hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult2_batched<double>), dim3(batchCount), dim3(nthreads ), 0, 0, n,
Aarray_, Xarray_, Yarray_, Warray_, batchCount);
#else
kronmult2_batched<double>( n,
Expand Down
2 changes: 1 addition & 1 deletion kronmult2_pbatched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ void kronmult2_pbatched(
int constexpr nwarps = 8;
int constexpr nthreads = nwarps * warpsize;

kronmult2_pbatched<double><<< batchCount, nthreads >>>( n,
hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult2_pbatched<double>), dim3(batchCount), dim3(nthreads ), 0, 0, n,
Aarray_, Xarray_, Yarray_, Warray_, batchCount);
#else
kronmult2_pbatched<double>( n,
Expand Down
2 changes: 1 addition & 1 deletion kronmult2_xbatched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ void kronmult2_xbatched(
int constexpr nwarps = 8;
int constexpr nthreads = nwarps * warpsize;

kronmult2_xbatched<double><<< batchCount, nthreads >>>( n,
hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult2_xbatched<double>), dim3(batchCount), dim3(nthreads ), 0, 0, n,
Aarray_, lda,
Xarray_, Yarray_, Warray_, batchCount);
#else
Expand Down
2 changes: 1 addition & 1 deletion kronmult3_batched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ void kronmult3_batched(
int constexpr nwarps = 8;
int constexpr nthreads = nwarps * warpsize;

kronmult3_batched<double><<< batchCount, nthreads >>>( n,
hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult3_batched<double>), dim3(batchCount), dim3(nthreads ), 0, 0, n,
Aarray_, Xarray_, Yarray_, Warray_, batchCount);
#else
kronmult3_batched<double>( n,
Expand Down
2 changes: 1 addition & 1 deletion kronmult3_pbatched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ void kronmult3_pbatched(
int constexpr nwarps = 8;
int constexpr nthreads = nwarps * warpsize;

kronmult3_pbatched<double><<< batchCount, nthreads >>>( n,
hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult3_pbatched<double>), dim3(batchCount), dim3(nthreads ), 0, 0, n,
Aarray_, Xarray_, Yarray_, Warray_, batchCount);
#else
kronmult3_pbatched<double>( n,
Expand Down
2 changes: 1 addition & 1 deletion kronmult3_xbatched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ void kronmult3_xbatched(
int constexpr nwarps = 8;
int constexpr nthreads = nwarps * warpsize;

kronmult3_xbatched<double><<< batchCount, nthreads >>>( n,
hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult3_xbatched<double>), dim3(batchCount), dim3(nthreads ), 0, 0, n,
Aarray_, lda,
Xarray_, Yarray_, Warray_, batchCount);
#else
Expand Down
2 changes: 1 addition & 1 deletion kronmult4_batched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ void kronmult4_batched(
int constexpr nwarps = 8;
int constexpr nthreads = nwarps * warpsize;

kronmult4_batched<double><<< batchCount, nthreads >>>( n,
hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult4_batched<double>), dim3(batchCount), dim3(nthreads ), 0, 0, n,
Aarray_, Xarray_, Yarray_, Warray_, batchCount);
#else
kronmult4_batched<double>( n,
Expand Down
2 changes: 1 addition & 1 deletion kronmult4_pbatched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ void kronmult4_pbatched(
int constexpr nwarps = 8;
int constexpr nthreads = nwarps * warpsize;

kronmult4_pbatched<double><<< batchCount, nthreads >>>( n,
hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult4_pbatched<double>), dim3(batchCount), dim3(nthreads ), 0, 0, n,
Aarray_, Xarray_, Yarray_, Warray_, batchCount);
#else
kronmult4_pbatched<double>( n,
Expand Down
2 changes: 1 addition & 1 deletion kronmult4_xbatched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ void kronmult4_xbatched(
int constexpr nwarps = 8;
int constexpr nthreads = nwarps * warpsize;

kronmult4_xbatched<double><<< batchCount, nthreads >>>( n,
hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult4_xbatched<double>), dim3(batchCount), dim3(nthreads ), 0, 0, n,
Aarray_, lda,
Xarray_, Yarray_, Warray_, batchCount);
#else
Expand Down
2 changes: 1 addition & 1 deletion kronmult5_batched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ void kronmult5_batched(
int constexpr nwarps = 8;
int constexpr nthreads = nwarps * warpsize;

kronmult5_batched<double><<< batchCount, nthreads >>>( n,
hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult5_batched<double>), dim3(batchCount), dim3(nthreads ), 0, 0, n,
Aarray_, Xarray_, Yarray_, Warray_, batchCount);
#else
kronmult5_batched<double>( n,
Expand Down
2 changes: 1 addition & 1 deletion kronmult5_pbatched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ void kronmult5_pbatched(
int constexpr nwarps = 8;
int constexpr nthreads = nwarps * warpsize;

kronmult5_pbatched<double><<< batchCount, nthreads >>>( n,
hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult5_pbatched<double>), dim3(batchCount), dim3(nthreads ), 0, 0, n,
Aarray_, Xarray_, Yarray_, Warray_, batchCount);
#else
kronmult5_pbatched<double>( n,
Expand Down
2 changes: 1 addition & 1 deletion kronmult5_xbatched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ void kronmult5_xbatched(
int constexpr nwarps = 8;
int constexpr nthreads = nwarps * warpsize;

kronmult5_xbatched<double><<< batchCount, nthreads >>>( n,
hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult5_xbatched<double>), dim3(batchCount), dim3(nthreads ), 0, 0, n,
Aarray_, lda,
Xarray_, Yarray_, Warray_, batchCount);
#else
Expand Down
2 changes: 1 addition & 1 deletion kronmult6_batched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ void kronmult6_batched(
int constexpr nwarps = 8;
int constexpr nthreads = nwarps * warpsize;

kronmult6_batched<double><<< batchCount, nthreads >>>( n,
hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult6_batched<double>), dim3(batchCount), dim3(nthreads ), 0, 0, n,
Aarray_, Xarray_, Yarray_, Warray_, batchCount);
#else
kronmult6_batched<double>( n,
Expand Down
2 changes: 1 addition & 1 deletion kronmult6_pbatched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ void kronmult6_pbatched(
int constexpr nwarps = 8;
int constexpr nthreads = nwarps * warpsize;

kronmult6_pbatched<double><<< batchCount, nthreads >>>( n,
hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult6_pbatched<double>), dim3(batchCount), dim3(nthreads ), 0, 0, n,
Aarray_, Xarray_, Yarray_, Warray_, batchCount);
#else
kronmult6_pbatched<double>( n,
Expand Down
2 changes: 1 addition & 1 deletion kronmult6_xbatched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ void kronmult6_xbatched(
int constexpr nwarps = 8;
int constexpr nthreads = nwarps * warpsize;

kronmult6_xbatched<double><<< batchCount, nthreads >>>( n,
hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult6_xbatched<double>), dim3(batchCount), dim3(nthreads ), 0, 0, n,
Aarray_, lda,
Xarray_, Yarray_, Warray_, batchCount);
#else
Expand Down
Loading