diff --git a/CMakeLists.txt b/CMakeLists.txt index e67f30c..4fee9bb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,13 +1,90 @@ - cmake_minimum_required (VERSION 3.17 FATAL_ERROR) + cmake_minimum_required (VERSION 3.21 FATAL_ERROR) #------------------------------------------------------------------------------- -# Setup languages to use. Only enable CUDA if GPU's are in use. +# Setup languages to use. Only enable HIP if GPU's are in use. #------------------------------------------------------------------------------- project (kronmult LANGUAGES CXX) - option (USE_GPU "Use CUDA for gpu support" OFF) + option (USE_GPU "Use HIP for gpu support" OFF) + set(GPU_ARCH "70" CACHE STRING "GPU architecture code for AMD/NVIDIA") if (USE_GPU) - enable_language (CUDA) + set(KRONMULT_PLATFORM_AMD 0) + set(KRONMULT_PLATFORM_NVCC 0) + + # search for HIP and libraries + if(NOT DEFINED HIP_PATH) + if(NOT DEFINED ENV{HIP_PATH}) + set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed") + else() + set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed") + endif() + endif() + + # set HIP_CLANG_PATH for potential installs in non-standard locations (such as rocm with spack) + if (NOT DEFINED HIP_CLANG_PATH) + if(NOT DEFINED ENV{HIP_CLANG_PATH}) + set(HIP_CLANG_PATH "${ROCM_PATH}/llvm/bin" CACHE PATH "Path to HIP clang binaries") + else() + set(HIP_CLANG_PATH $ENV{HIP_CLANG_PATH} CACHE PATH "Path to HIP clang binaries") + endif() + endif() + + # note: could probably grab this path directly using hipconfig? + if (NOT DEFINED HIP_CLANG_INCLUDE_PATH) + if(NOT DEFINED ENV{HIP_CLANG_INCLUDE_PATH}) + # probably need a better way to get the compiler version.. this will cause non-existent paths for non-clang compilers + set(HIP_CLANG_INCLUDE_PATH "${HIP_CLANG_PATH}/../lib/clang/${CMAKE_CXX_COMPILER_VERSION}/include" CACHE PATH "Path to HIP clang include directory") + else() + set(HIP_CLANG_INCLUDE_PATH $ENV{HIP_CLANG_INCLUDE_PATH} CACHE PATH "Path to HIP clang include directory") + endif() + endif() + + # try to find hipconfig executable which can help detect platforms and include dirs + find_program(HIPCONFIG_PATH hipconfig HINTS "${HIP_PATH}/bin") + if(HIPCONFIG_PATH) + execute_process(COMMAND ${HIPCONFIG_PATH} --platform OUTPUT_VARIABLE KRONMULT_HIP_PLATFORM) + elseif(DEFINED ENV{HIP_PLATFORM}) + set(KRONMULT_HIP_PLATFORM "$ENV{HIP_PLATFORM}") + else() + message(FATAL_ERROR "Could not determine HIP platform, make sure HIP_PLATFORM is set") + endif() + + message(STATUS "HIP platform has been detected as ${KRONMULT_HIP_PLATFORM}") + # hip >= 4.2 is now using "amd" to identify platform + if(KRONMULT_HIP_PLATFORM STREQUAL "hcc" OR KRONMULT_HIP_PLATFORM STREQUAL "amd") + set(KRONMULT_PLATFORM_AMD 1) + # hip <= 4.1 uses "nvcc" to identify nvidia platforms, >= 4.2 uses "nvidia" + elseif(KRONMULT_HIP_PLATFORM STREQUAL "nvcc" OR KRONMULT_HIP_PLATFORM STREQUAL "nvidia") + set(KRONMULT_PLATFORM_NVCC 1) + endif() + + set(HIP_VERBOSE_BUILD ON CACHE STRING "Verbose compilation for HIP") + + # look for HIP cmake configs in different locations + list(APPEND CMAKE_MODULE_PATH "${HIP_PATH}/cmake" "${HIP_PATH}/lib/cmake" "${ROCM_PATH}") + list(APPEND CMAKE_PREFIX_PATH "${HIP_PATH}/cmake" "${HIP_PATH}/lib/cmake" "${ROCM_PATH}") + set(CMAKE_HIP_ARCHITECTURES OFF) + find_package(HIP REQUIRED 4.0) + if(KRONMULT_PLATFORM_AMD) + enable_language (HIP) + set_target_properties(hip-lang::device PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${HIP_PATH}/include) + endif() + + message(STATUS "HIP PLATFORM: ${HIP_PLATFORM}") + message(STATUS "HIP COMPILER: ${HIP_COMPILER}") + message(STATUS "HIP RUNTIME: ${HIP_RUNTIME}") + message(STATUS "HIP Includes: ${HIP_INCLUDE_DIRS}") + message(STATUS "HIP Libraries: ${HIP_LIBRARIES}") + + if(KRONMULT_PLATFORM_NVCC) + enable_language(CUDA) + set (CMAKE_CUDA_STANDARD 14) + set (CMAKE_CUDA_STANDARD_REQUIRED ON) + add_compile_definitions(__HIP_PLATFORM_NVCC__ __HIP_PLATFORM_NVIDIA__) + else() + add_compile_definitions(__HIP_PLATFORM_HCC__ __HIP_PLATFORM_AMD__) + endif() + endif () #------------------------------------------------------------------------------- @@ -25,7 +102,7 @@ # Sanitizer options #------------------------------------------------------------------------------- set (CMAKE_CXX_FLAGS_SANITIZED -fno-omit-frame-pointer) - set (CMAKE_CUDA_FLAGS_SANITIZED -fno-omit-frame-pointer) + set (CMAKE_HIP_FLAGS_SANITIZED -fno-omit-frame-pointer) set (CMAKE_EXE_LINKER_FLAGS_SANITIZED "") set (CMAKE_SHARED_LINKER_FLAGS_SANITIZED "") @@ -36,7 +113,7 @@ if (${SANITIZE_${upper_name}}) set (CMAKE_CXX_FLAGS_SANITIZED "${CMAKE_CXX_FLAGS_SANITIZED} -fsanitize=${name}") - set (CMAKE_CUDA_FLAGS_SANITIZED "${CMAKE_CUDA_FLAGS_SANITIZED} -fsanitize=${name}") + set (CMAKE_HIP_FLAGS_SANITIZED "${CMAKE_HIP_FLAGS_SANITIZED} -fsanitize=${name}") set (CMAKE_EXE_LINKER_FLAGS_SANITIZED "${CMAKE_EXE_LINKER_FLAGS_SANITIZED} -fsanitize=${name}") set (CMAKE_SHARED_LINKER_FLAGS_SANITIZED "${CMAKE_SHARED_LINKER_FLAGS_SANITIZED} -fsanitize=${name}") endif () @@ -49,19 +126,32 @@ register_sanitizer_option (undefined ON) #------------------------------------------------------------------------------- -# Define a macro function to set a targets source files to the CUDA language. +# Define a macro function to set a targets source files to the HIP language. # # Can't use $<$:foo> generator expressions for # target_compile_features # # Need to handle this with an explicit if statement. #------------------------------------------------------------------------------- - macro (target_set_cuda target) + macro (target_set_hip target) if (USE_GPU) - target_compile_features (${target} PUBLIC cuda_std_14) get_target_property (SOURCES ${target} SOURCES) - set_source_files_properties (${SOURCES} PROPERTIES LANGUAGE CUDA) - set_target_properties (${target} PROPERTIES CUDA_ARCHITECTURES 60) + set_source_files_properties (${SOURCES} PROPERTIES LANGUAGE HIP) + + if(KRONMULT_PLATFORM_NVCC) + target_compile_features (${target} PUBLIC cuda_std_14) + set_source_files_properties (${SOURCES} PROPERTIES LANGUAGE CUDA) + set_target_properties (${target} PROPERTIES CUDA_ARCHITECTURES ${GPU_ARCH}) + else() + set_source_files_properties (${SOURCES} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) + set_target_properties (${target} PROPERTIES HIP_ARCHITECTURES gfx${GPU_ARCH}) + set_target_properties (${target} PROPERTIES LINKER_LANGUAGE HIP) + #target_link_libraries(${target} PUBLIC hip-lang::device) + endif() + set_target_properties (${target} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) + + # HIP_INCLUDE_DIRS is not getting set for nvidia platforms + target_include_directories(${target} SYSTEM PRIVATE ${HIP_INCLUDE_DIRS} ${HIP_PATH}/include) endif () endmacro () @@ -76,7 +166,7 @@ add_library (kron SHARED) # Note can't use generator expressions on the source files since it interfers -# with setting the source property to target CUDA. +# with setting the source property to target HIP. target_sources (kron PRIVATE @@ -109,17 +199,18 @@ ) target_compile_features (kron PUBLIC cxx_std_17) - target_set_cuda (kron) + target_set_hip (kron) target_compile_options (kron PUBLIC $<$:-Wall -Wextra -Wpedantic> - $<$:--compiler-options -fPIC --keep-device-functions> + $<$:--compiler-options -fPIC --keep-device-functions> + $<$:--offload-arch=gfx${GPU_ARCH}> ) target_compile_definitions (kron PUBLIC - $<$:USE_GPU> + $<$:USE_GPU> ) if (OpenMP_CXX_FOUND) @@ -140,7 +231,7 @@ PRIVATE ${source} ) - target_set_cuda (${target}) + target_set_hip (${target}) target_link_libraries (${target} PUBLIC kron) add_test (NAME ${target} diff --git a/kgemm_nn_batched.cpp b/kgemm_nn_batched.cpp index 3ba54fd..96f2a08 100644 --- a/kgemm_nn_batched.cpp +++ b/kgemm_nn_batched.cpp @@ -20,7 +20,7 @@ void kgemm_nn_batched( int const mm, int const nn, int const kk, int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kgemm_nn_batched<<< batchCount, nthreads>>>( mm,nn,kk, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kgemm_nn_batched), dim3(batchCount), dim3(nthreads), 0, 0, mm,nn,kk, alpha, Aarray_, ldAarray_, Barray_, ldBarray_, diff --git a/kgemm_nt_batched.cpp b/kgemm_nt_batched.cpp index 62c443e..059360b 100644 --- a/kgemm_nt_batched.cpp +++ b/kgemm_nt_batched.cpp @@ -19,7 +19,7 @@ void kgemm_nt_batched( int const mm, int const nn, int const kk, int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kgemm_nt_batched<<< batchCount, nthreads >>>( mm,nn,kk, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kgemm_nt_batched), dim3(batchCount), dim3(nthreads ), 0, 0, mm,nn,kk, alpha, Aarray_, ldAarray_, Barray_, ldBarray_, diff --git a/kroncommon.hpp b/kroncommon.hpp index 750bc57..a5c9df5 100644 --- a/kroncommon.hpp +++ b/kroncommon.hpp @@ -4,8 +4,7 @@ #ifdef USE_GPU -#include -#include +#include #define GLOBAL_FUNCTION __global__ #define SYNCTHREADS __syncthreads() #define SHARED_MEMORY __shared__ diff --git a/kronmult1_batched.cpp b/kronmult1_batched.cpp index edfb3e0..917172d 100644 --- a/kronmult1_batched.cpp +++ b/kronmult1_batched.cpp @@ -13,7 +13,7 @@ void kronmult1_batched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult1_batched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult1_batched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, Xarray_, Yarray_, Warray_, batchCount); #else kronmult1_batched( n, diff --git a/kronmult1_pbatched.cpp b/kronmult1_pbatched.cpp index afaa266..413191e 100644 --- a/kronmult1_pbatched.cpp +++ b/kronmult1_pbatched.cpp @@ -13,7 +13,7 @@ void kronmult1_pbatched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult1_pbatched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult1_pbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, Xarray_, Yarray_, Warray_, batchCount); #else kronmult1_pbatched( n, diff --git a/kronmult1_xbatched.cpp b/kronmult1_xbatched.cpp index 61d5314..14a698b 100644 --- a/kronmult1_xbatched.cpp +++ b/kronmult1_xbatched.cpp @@ -14,7 +14,7 @@ void kronmult1_xbatched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult1_xbatched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult1_xbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, lda, Xarray_, Yarray_, Warray_, batchCount); #else diff --git a/kronmult2_batched.cpp b/kronmult2_batched.cpp index 54a4b24..72ca53c 100644 --- a/kronmult2_batched.cpp +++ b/kronmult2_batched.cpp @@ -13,7 +13,7 @@ void kronmult2_batched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult2_batched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult2_batched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, Xarray_, Yarray_, Warray_, batchCount); #else kronmult2_batched( n, diff --git a/kronmult2_pbatched.cpp b/kronmult2_pbatched.cpp index bcfe204..ace712e 100644 --- a/kronmult2_pbatched.cpp +++ b/kronmult2_pbatched.cpp @@ -13,7 +13,7 @@ void kronmult2_pbatched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult2_pbatched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult2_pbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, Xarray_, Yarray_, Warray_, batchCount); #else kronmult2_pbatched( n, diff --git a/kronmult2_xbatched.cpp b/kronmult2_xbatched.cpp index bee62d4..039f244 100644 --- a/kronmult2_xbatched.cpp +++ b/kronmult2_xbatched.cpp @@ -14,7 +14,7 @@ void kronmult2_xbatched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult2_xbatched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult2_xbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, lda, Xarray_, Yarray_, Warray_, batchCount); #else diff --git a/kronmult3_batched.cpp b/kronmult3_batched.cpp index 4a7e719..a69e2fb 100644 --- a/kronmult3_batched.cpp +++ b/kronmult3_batched.cpp @@ -13,7 +13,7 @@ void kronmult3_batched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult3_batched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult3_batched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, Xarray_, Yarray_, Warray_, batchCount); #else kronmult3_batched( n, diff --git a/kronmult3_pbatched.cpp b/kronmult3_pbatched.cpp index 822f822..706fbe3 100644 --- a/kronmult3_pbatched.cpp +++ b/kronmult3_pbatched.cpp @@ -13,7 +13,7 @@ void kronmult3_pbatched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult3_pbatched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult3_pbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, Xarray_, Yarray_, Warray_, batchCount); #else kronmult3_pbatched( n, diff --git a/kronmult3_xbatched.cpp b/kronmult3_xbatched.cpp index 4b609d5..6f8c785 100644 --- a/kronmult3_xbatched.cpp +++ b/kronmult3_xbatched.cpp @@ -14,7 +14,7 @@ void kronmult3_xbatched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult3_xbatched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult3_xbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, lda, Xarray_, Yarray_, Warray_, batchCount); #else diff --git a/kronmult4_batched.cpp b/kronmult4_batched.cpp index e9327ae..059b1a3 100644 --- a/kronmult4_batched.cpp +++ b/kronmult4_batched.cpp @@ -13,7 +13,7 @@ void kronmult4_batched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult4_batched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult4_batched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, Xarray_, Yarray_, Warray_, batchCount); #else kronmult4_batched( n, diff --git a/kronmult4_pbatched.cpp b/kronmult4_pbatched.cpp index fae2eea..0ce2b19 100644 --- a/kronmult4_pbatched.cpp +++ b/kronmult4_pbatched.cpp @@ -13,7 +13,7 @@ void kronmult4_pbatched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult4_pbatched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult4_pbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, Xarray_, Yarray_, Warray_, batchCount); #else kronmult4_pbatched( n, diff --git a/kronmult4_xbatched.cpp b/kronmult4_xbatched.cpp index 4148c1a..6c891c5 100644 --- a/kronmult4_xbatched.cpp +++ b/kronmult4_xbatched.cpp @@ -14,7 +14,7 @@ void kronmult4_xbatched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult4_xbatched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult4_xbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, lda, Xarray_, Yarray_, Warray_, batchCount); #else diff --git a/kronmult5_batched.cpp b/kronmult5_batched.cpp index 417833d..1889aa1 100644 --- a/kronmult5_batched.cpp +++ b/kronmult5_batched.cpp @@ -13,7 +13,7 @@ void kronmult5_batched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult5_batched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult5_batched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, Xarray_, Yarray_, Warray_, batchCount); #else kronmult5_batched( n, diff --git a/kronmult5_pbatched.cpp b/kronmult5_pbatched.cpp index 5398ff6..ceeff45 100644 --- a/kronmult5_pbatched.cpp +++ b/kronmult5_pbatched.cpp @@ -13,7 +13,7 @@ void kronmult5_pbatched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult5_pbatched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult5_pbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, Xarray_, Yarray_, Warray_, batchCount); #else kronmult5_pbatched( n, diff --git a/kronmult5_xbatched.cpp b/kronmult5_xbatched.cpp index f3e14da..23495b4 100644 --- a/kronmult5_xbatched.cpp +++ b/kronmult5_xbatched.cpp @@ -14,7 +14,7 @@ void kronmult5_xbatched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult5_xbatched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult5_xbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, lda, Xarray_, Yarray_, Warray_, batchCount); #else diff --git a/kronmult6_batched.cpp b/kronmult6_batched.cpp index 7b3a624..f4f0454 100644 --- a/kronmult6_batched.cpp +++ b/kronmult6_batched.cpp @@ -13,7 +13,7 @@ void kronmult6_batched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult6_batched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult6_batched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, Xarray_, Yarray_, Warray_, batchCount); #else kronmult6_batched( n, diff --git a/kronmult6_pbatched.cpp b/kronmult6_pbatched.cpp index 30e29fd..7ceed69 100644 --- a/kronmult6_pbatched.cpp +++ b/kronmult6_pbatched.cpp @@ -13,7 +13,7 @@ void kronmult6_pbatched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult6_pbatched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult6_pbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, Xarray_, Yarray_, Warray_, batchCount); #else kronmult6_pbatched( n, diff --git a/kronmult6_xbatched.cpp b/kronmult6_xbatched.cpp index a5ab2f9..66d8825 100644 --- a/kronmult6_xbatched.cpp +++ b/kronmult6_xbatched.cpp @@ -14,7 +14,7 @@ void kronmult6_xbatched( int constexpr nwarps = 8; int constexpr nthreads = nwarps * warpsize; - kronmult6_xbatched<<< batchCount, nthreads >>>( n, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult6_xbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, Aarray_, lda, Xarray_, Yarray_, Warray_, batchCount); #else diff --git a/test_kgemm_nn_batched.cpp b/test_kgemm_nn_batched.cpp index e83a728..0cfd5be 100644 --- a/test_kgemm_nn_batched.cpp +++ b/test_kgemm_nn_batched.cpp @@ -5,7 +5,7 @@ #include #ifdef USE_GPU -#include +#include #else #include #include @@ -18,11 +18,11 @@ static inline void host2gpu( void *dest, void *src, size_t nbytes ) { #ifdef USE_GPU - cudaError_t istat = cudaMemcpy( dest, - src, - nbytes, - cudaMemcpyHostToDevice ); - expect( istat == cudaSuccess ); + hipError_t istat = hipMemcpy( dest, + src, + nbytes, + hipMemcpyHostToDevice ); + assert( istat == hipSuccess ); #else memcpy( dest, src, nbytes ); #endif @@ -32,11 +32,11 @@ static inline void gpu2host( void * dest, void * src, size_t nbytes ) { #ifdef USE_GPU - cudaError_t istat = cudaMemcpy( dest, - src, - nbytes, - cudaMemcpyDeviceToHost); - expect( istat == cudaSuccess ); + hipError_t istat = hipMemcpy( dest, + src, + nbytes, + hipMemcpyDeviceToHost); + expect( istat == hipSuccess ); #else memcpy( dest, src, nbytes ); #endif @@ -47,8 +47,8 @@ static inline void *myalloc( size_t const nbytes ) { void *devPtr = nullptr; #ifdef USE_GPU - cudaError_t istat = cudaMalloc( &devPtr, nbytes ); - expect( istat == cudaSuccess ); + hipError_t istat = hipMalloc( &devPtr, nbytes ); + expect( istat == hipSuccess ); #else devPtr = malloc( nbytes ); #endif @@ -59,8 +59,8 @@ void *myalloc( size_t const nbytes ) { static inline void myfree( void * devPtr ) { #ifdef USE_GPU - cudaError_t istat = cudaFree( devPtr); - expect( istat == cudaSuccess ); + hipError_t istat = hipFree( devPtr); + expect( istat == hipSuccess ); #else free( devPtr ); #endif @@ -328,11 +328,11 @@ T test_kgemm_nn_batched( int const mm, int const nwarps = min( min(32,mm), min(nn,kk)); int const nthreads = nwarps * warpsize; - cudaError_t istat_sync_start = cudaDeviceSynchronize(); - expect( istat_sync_start == cudaSuccess ); + hipError_t istat_sync_start = hipDeviceSynchronize(); + expect( istat_sync_start == hipSuccess ); - kgemm_nn_batched<<< batchCount, nthreads >>>( mm,nn,kk, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kgemm_nn_batched), dim3(batchCount), dim3(nthreads ), 0, 0, mm,nn,kk, alpha, ddAarray_, dldAarray_, ddBarray_, dldBarray_, @@ -340,8 +340,8 @@ T test_kgemm_nn_batched( int const mm, ddCarray_, dldCarray_, batchCount); - cudaError_t istat_sync_end = cudaDeviceSynchronize(); - expect( istat_sync_end == cudaSuccess ); + hipError_t istat_sync_end = hipDeviceSynchronize(); + expect( istat_sync_end == hipSuccess ); } #else { diff --git a/test_kgemm_nt_batched.cpp b/test_kgemm_nt_batched.cpp index 58c68e5..a126b8d 100644 --- a/test_kgemm_nt_batched.cpp +++ b/test_kgemm_nt_batched.cpp @@ -6,7 +6,7 @@ #include #ifdef USE_GPU -#include +#include #else #include #include @@ -19,11 +19,11 @@ static inline void host2gpu( void *dest, void *src, size_t nbytes ) { #ifdef USE_GPU - cudaError_t istat = cudaMemcpy( dest, - src, - nbytes, - cudaMemcpyHostToDevice ); - expect( istat == cudaSuccess ); + hipError_t istat = hipMemcpy( dest, + src, + nbytes, + hipMemcpyHostToDevice ); + expect( istat == hipSuccess ); #else memcpy( dest, src, nbytes ); #endif @@ -33,11 +33,11 @@ static inline void gpu2host( void * dest, void * src, size_t nbytes ) { #ifdef USE_GPU - cudaError_t istat = cudaMemcpy( dest, + hipError_t istat = hipMemcpy( dest, src, nbytes, - cudaMemcpyDeviceToHost); - expect( istat == cudaSuccess ); + hipMemcpyDeviceToHost); + expect( istat == hipSuccess ); #else memcpy( dest, src, nbytes ); #endif @@ -48,8 +48,8 @@ static inline void *myalloc( size_t const nbytes ) { void *devPtr = nullptr; #ifdef USE_GPU - cudaError_t istat = cudaMalloc( &devPtr, nbytes ); - expect( istat == cudaSuccess ); + hipError_t istat = hipMalloc( &devPtr, nbytes ); + expect( istat == hipSuccess ); #else devPtr = malloc( nbytes ); #endif @@ -60,8 +60,8 @@ void *myalloc( size_t const nbytes ) { static inline void myfree( void * devPtr ) { #ifdef USE_GPU - cudaError_t istat = cudaFree( devPtr); - expect( istat == cudaSuccess ); + hipError_t istat = hipFree( devPtr); + expect( istat == hipSuccess ); #else free( devPtr ); #endif @@ -327,11 +327,11 @@ T test_kgemm_nt_batched( int const mm, int const nwarps = min( min(32,mm), min(nn,kk)); int const nthreads = nwarps * warpsize; - cudaError_t istat_sync_start = cudaDeviceSynchronize(); - expect( istat_sync_start == cudaSuccess ); + hipError_t istat_sync_start = hipDeviceSynchronize(); + expect( istat_sync_start == hipSuccess ); - kgemm_nt_batched<<< batchCount, nthreads >>>( mm,nn,kk, + hipLaunchKernelGGL(HIP_KERNEL_NAME(kgemm_nt_batched), dim3(batchCount), dim3(nthreads ), 0, 0, mm,nn,kk, alpha, ddAarray_, dldAarray_, ddBarray_, dldBarray_, @@ -339,8 +339,8 @@ T test_kgemm_nt_batched( int const mm, ddCarray_, dldCarray_, batchCount); - cudaError_t istat_sync_end = cudaDeviceSynchronize(); - expect( istat_sync_end == cudaSuccess ); + hipError_t istat_sync_end = hipDeviceSynchronize(); + expect( istat_sync_end == hipSuccess ); } #else { diff --git a/test_kronmult6_batched.cpp b/test_kronmult6_batched.cpp index 6a100b4..609ce79 100644 --- a/test_kronmult6_batched.cpp +++ b/test_kronmult6_batched.cpp @@ -13,7 +13,7 @@ #ifdef USE_GPU -#include +#include #else #include #include @@ -24,11 +24,11 @@ static inline void host2gpu( void *dest, void *src, size_t nbytes ) { #ifdef USE_GPU - cudaError_t istat = cudaMemcpy( dest, - src, - nbytes, - cudaMemcpyHostToDevice ); - expect( istat == cudaSuccess ); + hipError_t istat = hipMemcpy( dest, + src, + nbytes, + hipMemcpyHostToDevice ); + expect( istat == hipSuccess ); #else memcpy( dest, src, nbytes ); #endif @@ -38,11 +38,11 @@ static inline void gpu2host( void *dest, void *src, size_t nbytes ) { #ifdef USE_GPU - cudaError_t istat = cudaMemcpy( dest, + hipError_t istat = hipMemcpy( dest, src, nbytes, - cudaMemcpyDeviceToHost); - expect( istat == cudaSuccess ); + hipMemcpyDeviceToHost); + expect( istat == hipSuccess ); #else memcpy( dest, src, nbytes ); #endif @@ -53,8 +53,8 @@ static inline void *myalloc( size_t nbytes ) { void *devPtr = nullptr; #ifdef USE_GPU - cudaError_t istat = cudaMalloc( &devPtr, nbytes ); - expect( istat == cudaSuccess ); + hipError_t istat = hipMalloc( &devPtr, nbytes ); + expect( istat == hipSuccess ); #else devPtr = malloc( nbytes ); #endif @@ -65,8 +65,8 @@ void *myalloc( size_t nbytes ) { static inline void myfree( void * devPtr ) { #ifdef USE_GPU - cudaError_t istat = cudaFree( devPtr); - expect( istat == cudaSuccess ); + hipError_t istat = hipFree( devPtr); + expect( istat == hipSuccess ); #else free( devPtr ); #endif @@ -195,42 +195,42 @@ T test_kronmult_batched( int const idim, // note the input Zarray will be over-written // -------------------------------------------- switch(idim) { - case 1: kronmult1_batched<<< batchCount, nthreads >>>( n, + case 1: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult1_batched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAarray_, dZarray_, dYarray_, dWarray_, batchCount ); break; - case 2: kronmult2_batched<<< batchCount, nthreads >>>( n, + case 2: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult2_batched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAarray_, dZarray_, dYarray_, dWarray_, batchCount ); break; - case 3: kronmult3_batched<<< batchCount, nthreads >>>( n, + case 3: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult3_batched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAarray_, dZarray_, dYarray_, dWarray_, batchCount ); break; - case 4: kronmult4_batched<<< batchCount, nthreads >>>( n, + case 4: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult4_batched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAarray_, dZarray_, dYarray_, dWarray_, batchCount ); break; - case 5: kronmult5_batched<<< batchCount, nthreads >>>( n, + case 5: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult5_batched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAarray_, dZarray_, dYarray_, dWarray_, batchCount ); break; - case 6: kronmult6_batched<<< batchCount, nthreads >>>( n, + case 6: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult6_batched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAarray_, dZarray_, dYarray_, @@ -244,8 +244,8 @@ T test_kronmult_batched( int const idim, // ------------------------------------------- // note important to wait for kernel to finish // ------------------------------------------- - cudaError_t istat = cudaDeviceSynchronize(); - expect( istat == cudaSuccess ); + hipError_t istat = hipDeviceSynchronize(); + expect( istat == hipSuccess ); } #else diff --git a/test_kronmult6_pbatched.cpp b/test_kronmult6_pbatched.cpp index f8bfc1d..62d41c4 100644 --- a/test_kronmult6_pbatched.cpp +++ b/test_kronmult6_pbatched.cpp @@ -13,7 +13,7 @@ #ifdef USE_GPU -#include +#include #else #include #include @@ -24,11 +24,11 @@ static inline void host2gpu( void *dest, void *src, size_t nbytes ) { #ifdef USE_GPU - cudaError_t istat = cudaMemcpy( dest, - src, - nbytes, - cudaMemcpyHostToDevice ); - expect( istat == cudaSuccess ); + hipError_t istat = hipMemcpy( dest, + src, + nbytes, + hipMemcpyHostToDevice ); + expect( istat == hipSuccess ); #else memcpy( dest, src, nbytes ); #endif @@ -38,11 +38,11 @@ static inline void gpu2host( void *dest, void *src, size_t nbytes ) { #ifdef USE_GPU - cudaError_t istat = cudaMemcpy( dest, + hipError_t istat = hipMemcpy( dest, src, nbytes, - cudaMemcpyDeviceToHost); - expect( istat == cudaSuccess ); + hipMemcpyDeviceToHost); + expect( istat == hipSuccess ); #else memcpy( dest, src, nbytes ); #endif @@ -53,8 +53,8 @@ static inline void *myalloc( size_t nbytes ) { void *devPtr = nullptr; #ifdef USE_GPU - cudaError_t istat = cudaMalloc( &devPtr, nbytes ); - expect( istat == cudaSuccess ); + hipError_t istat = hipMalloc( &devPtr, nbytes ); + expect( istat == hipSuccess ); #else devPtr = malloc( nbytes ); #endif @@ -65,8 +65,8 @@ void *myalloc( size_t nbytes ) { static inline void myfree( void * devPtr ) { #ifdef USE_GPU - cudaError_t istat = cudaFree( devPtr); - expect( istat == cudaSuccess ); + hipError_t istat = hipFree( devPtr); + expect( istat == hipSuccess ); #else free( devPtr ); #endif @@ -269,42 +269,42 @@ T test_kronmult_pbatched( int const idim, // note the input Zarray will be over-written // -------------------------------------------- switch(idim) { - case 1: kronmult1_pbatched<<< batchCount, nthreads >>>( n, + case 1: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult1_pbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAarray_, dpdZarray_, dpdYarray_, dpdWarray_, batchCount ); break; - case 2: kronmult2_pbatched<<< batchCount, nthreads >>>( n, + case 2: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult2_pbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAarray_, dpdZarray_, dpdYarray_, dpdWarray_, batchCount ); break; - case 3: kronmult3_pbatched<<< batchCount, nthreads >>>( n, + case 3: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult3_pbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAarray_, dpdZarray_, dpdYarray_, dpdWarray_, batchCount ); break; - case 4: kronmult4_pbatched<<< batchCount, nthreads >>>( n, + case 4: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult4_pbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAarray_, dpdZarray_, dpdYarray_, dpdWarray_, batchCount ); break; - case 5: kronmult5_pbatched<<< batchCount, nthreads >>>( n, + case 5: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult5_pbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAarray_, dpdZarray_, dpdYarray_, dpdWarray_, batchCount ); break; - case 6: kronmult6_pbatched<<< batchCount, nthreads >>>( n, + case 6: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult6_pbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAarray_, dpdZarray_, dpdYarray_, @@ -318,8 +318,8 @@ T test_kronmult_pbatched( int const idim, // ------------------------------------------- // note important to wait for kernel to finish // ------------------------------------------- - cudaError_t istat = cudaDeviceSynchronize(); - expect( istat == cudaSuccess ); + hipError_t istat = hipDeviceSynchronize(); + expect( istat == hipSuccess ); } #else diff --git a/test_kronmult6_xbatched.cpp b/test_kronmult6_xbatched.cpp index 90a938a..7534b59 100644 --- a/test_kronmult6_xbatched.cpp +++ b/test_kronmult6_xbatched.cpp @@ -13,7 +13,7 @@ #ifdef USE_GPU -#include +#include #else #include #include @@ -24,11 +24,11 @@ static inline void host2gpu( void *dest, void *src, size_t nbytes ) { #ifdef USE_GPU - cudaError_t istat = cudaMemcpy( dest, - src, - nbytes, - cudaMemcpyHostToDevice ); - expect( istat == cudaSuccess ); + hipError_t istat = hipMemcpy( dest, + src, + nbytes, + hipMemcpyHostToDevice ); + expect( istat == hipSuccess ); #else memcpy( dest, src, nbytes ); #endif @@ -38,11 +38,11 @@ static inline void gpu2host( void *dest, void *src, size_t nbytes ) { #ifdef USE_GPU - cudaError_t istat = cudaMemcpy( dest, + hipError_t istat = hipMemcpy( dest, src, nbytes, - cudaMemcpyDeviceToHost); - expect( istat == cudaSuccess ); + hipMemcpyDeviceToHost); + expect( istat == hipSuccess ); #else memcpy( dest, src, nbytes ); #endif @@ -53,8 +53,8 @@ static inline void *myalloc( size_t nbytes ) { void *devPtr = nullptr; #ifdef USE_GPU - cudaError_t istat = cudaMalloc( &devPtr, nbytes ); - expect( istat == cudaSuccess ); + hipError_t istat = hipMalloc( &devPtr, nbytes ); + expect( istat == hipSuccess ); #else devPtr = malloc( nbytes ); #endif @@ -65,8 +65,8 @@ void *myalloc( size_t nbytes ) { static inline void myfree( void * devPtr ) { #ifdef USE_GPU - cudaError_t istat = cudaFree( devPtr); - expect( istat == cudaSuccess ); + hipError_t istat = hipFree( devPtr); + expect( istat == hipSuccess ); #else free( devPtr ); #endif @@ -302,42 +302,42 @@ T test_kronmult_xbatched( int const idim, // note the input Zarray will be over-written // -------------------------------------------- switch(idim) { - case 1: kronmult1_xbatched<<< batchCount, nthreads >>>( n, + case 1: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult1_xbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAparray_, lda, dpdZarray_, dpdYarray_, dpdWarray_, batchCount ); break; - case 2: kronmult2_xbatched<<< batchCount, nthreads >>>( n, + case 2: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult2_xbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAparray_, lda, dpdZarray_, dpdYarray_, dpdWarray_, batchCount ); break; - case 3: kronmult3_xbatched<<< batchCount, nthreads >>>( n, + case 3: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult3_xbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAparray_, lda, dpdZarray_, dpdYarray_, dpdWarray_, batchCount ); break; - case 4: kronmult4_xbatched<<< batchCount, nthreads >>>( n, + case 4: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult4_xbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAparray_, lda, dpdZarray_, dpdYarray_, dpdWarray_, batchCount ); break; - case 5: kronmult5_xbatched<<< batchCount, nthreads >>>( n, + case 5: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult5_xbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAparray_, lda, dpdZarray_, dpdYarray_, dpdWarray_, batchCount ); break; - case 6: kronmult6_xbatched<<< batchCount, nthreads >>>( n, + case 6: hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult6_xbatched), dim3(batchCount), dim3(nthreads ), 0, 0, n, dAparray_, lda, dpdZarray_, dpdYarray_, @@ -351,8 +351,8 @@ T test_kronmult_xbatched( int const idim, // ------------------------------------------- // note important to wait for kernel to finish // ------------------------------------------- - cudaError_t istat = cudaDeviceSynchronize(); - expect( istat == cudaSuccess ); + hipError_t istat = hipDeviceSynchronize(); + expect( istat == hipSuccess ); } #else