From edf5be33c8de8a08a7f3747e7cd1218e72a6fe91 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Fri, 9 Aug 2024 16:08:08 -0400 Subject: [PATCH] Use the RoCM/HIP device to accelerate certain DPLASMA kernels (#57) * configure: Add the --with-hip option Signed-off-by: Aurelien Bouteiller * hip: Configury Signed-off-by: Aurelien Bouteiller * hip: kernel typedefs Signed-off-by: Aurelien Bouteiller * hip: update for hip-enabled parsec Signed-off-by: Aurelien Bouteiller * hip: detect hipblas and rocsolvers Signed-off-by: Aurelien Bouteiller Conflicts: src/CMakeLists.txt * hip: precision generator rules Signed-off-by: Aurelien Bouteiller * hip: cleanup unused dyld hipblas functions Signed-off-by: Aurelien Bouteiller * hip: Update lapack stagein Signed-off-by: Aurelien Bouteiller * Update for feature/common_gpu parsec branch changes Signed-off-by: Aurelien Bouteiller * Some conflicting updates between hip and common_gpu need more resolution Signed-off-by: Aurelien Bouteiller * hip: stream info registration * hip: potrf on AMD Signed-off-by: Aurelien Bouteiller * hip:po: Some errors introduced when merging Signed-off-by: Aurelien Bouteiller * Add HIP to the lookahead gpu gemm Signed-off-by: Aurelien Bouteiller * Add HIP to zgemm_summa Signed-off-by: Aurelien Bouteiller * hip: rework of PO and workspaces Signed-off-by: Aurelien Bouteiller * hip: remove unecessary hiblas init calls Signed-off-by: Aurelien Bouteiller * hip:po:errors in ldam asserts Signed-off-by: Aurelien Bouteiller * hip:po: some of the changes had broken cusolver Signed-off-by: Aurelien Bouteiller * fix printlogcuda/hip Signed-off-by: Aurelien Bouteiller * Auto-generate hip stage-in/out functions Use proper error checks instead of asserts * hip:zgemm_gpu: don't use hipComplex * Return the proper PARSEC_HOOK_RETURN_ERROR in GPU error cases Signed-off-by: Aurelien Bouteiller * Update for the new device mask for incarnations Signed-off-by: Aurelien Bouteiller * So far only NN gemm can run with HIP Signed-off-by: Aurelien Bouteiller * Use the correct DPLASMA_HAVE_HIP Signed-off-by: Aurelien Bouteiller * Remove weight properties from HIP bodies Signed-off-by: Aurelien Bouteiller * Reorder and uniformize cuda and hip bodies Signed-off-by: Aurelien Bouteiller * A PARSEC_HAVE_HIP was still present Signed-off-by: Aurelien Bouteiller * Rework zpotrf_U Signed-off-by: Aurelien Bouteiller * hip: add NT/TN/TT cases to gemm_summa Signed-off-by: Aurelien Bouteiller * Update parsec to a version that works with GPUs Signed-off-by: Aurelien Bouteiller * zpotrf_wrapper: uid and handles don't exist when not using a GPU device Signed-off-by: Aurelien Bouteiller * Update dtd for hip/cuda specializations for the dtd workspaces Signed-off-by: Aurelien Bouteiller * Make all gemm_summa the same between hip/cuda Signed-off-by: Aurelien Bouteiller * Use the same controls as parsec for GPU_WITH_CUDA/HIP * hip: merge error: the device count must be updated in both hip and cuda builds * hip: printlog hipblascomplex not compatible with creal * hip: final cleanup --------- Signed-off-by: Aurelien Bouteiller --- CMakeLists.txt | 26 ++- configure | 20 +- share/help-dplasma.txt | 11 +- src/CMakeLists.txt | 25 ++- src/cuda/README.md | 3 + src/cuda/lapack_cuda_stage_in.c | 165 ++++++++++++++++ src/dplasmaaux.c | 3 +- src/dplasmaaux.h | 6 +- src/dplasmaaux_cuda.c | 21 +- src/dplasmaaux_cuda.h | 25 ++- src/dplasmaaux_hip.c | 78 ++++++++ src/dplasmaaux_hip.h | 86 ++++++++ src/dplasmajdf.h | 6 +- src/dplasmajdf_lapack_dtt.h | 184 ++++-------------- src/dtd_wrappers/dplasma_z_dtd.h | 4 +- src/dtd_wrappers/zgemm.c | 9 +- src/dtd_wrappers/zherk.c | 9 +- src/dtd_wrappers/zpotrf.c | 19 +- src/dtd_wrappers/ztrsm.c | 9 +- src/include/dplasma/config.h.in | 1 + ..._cublas_utils.h => potrf_gpu_workspaces.h} | 12 +- src/zgemm_NN.jdf | 11 +- src/zgemm_NN_gpu.jdf | 79 +++++++- src/zgemm_NN_summa.jdf | 76 +++++++- src/zgemm_NT.jdf | 15 +- src/zgemm_NT_summa.jdf | 76 +++++++- src/zgemm_TN.jdf | 15 +- src/zgemm_TN_summa.jdf | 76 +++++++- src/zgemm_TT.jdf | 15 +- src/zgemm_TT_summa.jdf | 76 +++++++- src/zgemm_wrapper.c | 73 +++++-- src/zgeqrf.jdf | 4 +- src/zgetrf_nopiv.jdf | 14 +- src/zpoinv_L.jdf | 9 +- src/zpoinv_U.jdf | 9 +- src/zpotrf_L.jdf | 150 +++++++++++--- src/zpotrf_U.jdf | 145 ++++++++++++-- src/zpotrf_wrapper.c | 80 ++++++-- src/ztrsm_LLN.jdf | 6 +- src/ztrsm_LLT.jdf | 6 +- src/ztrsm_LUN.jdf | 6 +- src/ztrsm_LUT.jdf | 6 +- src/ztrsm_RLN.jdf | 6 +- src/ztrsm_RLT.jdf | 6 +- src/ztrsm_RUN.jdf | 6 +- src/ztrsm_RUT.jdf | 6 +- tests/Testings.cmake | 41 ++-- tests/common.c | 57 ++++-- tests/testing_zgemm_dtd.c | 2 +- tests/testing_zpotrf_dtd.c | 8 +- tools/PrecisionGenerator/subs.py | 10 +- 51 files changed, 1373 insertions(+), 438 deletions(-) create mode 100644 src/cuda/README.md create mode 100644 src/cuda/lapack_cuda_stage_in.c create mode 100644 src/dplasmaaux_hip.c create mode 100644 src/dplasmaaux_hip.h rename src/{potrf_cublas_utils.h => potrf_gpu_workspaces.h} (66%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 606c3089..56ac451b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,6 +16,10 @@ set(DPLASMA_VERSION "${DPLASMA_VERSION_MAJOR}.${DPLASMA_VERSION_MINOR}") ############################################################################ # CMake Policies Tuning +if(POLICY CMP0144) + # CMP0144: find_package uses upper-case _ROOT variables in addition to _ROOT + cmake_policy(SET CMP0144 NEW) +endif(POLICY CMP0144) set(CMAKE_NO_SYSTEM_FROM_IMPORTED True) ############################################################################ @@ -231,12 +235,30 @@ endif(NOT TARGET PaRSEC::parsec AND NOT TARGET PaRSEC::parsec_ptgpp) ############################################################################ # Resume configuring dplasma -option(DPLASMA_HAVE_CUDA "Use CUDA to accelerate DPLASMA routines" ${PARSEC_HAVE_CUDA}) -if(DPLASMA_HAVE_CUDA) +option(DPLASMA_GPU_WITH_CUDA "Use CUDA to accelerate DPLASMA routines" ${PARSEC_HAVE_CUDA}) +if(DPLASMA_GPU_WITH_CUDA) + if(NOT PARSEC_HAVE_CUDA) + message(FATAL_ERROR "CUDA support for DPLASMA requested, but detected PaRSEC does not support it") + endif() message(STATUS "CUDA support for DPLASMA enabled") if(NOT TARGET CUDA::cusolver) find_package(CUDAToolkit REQUIRED) endif(NOT TARGET CUDA::cusolver) + set(DPLASMA_HAVE_CUDA ${PARSEC_HAVE_CUDA} CACHE BOOL "True if DPLASMA provide support for CUDA") +endif() +option(DPLASMA_GPU_WITH_HIP "Use HIP to accelerate DPLASMA routines" ${PARSEC_HAVE_HIP}) +if(DPLASMA_GPU_WITH_HIP) + if(NOT PARSEC_HAVE_HIP) + message(FATAL_ERROR "HIP support for DPLASMA requested, but detected PaRSEC does not support it") + endif() + message(STATUS "HIP support for DPLASMA enabled") + # This is kinda ugly but the PATH and HINTS don't get transmitted to sub-dependents + set(CMAKE_SYSTEM_PREFIX_PATH_save ${CMAKE_SYSTEM_PREFIX_PATH}) + list(APPEND CMAKE_SYSTEM_PREFIX_PATH /opt/rocm) + find_package(hipblas REQUIRED) + find_package(rocsolver REQUIRED) + set(CMAKE_SYSTEM_PREFIX_PATH ${CMAKE_SYSTEM_PREFIX_PATH_save}) + set(DPLASMA_HAVE_HIP ${PARSEC_HAVE_HIP} CACHE BOOL "True if DPLASMA provide support for HIP") endif() ############################################################################ diff --git a/configure b/configure index dfeae934..c31a719c 100755 --- a/configure +++ b/configure @@ -132,6 +132,9 @@ cat <&2 "Python is required. Please provide a path to the python executable."; exit 3;; diff --git a/share/help-dplasma.txt b/share/help-dplasma.txt index 841d89aa..18ed9187 100644 --- a/share/help-dplasma.txt +++ b/share/help-dplasma.txt @@ -1,8 +1,9 @@ -[cu*_alloc_failed] -There was not enough memory available on a CUDA device +[gpu_alloc_failed] +There was not enough memory available on a GPU device while trying to allocate a %s handle to manage tasks on -this device, or another CUDA device on the node. The +this device, or another GPU device on the node. The PaRSEC runtime system may be configured to reserve too -much memory on CUDA devices. Try reducing the amount of +much memory on GPU devices. Try reducing the amount of reserved memory by setting the PaRSEC MCA parameter -'device_cuda_memory_use' to a lower value. +'device_cuda_memory_use' (or similar for the type of +device) to a lower value. diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 42796dde..164f8cd0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -28,8 +28,27 @@ if( NOT DPLASMA_HAVE_COMPLEX_H ) list(APPEND EXTRA_SOURCES complex.c) endif() if( DPLASMA_HAVE_CUDA ) - list(APPEND EXTRA_SOURCES dplasmaaux_cuda.c) + list(APPEND EXTRA_SOURCES dplasmaaux_cuda.c cuda/lapack_cuda_stage_in.c) endif() +if( DPLASMA_HAVE_HIP ) + list(APPEND EXTRA_SOURCES dplasmaaux_hip.c) + FILE(GLOB cuda_sources cuda/[^\\.]*.[ch]) + find_package(Perl REQUIRED) + find_program(HIPIFY_PERL_COMMAND NAMES hipify-perl HINTS ${HIP_BIN_INSTALL_DIR} REQUIRED) + foreach(cuda_file ${cuda_sources}) + file(RELATIVE_PATH cuda_filename ${CMAKE_CURRENT_SOURCE_DIR}/cuda ${cuda_file}) + string(REPLACE cuda hip hip_file ${cuda_filename}) + string(PREPEND hip_file "${CMAKE_CURRENT_BINARY_DIR}/hip/") + add_custom_command(OUTPUT ${hip_file} + DEPENDS ${cuda_file} # do not use MAIN_DEPENDENCY, that overides the default .c.o rule + COMMAND ${CMAKE_COMMAND} -E copy "${cuda_file}" "${hip_file}.prehip" + COMMAND ${PERL_EXECUTABLE} ${HIPIFY_PERL_COMMAND} --inplace --print-stats "${hip_file}" + COMMAND ${PERL_EXECUTABLE} -i -pe "s{(cuda)}{ substr uc hip | (uc \$1 ^ \$1), 0, 3 }egi" "${hip_file}" VERBATIM) # Convert all remaining cuda/CUDA + if(${hip_file} MATCHES [^\\.]*.c) # do not add .h to sources + list(APPEND EXTRA_SOURCES ${hip_file}) + endif() + endforeach() +endif( DPLASMA_HAVE_HIP ) ### Generate .c files from .jdf for all required precisions set(JDF @@ -236,7 +255,9 @@ target_link_libraries(dplasma PaRSEC::parsec LAPACKE::LAPACKE $<$:CUDA::cublas> - $<$:CUDA::cusolver>) + $<$:CUDA::cusolver> + $<$:roc::hipblas> + $<$:roc::rocsolver>) set_target_properties(dplasma PROPERTIES VERSION ${DPLASMA_VERSION_MAJOR}.${DPLASMA_VERSION_MINOR} SOVERSION ${DPLASMA_VERSION_MAJOR}) diff --git a/src/cuda/README.md b/src/cuda/README.md new file mode 100644 index 00000000..1b5a22c5 --- /dev/null +++ b/src/cuda/README.md @@ -0,0 +1,3 @@ +This directory contains files that are automatically converted from CUDA to HIP using Hipify. +If your file is not automatically convertible, put it somewhere else. + diff --git a/src/cuda/lapack_cuda_stage_in.c b/src/cuda/lapack_cuda_stage_in.c new file mode 100644 index 00000000..7261342a --- /dev/null +++ b/src/cuda/lapack_cuda_stage_in.c @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2020-2024 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * + * $COPYRIGHT + * + */ + +#include "dplasma.h" +#include "dplasmajdf_lapack_dtt.h" + +#if defined(DPLASMA_HAVE_CUDA) +#include +#include + +/* Use cudaMemcpy2DAsync or loop with cudaMemcpyAsync for data transfers to device */ +#define USE_COPY_2D + +int +dplasma_cuda_lapack_stage_in(parsec_gpu_task_t *gtask, + uint32_t flow_mask, + parsec_gpu_exec_stream_t *gpu_stream) +{ + cudaError_t ret; + parsec_data_copy_t * copy_in; + parsec_data_copy_t * copy_out; + parsec_device_gpu_module_t *in_elem_dev; + parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t*)gpu_stream; + dplasma_data_collection_t * ddc; + parsec_task_t *task = gtask->ec; + int elem_sz; + int i; + for(i = 0; i < task->task_class->nb_flows; i++){ + if(flow_mask & (1U << i)){ + copy_in = task->data[i].data_in; + copy_out = task->data[i].data_out; + ddc = (dplasma_data_collection_t*)gtask->flow_dc[i]; + assert(ddc != NULL); + elem_sz = parsec_datadist_getsizeoftype(ddc->dc_original->mtype); + in_elem_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get( copy_in->device_index); + if( (in_elem_dev->super.type == PARSEC_DEV_CUDA) || (ddc->dc_original->storage != PARSEC_MATRIX_LAPACK)){ + ret = (cudaError_t)cudaMemcpyAsync( copy_out->device_private, + copy_in->device_private, + gtask->flow_nb_elts[i], + (in_elem_dev->super.type != PARSEC_DEV_CUDA)? + cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice, + cuda_stream->cuda_stream); + PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } ); + }else{ + +#ifdef USE_COPY_2D + int ldd, nrows, ncols; + ADTT_INFO_internal(copy_in, ddc, &ldd, &nrows, &ncols); + size_t dpitch = ddc->dc_original->mb * elem_sz; + size_t spitch = ldd * elem_sz; + size_t width = nrows * elem_sz; + size_t height = ncols; + /* copy width bytes heigth times, skipping pitch - width bytes every time */ + ret = (cudaError_t)cudaMemcpy2DAsync( copy_out->device_private, + dpitch, /*dst pitch bytes*/ + copy_in->device_private, + spitch, /*src pitch bytes*/ + width, height, + cudaMemcpyHostToDevice, + cuda_stream->cuda_stream ); + PARSEC_CUDA_CHECK_ERROR( "cudaMemcpy2DAsync ", ret, { return PARSEC_ERROR; } ); + + +#else + + int ldd, nrows, ncols; + ADTT_INFO_internal(copy_in, ddc, &ldd, &nrows, &ncols); + + int j; + for(j=0; jdevice_private) + j * ldd * elem_sz; + char*dst = ((char*)copy_out->device_private) + j * ddc->dc_original->mb * elem_sz; + ret = cudaMemcpyAsync(dst, + src, + nrows * elem_sz, + cudaMemcpyHostToDevice, + cuda_stream->cuda_stream ); + PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } ); + + } +#endif + + + } + } + } + return PARSEC_SUCCESS; +} + +int +dplasma_cuda_lapack_stage_out(parsec_gpu_task_t *gtask, + uint32_t flow_mask, + parsec_gpu_exec_stream_t *gpu_stream) +{ + cudaError_t ret; + parsec_data_copy_t * copy_in; + parsec_data_copy_t * copy_out; + parsec_device_gpu_module_t *out_elem_dev; + parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t*)gpu_stream; + parsec_task_t *task = gtask->ec; + dplasma_data_collection_t * ddc; + int elem_sz; + int i; + for(i = 0; i < task->task_class->nb_flows; i++){ + if(flow_mask & (1U << i)){ + copy_in = task->data[i].data_out; + copy_out = copy_in->original->device_copies[0]; + ddc = (dplasma_data_collection_t*)gtask->flow_dc[i]; + assert(ddc != NULL); + elem_sz = parsec_datadist_getsizeoftype(ddc->dc_original->mtype); + out_elem_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get( copy_out->device_index); + + if( (out_elem_dev->super.type == PARSEC_DEV_CUDA) || (ddc->dc_original->storage != PARSEC_MATRIX_LAPACK)){ + ret = (cudaError_t)cudaMemcpyAsync( copy_out->device_private, + copy_in->device_private, + gtask->flow_nb_elts[i], + out_elem_dev->super.type != PARSEC_DEV_CUDA ? + cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice, + cuda_stream->cuda_stream); + PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } ); + }else{ + +#ifdef USE_COPY_2D + int ldd, nrows, ncols; + ADTT_INFO_internal(copy_out, ddc, &ldd, &nrows, &ncols); + size_t dpitch = ldd * elem_sz; + size_t spitch = ddc->dc_original->mb * elem_sz; + size_t width = nrows * elem_sz; + size_t height = ncols; + /* copy width bytes heigth times, skipping pitch - width bytes every time */ + ret = (cudaError_t)cudaMemcpy2DAsync( copy_out->device_private, + dpitch, /*dst pitch bytes*/ + copy_in->device_private, + spitch, /*src pitch bytes*/ + width, height, + cudaMemcpyDeviceToHost, + cuda_stream->cuda_stream); + PARSEC_CUDA_CHECK_ERROR( "cudaMemcpy2DAsync ", ret, { return PARSEC_ERROR; } ); +#else + int ldd, nrows, ncols; + ADTT_INFO_internal(copy_out, ddc, &ldd, &nrows, &ncols); + int j; + for(j=0; jdevice_private) + j * ddc->dc_original->mb * elem_sz; + char*dst = ((char*)copy_out->device_private) + j * ldd * elem_sz; + ret = cudaMemcpyAsync(dst, + src, + nrows * elem_sz, + cudaMemcpyDeviceToHost, + cuda_stream->cuda_stream); + PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } ); + } +#endif + } + } + } + return PARSEC_SUCCESS; +} +#endif /* defined(DPLASMA_HAVE_CUDA) */ diff --git a/src/dplasmaaux.c b/src/dplasmaaux.c index 09eab0d0..86a6b189 100644 --- a/src/dplasmaaux.c +++ b/src/dplasmaaux.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2021 The University of Tennessee and The University + * Copyright (c) 2011-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2013 Inria. All rights reserved. @@ -109,3 +109,4 @@ dplasma_aux_getGEMMLookahead( parsec_tiled_matrix_t *A ) return dplasma_imax( ceil( alpha ), 2 ); } } + diff --git a/src/dplasmaaux.h b/src/dplasmaaux.h index f4df23f5..28ae2039 100644 --- a/src/dplasmaaux.h +++ b/src/dplasmaaux.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2021 The University of Tennessee and The University + * Copyright (c) 2011-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2013 Inria. All rights reserved. @@ -112,5 +112,7 @@ extern void *dplasma_pcomm; #if defined(DPLASMA_HAVE_CUDA) #include "dplasmaaux_cuda.h" #endif - +#if defined(DPLASMA_HAVE_HIP) +#include "dplasmaaux_hip.h" +#endif #endif /* _DPLASMAAUX_H_INCLUDED */ diff --git a/src/dplasmaaux_cuda.c b/src/dplasmaaux_cuda.c index e4c0ceb8..c85242b0 100644 --- a/src/dplasmaaux_cuda.c +++ b/src/dplasmaaux_cuda.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023- The University of Tennessee and The University + * Copyright (c) 2023-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * $COPYRIGHT @@ -8,21 +8,23 @@ #include "dplasma/config.h" #include "parsec/utils/zone_malloc.h" #include "parsec/utils/show_help.h" +#include "potrf_gpu_workspaces.h" + #include +#include #include "dplasmaaux_cuda.h" -#include "potrf_cublas_utils.h" -/* +/* * Global info ID's for cublas handles and workspaces * Should be initialized in the tests * with the return of parsec_info_register * or parsec_info_lookup */ -parsec_info_id_t CuHI = -1; -parsec_info_id_t WoSI = -1; +parsec_info_id_t dplasma_dtd_cuda_infoid = -1; +parsec_info_id_t dplasma_dtd_cuda_workspace_infoid = -1; -/* Unfortunately, CUBLAS does not provide a error to string function */ -static char *dplasma_cublas_error_to_string(cublasStatus_t cublas_status) +/* Unfortunately, CUBLAS < 11.4.2 does not provide a error to string function */ +const char *dplasma_cublas_error_to_string(cublasStatus_t cublas_status) { switch(cublas_status) { @@ -38,8 +40,8 @@ static char *dplasma_cublas_error_to_string(cublasStatus_t cublas_status) } } -/* Unfortunately, cuSolver does not provide a error to string function */ -char *dplasma_cusolver_error_to_string(cusolverStatus_t cusolver_status) +/* Unfortunately, cuSolver < 11.4.2 does not provide a error to string function */ +const char *dplasma_cusolver_error_to_string(cusolverStatus_t cusolver_status) { switch(cusolver_status) { case CUSOLVER_STATUS_SUCCESS: return "CUSOLVER_STATUS_SUCCESS"; @@ -106,3 +108,4 @@ void dplasma_destroy_cuda_handles(void *_h, void *_n) cusolverDnDestroy(handles->cusolverDn_handle); free(handles); } + diff --git a/src/dplasmaaux_cuda.h b/src/dplasmaaux_cuda.h index aa1360ff..ceebd1cb 100644 --- a/src/dplasmaaux_cuda.h +++ b/src/dplasmaaux_cuda.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023- The University of Tennessee and The University + * Copyright (c) 2023-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * $COPYRIGHT @@ -9,16 +9,18 @@ #ifndef _DPLASMAAAUX_CUDA_H_ #define _DPLASMAAAUX_CUDA_H_ + +#if defined(DPLASMA_HAVE_CUDA) #include "parsec/mca/device/cuda/device_cuda.h" /** * DPLASMA currently supports a mix of cublas v1 and v2, but not in the same source file. Thus, * the simplest way to provide common headers is to require the developer to manually specify - * when cublas_v2 is needed by including the header before dplasmaaux.h. Otherwise, we will include - * cublas.h (v1) automatically if CUDA is enabled. + * when legacy cublas is needed by including the header before dplasmaaux.h. Otherwise, we will include + * cublas_v2.h (v2) automatically if CUDA is enabled. */ -#if !defined(CUBLAS_V2_H_) -#include +#if !defined(CUBLAS_H_) +#include #endif /* !defined(CUBLAS_V2_H_) */ #define dplasma_cublas_side(side) \ @@ -57,8 +59,8 @@ trans = (trans == dplasmaNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; #endif /* PRECISION_z || PRECISION_c */ -extern parsec_info_id_t CuHI; -extern parsec_info_id_t WoSI; +extern parsec_info_id_t dplasma_dtd_cuda_infoid; +extern parsec_info_id_t dplasma_dtd_cuda_workspace_infoid; typedef struct { cublasHandle_t cublas_handle; @@ -68,12 +70,14 @@ typedef struct { void *dplasma_create_cuda_handles(void *obj, void *user); void dplasma_destroy_cuda_handles(void *_h, void *_n); +const char *dplasma_cublas_error_to_string(cublasStatus_t cublas_status); + #define DPLASMA_CUBLAS_CHECK_STATUS( STR, STATUS, CODE ) \ do { \ cublasStatus_t __cublas_status = (cublasStatus_t) (STATUS); \ if( CUBLAS_STATUS_SUCCESS != __cublas_status ) { \ parsec_warning( "%s:%d %s%s", __FILE__, __LINE__, \ - (STR), cublasGetStatusString(__cublas_status) ); \ + (STR), dplasma_cublas_error_to_string(__cublas_status) ); \ CODE; \ } \ } while(0) @@ -82,7 +86,7 @@ void dplasma_destroy_cuda_handles(void *_h, void *_n); /* Support for cusolve requires cublas_v2 */ #include -char *dplasma_cusolver_error_to_string(cusolverStatus_t cusolver_status); +const char *dplasma_cusolver_error_to_string(cusolverStatus_t cusolver_status); #define DPLASMA_CUSOLVER_CHECK_STATUS( STR, STATUS, CODE ) \ do { \ @@ -95,4 +99,7 @@ char *dplasma_cusolver_error_to_string(cusolverStatus_t cusolver_status); } while(0) #endif /* defined(CUBLAS_V2_H_) */ +#else +#warning "DPLASMA_HAVE_CUDA not defined, this file should not be included then." +#endif /* defined(DPLASMA_HAVE_CUDA) */ #endif /* __DPLAMAAUX_CUDA_H__ */ diff --git a/src/dplasmaaux_hip.c b/src/dplasmaaux_hip.c new file mode 100644 index 00000000..40574ae7 --- /dev/null +++ b/src/dplasmaaux_hip.c @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2023-2024 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT + * + */ +#include "dplasma/config.h" +#include "parsec/utils/zone_malloc.h" +#include "parsec/utils/show_help.h" +#include "dplasmaaux_hip.h" +#include "potrf_gpu_workspaces.h" + +#include +#include + +/* + * Global info ID's for cublas handles and workspaces + * Should be initialized in the tests + * with the return of parsec_info_register + * or parsec_info_lookup + */ +parsec_info_id_t dplasma_dtd_hip_infoid = -1; + +/* Unfortunately, hipSolver does not provide a error to string function */ +const char *dplasma_hipsolver_error_to_string(hipsolverStatus_t hipsolver_status) +{ + switch(hipsolver_status) { + case HIPSOLVER_STATUS_SUCCESS: return "HIPSOLVER_STATUS_SUCCESS"; + case HIPSOLVER_STATUS_NOT_INITIALIZED: return "HIPSOLVER_STATUS_NOT_INITIALIZED"; + case HIPSOLVER_STATUS_ALLOC_FAILED: return "HIPSOLVER_STATUS_ALLOC_FAILED"; + case HIPSOLVER_STATUS_INVALID_VALUE: return "HIPSOLVER_STATUS_INVALID_VALUE"; + case HIPSOLVER_STATUS_ARCH_MISMATCH: return "HIPSOLVER_STATUS_ARCH_MISMATCH"; + case HIPSOLVER_STATUS_EXECUTION_FAILED: return "HIPSOLVER_STATUS_EXECUTION_FAILED"; + case HIPSOLVER_STATUS_INTERNAL_ERROR: return "HIPSOLVER_STATUS_INTERNAL_ERROR"; + case HIPSOLVER_STATUS_MAPPING_ERROR: return "HIPSOLVER_STATUS_MAPPING_ERROR"; + case HIPSOLVER_STATUS_NOT_SUPPORTED: return "HIPSOLVER_STATUS_NOT_SUPPORTED"; + case HIPSOLVER_STATUS_HANDLE_IS_NULLPTR: return "HIPSOLVER_STATUS_HANDLE_IS_NULLPTR"; + case HIPSOLVER_STATUS_INVALID_ENUM: return "HIPSOLVER_STATUS_INVALID_ENUM"; + default: return "unknown hipsolver error"; + } +} + +void *dplasma_create_hip_handles(void *obj, void *_n) +{ + parsec_hip_exec_stream_t *stream = (parsec_hip_exec_stream_t *)obj; + dplasma_hip_handles_t *new; + hipblasHandle_t hipblas_handle; + hipblasStatus_t hipblas_status; + + (void)_n; + + /* No need to call hipSetDevice, as this has been done by PaRSEC before calling the task body */ + hipblas_status = hipblasCreate(&hipblas_handle); + if(HIPBLAS_STATUS_SUCCESS != hipblas_status) { + if( HIPBLAS_STATUS_ALLOC_FAILED == hipblas_status) { + parsec_show_help("help-dplasma.txt", "gpu_alloc_failed", 1, "HIPBLAS"); + } + parsec_fatal("Unable to create HIPBLAS Handle: %s", + hipblasStatusToString(hipblas_status)); + return NULL; + } + hipblas_status = hipblasSetStream(hipblas_handle, stream->hip_stream); + assert(HIPBLAS_STATUS_SUCCESS == hipblas_status); + + new = malloc(sizeof(dplasma_hip_handles_t)); + new->hipblas_handle = hipblas_handle; + + return new; +} + +void dplasma_destroy_hip_handles(void *_h, void *_n) +{ + dplasma_hip_handles_t *handles = (dplasma_hip_handles_t*)_h; + (void)_n; + hipblasDestroy(handles->hipblas_handle); + free(handles); +} diff --git a/src/dplasmaaux_hip.h b/src/dplasmaaux_hip.h new file mode 100644 index 00000000..5dc24460 --- /dev/null +++ b/src/dplasmaaux_hip.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2023-2024 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT + * + */ + +#ifndef _DPLASMAAAUX_HIP_H_ +#define _DPLASMAAAUX_HIP_H_ + +#if defined(DPLASMA_HAVE_HIP) +#include "parsec/mca/device/hip/device_hip.h" + +#include + +#define dplasma_hipblas_side(side) \ + assert( (side == dplasmaRight) || (side == dplasmaLeft) ); \ + side = (side == dplasmaRight) ? HIPBLAS_SIDE_RIGHT : HIPBLAS_SIDE_LEFT; + + +#define dplasma_hipblas_diag(diag) \ + assert( (diag == dplasmaNonUnit) || (diag == dplasmaUnit) ); \ + diag = (diag == dplasmaNonUnit) ? HIPBLAS_DIAG_NON_UNIT : HIPBLAS_DIAG_UNIT; + +#define dplasma_hipblas_fill(fill) \ + assert( (fill == dplasmaLower) || (fill == dplasmaUpper) ); \ + fill = (fill == dplasmaLower) ? HIPBLAS_FILL_MODE_LOWER : HIPBLAS_FILL_MODE_UPPER; + +#if defined(PRECISION_z) || defined(PRECISION_c) +#define dplasma_hipblas_op(trans) \ + assert( (trans == dplasmaNoTrans) || (trans == dplasmaTrans) || (trans == dplasmaConjTrans) ); \ + switch(trans){ \ + case dplasmaNoTrans: \ + trans = HIPBLAS_OP_N; \ + break; \ + case dplasmaTrans: \ + trans = HIPBLAS_OP_T; \ + break; \ + case dplasmaConjTrans: \ + trans = HIPBLAS_OP_C; \ + break; \ + default: \ + trans = HIPBLAS_OP_N; \ + break; \ + } +#else +#define dplasma_hipblas_op(trans) \ + assert( (trans == dplasmaNoTrans) || (trans == dplasmaTrans) ); \ + trans = (trans == dplasmaNoTrans) ? HIPBLAS_OP_N : HIPBLAS_OP_T; +#endif /* PRECISION_z || PRECISION_c */ + +extern parsec_info_id_t dplasma_dtd_hip_infoid; + +typedef struct { + hipblasHandle_t hipblas_handle; +} dplasma_hip_handles_t; + +void *dplasma_create_hip_handles(void *obj, void *user); +void dplasma_destroy_hip_handles(void *_h, void *_n); + +#define DPLASMA_ROCBLAS_CHECK_ERROR(STR, ERROR, CODE) \ + do { \ + rocblas_status __error = (rocblas_status) (ERROR); \ + if(rocblas_status_success != __error) { \ + parsec_warning( "%s:%d %s%s", __FILE__, __LINE__, \ + (STR), rocblas_status_to_string(__error)); \ + CODE; \ + } \ + } while(0) + +/* For some reason the error values are not the same... */ +#define DPLASMA_HIPBLAS_CHECK_ERROR(STR, ERROR, CODE) \ + do { \ + hipblasStatus_t __error = (hipblasStatus_t) (ERROR); \ + if(HIPBLAS_STATUS_SUCCESS != __error) { \ + parsec_warning( "%s:%d %s%s", __FILE__, __LINE__, \ + (STR), hipblasStatusToString(__error)); \ + CODE; \ + } \ + } while(0) + +#else +#warning "DPLASMA_HAVE_HIP not defined, this file should not be included then." +#endif /* defined(DPLASMA_HAVE_HIP */ +#endif /* __DPLAMAAUX_HIP_H__ */ diff --git a/src/dplasmajdf.h b/src/dplasmajdf.h index 4d2acbae..8d09cb5f 100644 --- a/src/dplasmajdf.h +++ b/src/dplasmajdf.h @@ -23,11 +23,11 @@ # include # define printlog(str, ...) fprintf(stderr, "thread %d VP %d " str "\n", \ es->th_id, es->virtual_process->vp_id, __VA_ARGS__) -# define printlogcuda(str, ...) fprintf(stderr, "cuda %d " str "\n", \ - gpu_device->cuda_index, __VA_ARGS__) +# define printloggpu(str, ...) fprintf(stderr, "GPU %s " str "\n", \ + gpu_device->super.device_name, __VA_ARGS__) #else # define printlog(...) do {} while(0) -# define printlogcuda(...) do {} while(0) +# define printloggpu(...) do {} while(0) #endif #ifndef PARSEC_HAVE_MPI diff --git a/src/dplasmajdf_lapack_dtt.h b/src/dplasmajdf_lapack_dtt.h index 4937fb72..21019004 100644 --- a/src/dplasmajdf_lapack_dtt.h +++ b/src/dplasmajdf_lapack_dtt.h @@ -1,11 +1,22 @@ -#ifndef _DPLASMAJDF_LAPACK_DTT_H_ -#define _DPLASMAJDF_LAPACK_DTT_H_ +/* + * Copyright (c) 2020-2024 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * + * $COPYRIGHT + * + */ +#ifndef INCLUDE_DPLASMA_LAPACK_DTT_H +#define INCLUDE_DPLASMA_LAPACK_DTT_H +#include "dplasma/config.h" +#include +#include #include "dplasma/types.h" #include "dplasma/types_lapack.h" /* DON'T CHANGE SHAPE */ -#define SAME -1 +#define DPLASMA_SHAPE_SAME -1 /* Obtain location on matrix. */ @@ -80,7 +91,7 @@ ADTT_CP(parsec_data_copy_t *cp, const dplasma_data_collection_t *ddc, int target rc = dplasma_get_info_from_datatype(ddc, cp->dtt, &cp_info, &adt); assert(rc == 0); - if(( cp_info->shape == target_shape )||(target_shape == SAME)){ + if(( cp_info->shape == target_shape )||(target_shape == DPLASMA_SHAPE_SAME)){ PARSEC_DEBUG_VERBOSE(8, parsec_debug_output, "CP %p [type %p] -> target_shape %d target_loc %d dtt %p", cp, cp->dtt, target_shape, target_loc, adt->opaque_dtt); @@ -121,159 +132,32 @@ void ADTT_INFO_internal(parsec_data_copy_t *cp, const dplasma_data_collection_t ADTT_INFO_internal(_f_##FLOW_NAME, ddc, lda, rows, cols) -#if defined(DPLASMA_HAVE_CUDA) -/* Use cudaMemcpy2DAsync or loop with cudaMemcpyAsync for data transfers to device */ -#define CUDA_COPY_2D /* Functions to transfer data in and out of the GPU. * Assuming a full tiled has been allocated on the GPU (mb*nb*size(elem)) */ -static int -stage_in_lapack(parsec_gpu_task_t *gtask, +#if defined(DPLASMA_HAVE_CUDA) +int +dplasma_cuda_lapack_stage_in(parsec_gpu_task_t *gtask, uint32_t flow_mask, - parsec_gpu_exec_stream_t *gpu_stream) -{ - cudaError_t ret; - parsec_data_copy_t * copy_in; - parsec_data_copy_t * copy_out; - parsec_device_cuda_module_t *in_elem_dev; - parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t *)gpu_stream; - dplasma_data_collection_t * ddc; - parsec_task_t *task = gtask->ec; - int elem_sz; - int i; - for(i = 0; i < task->task_class->nb_flows; i++){ - if(flow_mask & (1U << i)){ - copy_in = task->data[i].data_in; - copy_out = task->data[i].data_out; - ddc = (dplasma_data_collection_t*)gtask->flow_dc[i]; - assert(ddc != NULL); - elem_sz = parsec_datadist_getsizeoftype(ddc->dc_original->mtype); - in_elem_dev = (parsec_device_cuda_module_t*)parsec_mca_device_get( copy_in->device_index); - if( (in_elem_dev->super.super.type == PARSEC_DEV_CUDA) || (ddc->dc_original->storage != PARSEC_MATRIX_LAPACK)){ - ret = (cudaError_t)cudaMemcpyAsync( copy_out->device_private, - copy_in->device_private, - gtask->flow_nb_elts[i], - (in_elem_dev->super.super.type != PARSEC_DEV_CUDA)? - cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice, - cuda_stream->cuda_stream); - PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } ); - }else{ - -#ifdef CUDA_COPY_2D - int ldd, nrows, ncols; - ADTT_INFO_internal(copy_in, ddc, &ldd, &nrows, &ncols); - size_t dpitch = ddc->dc_original->mb * elem_sz; - size_t spitch = ldd * elem_sz; - size_t width = nrows * elem_sz; - size_t height = ncols; - /* copy width bytes heigth times, skipping pitch - width bytes every time */ - ret = (cudaError_t)cudaMemcpy2DAsync( copy_out->device_private, - dpitch, /*dst pitch bytes*/ - copy_in->device_private, - spitch, /*src pitch bytes*/ - width, height, - cudaMemcpyHostToDevice, - cuda_stream->cuda_stream ); - PARSEC_CUDA_CHECK_ERROR( "cudaMemcpy2DAsync ", ret, { return PARSEC_ERROR; } ); - - -#else - - int ldd, nrows, ncols; - ADTT_INFO_internal(copy_in, ddc, &ldd, &nrows, &ncols); - - int j; - for(j=0; jdevice_private) + j * ldd * elem_sz; - char*dst = ((char*)copy_out->device_private) + j * ddc->dc_original->mb * elem_sz; - ret = cudaMemcpyAsync(dst, - src, - nrows * elem_sz, - cudaMemcpyHostToDevice, - gpu_stream->cuda_stream ); - PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } ); - - } -#endif - + parsec_gpu_exec_stream_t *gpu_stream); - } - } - } - return PARSEC_SUCCESS; -} - -static int -stage_out_lapack(parsec_gpu_task_t *gtask, +int +dplasma_cuda_lapack_stage_out(parsec_gpu_task_t *gtask, uint32_t flow_mask, - parsec_gpu_exec_stream_t *gpu_stream) -{ - cudaError_t ret; - parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t*)gpu_stream; - parsec_data_copy_t * copy_in; - parsec_data_copy_t * copy_out; - parsec_device_cuda_module_t *out_elem_dev; - parsec_task_t *task = gtask->ec; - dplasma_data_collection_t * ddc; - int elem_sz; - int i; - for(i = 0; i < task->task_class->nb_flows; i++){ - if(flow_mask & (1U << i)){ - copy_in = task->data[i].data_out; - copy_out = copy_in->original->device_copies[0]; - ddc = (dplasma_data_collection_t*)gtask->flow_dc[i]; - assert(ddc != NULL); - elem_sz = parsec_datadist_getsizeoftype(ddc->dc_original->mtype); - out_elem_dev = (parsec_device_cuda_module_t*)parsec_mca_device_get( copy_out->device_index); - - if( (out_elem_dev->super.super.type == PARSEC_DEV_CUDA) || (ddc->dc_original->storage != PARSEC_MATRIX_LAPACK)){ - ret = (cudaError_t)cudaMemcpyAsync( copy_out->device_private, - copy_in->device_private, - gtask->flow_nb_elts[i], - out_elem_dev->super.super.type != PARSEC_DEV_CUDA ? - cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice, - cuda_stream->cuda_stream); - PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } ); - }else{ - -#ifdef CUDA_COPY_2D - int ldd, nrows, ncols; - ADTT_INFO_internal(copy_out, ddc, &ldd, &nrows, &ncols); - size_t dpitch = ldd * elem_sz; - size_t spitch = ddc->dc_original->mb * elem_sz; - size_t width = nrows * elem_sz; - size_t height = ncols; - /* copy width bytes heigth times, skipping pitch - width bytes every time */ - ret = (cudaError_t)cudaMemcpy2DAsync( copy_out->device_private, - dpitch, /*dst pitch bytes*/ - copy_in->device_private, - spitch, /*src pitch bytes*/ - width, height, - cudaMemcpyDeviceToHost, - cuda_stream->cuda_stream); - PARSEC_CUDA_CHECK_ERROR( "cudaMemcpy2DAsync ", ret, { return PARSEC_ERROR; } ); -#else - int ldd, nrows, ncols; - ADTT_INFO_internal(copy_out, ddc, &ldd, &nrows, &ncols); - int j; - for(j=0; jdevice_private) + j * ddc->dc_original->mb * elem_sz; - char*dst = ((char*)copy_out->device_private) + j * ldd * elem_sz; - ret = cudaMemcpyAsync(dst, - src, - nrows * elem_sz, - cudaMemcpyDeviceToHost, - cuda_stream->cuda_stream); - PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } ); - } -#endif - } - } - } - return PARSEC_SUCCESS; -} + parsec_gpu_exec_stream_t *gpu_stream); #endif /* defined(DPLASMA_HAVE_CUDA) */ +#if defined(DPLASMA_HAVE_HIP) +int +dplasma_hip_lapack_stage_in(parsec_gpu_task_t *gtask, + uint32_t flow_mask, + parsec_gpu_exec_stream_t *gpu_stream); + +int +dplasma_hip_lapack_stage_out(parsec_gpu_task_t *gtask, + uint32_t flow_mask, + parsec_gpu_exec_stream_t *gpu_stream); +#endif /* defined(DPLASMA_HAVE_HIP) */ -#endif /* _DPLASMAJDF_LAPACK_DTT_H_ */ +#endif /* INCLUDE_DPLASMA_LAPACK_DTT_H */ diff --git a/src/dtd_wrappers/dplasma_z_dtd.h b/src/dtd_wrappers/dplasma_z_dtd.h index 97c906da..e6db6208 100644 --- a/src/dtd_wrappers/dplasma_z_dtd.h +++ b/src/dtd_wrappers/dplasma_z_dtd.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023- The University of Tennessee and The University + * Copyright (c) 2023-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -19,7 +19,7 @@ #include "parsec/mca/device/cuda/device_cuda.h" #include "parsec/utils/zone_malloc.h" #include "dplasmaaux.h" -#include "potrf_cublas_utils.h" +#include "potrf_gpu_workspaces.h" /* probably need to add this to substitions */ #if defined(PRECISION_s) diff --git a/src/dtd_wrappers/zgemm.c b/src/dtd_wrappers/zgemm.c index 9b39354d..1650b6f6 100644 --- a/src/dtd_wrappers/zgemm.c +++ b/src/dtd_wrappers/zgemm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023- The University of Tennessee and The University + * Copyright (c) 2023-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -7,11 +7,6 @@ * */ #include "dplasma/config.h" - -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ - #include "dplasma_z_dtd.h" int @@ -80,7 +75,7 @@ parsec_core_zgemm_cuda(parsec_device_gpu_module_t* gpu_device, } #endif /* defined(PARSEC_DEBUG_NOISIER) */ - handles = parsec_info_get(&gpu_stream->infos, CuHI); + handles = parsec_info_get(&gpu_stream->infos, dplasma_dtd_cuda_infoid); assert(NULL != handles); parsec_cuda_exec_stream_t* cuda_stream = (parsec_cuda_exec_stream_t*)gpu_stream; diff --git a/src/dtd_wrappers/zherk.c b/src/dtd_wrappers/zherk.c index f0aed388..3de61529 100644 --- a/src/dtd_wrappers/zherk.c +++ b/src/dtd_wrappers/zherk.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023- The University of Tennessee and The University + * Copyright (c) 2023-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -7,11 +7,6 @@ * */ #include "dplasma/config.h" - -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ - #include "dplasma_z_dtd.h" int @@ -60,7 +55,7 @@ parsec_core_zherk_cuda(parsec_device_gpu_module_t* gpu_device, dplasma_cublas_op(trans); dplasma_cublas_fill(uplo); - handles = parsec_info_get(&gpu_stream->infos, CuHI); + handles = parsec_info_get(&gpu_stream->infos, dplasma_dtd_cuda_infoid); #if defined(PARSEC_DEBUG_NOISIER) { diff --git a/src/dtd_wrappers/zpotrf.c b/src/dtd_wrappers/zpotrf.c index 0b979be4..521845c2 100644 --- a/src/dtd_wrappers/zpotrf.c +++ b/src/dtd_wrappers/zpotrf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023- The University of Tennessee and The University + * Copyright (c) 2023-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -7,11 +7,6 @@ * */ #include "dplasma/config.h" - -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ - #include "dplasma_z_dtd.h" int @@ -39,7 +34,7 @@ zpotrf_dtd_create_workspace(void *obj, void *user) cusolverDnHandle_t cusolverDnHandle; cusolverStatus_t status; zpotrf_dtd_workspace_info_t *infos = (zpotrf_dtd_workspace_info_t*) user; - dplasma_potrf_workspace_t *wp = NULL; + dplasma_potrf_gpu_workspaces_t *wp = NULL; size_t workspace_size; size_t host_size; int mb = infos->mb; @@ -66,7 +61,7 @@ zpotrf_dtd_create_workspace(void *obj, void *user) cusolverDnDestroy(cusolverDnHandle); - wp = (dplasma_potrf_workspace_t*)malloc(sizeof(dplasma_potrf_workspace_t)); + wp = (dplasma_potrf_gpu_workspaces_t*)malloc(sizeof(dplasma_potrf_gpu_workspaces_t)); wp->tmpmem = zone_malloc(memory, workspace_size * elt_size + sizeof(int)); assert(NULL != wp->tmpmem); wp->lwork = workspace_size; @@ -81,7 +76,7 @@ zpotrf_dtd_create_workspace(void *obj, void *user) void zpotrf_dtd_destroy_workspace(void *_ws, void *_n) { - dplasma_potrf_workspace_t *ws = (dplasma_potrf_workspace_t*)_ws; + dplasma_potrf_gpu_workspaces_t *ws = (dplasma_potrf_gpu_workspaces_t*)_ws; cusolverDnParams_t* params = ws->params; cusolverStatus_t status = cusolverDnDestroyParams(*params); assert(CUSOLVER_STATUS_SUCCESS == status); @@ -105,7 +100,7 @@ parsec_core_zpotrf_cuda(parsec_device_gpu_module_t* gpu_device, parsec_task_t* this_task = gpu_task->ec; cusolverStatus_t status; dplasma_cuda_handles_t* handles; - dplasma_potrf_workspace_t *wp; + dplasma_potrf_gpu_workspaces_t *wp; cuDoubleComplex *workspace; cusolverDnParams_t* params; int *d_iinfo; @@ -116,9 +111,9 @@ parsec_core_zpotrf_cuda(parsec_device_gpu_module_t* gpu_device, dplasma_cublas_fill(uplo); - handles = parsec_info_get(&gpu_stream->infos, CuHI); + handles = parsec_info_get(&gpu_stream->infos, dplasma_dtd_cuda_infoid); assert(NULL != handles); - wp = parsec_info_get(&gpu_device->super.infos, WoSI); + wp = parsec_info_get(&gpu_device->super.infos, dplasma_dtd_cuda_workspace_infoid); assert(NULL != wp); workspace = (cuDoubleComplex*)wp->tmpmem; diff --git a/src/dtd_wrappers/ztrsm.c b/src/dtd_wrappers/ztrsm.c index 612ec99e..7e5a7ac6 100644 --- a/src/dtd_wrappers/ztrsm.c +++ b/src/dtd_wrappers/ztrsm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023- The University of Tennessee and The University + * Copyright (c) 2023-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -7,11 +7,6 @@ * */ #include "dplasma/config.h" - -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ - #include "dplasma_z_dtd.h" int @@ -62,7 +57,7 @@ parsec_core_ztrsm_cuda(parsec_device_gpu_module_t* gpu_device, dplasma_cublas_op(trans); dplasma_cublas_diag(diag); - handles = parsec_info_get(&gpu_stream->infos, CuHI); + handles = parsec_info_get(&gpu_stream->infos, dplasma_dtd_cuda_infoid); #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex alphag = make_cuDoubleComplex( creal(alpha), cimag(alpha)); diff --git a/src/include/dplasma/config.h.in b/src/include/dplasma/config.h.in index 1b180b84..b72b34de 100644 --- a/src/include/dplasma/config.h.in +++ b/src/include/dplasma/config.h.in @@ -6,6 +6,7 @@ /* GPU Backends */ #cmakedefine DPLASMA_HAVE_CUDA +#cmakedefine DPLASMA_HAVE_HIP /* system feature tests */ #cmakedefine DPLASMA_HAVE_COMPLEX_H diff --git a/src/potrf_cublas_utils.h b/src/potrf_gpu_workspaces.h similarity index 66% rename from src/potrf_cublas_utils.h rename to src/potrf_gpu_workspaces.h index 8ff02a9b..5366e80a 100644 --- a/src/potrf_cublas_utils.h +++ b/src/potrf_gpu_workspaces.h @@ -4,10 +4,8 @@ * reserved. * */ -#ifndef DPLASMA_POTRF_CUBLAS_UTILS_H -#define DPLASMA_POTRF_CUBLAS_UTILS_H - -#if defined(DPLASMA_HAVE_CUDA) +#ifndef DPLASMA_POTRF_GPU_WORKSPACES_H +#define DPLASMA_POTRF_GPU_WORKSPACES_H typedef struct { char *tmpmem; @@ -16,8 +14,6 @@ typedef struct { void* params; size_t host_size; void* host_buffer; -} dplasma_potrf_workspace_t; - -#endif +} dplasma_potrf_gpu_workspaces_t; -#endif //DPLASMA_POTRF_CUBLAS_UTILS_H +#endif //DPLASMA_POTRF_GPU_WORKSPACES_H diff --git a/src/zgemm_NN.jdf b/src/zgemm_NN.jdf index fe04a879..fa273d40 100644 --- a/src/zgemm_NN.jdf +++ b/src/zgemm_NN.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 The University of Tennessee and The University + * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -8,16 +8,15 @@ extern "C" %{ * $COPYRIGHT * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" -#include "dplasmaaux.h" #include "parsec/data_dist/matrix/matrix.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" #include -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ - /* Define the different shapes this JDF is using */ #define A_SHAPE 0 #define B_SHAPE 1 diff --git a/src/zgemm_NN_gpu.jdf b/src/zgemm_NN_gpu.jdf index eb723c32..bf087dd7 100644 --- a/src/zgemm_NN_gpu.jdf +++ b/src/zgemm_NN_gpu.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2017-2023 The University of Tennessee and The University + * Copyright (c) 2017-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -9,15 +9,14 @@ extern "C" %{ * */ -#include -#include "dplasmajdf.h" -#include "dplasmaaux.h" -#include "parsec/data_dist/matrix/matrix.h" -#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" - +#include "dplasma/config.h" #if defined(DPLASMA_HAVE_CUDA) #include #endif /* defined(DPLASMA_HAVE_CUDA) */ +#include "dplasmajdf.h" +#include "parsec/data_dist/matrix/matrix.h" +#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" +#include static void succ(int *x, int *y, int *z, int xMax, int yMax, int zMax, int l) { @@ -151,6 +150,8 @@ xMax [ type = int default = "-1" hidden=on ] yMax [ type = int default = "-1" hidden=on ] zMax [ type = int default = "-1" hidden=on ] +hip_handles_infokey [type = "int" hidden = on default = -1 ] + /********************************************************* * READ_A * * A is broadcast to all target GEMMs from the beginning * @@ -425,6 +426,70 @@ BODY [type=CUDA] } END +BODY [type=HIP] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex lalpha = { creal(alpha), cimag(alpha) }; + hipblasDoubleComplex lbeta = { 1., 0. }; + if( k == 0 ) { lbeta.x = creal(beta); lbeta.y = cimag(beta); }; +#else + double lalpha = alpha; + double lbeta = (k == 0) ? beta : 1.0; +#endif + int cAmb = descA->mb; + int cAnb = descA->nb; + int cBmb = descB->nb; + int cBnb = descB->nb; + int cCmb = cAmb; + int cCnb = cBnb; + + int tempmm = cCmb; + int tempnn = cCnb; + int tempkk = cAnb; + int ldam = cAmb; + int ldbk = cBmb; + int ldcm = cCmb; + + PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, + "HIP: gemm( %d, %d, %d ) > A(%d,%d) * B(%d,%d) C(%d,%d)\n", + m, n, k, cAmb, cAnb, cBmb, cBnb, cCmb, cCnb); + + hipblasStatus_t status; + hipblasOperation_t opA = dplasmaNoTrans == transA? HIPBLAS_OP_N: HIPBLAS_OP_T; + hipblasOperation_t opB = dplasmaNoTrans == transB? HIPBLAS_OP_N: HIPBLAS_OP_T; + dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, + opA, opB, + tempmm, tempnn, tempkk, + &lalpha, A, ldam, + B, ldbk, + &lbeta, C, ldcm ); + DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); + + /* Quick and dirty emulation of the next GEMM */ + if( k == descC->mt -1 ) { + unsigned int chore_id = 0; + for(chore_id = 0; chore_id < 8*sizeof(this_task->chore_mask); chore_id++) { + if( (this_task->chore_mask & (1<chore_mask)); + __parsec_zgemm_NN_gpu_GEMM_task_t next_gemm; + memcpy(&next_gemm, this_task, sizeof(__parsec_zgemm_NN_gpu_GEMM_task_t)); + next_gemm.locals.k.value = descC->mt -1; + assert( PARSEC_DEV_HIP == next_gemm.task_class->incarnations[chore_id].type ); + if(NULL != next_gemm.task_class->incarnations[chore_id].evaluate) { + if( next_gemm.task_class->incarnations[chore_id].evaluate((parsec_task_t*)&next_gemm) == + PARSEC_HOOK_RETURN_NEXT ) { + /* The next GEMM wants to run on the CPUs... */ + gpu_task->pushout |= (1 << 0); + } + } + } +} +END + BODY { dplasma_complex64_t lbeta = (k == 0) ? beta : (dplasma_complex64_t)1.0; diff --git a/src/zgemm_NN_summa.jdf b/src/zgemm_NN_summa.jdf index f6a60d6e..c27a5846 100644 --- a/src/zgemm_NN_summa.jdf +++ b/src/zgemm_NN_summa.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 The University of Tennessee and The University + * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -8,16 +8,15 @@ extern "C" %{ * $COPYRIGHT * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" -#include "dplasmaaux.h" #include "parsec/data_dist/matrix/matrix.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" #include -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ - /* Define the different shapes this JDF is using */ #define A_SHAPE 0 #define B_SHAPE 1 @@ -85,6 +84,8 @@ Q [type = "int" hidden=on default="((parsec_matrix_block_cyclic_t*)descC)-> lookP [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] lookQ [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] +hip_handles_infokey [type = "int" hidden = on default = -1 ] + /************************************************** * READ_A * **************************************************/ @@ -211,8 +212,8 @@ BODY [type=CUDA B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} A.dc=ddescA B.dc=ddescB C.dc=ddescC - stage_in=stage_in_lapack - stage_out=stage_out_lapack] + stage_in=dplasma_cuda_lapack_stage_in + stage_out=dplasma_cuda_lapack_stage_out] { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex lalpha = make_cuDoubleComplex(creal(alpha), cimag(alpha)); @@ -229,6 +230,15 @@ BODY [type=CUDA int ldbk = descB->mb; int ldcm = descC->mb; + printloggpu("gemm( %d, %d, %d )\n" + " ( %s, %s, %d, %d, %d, %f, A(%d,%d), %d, B(%d,%d), %d, %f, C(%d,%d), %d)\n", + m, n, k, + &dplasma_lapack_const( transA ), &dplasma_lapack_const( transB ), + tempmm, tempnn, tempkk, + creal(alpha), m, k, ldam, + k, n, ldbk, + creal(lbeta), m, n, ldcm ); + cublasStatus_t status; cublasSetKernelStream( parsec_body.stream ); cublasZgemm( dplasma_lapack_const(transA), dplasma_lapack_const(transB), @@ -241,6 +251,56 @@ BODY [type=CUDA } END +BODY [type=HIP + A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} + B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} + C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} + A.dc=ddescA B.dc=ddescB C.dc=ddescC + stage_in=dplasma_hip_lapack_stage_in + stage_out=dplasma_hip_lapack_stage_out] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex lalpha; + lalpha.x = creal(alpha); lalpha.y = cimag(alpha); + hipblasDoubleComplex lbeta = { 1., 0. }; + if(k == 0) { + lbeta.x = creal(beta); lbeta.y = cimag(beta); + } +#else + double lalpha = alpha; + double lbeta = (k == 0) ? beta : 1.0; +#endif + int tempmm = m == descC->mt-1 ? descC->m - m * descC->mb : descC->mb; + int tempnn = n == descC->nt-1 ? descC->n - n * descC->nb : descC->nb; + int tempkk = k == descA->nt-1 ? descA->n - k * descA->nb : descA->nb; + int ldam = descA->mb; + int ldbk = descB->mb; + int ldcm = descC->mb; + + printloggpu("gemm( %d, %d, %d )\n" + " ( %s, %s, %d, %d, %d, %f, A(%d,%d), %d, B(%d,%d), %d, %f, C(%d,%d), %d)\n", + m, n, k, + &dplasma_lapack_const( transA ), &dplasma_lapack_const( transB ), + tempmm, tempnn, tempkk, + creal(alpha), m, k, ldam, + k, n, ldbk, + (k==0)? creal(beta): 1.0, m, n, ldcm ); + + hipblasStatus_t status; + hipblasOperation_t opA = dplasmaNoTrans == transA? HIPBLAS_OP_N: HIPBLAS_OP_T; + hipblasOperation_t opB = dplasmaNoTrans == transB? HIPBLAS_OP_N: HIPBLAS_OP_T; + dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, + opA, opB, + tempmm, tempnn, tempkk, + &lalpha, (hipblasDoubleComplex*)A, ldam, + (hipblasDoubleComplex*)B, ldbk, + &lbeta, (hipblasDoubleComplex*)C, ldcm ); + DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + BODY { diff --git a/src/zgemm_NT.jdf b/src/zgemm_NT.jdf index 8fc8da8f..b6fa8c89 100644 --- a/src/zgemm_NT.jdf +++ b/src/zgemm_NT.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 The University of Tennessee and The University + * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -8,16 +8,15 @@ extern "C" %{ * $COPYRIGHT * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" -#include "dplasmaaux.h" #include "parsec/data_dist/matrix/matrix.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" #include -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ - /* Define the different shapes this JDF is using */ #define A_SHAPE 0 #define B_SHAPE 1 @@ -169,8 +168,8 @@ BODY [type=CUDA B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} A.dc=ddescA B.dc=ddescB C.dc=ddescC - stage_in=stage_in_lapack - stage_out=stage_out_lapack] + stage_in=dplasma_cuda_lapack_stage_in + stage_out=dplasma_cuda_lapack_stage_out] { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex lalpha = make_cuDoubleComplex(creal(alpha), cimag(alpha)); diff --git a/src/zgemm_NT_summa.jdf b/src/zgemm_NT_summa.jdf index 476c36d7..d66e5171 100644 --- a/src/zgemm_NT_summa.jdf +++ b/src/zgemm_NT_summa.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 The University of Tennessee and The University + * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -8,16 +8,15 @@ extern "C" %{ * $COPYRIGHT * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" -#include "dplasmaaux.h" #include "parsec/data_dist/matrix/matrix.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" #include -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ - /* Define the different shapes this JDF is using */ #define A_SHAPE 0 #define B_SHAPE 1 @@ -85,6 +84,8 @@ Q [type = "int" hidden=on default="((parsec_matrix_block_cyclic_t*)descC)-> lookP [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] lookQ [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] +hip_handles_infokey [type = "int" hidden = on default = -1 ] + /************************************************** * READ_A * **************************************************/ @@ -211,8 +212,8 @@ BODY [type=CUDA B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} A.dc=ddescA B.dc=ddescB C.dc=ddescC - stage_in=stage_in_lapack - stage_out=stage_out_lapack] + stage_in=dplasma_cuda_lapack_stage_in + stage_out=dplasma_cuda_lapack_stage_out] { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex lalpha = make_cuDoubleComplex(creal(alpha), cimag(alpha)); @@ -229,6 +230,15 @@ BODY [type=CUDA int ldbn = descB->mb; int ldcm = descC->mb; + printloggpu("gemm( %d, %d, %d )\n" + " ( %s, %s, %d, %d, %d, %f, A(%d,%d), %d, B(%d,%d), %d, %f, C(%d,%d), %d)\n", + m, n, k, + &dplasma_lapack_const( transA ), &dplasma_lapack_const( transB ), + tempmm, tempnn, tempkk, + creal(alpha), m, k, ldam, + n, k, ldbn, + creal(lbeta), m, n, ldcm ); + cublasStatus_t status; cublasSetKernelStream( parsec_body.stream ); cublasZgemm( dplasma_lapack_const(transA), dplasma_lapack_const(transB), @@ -241,6 +251,56 @@ BODY [type=CUDA } END +BODY [type=HIP + A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} + B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} + C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} + A.dc=ddescA B.dc=ddescB C.dc=ddescC + stage_in=dplasma_hip_lapack_stage_in + stage_out=dplasma_hip_lapack_stage_out] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex lalpha; + lalpha.x = creal(alpha); lalpha.y = cimag(alpha); + hipblasDoubleComplex lbeta = { 1., 0. }; + if(k == 0) { + lbeta.x = creal(beta); lbeta.y = cimag(beta); + } +#else + double lalpha = alpha; + double lbeta = (k == 0) ? beta : 1.0; +#endif + int tempmm = m == descC->mt-1 ? descC->m - m * descC->mb : descC->mb; + int tempnn = n == descC->nt-1 ? descC->n - n * descC->nb : descC->nb; + int tempkk = k == descA->nt-1 ? descA->n - k * descA->nb : descA->nb; + int ldam = descA->mb; + int ldbn = descB->mb; + int ldcm = descC->mb; + + printloggpu("gemm( %d, %d, %d )\n" + " ( %s, %s, %d, %d, %d, %f, A(%d,%d), %d, B(%d,%d), %d, %f, C(%d,%d), %d)\n", + m, n, k, + &dplasma_lapack_const( transA ), &dplasma_lapack_const( transB ), + tempmm, tempnn, tempkk, + creal(alpha), m, k, ldam, + n, k, ldbn, + (k==0)? creal(beta): 1.0, m, n, ldcm ); + + hipblasStatus_t status; + hipblasOperation_t opA = dplasmaNoTrans == transA? HIPBLAS_OP_N: HIPBLAS_OP_T; + hipblasOperation_t opB = dplasmaNoTrans == transB? HIPBLAS_OP_N: HIPBLAS_OP_T; + dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, + opA, opB, + tempmm, tempnn, tempkk, + &lalpha, (hipblasDoubleComplex*)A, ldam, + (hipblasDoubleComplex*)B, ldbn, + &lbeta, (hipblasDoubleComplex*)C, ldcm ); + DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + BODY { dplasma_complex64_t lbeta = (k == 0) ? beta : (dplasma_complex64_t)1.0; diff --git a/src/zgemm_TN.jdf b/src/zgemm_TN.jdf index f573ecda..aa529d91 100644 --- a/src/zgemm_TN.jdf +++ b/src/zgemm_TN.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 The University of Tennessee and The University + * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -8,16 +8,15 @@ extern "C" %{ * $COPYRIGHT * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" -#include "dplasmaaux.h" #include "parsec/data_dist/matrix/matrix.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" #include -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ - /* Define the different shapes this JDF is using */ #define A_SHAPE 0 #define B_SHAPE 1 @@ -169,8 +168,8 @@ BODY [type=CUDA B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} A.dc=ddescA B.dc=ddescB C.dc=ddescC - stage_in=stage_in_lapack - stage_out=stage_out_lapack] + stage_in=dplasma_cuda_lapack_stage_in + stage_out=dplasma_cuda_lapack_stage_out] { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex lalpha = make_cuDoubleComplex(creal(alpha), cimag(alpha)); diff --git a/src/zgemm_TN_summa.jdf b/src/zgemm_TN_summa.jdf index c82251d2..b095ac61 100644 --- a/src/zgemm_TN_summa.jdf +++ b/src/zgemm_TN_summa.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 The University of Tennessee and The University + * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -8,16 +8,15 @@ extern "C" %{ * $COPYRIGHT * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" -#include "dplasmaaux.h" #include "parsec/data_dist/matrix/matrix.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" #include -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ - /* Define the different shapes this JDF is using */ #define A_SHAPE 0 #define B_SHAPE 1 @@ -85,6 +84,8 @@ Q [type = "int" hidden=on default="((parsec_matrix_block_cyclic_t*)descC)-> lookP [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] lookQ [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] +hip_handles_infokey [type = "int" hidden = on default = -1 ] + /************************************************** * READ_A * **************************************************/ @@ -210,8 +211,8 @@ BODY [type=CUDA B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} A.dc=ddescA B.dc=ddescB C.dc=ddescC - stage_in=stage_in_lapack - stage_out=stage_out_lapack] + stage_in=dplasma_cuda_lapack_stage_in + stage_out=dplasma_cuda_lapack_stage_out] { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex lalpha = make_cuDoubleComplex(creal(alpha), cimag(alpha)); @@ -228,6 +229,15 @@ BODY [type=CUDA int ldbk = descB->mb; int ldcm = descC->mb; + printloggpu("gemm( %d, %d, %d )\n" + " ( %s, %s, %d, %d, %d, %f, A(%d,%d), %d, B(%d,%d), %d, %f, C(%d,%d), %d)\n", + m, n, k, + &dplasma_lapack_const( transA ), &dplasma_lapack_const( transB ), + tempmm, tempnn, tempkk, + creal(alpha), k, m, ldak, + k, n, ldbk, + creal(lbeta), m, n, ldcm ); + cublasStatus_t status; cublasSetKernelStream( parsec_body.stream ); cublasZgemm( dplasma_lapack_const(transA), dplasma_lapack_const(transB), @@ -240,6 +250,56 @@ BODY [type=CUDA } END +BODY [type=HIP + A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} + B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} + C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} + A.dc=ddescA B.dc=ddescB C.dc=ddescC + stage_in=dplasma_hip_lapack_stage_in + stage_out=dplasma_hip_lapack_stage_out] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex lalpha; + lalpha.x = creal(alpha); lalpha.y = cimag(alpha); + hipblasDoubleComplex lbeta = { 1., 0. }; + if(k == 0) { + lbeta.x = creal(beta); lbeta.y = cimag(beta); + } +#else + double lalpha = alpha; + double lbeta = (k == 0) ? beta : 1.0; +#endif + int tempmm = m == descC->mt-1 ? descC->m - m * descC->mb : descC->mb; + int tempnn = n == descC->nt-1 ? descC->n - n * descC->nb : descC->nb; + int tempkk = k == descA->nt-1 ? descA->m - k * descA->nb : descA->nb; + int ldak = descA->mb; + int ldbk = descB->mb; + int ldcm = descC->mb; + + printloggpu("gemm( %d, %d, %d )\n" + " ( %s, %s, %d, %d, %d, %f, A(%d,%d), %d, B(%d,%d), %d, %f, C(%d,%d), %d)\n", + m, n, k, + &dplasma_lapack_const( transA ), &dplasma_lapack_const( transB ), + tempmm, tempnn, tempkk, + creal(alpha), k, m, ldak, + k, n, ldbk, + (k==0)? creal(beta): 1.0, m, n, ldcm ); + + hipblasStatus_t status; + hipblasOperation_t opA = dplasmaNoTrans == transA? HIPBLAS_OP_N: HIPBLAS_OP_T; + hipblasOperation_t opB = dplasmaNoTrans == transB? HIPBLAS_OP_N: HIPBLAS_OP_T; + dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, + opA, opB, + tempmm, tempnn, tempkk, + &lalpha, (hipblasDoubleComplex*)A, ldak, + (hipblasDoubleComplex*)B, ldbk, + &lbeta, (hipblasDoubleComplex*)C, ldcm ); + DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + BODY { dplasma_complex64_t lbeta = (k == 0) ? beta : (dplasma_complex64_t)1.0; diff --git a/src/zgemm_TT.jdf b/src/zgemm_TT.jdf index f0baccb8..4a4effdc 100644 --- a/src/zgemm_TT.jdf +++ b/src/zgemm_TT.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 The University of Tennessee and The University + * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -8,16 +8,15 @@ extern "C" %{ * $COPYRIGHT * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" -#include "dplasmaaux.h" #include "parsec/data_dist/matrix/matrix.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" #include -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ - /* Define the different shapes this JDF is using */ #define A_SHAPE 0 #define B_SHAPE 1 @@ -169,8 +168,8 @@ BODY [type=CUDA B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} A.dc=ddescA B.dc=ddescB C.dc=ddescC - stage_in=stage_in_lapack - stage_out=stage_out_lapack] + stage_in=dplasma_cuda_lapack_stage_in + stage_out=dplasma_cuda_lapack_stage_out] { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex lalpha = make_cuDoubleComplex(creal(alpha), cimag(alpha)); diff --git a/src/zgemm_TT_summa.jdf b/src/zgemm_TT_summa.jdf index 0ea2708c..5b3eb373 100644 --- a/src/zgemm_TT_summa.jdf +++ b/src/zgemm_TT_summa.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 The University of Tennessee and The University + * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -8,16 +8,15 @@ extern "C" %{ * $COPYRIGHT * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" -#include "dplasmaaux.h" #include "parsec/data_dist/matrix/matrix.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" #include -#if defined(DPLASMA_HAVE_CUDA) -#include -#endif /* defined(DPLASMA_HAVE_CUDA) */ - /* Define the different shapes this JDF is using */ #define A_SHAPE 0 #define B_SHAPE 1 @@ -85,6 +84,8 @@ Q [type = "int" hidden=on default="((parsec_matrix_block_cyclic_t*)descC)-> lookP [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] lookQ [type = "int" hidden=on default="dplasma_aux_getGEMMLookahead(descC)"] +hip_handles_infokey [type = "int" hidden = on default = -1 ] + /************************************************** * READ_A * **************************************************/ @@ -210,8 +211,8 @@ BODY [type=CUDA B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} A.dc=ddescA B.dc=ddescB C.dc=ddescC - stage_in=stage_in_lapack - stage_out=stage_out_lapack] + stage_in=dplasma_cuda_lapack_stage_in + stage_out=dplasma_cuda_lapack_stage_out] { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex lalpha = make_cuDoubleComplex(creal(alpha), cimag(alpha)); @@ -228,6 +229,15 @@ BODY [type=CUDA int ldbn = descB->mb; int ldcm = descC->mb; + printloggpu("gemm( %d, %d, %d )\n" + " ( %s, %s, %d, %d, %d, %f, A(%d,%d), %d, B(%d,%d), %d, %f, C(%d,%d), %d)\n", + m, n, k, + &dplasma_lapack_const( transA ), &dplasma_lapack_const( transB ), + tempmm, tempnn, tempkk, + creal(alpha), k, m, ldak, + n, k, ldbn, + creal(lbeta), m, n, ldcm ); + cublasStatus_t status; cublasSetKernelStream( parsec_body.stream ); cublasZgemm( dplasma_lapack_const(transA), dplasma_lapack_const(transB), @@ -243,6 +253,56 @@ BODY [type=CUDA } END +BODY [type=HIP + A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} + B.size=%{ return descB->mb*descB->nb*parsec_datadist_getsizeoftype(descB->mtype);%} + C.size=%{ return descC->mb*descC->nb*parsec_datadist_getsizeoftype(descC->mtype);%} + A.dc=ddescA B.dc=ddescB C.dc=ddescC + stage_in=dplasma_hip_lapack_stage_in + stage_out=dplasma_hip_lapack_stage_out] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex lalpha; + lalpha.x = creal(alpha); lalpha.y = cimag(alpha); + hipblasDoubleComplex lbeta = { 1., 0. }; + if(k == 0) { + lbeta.x = creal(beta); lbeta.y = cimag(beta); + } +#else + double lalpha = alpha; + double lbeta = (k == 0) ? beta : 1.0; +#endif + int tempmm = m == descC->mt-1 ? descC->m - m * descC->mb : descC->mb; + int tempnn = n == descC->nt-1 ? descC->n - n * descC->nb : descC->nb; + int tempkk = k == descA->mt-1 ? descA->m - k * descA->nb : descA->nb; + int ldak = descA->mb; + int ldbn = descB->mb; + int ldcm = descC->mb; + + printloggpu("gemm( %d, %d, %d )\n" + " ( %s, %s, %d, %d, %d, %f, A(%d,%d), %d, B(%d,%d), %d, %f, C(%d,%d), %d)\n", + m, n, k, + &dplasma_lapack_const( transA ), &dplasma_lapack_const( transB ), + tempmm, tempnn, tempkk, + creal(alpha), k, m, ldak, + n, k, ldbn, + (k==0)? creal(beta): 1.0, m, n, ldcm ); + + hipblasStatus_t status; + hipblasOperation_t opA = dplasmaNoTrans == transA? HIPBLAS_OP_N: HIPBLAS_OP_T; + hipblasOperation_t opB = dplasmaNoTrans == transB? HIPBLAS_OP_N: HIPBLAS_OP_T; + dplasma_hip_handles_t *handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, + opA, opB, + tempmm, tempnn, tempkk, + &lalpha, (hipblasDoubleComplex*)A, ldak, + (hipblasDoubleComplex*)B, ldbn, + &lbeta, (hipblasDoubleComplex*)C, ldcm ); + DPLASMA_HIPBLAS_CHECK_ERROR( "hipblasZgemm ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); +} +END + BODY { dplasma_complex64_t lbeta = (k == 0) ? beta : (dplasma_complex64_t)1.0; diff --git a/src/zgemm_wrapper.c b/src/zgemm_wrapper.c index e2d5ebb7..701db625 100644 --- a/src/zgemm_wrapper.c +++ b/src/zgemm_wrapper.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010-2020 The University of Tennessee and The University + * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2013 Inria. All rights reserved. @@ -14,9 +14,7 @@ #include "dplasma/types_lapack.h" #include "dplasmaaux.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" -#if defined(DPLASMA_HAVE_CUDA) -#include "parsec/mca/device/cuda/device_cuda.h" -#endif +#include "parsec/mca/device/device_gpu.h" #include "utils/dplasma_info.h" #include "zgemm_NN.h" @@ -80,12 +78,26 @@ dplasma_zgemm_summa_new(dplasma_enum_t transA, dplasma_enum_t transB, parsec_zgemm_NN_summa_taskpool_t* tp; tp = parsec_zgemm_NN_summa_new(transA, transB, alpha, beta, ddc_A, ddc_B, ddc_C, (parsec_data_collection_t*)Cdist); +#if defined(DPLASMA_HAVE_HIP) + /* It doesn't cost anything to define these infos if we have HIP but + * don't have GPUs on the current machine, so we do it non-conditionally */ + tp->_g_hip_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); +#else + tp->_g_hip_handles_infokey = PARSEC_INFO_ID_UNDEFINED; +#endif zgemm_tp = (parsec_taskpool_t*)tp; } else { PARSEC_DEBUG_VERBOSE(3, parsec_debug_output, "zgemm_NT_summa\n"); parsec_zgemm_NT_summa_taskpool_t* tp; tp = parsec_zgemm_NT_summa_new(transA, transB, alpha, beta, ddc_A, ddc_B, ddc_C, (parsec_data_collection_t*)Cdist); +#if defined(DPLASMA_HAVE_HIP) + /* It doesn't cost anything to define these infos if we have HIP but + * don't have GPUs on the current machine, so we do it non-conditionally */ + tp->_g_hip_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); +#else + tp->_g_hip_handles_infokey = PARSEC_INFO_ID_UNDEFINED; +#endif zgemm_tp = (parsec_taskpool_t*)tp; } } else { @@ -94,6 +106,13 @@ dplasma_zgemm_summa_new(dplasma_enum_t transA, dplasma_enum_t transB, parsec_zgemm_TN_summa_taskpool_t* tp; tp = parsec_zgemm_TN_summa_new(transA, transB, alpha, beta, ddc_A, ddc_B, ddc_C, (parsec_data_collection_t*)Cdist); +#if defined(DPLASMA_HAVE_HIP) + /* It doesn't cost anything to define these infos if we have HIP but + * don't have GPUs on the current machine, so we do it non-conditionally */ + tp->_g_hip_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); +#else + tp->_g_hip_handles_infokey = PARSEC_INFO_ID_UNDEFINED; +#endif zgemm_tp = (parsec_taskpool_t*)tp; } else { PARSEC_DEBUG_VERBOSE(3, parsec_debug_output, "zgemm_TT_summa\n"); @@ -101,6 +120,13 @@ dplasma_zgemm_summa_new(dplasma_enum_t transA, dplasma_enum_t transB, tp = parsec_zgemm_TT_summa_new(transA, transB, alpha, beta, ddc_A, ddc_B, ddc_C, (parsec_data_collection_t*)Cdist); +#if defined(DPLASMA_HAVE_HIP) + /* It doesn't cost anything to define these infos if we have HIP but + * don't have GPUs on the current machine, so we do it non-conditionally */ + tp->_g_hip_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); +#else + tp->_g_hip_handles_infokey = PARSEC_INFO_ID_UNDEFINED; +#endif zgemm_tp = (parsec_taskpool_t*)tp; } } @@ -188,7 +214,7 @@ dplasma_zgemm_default_new(dplasma_enum_t transA, dplasma_enum_t transB, return zgemm_tp; } -#if defined(DPLASMA_HAVE_CUDA) +#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP) static parsec_taskpool_t* dplasma_zgemm_gpu_new( dplasma_enum_t transA, dplasma_enum_t transB, dplasma_complex64_t alpha, const parsec_tiled_matrix_t* A, const parsec_tiled_matrix_t* B, @@ -222,13 +248,13 @@ dplasma_zgemm_gpu_new( dplasma_enum_t transA, dplasma_enum_t transB, nbgpu = 0; for(dev = 0; dev < (int)parsec_nb_devices; dev++) { parsec_device_module_t *device = parsec_mca_device_get(dev); - if( PARSEC_DEV_CUDA == device->type ) { - parsec_device_cuda_module_t *cuda_device = (parsec_device_cuda_module_t*)device; + if( PARSEC_DEV_CUDA == device->type || PARSEC_DEV_HIP == device->type ) { + parsec_device_gpu_module_t *gpu_device = (parsec_device_gpu_module_t*)device; nbgpu++; if( 0 == gpu_mem_block_size ) - gpu_mem_block_size = cuda_device->super.mem_block_size; - if( -1 == gpu_mem_nb_blocks || cuda_device->super.mem_nb_blocks < gpu_mem_nb_blocks ) - gpu_mem_nb_blocks = cuda_device->super.mem_nb_blocks; + gpu_mem_block_size = gpu_device->mem_block_size; + if( -1 == gpu_mem_nb_blocks || gpu_device->mem_nb_blocks < gpu_mem_nb_blocks ) + gpu_mem_nb_blocks = gpu_device->mem_nb_blocks; } } if(nbgpu == 0) { @@ -239,7 +265,7 @@ dplasma_zgemm_gpu_new( dplasma_enum_t transA, dplasma_enum_t transB, nbgpu= 0; for(dev = 0; dev < (int)parsec_nb_devices; dev++) { parsec_device_module_t *device = parsec_mca_device_get(dev); - if( PARSEC_DEV_CUDA == device->type ) { + if( PARSEC_DEV_CUDA == device->type || PARSEC_DEV_HIP == device->type ) { dev_index[nbgpu++] = device->device_index; } } @@ -358,8 +384,15 @@ dplasma_zgemm_gpu_new( dplasma_enum_t transA, dplasma_enum_t transB, K = B->mt; tp->_g_zMax = (K + d - 1) / d - 1; - zgemm_tp = (parsec_taskpool_t *) tp; +#if defined(DPLASMA_HAVE_HIP) + /* It doesn't cost anything to define these infos if we have HIP but + * don't have GPUs on the current machine, so we do it non-conditionally */ + tp->_g_hip_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); +#else + tp->_g_hip_handles_infokey = PARSEC_INFO_ID_UNDEFINED; +#endif + zgemm_tp = (parsec_taskpool_t *) tp; return zgemm_tp; } @@ -368,7 +401,7 @@ dplasma_zgemm_gpu_new( dplasma_enum_t transA, dplasma_enum_t transB, free(dev_index); return NULL; } -#endif /* DPLASMA_HAVE_CUDA */ +#endif /* DPLASMA_HAVE_CUDA || DPLASMA_HAVE_HIP */ /** ******************************************************************************* @@ -453,7 +486,7 @@ dplasma_zgemm_New_ex( dplasma_enum_t transA, dplasma_enum_t transB, } if ( C->dtype & parsec_matrix_block_cyclic_type ) { -#if defined(DPLASMA_HAVE_CUDA) +#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP) int nb_gpu_devices = 0, devid; int p = ((parsec_matrix_block_cyclic_t*)C)->grid.rows; int q = ((parsec_matrix_block_cyclic_t*)C)->grid.cols; @@ -461,13 +494,13 @@ dplasma_zgemm_New_ex( dplasma_enum_t transA, dplasma_enum_t transB, int64_t gpu_mem_nb_blocks = -1; for(devid = 0; devid < (int)parsec_nb_devices; devid++) { parsec_device_module_t *device = parsec_mca_device_get(devid); - if( PARSEC_DEV_CUDA == device->type ) { - parsec_device_cuda_module_t *cuda_device = (parsec_device_cuda_module_t*)device; + if( PARSEC_DEV_CUDA == device->type || PARSEC_DEV_HIP == device->type ) { + parsec_device_gpu_module_t *gpu_device = (parsec_device_gpu_module_t*)device; nb_gpu_devices++; if( 0 == gpu_mem_block_size ) - gpu_mem_block_size = cuda_device->super.mem_block_size; - if( -1 == gpu_mem_nb_blocks || cuda_device->super.mem_nb_blocks < gpu_mem_nb_blocks ) - gpu_mem_nb_blocks = cuda_device->super.mem_nb_blocks; + gpu_mem_block_size = gpu_device->mem_block_size; + if( -1 == gpu_mem_nb_blocks || gpu_device->mem_nb_blocks < gpu_mem_nb_blocks ) + gpu_mem_nb_blocks = gpu_device->mem_nb_blocks; } } if(0 < nb_gpu_devices) { @@ -484,7 +517,7 @@ dplasma_zgemm_New_ex( dplasma_enum_t transA, dplasma_enum_t transB, return zgemm_tp; } } -#endif /* DPLASMA_HAVE_CUDA */ +#endif /* DPLASMA_HAVE_CUDA || DPLASMA_HAVE_HIP */ zgemm_tp = dplasma_zgemm_summa_new(transA, transB, alpha, A, B, beta, C, opt); return zgemm_tp; } diff --git a/src/zgeqrf.jdf b/src/zgeqrf.jdf index e6af9469..0874b87c 100644 --- a/src/zgeqrf.jdf +++ b/src/zgeqrf.jdf @@ -478,8 +478,8 @@ BODY [type=CUDA device=%{ return n; %} V.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} T.size=%{ return descT->mb*descT->nb*parsec_datadist_getsizeoftype(descT->mtype);%} A1.dc=ddescA A2.dc=ddescA V.dc=ddescA T.dc=ddescT - stage_in=stage_in_lapack - stage_out=stage_out_lapack] + stage_in=dplasma_cuda_lapack_stage_in + stage_out=dplasma_cuda_lapack_stage_out] { dplasma_complex64_t *WORK, *WORKC; int tempmm = ((m)==(descA->mt-1)) ? (descA->m-(m*descA->mb)) : descA->mb; diff --git a/src/zgetrf_nopiv.jdf b/src/zgetrf_nopiv.jdf index c457333d..04113181 100644 --- a/src/zgetrf_nopiv.jdf +++ b/src/zgetrf_nopiv.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 The University of Tennessee and The University + * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2013 Inria. All rights reserved. @@ -10,12 +10,12 @@ extern "C" %{ * @precisions normal z -> s d c * */ -#include "dplasmajdf.h" -#include "parsec/data_dist/matrix/matrix.h" - +#include "dplasma/config.h" #if defined(DPLASMA_HAVE_CUDA) #include #endif /* defined(DPLASMA_HAVE_CUDA) */ +#include "dplasmajdf.h" +#include "parsec/data_dist/matrix/matrix.h" /* Define the different shapes this JDF is using */ #define DEFAULT 0 @@ -200,8 +200,8 @@ BODY [type=CUDA B.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} C.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} A.dc=ddescA B.dc=ddescA C.dc=ddescA - stage_in=stage_in_lapack - stage_out=stage_out_lapack] + stage_in=dplasma_cuda_lapack_stage_in + stage_out=dplasma_cuda_lapack_stage_out] { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex zone = make_cuDoubleComplex( 1., 0.); @@ -219,7 +219,7 @@ BODY [type=CUDA cublasStatus_t status; - printlogcuda("CUDA_zgemm(%d, %d, %d)\n" + printloggpu("zgemm(%d, %d, %d)\n" "\t(dplasmaNoTrans, dplasmaNoTrans, tempmm, tempnn, descA->mb, -1, A(%d,%d)[%p], ldam %d, A(%d,%d)[%p], ldak %d, 1.000000, A(%d,%d)[%p], ldam %d)\n", k, n, m, m, k, A, ldam_A, k, n, B, ldak_B, m, n, C, ldam_C); diff --git a/src/zpoinv_L.jdf b/src/zpoinv_L.jdf index 309e4b86..4dc3e79c 100644 --- a/src/zpoinv_L.jdf +++ b/src/zpoinv_L.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 The University of Tennessee and The University + * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2013 Inria. All rights reserved. @@ -10,13 +10,12 @@ extern "C" %{ * @precisions normal z -> s d c * */ -#include "dplasmajdf.h" -#include "parsec/data_dist/matrix/matrix.h" - +#include "dplasma/config.h" #if defined(DPLASMA_HAVE_CUDA) #include #endif /* defined(DPLASMA_HAVE_CUDA) */ - +#include "dplasmajdf.h" +#include "parsec/data_dist/matrix/matrix.h" %} descA [type = "parsec_tiled_matrix_t*"] diff --git a/src/zpoinv_U.jdf b/src/zpoinv_U.jdf index 3db2a761..bdfc7459 100644 --- a/src/zpoinv_U.jdf +++ b/src/zpoinv_U.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 The University of Tennessee and The University + * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2013 Inria. All rights reserved. @@ -10,13 +10,12 @@ extern "C" %{ * @precisions normal z -> s d c * */ -#include "dplasmajdf.h" -#include "parsec/data_dist/matrix/matrix.h" - +#include "dplasma/config.h" #if defined(DPLASMA_HAVE_CUDA) #include #endif /* defined(DPLASMA_HAVE_CUDA) */ - +#include "dplasmajdf.h" +#include "parsec/data_dist/matrix/matrix.h" %} descA [type = "parsec_tiled_matrix_t*"] diff --git a/src/zpotrf_L.jdf b/src/zpotrf_L.jdf index 4e4fbedf..5a2e335c 100644 --- a/src/zpotrf_L.jdf +++ b/src/zpotrf_L.jdf @@ -1,22 +1,19 @@ extern "C" %{ /* - * Copyright (c) 2010-2023 The University of Tennessee and The University + * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2013 Inria. All rights reserved. + * * $COPYRIGHT * * @precisions normal z -> s d c * */ #include "dplasma/config.h" -#if defined(DPLASMA_HAVE_CUDA) -#include -#include "potrf_cublas_utils.h" -#endif /* defined(DPLASMA_HAVE_CUDA) */ - #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" +#include "potrf_gpu_workspaces.h" #if defined(PARSEC_HAVE_DEV_RECURSIVE_SUPPORT) #include "parsec/data_dist/matrix/subtile.h" @@ -84,8 +81,11 @@ INFO [type = "int*"] PRI_CHANGE [type = "int" hidden = on default = 0 ] PRI_MAX [type = "int" hidden = on default = "(descA->mt * ( 6 + descA->mt * ( 6 + descA->mt )))" ] smallnb [type = "int" hidden = on default = "descA->mb" ] -CuHandlesID [type = "int" hidden = on default = -1 ] -POWorkspaceID [type = "int" hidden = on default = -1 ] + +cuda_handles_infokey [type = "int" hidden = on default = -1 ] +cuda_workspaces_infokey [type = "int" hidden = on default = -1 ] +hip_handles_infokey [type = "int" hidden = on default = -1 ] +hip_workspaces_infokey [type = "int" hidden = on default = -1 ] /************************************************** * potrf_zpotrf * @@ -122,7 +122,7 @@ BODY [type=CUDA] cusolverStatus_t status; cublasFillMode_t cublas_uplo; - dplasma_potrf_workspace_t *wp; + dplasma_potrf_gpu_workspaces_t *wp; cuDoubleComplex *workspace; int *d_iinfo; dplasma_cuda_handles_t *handles; @@ -132,9 +132,9 @@ BODY [type=CUDA] if( PlasmaUpper == uplo ) cublas_uplo = CUBLAS_FILL_MODE_UPPER; - handles = parsec_info_get(&gpu_stream->infos, CuHandlesID); + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); assert(NULL != handles); - wp = parsec_info_get(&gpu_device->super.infos, POWorkspaceID); + wp = parsec_info_get(&gpu_device->super.infos, cuda_workspaces_infokey); assert(NULL != wp); workspace = (cuDoubleComplex*)wp->tmpmem; @@ -145,6 +145,34 @@ BODY [type=CUDA] } END +BODY [type=HIP] +{ + int tempkm = k == descA->mt-1 ? descA->m - k*descA->mb : descA->mb; + int ldak = LDA(descA, T); + + rocblas_status status; + rocblas_fill rocblas_uplo; + dplasma_potrf_gpu_workspaces_t *wp; + int *d_iinfo; + dplasma_hip_handles_t *handles; + + if( PlasmaLower == uplo ) + rocblas_uplo = rocblas_fill_lower; + if( PlasmaUpper == uplo ) + rocblas_uplo = rocblas_fill_upper; + + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + wp = parsec_info_get(&gpu_device->super.infos, hip_workspaces_infokey); + assert(NULL != wp); + + d_iinfo = (int*)wp->tmpmem; + + status = rocsolver_zpotrf( handles->hipblas_handle, rocblas_uplo, tempkm, T, ldak, d_iinfo); + DPLASMA_ROCBLAS_CHECK_ERROR("rocsolver_zpotrf", status, {return PARSEC_HOOK_RETURN_ERROR;}); +} +END + BODY [type=RECURSIVE] { int tempkm = k == descA->mt-1 ? descA->m - k*descA->mb : descA->mb; @@ -230,7 +258,7 @@ BODY [type=CUDA] double zone = 1.; #endif cublasStatus_t status; - handles = parsec_info_get(&gpu_stream->infos, CuHandlesID); + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); assert(NULL != handles); cublasSetStream( handles->cublas_handle, parsec_body.stream ); status = cublasZtrsm_v2(handles->cublas_handle, @@ -242,6 +270,29 @@ BODY [type=CUDA] } END +BODY [type=HIP] +{ + int tempmm = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb; + int ldak = LDA(ddescA, T); + int ldam = LDA(ddescA, C); + dplasma_hip_handles_t *handles; +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex zone = { 1., 0. }; +#else + double zone = 1.; +#endif + hipblasStatus_t status; + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZtrsm(handles->hipblas_handle, + HIPBLAS_SIDE_RIGHT, HIPBLAS_FILL_MODE_LOWER, + HIPBLAS_OP_C, HIPBLAS_DIAG_NON_UNIT, + tempmm, descA->nb, + &zone, T, ldak, C, ldam); + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZtrsm", status, {return PARSEC_HOOK_RETURN_ERROR;}); +} +END + BODY [type=RECURSIVE] { int tempmm = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb; @@ -336,7 +387,7 @@ BODY [type=CUDA] double mzone = -1.; cublasStatus_t status; - handles = parsec_info_get(&gpu_stream->infos, CuHandlesID); + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); assert(NULL != handles); cublasSetStream( handles->cublas_handle, parsec_body.stream ); status = cublasZherk_v2( handles->cublas_handle, @@ -348,6 +399,27 @@ BODY [type=CUDA] } END +BODY [type=HIP] +{ + int tempmm = m == descA->mt-1 ? descA->m - m*descA->mb : descA->mb; + int ldam_A = LDA(ddescA, A); + int ldam_T = LDA(ddescA, T); + dplasma_hip_handles_t *handles; + double zone = 1.; + double mzone = -1.; + hipblasStatus_t status; + + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZherk( handles->hipblas_handle, + HIPBLAS_FILL_MODE_LOWER, HIPBLAS_OP_N, + tempmm, descA->mb, + &mzone, A, ldam_A, + &zone, T, ldam_T); + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZherk", status, {return PARSEC_HOOK_RETURN_ERROR;}); +} +END + BODY [type=RECURSIVE] { int tempmm = m == descA->mt-1 ? descA->m - m*descA->mb : descA->mb; @@ -390,8 +462,7 @@ BODY tempmm, descA->mb, (double)-1.0, A /*A(m, k)*/, ldam_A, (double) 1.0, T /*A(m, m)*/, ldam_T); - printlog( - "CORE_zherk( %d, %d )\n\t( %s, %s, %d, %d, %f, A(%d,%d)[%p], %d, %f, A(%d,%d)[%p], %d)\n", + printlog("CORE_zherk( %d, %d )\n\t( %s, %s, %d, %d, %f, A(%d,%d)[%p], %d, %f, A(%d,%d)[%p], %d)\n", k, m, &dplasma_lapack_const( dplasmaLower ), &dplasma_lapack_const( dplasmaNoTrans ), tempmm, descA->mb, @@ -434,8 +505,8 @@ BODY [type=CUDA B.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} C.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} A.dc=ddescA B.dc=ddescA C.dc=ddescA - stage_in=stage_in_lapack - stage_out=stage_out_lapack] + stage_in=dplasma_cuda_lapack_stage_in + stage_out=dplasma_cuda_lapack_stage_out] { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex zone = make_cuDoubleComplex( 1., 0.); @@ -448,15 +519,13 @@ BODY [type=CUDA int ldam_A = LDA(ddescA, A); int ldan_B = LDA(ddescA, B); int ldam_C = LDA(ddescA, C); - - cublasStatus_t status; - dplasma_cuda_handles_t *handles; - assert( ldam_A <= descA->mb ); assert( ldan_B <= descA->mb ); assert( ldam_C <= descA->mb ); - handles = parsec_info_get(&gpu_stream->infos, CuHandlesID); + cublasStatus_t status; + dplasma_cuda_handles_t *handles; + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); assert(NULL != handles); cublasSetStream( handles->cublas_handle, parsec_body.stream ); @@ -470,6 +539,43 @@ BODY [type=CUDA } END +BODY [type=HIP + A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} + B.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} + C.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} + A.dc=ddescA B.dc=ddescA C.dc=ddescA + stage_in=dplasma_hip_lapack_stage_in + stage_out=dplasma_hip_lapack_stage_out] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex zone = { 1., 0. }; + hipblasDoubleComplex mzone = { -1., 0. }; +#else + double zone = 1.; + double mzone = -1.; +#endif + int tempmm = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb; + int ldam_A = LDA(ddescA, A); + int ldan_B = LDA(ddescA, B); + int ldam_C = LDA(ddescA, C); + assert( ldam_A <= descA->mb ); + assert( ldan_B <= descA->mb ); + assert( ldam_C <= descA->mb ); + + hipblasStatus_t status; + dplasma_hip_handles_t *handles; + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, + HIPBLAS_OP_N, HIPBLAS_OP_C, + tempmm, descA->mb, descA->mb, + &mzone, A, ldam_A, + B, ldan_B, + &zone, C, ldam_C ); + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZgemm", status, {return PARSEC_HOOK_RETURN_ERROR;}); +} +END + BODY [type=RECURSIVE] { int tempmm = m == descA->mt-1 ? descA->m - m * descA->mb : descA->mb; diff --git a/src/zpotrf_U.jdf b/src/zpotrf_U.jdf index 3f69415e..67846484 100644 --- a/src/zpotrf_U.jdf +++ b/src/zpotrf_U.jdf @@ -1,19 +1,17 @@ extern "C" %{ /* - * Copyright (c) 2010-2023 The University of Tennessee and The University + * Copyright (c) 2010-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2013 Inria. All rights reserved. * + * $COPYRIGHT + * * @precisions normal z -> s d c * */ #include "dplasma/config.h" -#if defined(DPLASMA_HAVE_CUDA) -#include -#include "potrf_cublas_utils.h" -#endif /* defined(DPLASMA_HAVE_CUDA) */ - +#include "potrf_gpu_workspaces.h" #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" @@ -84,8 +82,10 @@ PRI_CHANGE [type = "int" hidden = on default = 0 ] PRI_MAX [type = "int" hidden = on default = "(descA->mt * ( 6 + descA->mt * ( 6 + descA->mt )))" ] smallnb [type = "int" hidden = on default = "descA->mb" ] -CuHandlesID [type = "int" hidden = on default = -1 ] -POWorkspaceID [type = "int" hidden = on default = -1 ] +cuda_handles_infokey [type = "int" hidden = on default = -1 ] +cuda_workspaces_infokey [type = "int" hidden = on default = -1 ] +hip_handles_infokey [type = "int" hidden = on default = -1 ] +hip_workspaces_infokey [type = "int" hidden = on default = -1 ] /************************************************** * potrf_zpotrf * @@ -121,7 +121,7 @@ BODY [type=CUDA] cusolverStatus_t status; cublasFillMode_t cublas_uplo; - dplasma_potrf_workspace_t *wp; + dplasma_potrf_gpu_workspaces_t *wp; cuDoubleComplex *workspace; int *d_iinfo; dplasma_cuda_handles_t *handles; @@ -131,9 +131,9 @@ BODY [type=CUDA] if( PlasmaUpper == uplo ) cublas_uplo = CUBLAS_FILL_MODE_UPPER; - handles = parsec_info_get(&gpu_stream->infos, CuHandlesID); + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); assert(NULL != handles); - wp = parsec_info_get(&gpu_device->super.infos, POWorkspaceID); + wp = parsec_info_get(&gpu_device->super.infos, cuda_workspaces_infokey); assert(NULL != wp); workspace = (cuDoubleComplex*)wp->tmpmem; @@ -144,6 +144,33 @@ BODY [type=CUDA] } END +BODY [type=HIP] +{ + int tempkn = k == descA->nt-1 ? descA->n - k*descA->nb : descA->nb; + int ldak = LDA(ddescA, T); + + rocblas_status status; + rocblas_fill rocblas_uplo; + dplasma_potrf_gpu_workspaces_t *wp; + int *d_iinfo; + dplasma_hip_handles_t *handles; + + if( PlasmaLower == uplo ) + rocblas_uplo = rocblas_fill_lower; + if( PlasmaUpper == uplo ) + rocblas_uplo = rocblas_fill_upper; + + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + wp = parsec_info_get(&gpu_device->super.infos, hip_workspaces_infokey); + assert(NULL != wp); + d_iinfo = (int*)wp->tmpmem; + + status = rocsolver_zpotrf( handles->hipblas_handle, rocblas_uplo, tempkn, T, ldak, d_iinfo); + DPLASMA_ROCBLAS_CHECK_ERROR("rocsolver_zpotrf", status, {return PARSEC_HOOK_RETURN_ERROR;}); +} +END + BODY [type=RECURSIVE] { int tempkn = k == descA->nt-1 ? descA->n - k*descA->nb : descA->nb; @@ -232,7 +259,7 @@ BODY [type=CUDA] #endif cublasStatus_t status; - handles = parsec_info_get(&gpu_stream->infos, CuHandlesID); + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); assert(NULL != handles); cublasSetStream( handles->cublas_handle, parsec_body.stream ); status = cublasZtrsm_v2(handles->cublas_handle, @@ -244,6 +271,30 @@ BODY [type=CUDA] } END +BODY [type=HIP] +{ + int tempnn = n == descA->nt - 1 ? descA->n - n * descA->nb : descA->nb; + int ldak_T = LDA(ddescA, T); + int ldak_C = LDA(ddescA, C); +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex zone = { 1., 0. }; +#else + double zone = 1.; +#endif + + hipblasStatus_t status; + dplasma_hip_handles_t *handles; + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZtrsm(handles->hipblas_handle, + HIPBLAS_SIDE_LEFT, HIPBLAS_FILL_MODE_UPPER, + HIPBLAS_OP_C, HIPBLAS_DIAG_NON_UNIT, + descA->mb, tempnn, + &zone, T, ldak_T, C, ldak_C); + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZtrsm", status, {return PARSEC_HOOK_RETURN_ERROR;}); +} +END + BODY [type=RECURSIVE] { int tempnn = n == descA->nt-1 ? descA->n - n * descA->nb : descA->nb; @@ -339,7 +390,7 @@ BODY [type=CUDA] double mzone = -1.; cublasStatus_t status; - handles = parsec_info_get(&gpu_stream->infos, CuHandlesID); + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); assert(NULL != handles); cublasSetStream( handles->cublas_handle, parsec_body.stream ); status = cublasZherk_v2( handles->cublas_handle, @@ -347,11 +398,31 @@ BODY [type=CUDA] tempnn, descA->mb, &mzone, A, ldak, &zone, T, ldan); - PARSEC_CUDA_CHECK_ERROR( "cublasZherk_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); } END +BODY [type=HIP] +{ + int tempnn = n == descA->nt-1 ? descA->n - n*descA->nb : descA->nb; + int ldak = LDA(ddescA, A ); + int ldan = LDA(ddescA, T ); + double zone = 1.; + double mzone = -1.; + + hipblasStatus_t status; + dplasma_hip_handles_t *handles; + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZherk(handles->hipblas_handle, + HIPBLAS_FILL_MODE_UPPER, HIPBLAS_OP_T, + tempnn, descA->mb, + &mzone, A, ldak, + &zone, T, ldan); + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZherk", status, {return PARSEC_HOOK_RETURN_ERROR;}); +} +END + BODY [type=RECURSIVE] { int tempnn = n == descA->nt-1 ? descA->n - n*descA->nb : descA->nb; @@ -438,8 +509,8 @@ BODY [type=CUDA B.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} C.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} A.dc=ddescA B.dc=ddescA C.dc=ddescA - stage_in=stage_in_lapack - stage_out=stage_out_lapack] + stage_in=dplasma_cuda_lapack_stage_in + stage_out=dplasma_cuda_lapack_stage_out] { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex zone = make_cuDoubleComplex( 1., 0.); @@ -461,7 +532,7 @@ BODY [type=CUDA assert( ldak_B <= descA->mb ); assert( ldam_C <= descA->mb ); - handles = parsec_info_get(&gpu_stream->infos, CuHandlesID); + handles = parsec_info_get(&gpu_stream->infos, cuda_handles_infokey); assert(NULL != handles); cublasSetStream( handles->cublas_handle, parsec_body.stream ); @@ -473,7 +544,7 @@ BODY [type=CUDA &zone, (cuDoubleComplex*)C, ldam_C); PARSEC_CUDA_CHECK_ERROR( "cublasZgemm_v2 ", status, {return PARSEC_HOOK_RETURN_ERROR;} ); - printlogcuda("CUDA_zgemm( %d, %d, %d )\n\t( %s, %s, %d, %d, %d, %f, A(%d,%d)[%p], %d, A(%d,%d)[%p], %d, %f, A(%d,%d)[%p], %d)\n", + printloggpu("zgemm( %d, %d, %d )\n\t( %s, %s, %d, %d, %d, %f, A(%d,%d)[%p], %d, A(%d,%d)[%p], %d, %f, A(%d,%d)[%p], %d)\n", m, n, k, &dplasma_lapack_const( dplasmaConjTrans ), &dplasma_lapack_const( dplasmaNoTrans ), descA->mb, tempnn, descA->nb, @@ -483,6 +554,44 @@ BODY [type=CUDA } END +BODY [type=HIP + A.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} + B.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} + C.size=%{ return descA->mb*descA->nb*parsec_datadist_getsizeoftype(descA->mtype);%} + A.dc=ddescA B.dc=ddescA C.dc=ddescA + stage_in=dplasma_hip_lapack_stage_in + stage_out=dplasma_hip_lapack_stage_out] +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + hipblasDoubleComplex zone = { 1., 0. }; + hipblasDoubleComplex mzone = { -1., 0. }; +#else + double zone = 1.; + double mzone = -1.; +#endif + + int tempnn = n == descA->nt-1 ? descA->n - n * descA->nb : descA->nb; + int ldak_A = LDA(ddescA, A); + int ldak_B = LDA(ddescA, B); + int ldam_C = LDA(ddescA, C); + assert( ldak_A <= descA->mb ); + assert( ldak_B <= descA->mb ); + assert( ldam_C <= descA->mb ); + + hipblasStatus_t status; + dplasma_hip_handles_t *handles; + handles = parsec_info_get(&gpu_stream->infos, hip_handles_infokey); + assert(NULL != handles); + status = hipblasZgemm( handles->hipblas_handle, + HIPBLAS_OP_C, HIPBLAS_OP_N, + descA->mb, tempnn, descA->nb, + &mzone, A, ldak_A, + B, ldak_B, + &zone, C, ldam_C); + DPLASMA_HIPBLAS_CHECK_ERROR("hipblasZgemm", status, {return PARSEC_HOOK_RETURN_ERROR;}); +} +END + BODY [type=RECURSIVE] { int tempnn = n == descA->nt-1 ? descA->n - n * descA->nb : descA->nb; diff --git a/src/zpotrf_wrapper.c b/src/zpotrf_wrapper.c index 8699ee0d..381f3a6f 100644 --- a/src/zpotrf_wrapper.c +++ b/src/zpotrf_wrapper.c @@ -12,12 +12,9 @@ #include "dplasma.h" #include "dplasma/types.h" #include "dplasma/types_lapack.h" -#if defined(DPLASMA_HAVE_CUDA) -#include -#include "potrf_cublas_utils.h" -#include "parsec/utils/zone_malloc.h" -#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmaaux.h" +#include "potrf_gpu_workspaces.h" +#include "parsec/utils/zone_malloc.h" #include "zpotrf_U.h" #include "zpotrf_L.h" @@ -59,14 +56,16 @@ dplasma_zpotrf_setrecursive( parsec_taskpool_t *tp, int hmb ) } #if defined(DPLASMA_HAVE_CUDA) -void *zpotrf_create_workspace(void *obj, void *user) +#include + +static void *zpotrf_create_cuda_workspace(void *obj, void *user) { parsec_device_module_t *mod = (parsec_device_module_t *)obj; - zone_malloc_t *memory = ((parsec_device_cuda_module_t*)mod)->super.memory; + zone_malloc_t *memory = ((parsec_device_gpu_module_t*)mod)->memory; cusolverDnHandle_t cusolverDnHandle; cusolverStatus_t status; parsec_zpotrf_U_taskpool_t *tp = (parsec_zpotrf_U_taskpool_t*)user; - dplasma_potrf_workspace_t *wp = NULL; + dplasma_potrf_gpu_workspaces_t *wp = NULL; int workspace_size; int mb = tp->_g_descA->mb; int nb = tp->_g_descA->nb; @@ -88,7 +87,7 @@ void *zpotrf_create_workspace(void *obj, void *user) cusolverDnDestroy(cusolverDnHandle); - wp = (dplasma_potrf_workspace_t*)malloc(sizeof(dplasma_potrf_workspace_t)); + wp = (dplasma_potrf_gpu_workspaces_t*)malloc(sizeof(dplasma_potrf_gpu_workspaces_t)); wp->tmpmem = zone_malloc(memory, workspace_size * elt_size + sizeof(int)); assert(NULL != wp->tmpmem); wp->lwork = workspace_size; @@ -97,9 +96,35 @@ void *zpotrf_create_workspace(void *obj, void *user) return wp; } -static void destroy_workspace(void *_ws, void *_n) +static void zpotrf_destroy_cuda_workspace(void *_ws, void *_n) { - dplasma_potrf_workspace_t *ws = (dplasma_potrf_workspace_t*)_ws; + dplasma_potrf_gpu_workspaces_t *ws = (dplasma_potrf_gpu_workspaces_t*)_ws; + zone_free((zone_malloc_t*)ws->memory, ws->tmpmem); + free(ws); + (void)_n; +} +#endif + +#if defined(DPLASMA_HAVE_HIP) +static void *zpotrf_create_hip_workspace(void *obj, void *user) +{ + parsec_device_module_t *mod = (parsec_device_module_t *)obj; + zone_malloc_t *memory = ((parsec_device_gpu_module_t*)mod)->memory; + dplasma_potrf_gpu_workspaces_t *wp = NULL; + (void)user; + + wp = (dplasma_potrf_gpu_workspaces_t*)malloc(sizeof(dplasma_potrf_gpu_workspaces_t)); + wp->tmpmem = zone_malloc(memory, sizeof(int)); + assert(NULL != wp->tmpmem); + wp->lwork = 0; + wp->memory = memory; + + return wp; +} + +static void zpotrf_destroy_hip_workspace(void *_ws, void *_n) +{ + dplasma_potrf_gpu_workspaces_t *ws = (dplasma_potrf_gpu_workspaces_t*)_ws; zone_free((zone_malloc_t*)ws->memory, ws->tmpmem); free(ws); (void)_n; @@ -178,8 +203,10 @@ dplasma_zpotrf_New( dplasma_enum_t uplo, int *info ) { parsec_zpotrf_L_taskpool_t *parsec_zpotrf = NULL; +#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP) char workspace_info_name[64]; static int uid = 0; +#endif parsec_taskpool_t *tp = NULL; dplasma_data_collection_t * ddc_A = dplasma_wrap_data_collection(A); @@ -204,16 +231,28 @@ dplasma_zpotrf_New( dplasma_enum_t uplo, #if defined(DPLASMA_HAVE_CUDA) /* It doesn't cost anything to define these infos if we have CUDA but * don't have GPUs on the current machine, so we do it non-conditionally */ - parsec_zpotrf->_g_CuHandlesID = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::CUDA::HANDLES", NULL); + parsec_zpotrf->_g_cuda_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::CUDA::HANDLES", NULL); snprintf(workspace_info_name, 64, "DPLASMA::ZPOTRF(%d)::WS", uid++); - parsec_zpotrf->_g_POWorkspaceID = parsec_info_register(&parsec_per_device_infos, workspace_info_name, - destroy_workspace, NULL, - zpotrf_create_workspace, parsec_zpotrf, + parsec_zpotrf->_g_cuda_workspaces_infokey = parsec_info_register(&parsec_per_device_infos, workspace_info_name, + zpotrf_destroy_cuda_workspace, NULL, + zpotrf_create_cuda_workspace, parsec_zpotrf, NULL); #else - parsec_zpotrf->_g_CuHandlesID = PARSEC_INFO_ID_UNDEFINED; - parsec_zpotrf->_g_POWorkspaceID = PARSEC_INFO_ID_UNDEFINED; - (void)uid; (void)workspace_info_name; + parsec_zpotrf->_g_cuda_handles_infokey = PARSEC_INFO_ID_UNDEFINED; + parsec_zpotrf->_g_cuda_workspaces_infokey = PARSEC_INFO_ID_UNDEFINED; +#endif +#if defined(DPLASMA_HAVE_HIP) + /* It doesn't cost anything to define these infos if we have HIP but + * don't have GPUs on the current machine, so we do it non-conditionally */ + parsec_zpotrf->_g_hip_handles_infokey = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); + snprintf(workspace_info_name, 64, "DPLASMA::ZPOTRF(%d)::WS", uid++); + parsec_zpotrf->_g_hip_workspaces_infokey = parsec_info_register(&parsec_per_device_infos, workspace_info_name, + zpotrf_destroy_hip_workspace, NULL, + zpotrf_create_hip_workspace, parsec_zpotrf, + NULL); +#else + parsec_zpotrf->_g_hip_handles_infokey = PARSEC_INFO_ID_UNDEFINED; + parsec_zpotrf->_g_hip_workspaces_infokey = PARSEC_INFO_ID_UNDEFINED; #endif int shape = 0; dplasma_setup_adtt_all_loc( ddc_A, @@ -253,7 +292,10 @@ dplasma_zpotrf_Destruct( parsec_taskpool_t *tp ) dplasma_data_collection_t * ddc_A = parsec_zpotrf->_g_ddescA; #if defined(DPLASMA_HAVE_CUDA) - parsec_info_unregister(&parsec_per_device_infos, parsec_zpotrf->_g_POWorkspaceID, NULL); + parsec_info_unregister(&parsec_per_device_infos, parsec_zpotrf->_g_cuda_workspaces_infokey, NULL); +#endif +#if defined(DPLASMA_HAVE_HIP) + parsec_info_unregister(&parsec_per_device_infos, parsec_zpotrf->_g_hip_workspaces_infokey, NULL); #endif parsec_taskpool_free(tp); diff --git a/src/ztrsm_LLN.jdf b/src/ztrsm_LLN.jdf index b814593a..40661a3b 100644 --- a/src/ztrsm_LLN.jdf +++ b/src/ztrsm_LLN.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 + * Copyright (c) 2010-2024 * * The University of Tennessee and The University * of Tennessee Research Foundation. All rights @@ -9,6 +9,10 @@ extern "C" %{ * @precisions normal z -> s d c * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" diff --git a/src/ztrsm_LLT.jdf b/src/ztrsm_LLT.jdf index 2c0d708e..fec3febc 100644 --- a/src/ztrsm_LLT.jdf +++ b/src/ztrsm_LLT.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 + * Copyright (c) 2010-2024 * * The University of Tennessee and The University * of Tennessee Research Foundation. All rights @@ -9,6 +9,10 @@ extern "C" %{ * @precisions normal z -> s d c * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" diff --git a/src/ztrsm_LUN.jdf b/src/ztrsm_LUN.jdf index 838cc835..e71a4c0b 100644 --- a/src/ztrsm_LUN.jdf +++ b/src/ztrsm_LUN.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 + * Copyright (c) 2010-2024 * * The University of Tennessee and The University * of Tennessee Research Foundation. All rights @@ -9,6 +9,10 @@ extern "C" %{ * @precisions normal z -> s d c * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" diff --git a/src/ztrsm_LUT.jdf b/src/ztrsm_LUT.jdf index d8eb0950..a009a7f2 100644 --- a/src/ztrsm_LUT.jdf +++ b/src/ztrsm_LUT.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 + * Copyright (c) 2010-2024 * * The University of Tennessee and The University * of Tennessee Research Foundation. All rights @@ -9,6 +9,10 @@ extern "C" %{ * @precisions normal z -> s d c * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" diff --git a/src/ztrsm_RLN.jdf b/src/ztrsm_RLN.jdf index 2cd6cc5e..759e95ad 100644 --- a/src/ztrsm_RLN.jdf +++ b/src/ztrsm_RLN.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 + * Copyright (c) 2010-2024 * * The University of Tennessee and The University * of Tennessee Research Foundation. All rights @@ -9,6 +9,10 @@ extern "C" %{ * @precisions normal z -> s d c * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" diff --git a/src/ztrsm_RLT.jdf b/src/ztrsm_RLT.jdf index bf799e47..12657a6a 100644 --- a/src/ztrsm_RLT.jdf +++ b/src/ztrsm_RLT.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 + * Copyright (c) 2010-2024 * * The University of Tennessee and The University * of Tennessee Research Foundation. All rights @@ -9,6 +9,10 @@ extern "C" %{ * @precisions normal z -> s d c * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" diff --git a/src/ztrsm_RUN.jdf b/src/ztrsm_RUN.jdf index 9d2a59f6..e864bc9c 100644 --- a/src/ztrsm_RUN.jdf +++ b/src/ztrsm_RUN.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 + * Copyright (c) 2010-2024 * * The University of Tennessee and The University * of Tennessee Research Foundation. All rights @@ -9,6 +9,10 @@ extern "C" %{ * @precisions normal z -> s d c * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" diff --git a/src/ztrsm_RUT.jdf b/src/ztrsm_RUT.jdf index 6e4385f3..7b8c5e54 100644 --- a/src/ztrsm_RUT.jdf +++ b/src/ztrsm_RUT.jdf @@ -1,6 +1,6 @@ extern "C" %{ /* - * Copyright (c) 2010-2020 + * Copyright (c) 2010-2024 * * The University of Tennessee and The University * of Tennessee Research Foundation. All rights @@ -9,6 +9,10 @@ extern "C" %{ * @precisions normal z -> s d c * */ +#include "dplasma/config.h" +#if defined(DPLASMA_HAVE_CUDA) +#include +#endif /* defined(DPLASMA_HAVE_CUDA) */ #include "dplasmajdf.h" #include "parsec/data_dist/matrix/matrix.h" diff --git a/tests/Testings.cmake b/tests/Testings.cmake index 6b5ee22a..666fd7f7 100644 --- a/tests/Testings.cmake +++ b/tests/Testings.cmake @@ -147,14 +147,24 @@ foreach(prec ${DPLASMA_PRECISIONS} ) # GPU tests if (DPLASMA_HAVE_CUDA) - dplasma_add_test(potrf potrf 1gpu_shm -N 3200 -t 320 ${OPTIONS} -g 1 -- --mca device_cuda_memory_number_of_blocks 4096) - dplasma_add_test(potrf potrf 1gpu_lowmem_shm -N 3200 -t 320 ${OPTIONS} -g 1 -- --mca device_cuda_memory_number_of_blocks 21) - dplasma_add_test(potrf potrf 1gpu_~knb_shm -N 1700 -t 320 ${OPTIONS} -g 1 -- --mca device_cuda_memory_number_of_blocks 4096) - dplasma_add_test(potrf potrf 2gpu_shm -N 4600 -t 320 ${OPTIONS} -g 2 -- --mca device_cuda_memory_number_of_blocks 4096) - dplasma_add_test(gemm gemm 1gpu_shm -N 1280 -t 320 ${OPTIONS} -g 1 -- --mca device_cuda_memory_number_of_blocks 4096) - dplasma_add_test(gemm gemm 1gpu_~knb_shm -N 1000 -t 320 ${OPTIONS} -g 1 -- --mca device_cuda_memory_number_of_blocks 4096) - dplasma_add_test(gemm gemm 2gpu_shm -N 1940 -t 320 ${OPTIONS} -g 2 -- --mca device_cuda_memory_number_of_blocks 4096) + dplasma_add_test(potrf potrf 1gpu_cuda_shm -N 3200 -t 320 ${OPTIONS} -g 1 -- --mca device_cuda_memory_number_of_blocks 4096) + dplasma_add_test(potrf potrf 1gpu_cuda_lowmem_shm -N 3200 -t 320 ${OPTIONS} -g 1 -- --mca device_cuda_memory_number_of_blocks 21) + dplasma_add_test(potrf potrf 1gpu_cuda_~knb_shm -N 1700 -t 320 ${OPTIONS} -g 1 -- --mca device_cuda_memory_number_of_blocks 4096) + dplasma_add_test(potrf potrf 2gpu_cuda_shm -N 4600 -t 320 ${OPTIONS} -g 2 -- --mca device_cuda_memory_number_of_blocks 4096) + dplasma_add_test(gemm gemm 1gpu_cuda_shm -N 1280 -t 320 ${OPTIONS} -g 1 -- --mca device_cuda_memory_number_of_blocks 4096) + dplasma_add_test(gemm gemm 1gpu_cuda_~knb_shm -N 1000 -t 320 ${OPTIONS} -g 1 -- --mca device_cuda_memory_number_of_blocks 4096) + dplasma_add_test(gemm gemm 2gpu_cuda_shm -N 1940 -t 320 ${OPTIONS} -g 2 -- --mca device_cuda_memory_number_of_blocks 4096) endif (DPLASMA_HAVE_CUDA) + if (DPLASMA_HAVE_HIP) + dplasma_add_test(potrf potrf 1gpu_hip_shm -N 3200 -t 320 ${OPTIONS} -g 1 -- --mca device_hip_memory_number_of_blocks 4096) + dplasma_add_test(potrf potrf 1gpu_hip_lowmem_shm -N 3200 -t 320 ${OPTIONS} -g 1 -- --mca device_hip_memory_number_of_blocks 21) + dplasma_add_test(potrf potrf 1gpu_hip_~knb_shm -N 1700 -t 320 ${OPTIONS} -g 1 -- --mca device_hip_memory_number_of_blocks 4096) + dplasma_add_test(potrf potrf 2gpu_hip_shm -N 4600 -t 320 ${OPTIONS} -g 2 -- --mca device_hip_memory_number_of_blocks 4096) + dplasma_add_test(gemm gemm 1gpu_hip_shm -N 1280 -t 320 ${OPTIONS} -g 1 -- --mca device_hip_memory_number_of_blocks 4096) + dplasma_add_test(gemm gemm 1gpu_hip_~knb_shm -N 1000 -t 320 ${OPTIONS} -g 1 -- --mca device_hip_memory_number_of_blocks 4096) + dplasma_add_test(gemm gemm 2gpu_hip_shm -N 1940 -t 320 ${OPTIONS} -g 2 -- --mca device_hip_memory_number_of_blocks 4096) + endif (DPLASMA_HAVE_HIP) + # if ( ${prec} STREQUAL "c" OR ${prec} STREQUAL "z" ) # dplasma_add_test(heev "" ${PTG2DTD}_shm -N 4000 ${OPTIONS}) @@ -254,12 +264,19 @@ if( MPI_C_FOUND ) # GPU Cholesky tests if (DPLASMA_HAVE_CUDA AND MPI_C_FOUND) - dplasma_add_test(potrf potrf 1gpu_mpi:${PROCS} -N 3200 -t 320 ${OPTIONS} -g 1 -P 2 -- --mca device_cuda_memory_number_of_blocks 4096) - dplasma_add_test(potrf potrf 1gpu_~knb_mpi:${PROCS} -N 1700 -t 320 ${OPTIONS} -g 1 -P 2 -- --mca device_cuda_memory_number_of_blocks 4096) - dplasma_add_test(potrf potrf_1gpu 2gpu_mpi:${PROCS} -N 4600 -t 320 ${OPTIONS} -g 2 -P 2 -- --mca device_cuda_memory_number_of_blocks 4096) - dplasma_add_test(gemm gemm 2gpu_mpi:${PROCS} -N 1940 -t 320 ${OPTIONS} -g 2 -P 2 -- --mca device_cuda_memory_number_of_blocks 4096) - dplasma_add_test(gemm gemm 2gpu_lowmem_mpi:${PROCS} -N 1940 -t 320 ${OPTIONS} -g 2 -P 2 -- --mca device_cuda_memory_number_of_blocks 21) + dplasma_add_test(potrf potrf 1gpu_cuda_mpi:${PROCS} -N 3200 -t 320 ${OPTIONS} -g 1 -P 2 -- --mca device_cuda_memory_number_of_blocks 4096) + dplasma_add_test(potrf potrf 1gpu_cuda_~knb_mpi:${PROCS} -N 1700 -t 320 ${OPTIONS} -g 1 -P 2 -- --mca device_cuda_memory_number_of_blocks 4096) + dplasma_add_test(potrf potrf_1gpu 2gpu_cuda_mpi:${PROCS} -N 4600 -t 320 ${OPTIONS} -g 2 -P 2 -- --mca device_cuda_memory_number_of_blocks 4096) + dplasma_add_test(gemm gemm 2gpu_cuda_mpi:${PROCS} -N 1940 -t 320 ${OPTIONS} -g 2 -P 2 -- --mca device_cuda_memory_number_of_blocks 4096) + dplasma_add_test(gemm gemm 2gpu_cuda_lowmem_mpi:${PROCS} -N 1940 -t 320 ${OPTIONS} -g 2 -P 2 -- --mca device_cuda_memory_number_of_blocks 21) endif (DPLASMA_HAVE_CUDA AND MPI_C_FOUND) + if (DPLASMA_HAVE_HIP AND MPI_C_FOUND) + dplasma_add_test(potrf potrf 1gpu_hip_mpi:${PROCS} -N 3200 -t 320 ${OPTIONS} -g 1 -P 2 -- --mca device_hip_memory_number_of_blocks 4096) + dplasma_add_test(potrf potrf 1gpu_hip_~knb_mpi:${PROCS} -N 1700 -t 320 ${OPTIONS} -g 1 -P 2 -- --mca device_hip_memory_number_of_blocks 4096) + dplasma_add_test(potrf potrf_1gpu 2gpu_hip_mpi:${PROCS} -N 4600 -t 320 ${OPTIONS} -g 2 -P 2 -- --mca device_hip_memory_number_of_blocks 4096) + dplasma_add_test(gemm gemm 2gpu_hip_mpi:${PROCS} -N 1940 -t 320 ${OPTIONS} -g 2 -P 2 -- --mca device_hip_memory_number_of_blocks 4096) + dplasma_add_test(gemm gemm 2gpu_hip_lowmem_mpi:${PROCS} -N 1940 -t 320 ${OPTIONS} -g 2 -P 2 -- --mca device_hip_memory_number_of_blocks 21) + endif (DPLASMA_HAVE_HIP AND MPI_C_FOUND) # dplasma_add_test(potrf_pbq "" mpi:${PROCS} -N 4000 ${OPTIONS} -o PBQ) # dplasma_add_test(geqrf_pbq "" mpi:${PROCS} -N 4000 ${OPTIONS} -o PBQ) diff --git a/tests/common.c b/tests/common.c index 4e390732..feef5a29 100644 --- a/tests/common.c +++ b/tests/common.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009-2021 The University of Tennessee and The University + * Copyright (c) 2009-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -26,11 +26,14 @@ #ifdef PARSEC_HAVE_MPI #include #endif +#include "dplasmaaux.h" #if defined(DPLASMA_HAVE_CUDA) #include -#include "dplasmaaux.h" #include #endif +#if defined(DPLASMA_HAVE_HIP) +#include +#endif char *PARSEC_SCHED_NAME[] = { "", /* default */ @@ -315,7 +318,7 @@ static void read_arguments(int *_argc, char*** _argv, int* iparam) break; case 'g': -#if !defined(DPLASMA_HAVE_CUDA) +#if !defined(DPLASMA_HAVE_CUDA) && !defined(DPLASMA_HAVE_HIP) iparam[IPARAM_NGPUS] = DPLASMA_ERR_NOT_SUPPORTED; /* force an error message */ #endif if(iparam[IPARAM_NGPUS] == DPLASMA_ERR_NOT_SUPPORTED) { @@ -446,6 +449,7 @@ static void parse_arguments(int *iparam) { if(iparam[IPARAM_NGPUS] != DPLASMA_ERR_NOT_INITIALIZED) { rc = asprintf(&value, "%d", iparam[IPARAM_NGPUS]); (void)rc; parsec_setenv_mca_param( "device_cuda_enabled", value, &environ ); + parsec_setenv_mca_param( "device_hip_enabled", value, &environ ); free(value); } @@ -694,26 +698,41 @@ parsec_context_t* setup_parsec(int argc, char **argv, int *iparam) if(iparam[IPARAM_VERBOSE] >= 4) { parsec_setenv_mca_param( "device_show_capabilities", "1", &environ ); } -#if defined(DPLASMA_HAVE_CUDA) - int dev, nbgpu = 0; + +#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP) + int dev, nb_cuda_gpu = 0, nb_hip_gpu = 0; for(dev = 0; dev < (int)parsec_nb_devices; dev++) { parsec_device_module_t *device = parsec_mca_device_get(dev); if( PARSEC_DEV_CUDA == device->type ) { - nbgpu++; + nb_cuda_gpu++; + } + else if( PARSEC_DEV_HIP == device->type ) { + nb_hip_gpu++; } } - if( nbgpu > 0 ) { - CuHI = parsec_info_register(&parsec_per_stream_infos, "DPLASMA::CUDA::HANDLES", - dplasma_destroy_cuda_handles, NULL, - dplasma_create_cuda_handles, NULL, - NULL); - assert(-1 != CuHI); - } - iparam[IPARAM_NGPUS] = nbgpu; + iparam[IPARAM_NGPUS] = nb_cuda_gpu + nb_hip_gpu; if(iparam[IPARAM_NGPUS] > 0 && iparam[IPARAM_VERBOSE] >= 3) { parsec_setenv_mca_param( "device_show_statistics", "1", &environ ); } +#if defined(DPLASMA_HAVE_CUDA) + if( nb_cuda_gpu > 0 ) { + dplasma_dtd_cuda_infoid = parsec_info_register(&parsec_per_stream_infos, "DPLASMA::CUDA::HANDLES", + dplasma_destroy_cuda_handles, NULL, + dplasma_create_cuda_handles, NULL, + NULL); + assert(-1 != dplasma_dtd_cuda_infoid); + } #endif +#if defined(DPLASMA_HAVE_HIP) + if( nb_hip_gpu > 0 ) { + dplasma_dtd_hip_infoid = parsec_info_register(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", + dplasma_destroy_hip_handles, NULL, + dplasma_create_hip_handles, NULL, + NULL); + assert(-1 != dplasma_dtd_hip_infoid); + } +#endif +#endif /* defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP) */ print_arguments(iparam); if(verbose > 2) TIME_PRINT(iparam[IPARAM_RANK], ("PaRSEC initialized\n")); @@ -723,8 +742,14 @@ parsec_context_t* setup_parsec(int argc, char **argv, int *iparam) void cleanup_parsec(parsec_context_t* parsec, int *iparam) { #if defined(DPLASMA_HAVE_CUDA) - parsec_info_id_t CuHI = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::CUDA::HANDLES", NULL); - parsec_info_unregister(&parsec_per_stream_infos, CuHI, NULL); + { parsec_info_id_t iid = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::CUDA::HANDLES", NULL); + parsec_info_unregister(&parsec_per_stream_infos, iid, NULL); + } +#endif +#if defined(DPLASMA_HAVE_HIP) + { parsec_info_id_t iid = parsec_info_lookup(&parsec_per_stream_infos, "DPLASMA::HIP::HANDLES", NULL); + parsec_info_unregister(&parsec_per_stream_infos, iid, NULL); + } #endif if(NULL != dev_stats) parsec_devices_free_statistics(&dev_stats); diff --git a/tests/testing_zgemm_dtd.c b/tests/testing_zgemm_dtd.c index 6161bba4..a9e5a602 100644 --- a/tests/testing_zgemm_dtd.c +++ b/tests/testing_zgemm_dtd.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2023 The University of Tennessee and The University + * Copyright (c) 2015-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * diff --git a/tests/testing_zpotrf_dtd.c b/tests/testing_zpotrf_dtd.c index 0ecaee5e..2b6e1841 100644 --- a/tests/testing_zpotrf_dtd.c +++ b/tests/testing_zpotrf_dtd.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2023 The University of Tennessee and The University + * Copyright (c) 2013-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * @@ -118,7 +118,7 @@ int main(int argc, char **argv) infos->mb = dcA.super.mb; infos->nb = dcA.super.nb; infos->uplo = uplo; - WoSI = parsec_info_register( &parsec_per_stream_infos, "DPLASMA::ZPOTRF::WS", + dplasma_dtd_cuda_workspace_infoid = parsec_info_register( &parsec_per_stream_infos, "DPLASMA::ZPOTRF::WS", zpotrf_dtd_destroy_workspace, NULL, zpotrf_dtd_create_workspace, infos, NULL); @@ -219,7 +219,7 @@ int main(int argc, char **argv) infos->mb = dcA.super.mb; infos->nb = dcA.super.nb; infos->uplo = uplo; - WoSI = parsec_info_register( &parsec_per_device_infos, "DPLASMA::ZPOTRF::WS", + dplasma_dtd_cuda_workspace_infoid = parsec_info_register( &parsec_per_device_infos, "DPLASMA::ZPOTRF::WS", zpotrf_dtd_destroy_workspace, NULL, zpotrf_dtd_create_workspace, infos, NULL); @@ -322,7 +322,7 @@ int main(int argc, char **argv) /* Cleaning up the parsec handle */ parsec_taskpool_free( dtd_tp ); #if defined(DPLASMA_HAVE_CUDA) - parsec_info_unregister(&parsec_per_device_infos, WoSI, NULL); + parsec_info_unregister(&parsec_per_device_infos, dplasma_dtd_cuda_workspace_infoid, NULL); free(infos); #endif diff --git a/tools/PrecisionGenerator/subs.py b/tools/PrecisionGenerator/subs.py index 0d81f61a..e16a64cb 100644 --- a/tools/PrecisionGenerator/subs.py +++ b/tools/PrecisionGenerator/subs.py @@ -69,7 +69,7 @@ ('#undef COMPLEX', '#undef COMPLEX', '#undef REAL', '#undef REAL' ), ('#define SINGLE', '#define DOUBLE', '#define SINGLE', '#define DOUBLE' ), ('#undef DOUBLE', '#undef SINGLE', '#undef DOUBLE', '#undef SINGLE' ), - ('float', 'double', 'dplasma_complex32_t', 'dplasma_complex64_t' ), + ('float', 'double', 'dplasma_complex32_t', 'dplasma_complex64_t'), ('PARSEC_MATRIX_FLOAT', 'PARSEC_MATRIX_DOUBLE', 'PARSEC_MATRIX_COMPLEX_FLOAT', 'PARSEC_MATRIX_COMPLEX_DOUBLE'), ('dplasma_float', 'dplasma_double', 'dplasma_complex32', 'dplasma_complex64' ), ## for doxygen categories ('dplasma_cores_float', 'dplasma_cores_double','dplasma_cores_complex32', 'dplasma_cores_complex64'), ## for doxygen categories @@ -81,11 +81,15 @@ ('smatrix', 'dmatrix', 'cmatrix', 'zmatrix' ), ('stwoDBC', 'dtwoDBC', 'ctwoDBC', 'ztwoDBC' ), ('float', 'double', 'cuFloatComplex', 'cuDoubleComplex' ), + ('float', 'double', 'hipComplex', 'hipDoubleComplex' ), + ('float', 'double', 'hipblasComplex', 'hipblasDoubleComplex'), ## both needed for make_hipComplex() ('float', 'double', 'cuCdivf', 'cuCdiv' ), + ('float', 'double', 'hipCdivf', 'hipCdiv' ), ('', '', 'crealf', 'creal' ), ('', '', 'cimagf', 'cimag' ), ('', '', 'conjf', 'conj' ), ('', '', 'cuCfmaf', 'cuCfma' ), + ('', '', 'hipCfmaf', 'hipCfma' ), ('cblas_snrm2','cblas_dnrm2','cblas_scnrm2','cblas_dznrm2'), ('cblas_sasum','cblas_dasum','cblas_scasum','cblas_dzasum'), @@ -279,6 +283,10 @@ ('cuda_s', 'cuda_d', 'cuda_c', 'cuda_z' ), ('cublasS', 'cublasD', 'cublasS', 'cublasD' ), ('cublasS', 'cublasD', 'cublasC', 'cublasZ' ), + ('hip_s', 'hip_d', 'hip_s', 'hip_d' ), + ('hip_s', 'hip_d', 'hip_c', 'hip_z' ), + ('hipblasS', 'hipblasD', 'hipblasS', 'hipblasD' ), + ('hipblasS', 'hipblasD', 'hipblasC', 'hipblasZ' ), ('example_s', 'example_d', 'example_c', 'example_z' ), ('FLOPS_SSY', 'FLOPS_DSY', 'FLOPS_CHE', 'FLOPS_ZHE' ), ('FLOPS_S', 'FLOPS_D', 'FLOPS_C', 'FLOPS_Z' ),