From e0a77ea8a8100d657631e81542799b9883267e44 Mon Sep 17 00:00:00 2001 From: Qinglei Cao Date: Fri, 27 Sep 2024 14:01:58 +0000 Subject: [PATCH] Add advice device support in dplasma --- src/dplasmaaux.c | 108 +++++++++++++++++++++++++++++++++++++++++ src/dplasmaaux.h | 35 +++++++++++++ src/zpotrf_L.jdf | 57 ++++------------------ src/zpotrf_wrapper.c | 54 --------------------- tests/testing_zgemm.c | 21 ++++++++ tests/testing_zpotrf.c | 9 +++- 6 files changed, 181 insertions(+), 103 deletions(-) diff --git a/src/dplasmaaux.c b/src/dplasmaaux.c index 86a6b189..3793f9a2 100644 --- a/src/dplasmaaux.c +++ b/src/dplasmaaux.c @@ -14,6 +14,8 @@ #include #include "dplasmaaux.h" #include "parsec/utils/show_help.h" +#include "parsec/data_dist/matrix/sym_two_dim_rectangle_cyclic.h" +#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" #if defined(PARSEC_HAVE_MPI) /* @@ -110,3 +112,109 @@ dplasma_aux_getGEMMLookahead( parsec_tiled_matrix_t *A ) } } +#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP) + +/* Find all devices */ +void dplasma_find_nb_devices(int **dev_index, int *nb) { + *nb = 0; + for(int i = 0; i < (int)parsec_nb_devices; i++) { + parsec_device_module_t *device = parsec_mca_device_get(i); + if( PARSEC_DEV_CUDA & device->type || PARSEC_DEV_HIP & device->type ) { + (*nb)++; + } + } +#if defined(DPLASMA_DEBUG) + if((*nb) == 0) { + char hostname[256]; + gethostname(hostname, 256); + fprintf(stderr, "No CUDA device found on rank %d on %s\n", + parsec->my_rank, hostname); + } +#endif + *dev_index = (int *)malloc((*nb) * sizeof(int)); + *nb = 0; + for(int i = 0; i < (int)parsec_nb_devices; i++) { + parsec_device_module_t *device = parsec_mca_device_get(i); + if( PARSEC_DEV_CUDA & device->type || PARSEC_DEV_HIP & device->type ) { + (*dev_index)[(*nb)++] = device->device_index; + } + } +} + +/* Get the most suitable process/gpu grid */ +int dplasma_grid_calculation( int nb_process ) { + int P; + for( P = (int)(sqrt(nb_process + 1.0)); P > 0; P-- ) { + if( 0 == nb_process % P ) break; + } + return P; +} + +/* Operator 2D */ +int dplasma_advise_data_on_device_ops_2D(parsec_execution_stream_t *es, + const parsec_tiled_matrix_t *A, + void *_A, parsec_matrix_uplo_t uplo, + int m, int n, void *op_args) { + dplasma_advise_data_on_device_t *args = (dplasma_advise_data_on_device_t *)op_args; + + if( args->nb_gpu_devices > 0 ) { + /* Nested 2D grid on GPU */ + int g = (m / args->grid_rows % args->gpu_rows) * args->gpu_cols + n / args->grid_cols % args->gpu_cols; + parsec_advise_data_on_device(A->super.data_of((parsec_data_collection_t*)A, m, n), + args->gpu_device_index[g], + PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + } + + (void)es; (void)uplo; + return 0; +} + +/* Set advise data on device + * + * If op_args == NULL, use dplasma_advise_data_on_device_t by default + */ +int dplasma_advise_data_on_device(parsec_context_t *parsec, + parsec_matrix_uplo_t uplo, + parsec_tiled_matrix_t *A, + parsec_tiled_matrix_unary_op_t operation, + void *op_args) { + + if(NULL != op_args) { + parsec_apply(parsec, uplo, A, operation, op_args); + } else { + /* Find the number of devices */ + dplasma_advise_data_on_device_t *args = (dplasma_advise_data_on_device_t *)malloc(sizeof(dplasma_advise_data_on_device_t)); + dplasma_find_nb_devices(&args->gpu_device_index, &args->nb_gpu_devices); + + /* Calculate the nested grid for the multiple GPUs on one process + * gpu_rows >= gpu_cols and as square as possible */ + if(dplasmaUpper == uplo) { + args->gpu_rows = dplasma_grid_calculation(args->nb_gpu_devices); + args->gpu_cols = args->nb_gpu_devices/args->gpu_rows; + } else { + args->gpu_cols = dplasma_grid_calculation(args->nb_gpu_devices); + args->gpu_rows = args->nb_gpu_devices/args->gpu_cols; + } + + if(dplasmaUpper == uplo || dplasmaLower == uplo) { + args->grid_rows = ((parsec_matrix_sym_block_cyclic_t *)A)->grid.rows; + args->grid_cols = ((parsec_matrix_sym_block_cyclic_t *)A)->grid.cols; + } else if(dplasmaUpperLower == uplo) { + args->grid_rows = ((parsec_matrix_block_cyclic_t *)A)->grid.rows; + args->grid_cols = ((parsec_matrix_block_cyclic_t *)A)->grid.cols; + } else { + dplasma_error("dplasma_advise_data_on_device", "illegal value of uplo"); + } + +#if defined(DPLASMA_DEBUG) + printf("nb_gpu_devices %d gpu_rows %d gpu_cols %d grid_rows %d grid_cols %d\n", + args->nb_gpu_devices, args->gpu_rows, args->gpu_cols, args->grid_rows, args->grid_cols); +#endif + + parsec_apply(parsec, uplo, A, operation, (void *)args); + } + + return 0; +} + +#endif diff --git a/src/dplasmaaux.h b/src/dplasmaaux.h index 28ae2039..681777dc 100644 --- a/src/dplasmaaux.h +++ b/src/dplasmaaux.h @@ -115,4 +115,39 @@ extern void *dplasma_pcomm; #if defined(DPLASMA_HAVE_HIP) #include "dplasmaaux_hip.h" #endif + +#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP) +/* Advise data on device arguments */ +typedef struct dplasma_advise_data_on_device_s { + int nb_gpu_devices; + int *gpu_device_index; + int gpu_rows; + int gpu_cols; + int grid_rows; + int grid_cols; +} dplasma_advise_data_on_device_t; + +/* Find all devices */ +void dplasma_find_nb_devices(int **dev_index, int *nb); + +/* Get the most suitable process/gpu grid */ +int dplasma_grid_calculation( int nb_process ); + +/* Operator 2D */ +int dplasma_advise_data_on_device_ops_2D(parsec_execution_stream_t *es, + const parsec_tiled_matrix_t *descA, + void *_A, parsec_matrix_uplo_t uplo, + int m, int n, void *args); + +/* Set advise data on device + * + * If op_args == NULL, use dplasma_advise_data_on_device_t by default + */ +int dplasma_advise_data_on_device( parsec_context_t *parsec, + parsec_matrix_uplo_t uplo, + parsec_tiled_matrix_t *A, + parsec_tiled_matrix_unary_op_t operation, + void *op_args ); +#endif + #endif /* _DPLASMAAUX_H_INCLUDED */ diff --git a/src/zpotrf_L.jdf b/src/zpotrf_L.jdf index d564b211..5a2e335c 100644 --- a/src/zpotrf_L.jdf +++ b/src/zpotrf_L.jdf @@ -87,49 +87,6 @@ cuda_workspaces_infokey [type = "int" hidden = on default = -1 ] hip_handles_infokey [type = "int" hidden = on default = -1 ] hip_workspaces_infokey [type = "int" hidden = on default = -1 ] -nb_gpu_devices [ type = "int" hidden = on default = 0 ] -gpu_device_index [ type = "int *" hidden = on default = "NULL"] -gpu_rows [ type = "int" hidden = on default = 1] -gpu_cols [ type = "int" hidden = on default = 1] -grid_rows [ type = "int" hidden = on default = 1] -grid_cols [ type = "int" hidden = on default = 1] - - -/************************************************** - * potrf_bind_A * - **************************************************/ -potrf_bind_A(m, n) - -// Execution space -m = 0 .. descA->mt-1 -n = 0 .. m - -loc_A = %{ return LOC(descA, m, n); %} - -// Parallel partitioning -:descA(m, n) - -READ A <- ddescA(m, n) [ type = %{ return ADTT_READ(ddescA, loc_A, DEFAULT, TILED); %} - type_data = %{ return ADTT_READ(ddescA, loc_A, DEFAULT, LAPACK); %} ] - -> (m == 0 && n == 0) ? T potrf_zpotrf(0) - -> (n == 0)? C potrf_ztrsm(m, n) - -> (m == n && n > 0) ? T potrf_zherk(0, m) - -> (m != n && n > 0) ? C potrf_zgemm(m, n, 0) - -BODY -{ -#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) || defined(PARSEC_HAVE_DEV_HIP_SUPPORT) - if( nb_gpu_devices > 0 ) { - int g = (m / grid_rows % gpu_rows) * gpu_cols + n / grid_cols % gpu_cols; - parsec_advise_data_on_device( _f_A->original, - gpu_device_index[g], - PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); - } -#endif -} -END - - /************************************************** * potrf_zpotrf * **************************************************/ @@ -149,7 +106,8 @@ loc_T = %{ return LOC(descA, k, k); %} // Parameters -RW T <- (k == 0) ? A potrf_bind_A(k, k) [ type_remote = %{ return ADTT_DC(ddescA, loc_T, DEFAULT, TILED); %} ] +RW T <- (k == 0) ? ddescA(k, k) [ type = %{ return ADTT_READ(ddescA, loc_T, DEFAULT, TILED); %} + type_data = %{ return ADTT_READ(ddescA, loc_T, DEFAULT, LAPACK); %} ] <- (k != 0) ? T potrf_zherk(k-1, k) [ type_remote = %{ return ADTT_DC(ddescA, loc_T, DEFAULT, TILED); %} ] -> T potrf_ztrsm(k+1..descA->mt-1, k) /* dep OUT: rely on datacopy dtt for sending */ -> ddescA(k, k) [ type = %{ return ADTT_CP(_f_T, ddescA, loc_T, DEFAULT); %} @@ -277,7 +235,8 @@ loc_C = %{ return LOC(descA, m, k); %} // Parameters READ T <- T potrf_zpotrf(k) [ type_remote = %{ return ADTT_DC(ddescA, loc_T, DEFAULT, TILED); %} ] -RW C <- (k == 0) ? A potrf_bind_A(m, k) [ type_remote = %{ return ADTT_DC(ddescA, loc_C, DEFAULT, TILED); %} ] +RW C <- (k == 0) ? ddescA(m, k) [ type = %{ return ADTT_READ(ddescA, loc_C, DEFAULT, TILED); %} + type_data = %{ return ADTT_READ(ddescA, loc_C, DEFAULT, LAPACK); %} ] <- (k != 0) ? C potrf_zgemm(m, k, k-1) [ type_remote = %{ return ADTT_DC(ddescA, loc_C, DEFAULT, TILED); %} ] -> A potrf_zherk(k, m) /* dep OUT: rely on datacopy dtt for sending */ -> A potrf_zgemm(m, k+1..m-1, k) /* dep OUT: rely on datacopy dtt for sending */ @@ -411,8 +370,9 @@ loc_T = %{ return LOC(descA, m, m); %} //Parameters READ A <- C potrf_ztrsm(m, k) [ type_remote = %{ return ADTT_DC(ddescA, loc_A, DEFAULT, TILED); %} ] -RW T <- (k == 0) ? A potrf_bind_A(m, m) [ type_remote = %{ return ADTT_DC(ddescA, loc_T, DEFAULT, TILED); %} ] - <- (k != 0) ? T potrf_zherk(k-1, m) /* dep OUT: rely on datacopy dtt for sending */ +RW T <- (k == 0) ? ddescA(m, m) [ type = %{ return ADTT_READ(ddescA, loc_T, DEFAULT, TILED); %} + type_data = %{ return ADTT_READ(ddescA, loc_T, DEFAULT, LAPACK); %} ] + <- (k != 0) ? T potrf_zherk(k-1, m) [ type_remote = %{ return ADTT_DC(ddescA, loc_T, DEFAULT, TILED); %} ] -> (m == k+1) ? T potrf_zpotrf(m) : T potrf_zherk(k+1, m) /* dep OUT: rely on datacopy dtt for sending */ ; (m >= (descA->mt - PRI_CHANGE)) ? (descA->mt - m) * (descA->mt - m) * (descA->mt - m) + 3 * (m - k) : PRI_MAX @@ -533,7 +493,8 @@ loc_C = %{ return LOC(descA, m, n); %} // Parameters READ A <- C potrf_ztrsm(m, k) [ type_remote = %{ return ADTT_DC(ddescA, loc_A, DEFAULT, TILED); %} ] READ B <- C potrf_ztrsm(n, k) [ type_remote = %{ return ADTT_DC(ddescA, loc_B, DEFAULT, TILED); %} ] -RW C <- (k == 0) ? A potrf_bind_A(m, n) [ type_remote = %{ return ADTT_DC(ddescA, loc_C, DEFAULT, TILED); %} ] +RW C <- (k == 0) ? ddescA(m, n) [ type = %{ return ADTT_READ(ddescA, loc_C, DEFAULT, TILED); %} + type_data = %{ return ADTT_READ(ddescA, loc_C, DEFAULT, LAPACK); %} ] <- (k != 0) ? C potrf_zgemm(m, n, k-1) [ type_remote = %{ return ADTT_DC(ddescA, loc_C, DEFAULT, TILED); %} ] -> (n == k+1) ? C potrf_ztrsm(m, n) : C potrf_zgemm(m, n, k+1) /* dep OUT: rely on datacopy dtt for sending */ diff --git a/src/zpotrf_wrapper.c b/src/zpotrf_wrapper.c index 6ae0a5d4..381f3a6f 100644 --- a/src/zpotrf_wrapper.c +++ b/src/zpotrf_wrapper.c @@ -19,7 +19,6 @@ #include "zpotrf_U.h" #include "zpotrf_L.h" #include "cores/dplasma_plasmatypes.h" -#include "parsec/data_dist/matrix/sym_two_dim_rectangle_cyclic.h" #define MAX_SHAPES 1 @@ -130,44 +129,7 @@ static void zpotrf_destroy_hip_workspace(void *_ws, void *_n) free(ws); (void)_n; } - -#endif - -/* Find all devices */ -static void parsec_find_nb_devices(int **dev_index, int *nb) { - for(int i = 0; i < (int)parsec_nb_devices; i++) { - parsec_device_module_t *device = parsec_mca_device_get(i); - if( PARSEC_DEV_CUDA == device->type || PARSEC_DEV_HIP == device->type ) { - (*nb)++; - } - } -#if defined(DPLASMA_DEBUG) - if((*nb) == 0) { - char hostname[256]; - gethostname(hostname, 256); - fprintf(stderr, "No CUDA device found on rank %d on %s\n", - parsec->my_rank, hostname); - } #endif - *dev_index = (int *)malloc((*nb) * sizeof(int)); - *nb = 0; - for(int i = 0; i < (int)parsec_nb_devices; i++) { - parsec_device_module_t *device = parsec_mca_device_get(i); - if( PARSEC_DEV_CUDA == device->type || PARSEC_DEV_HIP == device->type ) { - (*dev_index)[(*nb)++] = device->device_index; - } - } -} - -/* Get the most suitable process/gpu grid */ -static int parsec_grid_calculation( int nb_process ) { - int P; - for( P = (int)(sqrt(nb_process + 1.0)); P > 0; P-- ) { - if( 0 == nb_process % P ) break; - } - return P; -} - /** ******************************************************************************* @@ -279,7 +241,6 @@ dplasma_zpotrf_New( dplasma_enum_t uplo, parsec_zpotrf->_g_cuda_handles_infokey = PARSEC_INFO_ID_UNDEFINED; parsec_zpotrf->_g_cuda_workspaces_infokey = PARSEC_INFO_ID_UNDEFINED; #endif - #if defined(DPLASMA_HAVE_HIP) /* It doesn't cost anything to define these infos if we have HIP but * don't have GPUs on the current machine, so we do it non-conditionally */ @@ -293,21 +254,6 @@ dplasma_zpotrf_New( dplasma_enum_t uplo, parsec_zpotrf->_g_hip_handles_infokey = PARSEC_INFO_ID_UNDEFINED; parsec_zpotrf->_g_hip_workspaces_infokey = PARSEC_INFO_ID_UNDEFINED; #endif - - int nb = 0, *dev_index; - parsec_find_nb_devices(&dev_index, &nb); - parsec_zpotrf->_g_nb_gpu_devices = nb; - parsec_zpotrf->_g_gpu_device_index = dev_index; - parsec_zpotrf->_g_gpu_cols = parsec_grid_calculation(nb); - parsec_zpotrf->_g_gpu_rows = nb/parsec_zpotrf->_g_gpu_cols; - parsec_zpotrf->_g_grid_rows = ((parsec_matrix_sym_block_cyclic_t *)A)->grid.rows; - parsec_zpotrf->_g_grid_cols = ((parsec_matrix_sym_block_cyclic_t *)A)->grid.cols; -#if defined(DPLASMA_DEBUG) - printf("nb_gpu_devices %d gpu_rows %d gpu_cols %d grid_rows %d grid_cols %d\n", - parsec_zpotrf->_g_nb_gpu_devices, parsec_zpotrf->_g_gpu_rows, - parsec_zpotrf->_g_gpu_cols, parsec_zpotrf->_g_grid_rows, parsec_zpotrf->_g_grid_cols); -#endif - int shape = 0; dplasma_setup_adtt_all_loc( ddc_A, parsec_datatype_double_complex_t, diff --git a/tests/testing_zgemm.c b/tests/testing_zgemm.c index c3f08647..cf6829df 100644 --- a/tests/testing_zgemm.c +++ b/tests/testing_zgemm.c @@ -8,6 +8,7 @@ */ #include "common.h" +#include "dplasmaaux.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" static int check_solution( parsec_context_t *parsec, int loud, @@ -76,6 +77,16 @@ int main(int argc, char ** argv) dplasma_zplrnt( parsec, 0, (parsec_tiled_matrix_t *)&dcC, Cseed); if(loud > 2) printf("Done\n"); + /* Advice data on device */ +#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP) + dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcA, + (parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL); + dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcB, + (parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL); + dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcC, + (parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL); +#endif + int t; for(t = 0; t < nruns; t++) { parsec_devices_release_memory(); @@ -142,6 +153,16 @@ int main(int argc, char ** argv) parsec_devices_release_memory(); parsec_devices_reset_load(parsec); + /* Advice data on device */ +#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP) + dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcA, + (parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL); + dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcB, + (parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL); + dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcC, + (parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL); +#endif + /* Create GEMM PaRSEC */ if(loud) printf("Compute ... ... "); PASTE_CODE_ENQUEUE_PROGRESS_DESTRUCT_KERNEL(parsec, zgemm, diff --git a/tests/testing_zpotrf.c b/tests/testing_zpotrf.c index ed6a4b10..2f327978 100644 --- a/tests/testing_zpotrf.c +++ b/tests/testing_zpotrf.c @@ -9,6 +9,7 @@ #include "common.h" #include "flops.h" +#include "dplasmaaux.h" #include "parsec/data_dist/matrix/sym_two_dim_rectangle_cyclic.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" @@ -18,7 +19,6 @@ int main(int argc, char ** argv) { parsec_context_t* parsec; int iparam[IPARAM_SIZEOF]; - //dplasma_enum_t uplo = dplasmaUpper; dplasma_enum_t uplo = dplasmaLower; int info = 0; int ret = 0; @@ -44,6 +44,13 @@ int main(int argc, char ** argv) parsec_matrix_sym_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE, rank, MB, NB, LDA, N, 0, 0, N, N, P, nodes/P, uplo)); + + /* Advice data on device */ +#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP) + dplasma_advise_data_on_device(parsec, uplo, (parsec_tiled_matrix_t*)&dcA, + (parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL); +#endif + int t; for(t = 0; t < nruns; t++) { /* matrix (re)generation */