diff --git a/parsec b/parsec index adabbd4d..a5f49ab2 160000 --- a/parsec +++ b/parsec @@ -1 +1 @@ -Subproject commit adabbd4d1fb580358a32d489df19fa9c05a316e1 +Subproject commit a5f49ab2ac1d4341efadc134df45ebedb2a7eb55 diff --git a/src/zpotrf_L.jdf b/src/zpotrf_L.jdf index 5a2e335c..d564b211 100644 --- a/src/zpotrf_L.jdf +++ b/src/zpotrf_L.jdf @@ -87,6 +87,49 @@ cuda_workspaces_infokey [type = "int" hidden = on default = -1 ] hip_handles_infokey [type = "int" hidden = on default = -1 ] hip_workspaces_infokey [type = "int" hidden = on default = -1 ] +nb_gpu_devices [ type = "int" hidden = on default = 0 ] +gpu_device_index [ type = "int *" hidden = on default = "NULL"] +gpu_rows [ type = "int" hidden = on default = 1] +gpu_cols [ type = "int" hidden = on default = 1] +grid_rows [ type = "int" hidden = on default = 1] +grid_cols [ type = "int" hidden = on default = 1] + + +/************************************************** + * potrf_bind_A * + **************************************************/ +potrf_bind_A(m, n) + +// Execution space +m = 0 .. descA->mt-1 +n = 0 .. m + +loc_A = %{ return LOC(descA, m, n); %} + +// Parallel partitioning +:descA(m, n) + +READ A <- ddescA(m, n) [ type = %{ return ADTT_READ(ddescA, loc_A, DEFAULT, TILED); %} + type_data = %{ return ADTT_READ(ddescA, loc_A, DEFAULT, LAPACK); %} ] + -> (m == 0 && n == 0) ? T potrf_zpotrf(0) + -> (n == 0)? C potrf_ztrsm(m, n) + -> (m == n && n > 0) ? T potrf_zherk(0, m) + -> (m != n && n > 0) ? C potrf_zgemm(m, n, 0) + +BODY +{ +#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) || defined(PARSEC_HAVE_DEV_HIP_SUPPORT) + if( nb_gpu_devices > 0 ) { + int g = (m / grid_rows % gpu_rows) * gpu_cols + n / grid_cols % gpu_cols; + parsec_advise_data_on_device( _f_A->original, + gpu_device_index[g], + PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE ); + } +#endif +} +END + + /************************************************** * potrf_zpotrf * **************************************************/ @@ -106,8 +149,7 @@ loc_T = %{ return LOC(descA, k, k); %} // Parameters -RW T <- (k == 0) ? ddescA(k, k) [ type = %{ return ADTT_READ(ddescA, loc_T, DEFAULT, TILED); %} - type_data = %{ return ADTT_READ(ddescA, loc_T, DEFAULT, LAPACK); %} ] +RW T <- (k == 0) ? A potrf_bind_A(k, k) [ type_remote = %{ return ADTT_DC(ddescA, loc_T, DEFAULT, TILED); %} ] <- (k != 0) ? T potrf_zherk(k-1, k) [ type_remote = %{ return ADTT_DC(ddescA, loc_T, DEFAULT, TILED); %} ] -> T potrf_ztrsm(k+1..descA->mt-1, k) /* dep OUT: rely on datacopy dtt for sending */ -> ddescA(k, k) [ type = %{ return ADTT_CP(_f_T, ddescA, loc_T, DEFAULT); %} @@ -235,8 +277,7 @@ loc_C = %{ return LOC(descA, m, k); %} // Parameters READ T <- T potrf_zpotrf(k) [ type_remote = %{ return ADTT_DC(ddescA, loc_T, DEFAULT, TILED); %} ] -RW C <- (k == 0) ? ddescA(m, k) [ type = %{ return ADTT_READ(ddescA, loc_C, DEFAULT, TILED); %} - type_data = %{ return ADTT_READ(ddescA, loc_C, DEFAULT, LAPACK); %} ] +RW C <- (k == 0) ? A potrf_bind_A(m, k) [ type_remote = %{ return ADTT_DC(ddescA, loc_C, DEFAULT, TILED); %} ] <- (k != 0) ? C potrf_zgemm(m, k, k-1) [ type_remote = %{ return ADTT_DC(ddescA, loc_C, DEFAULT, TILED); %} ] -> A potrf_zherk(k, m) /* dep OUT: rely on datacopy dtt for sending */ -> A potrf_zgemm(m, k+1..m-1, k) /* dep OUT: rely on datacopy dtt for sending */ @@ -370,9 +411,8 @@ loc_T = %{ return LOC(descA, m, m); %} //Parameters READ A <- C potrf_ztrsm(m, k) [ type_remote = %{ return ADTT_DC(ddescA, loc_A, DEFAULT, TILED); %} ] -RW T <- (k == 0) ? ddescA(m, m) [ type = %{ return ADTT_READ(ddescA, loc_T, DEFAULT, TILED); %} - type_data = %{ return ADTT_READ(ddescA, loc_T, DEFAULT, LAPACK); %} ] - <- (k != 0) ? T potrf_zherk(k-1, m) [ type_remote = %{ return ADTT_DC(ddescA, loc_T, DEFAULT, TILED); %} ] +RW T <- (k == 0) ? A potrf_bind_A(m, m) [ type_remote = %{ return ADTT_DC(ddescA, loc_T, DEFAULT, TILED); %} ] + <- (k != 0) ? T potrf_zherk(k-1, m) /* dep OUT: rely on datacopy dtt for sending */ -> (m == k+1) ? T potrf_zpotrf(m) : T potrf_zherk(k+1, m) /* dep OUT: rely on datacopy dtt for sending */ ; (m >= (descA->mt - PRI_CHANGE)) ? (descA->mt - m) * (descA->mt - m) * (descA->mt - m) + 3 * (m - k) : PRI_MAX @@ -493,8 +533,7 @@ loc_C = %{ return LOC(descA, m, n); %} // Parameters READ A <- C potrf_ztrsm(m, k) [ type_remote = %{ return ADTT_DC(ddescA, loc_A, DEFAULT, TILED); %} ] READ B <- C potrf_ztrsm(n, k) [ type_remote = %{ return ADTT_DC(ddescA, loc_B, DEFAULT, TILED); %} ] -RW C <- (k == 0) ? ddescA(m, n) [ type = %{ return ADTT_READ(ddescA, loc_C, DEFAULT, TILED); %} - type_data = %{ return ADTT_READ(ddescA, loc_C, DEFAULT, LAPACK); %} ] +RW C <- (k == 0) ? A potrf_bind_A(m, n) [ type_remote = %{ return ADTT_DC(ddescA, loc_C, DEFAULT, TILED); %} ] <- (k != 0) ? C potrf_zgemm(m, n, k-1) [ type_remote = %{ return ADTT_DC(ddescA, loc_C, DEFAULT, TILED); %} ] -> (n == k+1) ? C potrf_ztrsm(m, n) : C potrf_zgemm(m, n, k+1) /* dep OUT: rely on datacopy dtt for sending */ diff --git a/src/zpotrf_wrapper.c b/src/zpotrf_wrapper.c index 381f3a6f..6ae0a5d4 100644 --- a/src/zpotrf_wrapper.c +++ b/src/zpotrf_wrapper.c @@ -19,6 +19,7 @@ #include "zpotrf_U.h" #include "zpotrf_L.h" #include "cores/dplasma_plasmatypes.h" +#include "parsec/data_dist/matrix/sym_two_dim_rectangle_cyclic.h" #define MAX_SHAPES 1 @@ -129,7 +130,44 @@ static void zpotrf_destroy_hip_workspace(void *_ws, void *_n) free(ws); (void)_n; } + +#endif + +/* Find all devices */ +static void parsec_find_nb_devices(int **dev_index, int *nb) { + for(int i = 0; i < (int)parsec_nb_devices; i++) { + parsec_device_module_t *device = parsec_mca_device_get(i); + if( PARSEC_DEV_CUDA == device->type || PARSEC_DEV_HIP == device->type ) { + (*nb)++; + } + } +#if defined(DPLASMA_DEBUG) + if((*nb) == 0) { + char hostname[256]; + gethostname(hostname, 256); + fprintf(stderr, "No CUDA device found on rank %d on %s\n", + parsec->my_rank, hostname); + } #endif + *dev_index = (int *)malloc((*nb) * sizeof(int)); + *nb = 0; + for(int i = 0; i < (int)parsec_nb_devices; i++) { + parsec_device_module_t *device = parsec_mca_device_get(i); + if( PARSEC_DEV_CUDA == device->type || PARSEC_DEV_HIP == device->type ) { + (*dev_index)[(*nb)++] = device->device_index; + } + } +} + +/* Get the most suitable process/gpu grid */ +static int parsec_grid_calculation( int nb_process ) { + int P; + for( P = (int)(sqrt(nb_process + 1.0)); P > 0; P-- ) { + if( 0 == nb_process % P ) break; + } + return P; +} + /** ******************************************************************************* @@ -241,6 +279,7 @@ dplasma_zpotrf_New( dplasma_enum_t uplo, parsec_zpotrf->_g_cuda_handles_infokey = PARSEC_INFO_ID_UNDEFINED; parsec_zpotrf->_g_cuda_workspaces_infokey = PARSEC_INFO_ID_UNDEFINED; #endif + #if defined(DPLASMA_HAVE_HIP) /* It doesn't cost anything to define these infos if we have HIP but * don't have GPUs on the current machine, so we do it non-conditionally */ @@ -254,6 +293,21 @@ dplasma_zpotrf_New( dplasma_enum_t uplo, parsec_zpotrf->_g_hip_handles_infokey = PARSEC_INFO_ID_UNDEFINED; parsec_zpotrf->_g_hip_workspaces_infokey = PARSEC_INFO_ID_UNDEFINED; #endif + + int nb = 0, *dev_index; + parsec_find_nb_devices(&dev_index, &nb); + parsec_zpotrf->_g_nb_gpu_devices = nb; + parsec_zpotrf->_g_gpu_device_index = dev_index; + parsec_zpotrf->_g_gpu_cols = parsec_grid_calculation(nb); + parsec_zpotrf->_g_gpu_rows = nb/parsec_zpotrf->_g_gpu_cols; + parsec_zpotrf->_g_grid_rows = ((parsec_matrix_sym_block_cyclic_t *)A)->grid.rows; + parsec_zpotrf->_g_grid_cols = ((parsec_matrix_sym_block_cyclic_t *)A)->grid.cols; +#if defined(DPLASMA_DEBUG) + printf("nb_gpu_devices %d gpu_rows %d gpu_cols %d grid_rows %d grid_cols %d\n", + parsec_zpotrf->_g_nb_gpu_devices, parsec_zpotrf->_g_gpu_rows, + parsec_zpotrf->_g_gpu_cols, parsec_zpotrf->_g_grid_rows, parsec_zpotrf->_g_grid_cols); +#endif + int shape = 0; dplasma_setup_adtt_all_loc( ddc_A, parsec_datatype_double_complex_t, diff --git a/tests/testing_zpotrf.c b/tests/testing_zpotrf.c index 7d0fa3b7..ed6a4b10 100644 --- a/tests/testing_zpotrf.c +++ b/tests/testing_zpotrf.c @@ -18,7 +18,8 @@ int main(int argc, char ** argv) { parsec_context_t* parsec; int iparam[IPARAM_SIZEOF]; - dplasma_enum_t uplo = dplasmaUpper; + //dplasma_enum_t uplo = dplasmaUpper; + dplasma_enum_t uplo = dplasmaLower; int info = 0; int ret = 0;