Skip to content

Commit

Permalink
Add advice device support in dplasma
Browse files Browse the repository at this point in the history
  • Loading branch information
Qinglei Cao committed Sep 27, 2024
1 parent 8c45110 commit e0a77ea
Show file tree
Hide file tree
Showing 6 changed files with 181 additions and 103 deletions.
108 changes: 108 additions & 0 deletions src/dplasmaaux.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
#include <string.h>
#include "dplasmaaux.h"
#include "parsec/utils/show_help.h"
#include "parsec/data_dist/matrix/sym_two_dim_rectangle_cyclic.h"
#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"

#if defined(PARSEC_HAVE_MPI)
/*
Expand Down Expand Up @@ -110,3 +112,109 @@ dplasma_aux_getGEMMLookahead( parsec_tiled_matrix_t *A )
}
}

#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP)

/* Find all devices */
void dplasma_find_nb_devices(int **dev_index, int *nb) {
*nb = 0;
for(int i = 0; i < (int)parsec_nb_devices; i++) {
parsec_device_module_t *device = parsec_mca_device_get(i);
if( PARSEC_DEV_CUDA & device->type || PARSEC_DEV_HIP & device->type ) {
(*nb)++;
}
}
#if defined(DPLASMA_DEBUG)
if((*nb) == 0) {
char hostname[256];
gethostname(hostname, 256);
fprintf(stderr, "No CUDA device found on rank %d on %s\n",
parsec->my_rank, hostname);
}
#endif
*dev_index = (int *)malloc((*nb) * sizeof(int));
*nb = 0;
for(int i = 0; i < (int)parsec_nb_devices; i++) {
parsec_device_module_t *device = parsec_mca_device_get(i);
if( PARSEC_DEV_CUDA & device->type || PARSEC_DEV_HIP & device->type ) {
(*dev_index)[(*nb)++] = device->device_index;
}
}
}

/* Get the most suitable process/gpu grid */
int dplasma_grid_calculation( int nb_process ) {
int P;
for( P = (int)(sqrt(nb_process + 1.0)); P > 0; P-- ) {
if( 0 == nb_process % P ) break;
}
return P;
}

/* Operator 2D */
int dplasma_advise_data_on_device_ops_2D(parsec_execution_stream_t *es,
const parsec_tiled_matrix_t *A,
void *_A, parsec_matrix_uplo_t uplo,
int m, int n, void *op_args) {
dplasma_advise_data_on_device_t *args = (dplasma_advise_data_on_device_t *)op_args;

if( args->nb_gpu_devices > 0 ) {
/* Nested 2D grid on GPU */
int g = (m / args->grid_rows % args->gpu_rows) * args->gpu_cols + n / args->grid_cols % args->gpu_cols;
parsec_advise_data_on_device(A->super.data_of((parsec_data_collection_t*)A, m, n),
args->gpu_device_index[g],
PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
}

(void)es; (void)uplo;
return 0;
}

/* Set advise data on device
*
* If op_args == NULL, use dplasma_advise_data_on_device_t by default
*/
int dplasma_advise_data_on_device(parsec_context_t *parsec,
parsec_matrix_uplo_t uplo,
parsec_tiled_matrix_t *A,
parsec_tiled_matrix_unary_op_t operation,
void *op_args) {

if(NULL != op_args) {
parsec_apply(parsec, uplo, A, operation, op_args);
} else {
/* Find the number of devices */
dplasma_advise_data_on_device_t *args = (dplasma_advise_data_on_device_t *)malloc(sizeof(dplasma_advise_data_on_device_t));
dplasma_find_nb_devices(&args->gpu_device_index, &args->nb_gpu_devices);

/* Calculate the nested grid for the multiple GPUs on one process
* gpu_rows >= gpu_cols and as square as possible */
if(dplasmaUpper == uplo) {
args->gpu_rows = dplasma_grid_calculation(args->nb_gpu_devices);
args->gpu_cols = args->nb_gpu_devices/args->gpu_rows;
} else {
args->gpu_cols = dplasma_grid_calculation(args->nb_gpu_devices);
args->gpu_rows = args->nb_gpu_devices/args->gpu_cols;
}

if(dplasmaUpper == uplo || dplasmaLower == uplo) {
args->grid_rows = ((parsec_matrix_sym_block_cyclic_t *)A)->grid.rows;
args->grid_cols = ((parsec_matrix_sym_block_cyclic_t *)A)->grid.cols;
} else if(dplasmaUpperLower == uplo) {
args->grid_rows = ((parsec_matrix_block_cyclic_t *)A)->grid.rows;
args->grid_cols = ((parsec_matrix_block_cyclic_t *)A)->grid.cols;
} else {
dplasma_error("dplasma_advise_data_on_device", "illegal value of uplo");
}

#if defined(DPLASMA_DEBUG)
printf("nb_gpu_devices %d gpu_rows %d gpu_cols %d grid_rows %d grid_cols %d\n",
args->nb_gpu_devices, args->gpu_rows, args->gpu_cols, args->grid_rows, args->grid_cols);
#endif

parsec_apply(parsec, uplo, A, operation, (void *)args);
}

return 0;
}

#endif
35 changes: 35 additions & 0 deletions src/dplasmaaux.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,4 +115,39 @@ extern void *dplasma_pcomm;
#if defined(DPLASMA_HAVE_HIP)
#include "dplasmaaux_hip.h"
#endif

#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP)
/* Advise data on device arguments */
typedef struct dplasma_advise_data_on_device_s {
int nb_gpu_devices;
int *gpu_device_index;
int gpu_rows;
int gpu_cols;
int grid_rows;
int grid_cols;
} dplasma_advise_data_on_device_t;

/* Find all devices */
void dplasma_find_nb_devices(int **dev_index, int *nb);

/* Get the most suitable process/gpu grid */
int dplasma_grid_calculation( int nb_process );

/* Operator 2D */
int dplasma_advise_data_on_device_ops_2D(parsec_execution_stream_t *es,
const parsec_tiled_matrix_t *descA,
void *_A, parsec_matrix_uplo_t uplo,
int m, int n, void *args);

/* Set advise data on device
*
* If op_args == NULL, use dplasma_advise_data_on_device_t by default
*/
int dplasma_advise_data_on_device( parsec_context_t *parsec,
parsec_matrix_uplo_t uplo,
parsec_tiled_matrix_t *A,
parsec_tiled_matrix_unary_op_t operation,
void *op_args );
#endif

#endif /* _DPLASMAAUX_H_INCLUDED */
57 changes: 9 additions & 48 deletions src/zpotrf_L.jdf
Original file line number Diff line number Diff line change
Expand Up @@ -87,49 +87,6 @@ cuda_workspaces_infokey [type = "int" hidden = on default = -1 ]
hip_handles_infokey [type = "int" hidden = on default = -1 ]
hip_workspaces_infokey [type = "int" hidden = on default = -1 ]

nb_gpu_devices [ type = "int" hidden = on default = 0 ]
gpu_device_index [ type = "int *" hidden = on default = "NULL"]
gpu_rows [ type = "int" hidden = on default = 1]
gpu_cols [ type = "int" hidden = on default = 1]
grid_rows [ type = "int" hidden = on default = 1]
grid_cols [ type = "int" hidden = on default = 1]


/**************************************************
* potrf_bind_A *
**************************************************/
potrf_bind_A(m, n)

// Execution space
m = 0 .. descA->mt-1
n = 0 .. m

loc_A = %{ return LOC(descA, m, n); %}

// Parallel partitioning
:descA(m, n)

READ A <- ddescA(m, n) [ type = %{ return ADTT_READ(ddescA, loc_A, DEFAULT, TILED); %}
type_data = %{ return ADTT_READ(ddescA, loc_A, DEFAULT, LAPACK); %} ]
-> (m == 0 && n == 0) ? T potrf_zpotrf(0)
-> (n == 0)? C potrf_ztrsm(m, n)
-> (m == n && n > 0) ? T potrf_zherk(0, m)
-> (m != n && n > 0) ? C potrf_zgemm(m, n, 0)

BODY
{
#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) || defined(PARSEC_HAVE_DEV_HIP_SUPPORT)
if( nb_gpu_devices > 0 ) {
int g = (m / grid_rows % gpu_rows) * gpu_cols + n / grid_cols % gpu_cols;
parsec_advise_data_on_device( _f_A->original,
gpu_device_index[g],
PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
}
#endif
}
END


/**************************************************
* potrf_zpotrf *
**************************************************/
Expand All @@ -149,7 +106,8 @@ loc_T = %{ return LOC(descA, k, k); %}

// Parameters

RW T <- (k == 0) ? A potrf_bind_A(k, k) [ type_remote = %{ return ADTT_DC(ddescA, loc_T, DEFAULT, TILED); %} ]
RW T <- (k == 0) ? ddescA(k, k) [ type = %{ return ADTT_READ(ddescA, loc_T, DEFAULT, TILED); %}
type_data = %{ return ADTT_READ(ddescA, loc_T, DEFAULT, LAPACK); %} ]
<- (k != 0) ? T potrf_zherk(k-1, k) [ type_remote = %{ return ADTT_DC(ddescA, loc_T, DEFAULT, TILED); %} ]
-> T potrf_ztrsm(k+1..descA->mt-1, k) /* dep OUT: rely on datacopy dtt for sending */
-> ddescA(k, k) [ type = %{ return ADTT_CP(_f_T, ddescA, loc_T, DEFAULT); %}
Expand Down Expand Up @@ -277,7 +235,8 @@ loc_C = %{ return LOC(descA, m, k); %}

// Parameters
READ T <- T potrf_zpotrf(k) [ type_remote = %{ return ADTT_DC(ddescA, loc_T, DEFAULT, TILED); %} ]
RW C <- (k == 0) ? A potrf_bind_A(m, k) [ type_remote = %{ return ADTT_DC(ddescA, loc_C, DEFAULT, TILED); %} ]
RW C <- (k == 0) ? ddescA(m, k) [ type = %{ return ADTT_READ(ddescA, loc_C, DEFAULT, TILED); %}
type_data = %{ return ADTT_READ(ddescA, loc_C, DEFAULT, LAPACK); %} ]
<- (k != 0) ? C potrf_zgemm(m, k, k-1) [ type_remote = %{ return ADTT_DC(ddescA, loc_C, DEFAULT, TILED); %} ]
-> A potrf_zherk(k, m) /* dep OUT: rely on datacopy dtt for sending */
-> A potrf_zgemm(m, k+1..m-1, k) /* dep OUT: rely on datacopy dtt for sending */
Expand Down Expand Up @@ -411,8 +370,9 @@ loc_T = %{ return LOC(descA, m, m); %}

//Parameters
READ A <- C potrf_ztrsm(m, k) [ type_remote = %{ return ADTT_DC(ddescA, loc_A, DEFAULT, TILED); %} ]
RW T <- (k == 0) ? A potrf_bind_A(m, m) [ type_remote = %{ return ADTT_DC(ddescA, loc_T, DEFAULT, TILED); %} ]
<- (k != 0) ? T potrf_zherk(k-1, m) /* dep OUT: rely on datacopy dtt for sending */
RW T <- (k == 0) ? ddescA(m, m) [ type = %{ return ADTT_READ(ddescA, loc_T, DEFAULT, TILED); %}
type_data = %{ return ADTT_READ(ddescA, loc_T, DEFAULT, LAPACK); %} ]
<- (k != 0) ? T potrf_zherk(k-1, m) [ type_remote = %{ return ADTT_DC(ddescA, loc_T, DEFAULT, TILED); %} ]
-> (m == k+1) ? T potrf_zpotrf(m) : T potrf_zherk(k+1, m) /* dep OUT: rely on datacopy dtt for sending */

; (m >= (descA->mt - PRI_CHANGE)) ? (descA->mt - m) * (descA->mt - m) * (descA->mt - m) + 3 * (m - k) : PRI_MAX
Expand Down Expand Up @@ -533,7 +493,8 @@ loc_C = %{ return LOC(descA, m, n); %}
// Parameters
READ A <- C potrf_ztrsm(m, k) [ type_remote = %{ return ADTT_DC(ddescA, loc_A, DEFAULT, TILED); %} ]
READ B <- C potrf_ztrsm(n, k) [ type_remote = %{ return ADTT_DC(ddescA, loc_B, DEFAULT, TILED); %} ]
RW C <- (k == 0) ? A potrf_bind_A(m, n) [ type_remote = %{ return ADTT_DC(ddescA, loc_C, DEFAULT, TILED); %} ]
RW C <- (k == 0) ? ddescA(m, n) [ type = %{ return ADTT_READ(ddescA, loc_C, DEFAULT, TILED); %}
type_data = %{ return ADTT_READ(ddescA, loc_C, DEFAULT, LAPACK); %} ]
<- (k != 0) ? C potrf_zgemm(m, n, k-1) [ type_remote = %{ return ADTT_DC(ddescA, loc_C, DEFAULT, TILED); %} ]
-> (n == k+1) ? C potrf_ztrsm(m, n) : C potrf_zgemm(m, n, k+1) /* dep OUT: rely on datacopy dtt for sending */

Expand Down
54 changes: 0 additions & 54 deletions src/zpotrf_wrapper.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
#include "zpotrf_U.h"
#include "zpotrf_L.h"
#include "cores/dplasma_plasmatypes.h"
#include "parsec/data_dist/matrix/sym_two_dim_rectangle_cyclic.h"

#define MAX_SHAPES 1

Expand Down Expand Up @@ -130,44 +129,7 @@ static void zpotrf_destroy_hip_workspace(void *_ws, void *_n)
free(ws);
(void)_n;
}

#endif

/* Find all devices */
static void parsec_find_nb_devices(int **dev_index, int *nb) {
for(int i = 0; i < (int)parsec_nb_devices; i++) {
parsec_device_module_t *device = parsec_mca_device_get(i);
if( PARSEC_DEV_CUDA == device->type || PARSEC_DEV_HIP == device->type ) {
(*nb)++;
}
}
#if defined(DPLASMA_DEBUG)
if((*nb) == 0) {
char hostname[256];
gethostname(hostname, 256);
fprintf(stderr, "No CUDA device found on rank %d on %s\n",
parsec->my_rank, hostname);
}
#endif
*dev_index = (int *)malloc((*nb) * sizeof(int));
*nb = 0;
for(int i = 0; i < (int)parsec_nb_devices; i++) {
parsec_device_module_t *device = parsec_mca_device_get(i);
if( PARSEC_DEV_CUDA == device->type || PARSEC_DEV_HIP == device->type ) {
(*dev_index)[(*nb)++] = device->device_index;
}
}
}

/* Get the most suitable process/gpu grid */
static int parsec_grid_calculation( int nb_process ) {
int P;
for( P = (int)(sqrt(nb_process + 1.0)); P > 0; P-- ) {
if( 0 == nb_process % P ) break;
}
return P;
}


/**
*******************************************************************************
Expand Down Expand Up @@ -279,7 +241,6 @@ dplasma_zpotrf_New( dplasma_enum_t uplo,
parsec_zpotrf->_g_cuda_handles_infokey = PARSEC_INFO_ID_UNDEFINED;
parsec_zpotrf->_g_cuda_workspaces_infokey = PARSEC_INFO_ID_UNDEFINED;
#endif

#if defined(DPLASMA_HAVE_HIP)
/* It doesn't cost anything to define these infos if we have HIP but
* don't have GPUs on the current machine, so we do it non-conditionally */
Expand All @@ -293,21 +254,6 @@ dplasma_zpotrf_New( dplasma_enum_t uplo,
parsec_zpotrf->_g_hip_handles_infokey = PARSEC_INFO_ID_UNDEFINED;
parsec_zpotrf->_g_hip_workspaces_infokey = PARSEC_INFO_ID_UNDEFINED;
#endif

int nb = 0, *dev_index;
parsec_find_nb_devices(&dev_index, &nb);
parsec_zpotrf->_g_nb_gpu_devices = nb;
parsec_zpotrf->_g_gpu_device_index = dev_index;
parsec_zpotrf->_g_gpu_cols = parsec_grid_calculation(nb);
parsec_zpotrf->_g_gpu_rows = nb/parsec_zpotrf->_g_gpu_cols;
parsec_zpotrf->_g_grid_rows = ((parsec_matrix_sym_block_cyclic_t *)A)->grid.rows;
parsec_zpotrf->_g_grid_cols = ((parsec_matrix_sym_block_cyclic_t *)A)->grid.cols;
#if defined(DPLASMA_DEBUG)
printf("nb_gpu_devices %d gpu_rows %d gpu_cols %d grid_rows %d grid_cols %d\n",
parsec_zpotrf->_g_nb_gpu_devices, parsec_zpotrf->_g_gpu_rows,
parsec_zpotrf->_g_gpu_cols, parsec_zpotrf->_g_grid_rows, parsec_zpotrf->_g_grid_cols);
#endif

int shape = 0;
dplasma_setup_adtt_all_loc( ddc_A,
parsec_datatype_double_complex_t,
Expand Down
21 changes: 21 additions & 0 deletions tests/testing_zgemm.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
*/

#include "common.h"
#include "dplasmaaux.h"
#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"

static int check_solution( parsec_context_t *parsec, int loud,
Expand Down Expand Up @@ -76,6 +77,16 @@ int main(int argc, char ** argv)
dplasma_zplrnt( parsec, 0, (parsec_tiled_matrix_t *)&dcC, Cseed);
if(loud > 2) printf("Done\n");

/* Advice data on device */
#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP)
dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcA,
(parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcB,
(parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcC,
(parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
#endif

int t;
for(t = 0; t < nruns; t++) {
parsec_devices_release_memory();
Expand Down Expand Up @@ -142,6 +153,16 @@ int main(int argc, char ** argv)
parsec_devices_release_memory();
parsec_devices_reset_load(parsec);

/* Advice data on device */
#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP)
dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcA,
(parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcB,
(parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcC,
(parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
#endif

/* Create GEMM PaRSEC */
if(loud) printf("Compute ... ... ");
PASTE_CODE_ENQUEUE_PROGRESS_DESTRUCT_KERNEL(parsec, zgemm,
Expand Down
Loading

0 comments on commit e0a77ea

Please sign in to comment.