From e0a77ea8a8100d657631e81542799b9883267e44 Mon Sep 17 00:00:00 2001
From: Qinglei Cao <qcao3@guyot.cluster>
Date: Fri, 27 Sep 2024 14:01:58 +0000
Subject: [PATCH] Add advice device support in dplasma

---
 src/dplasmaaux.c       | 108 +++++++++++++++++++++++++++++++++++++++++
 src/dplasmaaux.h       |  35 +++++++++++++
 src/zpotrf_L.jdf       |  57 ++++------------------
 src/zpotrf_wrapper.c   |  54 ---------------------
 tests/testing_zgemm.c  |  21 ++++++++
 tests/testing_zpotrf.c |   9 +++-
 6 files changed, 181 insertions(+), 103 deletions(-)

diff --git a/src/dplasmaaux.c b/src/dplasmaaux.c
index 86a6b189..3793f9a2 100644
--- a/src/dplasmaaux.c
+++ b/src/dplasmaaux.c
@@ -14,6 +14,8 @@
 #include <string.h>
 #include "dplasmaaux.h"
 #include "parsec/utils/show_help.h"
+#include "parsec/data_dist/matrix/sym_two_dim_rectangle_cyclic.h"
+#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"
 
 #if defined(PARSEC_HAVE_MPI)
 /*
@@ -110,3 +112,109 @@ dplasma_aux_getGEMMLookahead( parsec_tiled_matrix_t *A )
     }
 }
 
+#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP)
+
+/* Find all devices */
+void dplasma_find_nb_devices(int **dev_index, int *nb) {
+    *nb = 0;
+    for(int i = 0; i < (int)parsec_nb_devices; i++) {
+        parsec_device_module_t *device = parsec_mca_device_get(i);
+        if( PARSEC_DEV_CUDA & device->type || PARSEC_DEV_HIP & device->type ) {
+            (*nb)++;
+        }
+    }
+#if defined(DPLASMA_DEBUG)
+    if((*nb) == 0) {
+        char hostname[256];
+        gethostname(hostname, 256);
+        fprintf(stderr, "No CUDA device found on rank %d on %s\n",
+                parsec->my_rank, hostname);
+    }
+#endif
+    *dev_index = (int *)malloc((*nb) * sizeof(int));
+    *nb = 0;
+    for(int i = 0; i < (int)parsec_nb_devices; i++) {
+        parsec_device_module_t *device = parsec_mca_device_get(i);
+        if( PARSEC_DEV_CUDA & device->type || PARSEC_DEV_HIP & device->type ) {
+            (*dev_index)[(*nb)++] = device->device_index;
+        }
+    }
+}
+
+/* Get the most suitable process/gpu grid */
+int dplasma_grid_calculation( int nb_process ) {
+    int P;
+    for( P = (int)(sqrt(nb_process + 1.0)); P > 0; P-- ) {
+        if( 0 == nb_process % P ) break;
+    }
+    return P;
+}
+
+/* Operator 2D */
+int dplasma_advise_data_on_device_ops_2D(parsec_execution_stream_t *es,
+                        const parsec_tiled_matrix_t *A,
+                        void *_A, parsec_matrix_uplo_t uplo,
+                        int m, int n, void *op_args) {
+	dplasma_advise_data_on_device_t *args = (dplasma_advise_data_on_device_t *)op_args;
+
+    if( args->nb_gpu_devices > 0 ) {
+		/* Nested 2D grid on GPU */
+        int g = (m / args->grid_rows % args->gpu_rows) * args->gpu_cols + n / args->grid_cols % args->gpu_cols;
+        parsec_advise_data_on_device(A->super.data_of((parsec_data_collection_t*)A, m, n), 
+                                    args->gpu_device_index[g],   
+                                    PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
+    }
+
+    (void)es; (void)uplo;
+    return 0;
+}
+
+/* Set advise data on device
+ *
+ * If op_args == NULL, use dplasma_advise_data_on_device_t by default 
+ */
+int dplasma_advise_data_on_device(parsec_context_t *parsec,
+		parsec_matrix_uplo_t uplo,
+		parsec_tiled_matrix_t *A,
+		parsec_tiled_matrix_unary_op_t operation,
+		void *op_args) {
+
+	if(NULL != op_args) {
+		parsec_apply(parsec, uplo, A, operation, op_args);
+	} else {
+		/* Find the number of devices */
+		dplasma_advise_data_on_device_t *args = (dplasma_advise_data_on_device_t *)malloc(sizeof(dplasma_advise_data_on_device_t));
+		dplasma_find_nb_devices(&args->gpu_device_index, &args->nb_gpu_devices);
+
+		/* Calculate the nested grid for the multiple GPUs on one process
+         * gpu_rows >= gpu_cols and as square as possible */
+        if(dplasmaUpper == uplo) {
+            args->gpu_rows = dplasma_grid_calculation(args->nb_gpu_devices);
+            args->gpu_cols = args->nb_gpu_devices/args->gpu_rows;
+        } else {
+            args->gpu_cols = dplasma_grid_calculation(args->nb_gpu_devices);
+            args->gpu_rows = args->nb_gpu_devices/args->gpu_cols;
+        }
+
+        if(dplasmaUpper == uplo || dplasmaLower == uplo) {
+            args->grid_rows = ((parsec_matrix_sym_block_cyclic_t *)A)->grid.rows;
+            args->grid_cols = ((parsec_matrix_sym_block_cyclic_t *)A)->grid.cols;
+        } else if(dplasmaUpperLower == uplo) {
+            args->grid_rows = ((parsec_matrix_block_cyclic_t *)A)->grid.rows;
+            args->grid_cols = ((parsec_matrix_block_cyclic_t *)A)->grid.cols;
+        } else {
+            dplasma_error("dplasma_advise_data_on_device", "illegal value of uplo");
+        }
+
+#if defined(DPLASMA_DEBUG)
+        printf("nb_gpu_devices %d gpu_rows %d gpu_cols %d grid_rows %d grid_cols %d\n",
+                args->nb_gpu_devices, args->gpu_rows, args->gpu_cols, args->grid_rows, args->grid_cols);
+#endif
+
+        parsec_apply(parsec, uplo, A, operation, (void *)args);
+    }
+
+    return 0;
+}
+
+#endif
diff --git a/src/dplasmaaux.h b/src/dplasmaaux.h
index 28ae2039..681777dc 100644
--- a/src/dplasmaaux.h
+++ b/src/dplasmaaux.h
@@ -115,4 +115,39 @@ extern void *dplasma_pcomm;
 #if defined(DPLASMA_HAVE_HIP)
 #include "dplasmaaux_hip.h"
 #endif
+
+#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP)
+/* Advise data on device arguments */
+typedef struct dplasma_advise_data_on_device_s {
+    int nb_gpu_devices;
+    int *gpu_device_index;
+    int gpu_rows;
+    int gpu_cols;
+    int grid_rows;
+    int grid_cols;
+} dplasma_advise_data_on_device_t;
+
+/* Find all devices */
+void dplasma_find_nb_devices(int **dev_index, int *nb);
+
+/* Get the most suitable process/gpu grid */
+int dplasma_grid_calculation( int nb_process );
+
+/* Operator 2D */
+int dplasma_advise_data_on_device_ops_2D(parsec_execution_stream_t *es,
+                        const parsec_tiled_matrix_t *descA,
+                        void *_A, parsec_matrix_uplo_t uplo,
+                        int m, int n, void *args);
+
+/* Set advise data on device
+ *
+ * If op_args == NULL, use dplasma_advise_data_on_device_t by default 
+ */
+int dplasma_advise_data_on_device( parsec_context_t *parsec,
+        parsec_matrix_uplo_t uplo,
+        parsec_tiled_matrix_t *A,
+        parsec_tiled_matrix_unary_op_t operation,
+        void *op_args );
+#endif
+
 #endif /* _DPLASMAAUX_H_INCLUDED */
diff --git a/src/zpotrf_L.jdf b/src/zpotrf_L.jdf
index d564b211..5a2e335c 100644
--- a/src/zpotrf_L.jdf
+++ b/src/zpotrf_L.jdf
@@ -87,49 +87,6 @@ cuda_workspaces_infokey [type = "int" hidden = on default = -1 ]
 hip_handles_infokey     [type = "int" hidden = on default = -1 ]
 hip_workspaces_infokey  [type = "int" hidden = on default = -1 ]
 
-nb_gpu_devices      [ type = "int"   hidden = on default = 0 ]
-gpu_device_index    [ type = "int *" hidden = on default = "NULL"]
-gpu_rows            [ type = "int" hidden = on default = 1]
-gpu_cols            [ type = "int" hidden = on default = 1]
-grid_rows           [ type = "int" hidden = on default = 1]
-grid_cols           [ type = "int" hidden = on default = 1]
-
-
-/**************************************************
- *               potrf_bind_A                     *
- **************************************************/
-potrf_bind_A(m, n)
-
-// Execution space
-m = 0 .. descA->mt-1
-n = 0 .. m
-
-loc_A = %{ return LOC(descA, m, n); %}
-
-// Parallel partitioning
-:descA(m, n)
-
-READ A <- ddescA(m, n)                                        [ type        = %{ return ADTT_READ(ddescA, loc_A, DEFAULT, TILED); %}
-                                                                type_data   = %{ return ADTT_READ(ddescA, loc_A, DEFAULT, LAPACK); %} ]
-       -> (m == 0 && n == 0) ? T potrf_zpotrf(0)                      
-       -> (n == 0)? C potrf_ztrsm(m, n)                               
-       -> (m == n && n > 0) ? T potrf_zherk(0, m)                     
-       -> (m != n && n > 0) ? C potrf_zgemm(m, n, 0)                  
-
-BODY
-{
-#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)  || defined(PARSEC_HAVE_DEV_HIP_SUPPORT)
-    if( nb_gpu_devices > 0 ) {
-        int g = (m / grid_rows % gpu_rows) * gpu_cols + n / grid_cols % gpu_cols; 
-        parsec_advise_data_on_device( _f_A->original,
-                                    gpu_device_index[g],
-                                    PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE );
-    }
-#endif
-}
-END
-
-
 /**************************************************
  *               potrf_zpotrf                     *
  **************************************************/
@@ -149,7 +106,8 @@ loc_T = %{ return LOC(descA, k, k); %}
 
 // Parameters
 
-RW T <- (k == 0) ? A potrf_bind_A(k, k)    [ type_remote = %{ return ADTT_DC(ddescA, loc_T, DEFAULT, TILED); %} ] 
+RW T <- (k == 0) ? ddescA(k, k)            [ type        = %{ return ADTT_READ(ddescA, loc_T, DEFAULT, TILED); %}
+                                             type_data   = %{ return ADTT_READ(ddescA, loc_T, DEFAULT, LAPACK); %} ]
      <- (k != 0) ? T potrf_zherk(k-1, k)   [ type_remote = %{ return ADTT_DC(ddescA, loc_T, DEFAULT, TILED); %} ]
      -> T potrf_ztrsm(k+1..descA->mt-1, k) /* dep OUT: rely on datacopy dtt for sending */
      -> ddescA(k, k)                       [ type        = %{ return ADTT_CP(_f_T, ddescA, loc_T, DEFAULT); %}
@@ -277,7 +235,8 @@ loc_C = %{ return LOC(descA, m, k); %}
 
 // Parameters
 READ  T <- T potrf_zpotrf(k)                     [ type_remote = %{ return ADTT_DC(ddescA, loc_T, DEFAULT, TILED); %} ]
-RW    C <- (k == 0) ? A potrf_bind_A(m, k)       [ type_remote = %{ return ADTT_DC(ddescA, loc_C, DEFAULT, TILED); %} ] 
+RW    C <- (k == 0) ? ddescA(m, k)               [ type        = %{ return ADTT_READ(ddescA, loc_C, DEFAULT, TILED); %}
+                                                   type_data   = %{ return ADTT_READ(ddescA, loc_C, DEFAULT, LAPACK); %} ]
         <- (k != 0) ? C potrf_zgemm(m, k, k-1)   [ type_remote = %{ return ADTT_DC(ddescA, loc_C, DEFAULT, TILED); %} ]
         -> A potrf_zherk(k, m)                   /* dep OUT: rely on datacopy dtt for sending */
         -> A potrf_zgemm(m, k+1..m-1, k)         /* dep OUT: rely on datacopy dtt for sending */
@@ -411,8 +370,9 @@ loc_T = %{ return LOC(descA, m, m); %}
 
 //Parameters
 READ  A <- C potrf_ztrsm(m, k)                [ type_remote = %{ return ADTT_DC(ddescA, loc_A, DEFAULT, TILED); %} ]
-RW    T <- (k == 0)   ? A potrf_bind_A(m, m)  [ type_remote = %{ return ADTT_DC(ddescA, loc_T, DEFAULT, TILED); %} ] 
-        <- (k != 0)   ? T potrf_zherk(k-1, m) /* dep OUT: rely on datacopy dtt for sending */ 
+RW    T <- (k == 0)   ? ddescA(m, m)          [ type        = %{ return ADTT_READ(ddescA, loc_T, DEFAULT, TILED); %}
+                                                type_data   = %{ return ADTT_READ(ddescA, loc_T, DEFAULT, LAPACK); %} ]
+        <- (k != 0)   ? T potrf_zherk(k-1, m) [ type_remote = %{ return ADTT_DC(ddescA, loc_T, DEFAULT, TILED); %} ]
         -> (m == k+1) ? T potrf_zpotrf(m)  : T potrf_zherk(k+1, m) /* dep OUT: rely on datacopy dtt for sending */
 
 ; (m >= (descA->mt - PRI_CHANGE)) ? (descA->mt - m) * (descA->mt - m) * (descA->mt - m) + 3 * (m - k) : PRI_MAX
@@ -533,7 +493,8 @@ loc_C = %{ return LOC(descA, m, n); %}
 // Parameters
 READ  A <- C potrf_ztrsm(m, k)                   [ type_remote = %{ return ADTT_DC(ddescA, loc_A, DEFAULT, TILED); %} ]
 READ  B <- C potrf_ztrsm(n, k)                   [ type_remote = %{ return ADTT_DC(ddescA, loc_B, DEFAULT, TILED); %} ]
-RW    C <- (k == 0)   ? A potrf_bind_A(m, n)     [ type_remote = %{ return ADTT_DC(ddescA, loc_C, DEFAULT, TILED); %} ] 
+RW    C <- (k == 0)   ? ddescA(m, n)             [ type        = %{ return ADTT_READ(ddescA, loc_C, DEFAULT, TILED); %}
+                                                   type_data   = %{ return ADTT_READ(ddescA, loc_C, DEFAULT, LAPACK); %} ]
         <- (k != 0)   ? C potrf_zgemm(m, n, k-1) [ type_remote = %{ return ADTT_DC(ddescA, loc_C, DEFAULT, TILED); %} ]
         -> (n == k+1) ? C potrf_ztrsm(m, n) : C potrf_zgemm(m, n, k+1) /* dep OUT: rely on datacopy dtt for sending */
 
diff --git a/src/zpotrf_wrapper.c b/src/zpotrf_wrapper.c
index 6ae0a5d4..381f3a6f 100644
--- a/src/zpotrf_wrapper.c
+++ b/src/zpotrf_wrapper.c
@@ -19,7 +19,6 @@
 #include "zpotrf_U.h"
 #include "zpotrf_L.h"
 #include "cores/dplasma_plasmatypes.h"
-#include "parsec/data_dist/matrix/sym_two_dim_rectangle_cyclic.h"
 
 #define MAX_SHAPES 1
 
@@ -130,44 +129,7 @@ static void zpotrf_destroy_hip_workspace(void *_ws, void *_n)
     free(ws);
     (void)_n;
 }
-
-#endif
-
-/* Find all devices */
-static void parsec_find_nb_devices(int **dev_index, int *nb) {
-    for(int i = 0; i < (int)parsec_nb_devices; i++) {
-        parsec_device_module_t *device = parsec_mca_device_get(i);
-        if( PARSEC_DEV_CUDA == device->type || PARSEC_DEV_HIP == device->type ) {
-            (*nb)++;
-        }
-    }
-#if defined(DPLASMA_DEBUG)
-    if((*nb) == 0) {
-        char hostname[256];
-        gethostname(hostname, 256);
-        fprintf(stderr, "No CUDA device found on rank %d on %s\n",
-                parsec->my_rank, hostname);
-    }
 #endif
-    *dev_index = (int *)malloc((*nb) * sizeof(int));
-    *nb = 0;
-    for(int i = 0; i < (int)parsec_nb_devices; i++) {
-        parsec_device_module_t *device = parsec_mca_device_get(i);
-        if( PARSEC_DEV_CUDA == device->type || PARSEC_DEV_HIP == device->type ) {
-            (*dev_index)[(*nb)++] = device->device_index;
-        }
-    }
-}
-
-/* Get the most suitable process/gpu grid */
-static int parsec_grid_calculation( int nb_process ) {
-    int P;
-    for( P = (int)(sqrt(nb_process + 1.0)); P > 0; P-- ) {
-        if( 0 == nb_process % P ) break;
-    }
-    return P;
-}
-
 
 /**
  *******************************************************************************
@@ -279,7 +241,6 @@ dplasma_zpotrf_New( dplasma_enum_t uplo,
     parsec_zpotrf->_g_cuda_handles_infokey = PARSEC_INFO_ID_UNDEFINED;
     parsec_zpotrf->_g_cuda_workspaces_infokey = PARSEC_INFO_ID_UNDEFINED;
 #endif
-
 #if defined(DPLASMA_HAVE_HIP)
     /* It doesn't cost anything to define these infos if we have HIP but
      * don't have GPUs on the current machine, so we do it non-conditionally */
@@ -293,21 +254,6 @@ dplasma_zpotrf_New( dplasma_enum_t uplo,
     parsec_zpotrf->_g_hip_handles_infokey = PARSEC_INFO_ID_UNDEFINED;
     parsec_zpotrf->_g_hip_workspaces_infokey = PARSEC_INFO_ID_UNDEFINED;
 #endif
-
-    int nb = 0, *dev_index;
-    parsec_find_nb_devices(&dev_index, &nb);
-    parsec_zpotrf->_g_nb_gpu_devices = nb;
-    parsec_zpotrf->_g_gpu_device_index = dev_index;
-    parsec_zpotrf->_g_gpu_cols = parsec_grid_calculation(nb);
-    parsec_zpotrf->_g_gpu_rows = nb/parsec_zpotrf->_g_gpu_cols;
-    parsec_zpotrf->_g_grid_rows = ((parsec_matrix_sym_block_cyclic_t *)A)->grid.rows;
-    parsec_zpotrf->_g_grid_cols = ((parsec_matrix_sym_block_cyclic_t *)A)->grid.cols;
-#if defined(DPLASMA_DEBUG)
-    printf("nb_gpu_devices %d gpu_rows %d gpu_cols %d grid_rows %d grid_cols %d\n",
-            parsec_zpotrf->_g_nb_gpu_devices, parsec_zpotrf->_g_gpu_rows,
-            parsec_zpotrf->_g_gpu_cols, parsec_zpotrf->_g_grid_rows, parsec_zpotrf->_g_grid_cols);
-#endif
-
     int shape = 0;
     dplasma_setup_adtt_all_loc( ddc_A,
                                 parsec_datatype_double_complex_t,
diff --git a/tests/testing_zgemm.c b/tests/testing_zgemm.c
index c3f08647..cf6829df 100644
--- a/tests/testing_zgemm.c
+++ b/tests/testing_zgemm.c
@@ -8,6 +8,7 @@
  */
 
 #include "common.h"
+#include "dplasmaaux.h"
 #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"
 
 static int check_solution( parsec_context_t *parsec, int loud,
@@ -76,6 +77,16 @@ int main(int argc, char ** argv)
         dplasma_zplrnt( parsec, 0, (parsec_tiled_matrix_t *)&dcC, Cseed);
         if(loud > 2) printf("Done\n");
 
+    /* Advice data on device */
+#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP)
+    dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcA,
+            (parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
+    dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcB,
+            (parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
+    dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcC,
+            (parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
+#endif
+
         int t;
         for(t = 0; t < nruns; t++) {
             parsec_devices_release_memory();
@@ -142,6 +153,16 @@ int main(int argc, char ** argv)
                 parsec_devices_release_memory();
                 parsec_devices_reset_load(parsec);
 
+                /* Advice data on device */
+#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP)
+                dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcA,
+                        (parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
+                dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcB,
+                        (parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
+                dplasma_advise_data_on_device(parsec, dplasmaUpperLower, (parsec_tiled_matrix_t*)&dcC,
+                        (parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
+#endif
+
                 /* Create GEMM PaRSEC */
                 if(loud) printf("Compute ... ... ");
                     PASTE_CODE_ENQUEUE_PROGRESS_DESTRUCT_KERNEL(parsec, zgemm,
diff --git a/tests/testing_zpotrf.c b/tests/testing_zpotrf.c
index ed6a4b10..2f327978 100644
--- a/tests/testing_zpotrf.c
+++ b/tests/testing_zpotrf.c
@@ -9,6 +9,7 @@
 
 #include "common.h"
 #include "flops.h"
+#include "dplasmaaux.h"
 #include "parsec/data_dist/matrix/sym_two_dim_rectangle_cyclic.h"
 #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"
 
@@ -18,7 +19,6 @@ int main(int argc, char ** argv)
 {
     parsec_context_t* parsec;
     int iparam[IPARAM_SIZEOF];
-    //dplasma_enum_t uplo = dplasmaUpper;
     dplasma_enum_t uplo = dplasmaLower;
     int info = 0;
     int ret = 0;
@@ -44,6 +44,13 @@ int main(int argc, char ** argv)
         parsec_matrix_sym_block_cyclic, (&dcA, PARSEC_MATRIX_COMPLEX_DOUBLE,
                                    rank, MB, NB, LDA, N, 0, 0,
                                    N, N, P, nodes/P, uplo));
+
+    /* Advice data on device */
+#if defined(DPLASMA_HAVE_CUDA) || defined(DPLASMA_HAVE_HIP)
+    dplasma_advise_data_on_device(parsec, uplo, (parsec_tiled_matrix_t*)&dcA,
+            (parsec_tiled_matrix_unary_op_t)dplasma_advise_data_on_device_ops_2D, NULL);
+#endif
+
     int t;
     for(t = 0; t < nruns; t++) {
         /* matrix (re)generation */