ICLDisco · bosilca · Jul 26, 2024
@@ -181,6 +181,8 @@ option(PARSEC_GPU_ALLOC_PER_TILE
 mark_as_advanced(PARSEC_GPU_ALLOC_PER_TILE)
 option(PARSEC_GPU_WITH_CUDA
   "Enable GPU support using CUDA kernels" ON)
+option(PARSEC_GPU_WITH_CUDA_BATCH
+  "Enable the runtime support for batched kernels" ON)
 option(PARSEC_GPU_WITH_HIP
     "Enable GPU support using HIP kernels" ON)
 option(PARSEC_GPU_WITH_LEVEL_ZERO
@@ -729,6 +731,12 @@ int main(int argc, char *argv[]) {
     endif (CUDAToolkit_FOUND)
     set(PARSEC_HAVE_CU_COMPILER ${CMAKE_CUDA_COMPILER} CACHE BOOL "True if PaRSEC provide support for compiling .cu files")
   endif( PARSEC_GPU_WITH_CUDA )
+  if( PARSEC_GPU_WITH_CUDA_BATCH )
+    if( NOT PARSEC_HAVE_CUDA)
+      message(FATAL_ERROR "PARSEC_GPU_WITH_CUDA_BATCH requires PARSEC_GPU_WITH_CUDA. Enable both or none")
+    endif( NOT PARSEC_HAVE_CUDA)
+    set(PARSEC_HAVE_CUDA_BATCH True CACHE BOOL "True if support for batched CUDA has been enabled")
+  endif( PARSEC_GPU_WITH_CUDA_BATCH )
 
   if( PARSEC_GPU_WITH_HIP )
     # This is kinda ugly but the PATH and HINTS don't get transmitted to sub-dependents

@@ -65,6 +65,10 @@ endif(@PARSEC_DIST_WITH_MPI@)
 if(@PARSEC_HAVE_CUDA@)
   find_package(CUDAToolkit REQUIRED)
   set(PARSEC_HAVE_CUDA TRUE)
+
+  if(@PARSEC_HAVE_CUDA_BATCH@)
+    set(PARSEC_HAVE_CUDA_BATCH TRUE)
+  endif(@PARSEC_HAVE_CUDA_BATCH@)
 endif(@PARSEC_HAVE_CUDA@)
 
 if(@PARSEC_HAVE_HIP@)

@@ -130,6 +130,7 @@
 #cmakedefine PARSEC_HAVE_DEV_CPU_SUPPORT
 #cmakedefine PARSEC_HAVE_DEV_RECURSIVE_SUPPORT
 #cmakedefine PARSEC_HAVE_DEV_CUDA_SUPPORT
+#cmakedefine PARSEC_HAVE_DEV_CUDA_BATCH_SUPPORT
 #cmakedefine PARSEC_HAVE_DEV_HIP_SUPPORT
 #cmakedefine PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT
 #cmakedefine PARSEC_HAVE_DEV_OPENCL_SUPPORT

@@ -2,7 +2,7 @@
  * Copyright (c) 2013-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
- * Copyright (c) 2023      NVIDIA Corporation.  All rights reserved.
+ * Copyright (c) 2023-2024 NVIDIA Corporation.  All rights reserved.
  */
 
 /* **************************************************************************** */
@@ -1477,7 +1477,7 @@ parsec_dtd_startup(parsec_context_t *context,
         if( !(tp->devices_index_mask & (1 << device->device_index))) continue;  /* not supported */
         // If CUDA is enabled, let the CUDA device activated for this
         // taskpool.
-        if( PARSEC_DEV_CUDA == device->type ) continue;
+        if( PARSEC_DEV_CUDA & device->type ) continue;
         if( NULL != device->taskpool_register )
             if( PARSEC_SUCCESS !=
                 device->taskpool_register(device, (parsec_taskpool_t *)tp)) {
@@ -2355,8 +2355,8 @@ int parsec_dtd_task_class_add_chore(parsec_taskpool_t *tp,
     /* We assume that incarnations is big enough, because it has been pre-allocated
      * with PARSEC_DEV_MAX_NB_TYPE+1 chores, as this is a DTD task class */
     incarnations = (__parsec_chore_t*)dtd_tc->super.incarnations;
-    for(i = 0; i < PARSEC_DEV_MAX_NB_TYPE && incarnations[i].type != PARSEC_DEV_NONE; i++) {
-        if( incarnations[i].type == device_type ) {
+    for(i = 0; i < PARSEC_DEV_MAX_NB_TYPE && (incarnations[i].type & PARSEC_DEV_ANY_TYPE) != PARSEC_DEV_NONE; i++) {
+        if( incarnations[i].type & PARSEC_DEV_ANY_TYPE & device_type ) {
             parsec_warning("A chore for this device type has already been added to task class '%s'\n",
                            tc->name);
             return PARSEC_ERROR;
@@ -2369,7 +2369,7 @@ int parsec_dtd_task_class_add_chore(parsec_taskpool_t *tp,
     }
 
     incarnations[i].type = device_type;
-    if(PARSEC_DEV_CUDA == device_type) {
+    if(PARSEC_DEV_CUDA & device_type) {
         incarnations[i].hook = parsec_dtd_gpu_task_submit;
         dtd_tc->gpu_func_ptr = (parsec_advance_task_function_t)function;
     }
@@ -3258,19 +3258,20 @@ __parsec_dtd_taskpool_create_task(parsec_taskpool_t *tp,
             dtd_tc = parsec_dtd_create_task_classv(name_of_kernel, nb_params, params);
             tc = &dtd_tc->super;
 
-            __parsec_chore_t **incarnations = (__parsec_chore_t **)&tc->incarnations;
-            (*incarnations)[0].type = device_type;
-            if( device_type == PARSEC_DEV_CUDA ) {
+            __parsec_chore_t *incarnations = (__parsec_chore_t *)tc->incarnations;
+            incarnations[0].type = device_type;
+            if( device_type & PARSEC_DEV_CUDA ) {
                 /* Special case for CUDA: we need an intermediate */
-                (*incarnations)[0].hook = parsec_dtd_gpu_task_submit;
+                incarnations[0].hook = parsec_dtd_gpu_task_submit;
                 dtd_tc->gpu_func_ptr = (parsec_advance_task_function_t)fpointer;
             }
             else {
                 /* Default case: the user-provided function is directly the hook to call */
-                (*incarnations)[0].hook = fpointer; // We can directly call the CPU hook
+                incarnations[0].hook = fpointer; // We can directly call the CPU hook
                 dtd_tc->cpu_func_ptr = fpointer;
             }
-            (*incarnations)[1].type = PARSEC_DEV_NONE;
+            incarnations[1].type = PARSEC_DEV_NONE;
+            incarnations[1].hook = NULL;
 
             /* Bookkeeping of the task class */
             parsec_dtd_register_task_class(&dtd_tp->super, fkey, tc);

@@ -2,6 +2,7 @@
  * Copyright (c) 2009-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
+ * Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
  */
 
 #include "parsec/parsec_config.h"
@@ -3938,25 +3939,40 @@ jdf_generate_function_incarnation_list( const jdf_t *jdf,
     jdf_def_list_t* dyld_property;
     jdf_def_list_t* evaluate_property = NULL;
     jdf_def_list_t* device_property = NULL;
+    jdf_def_list_t* batch_property = NULL;
 
     (void)jdf;
     string_arena_add_string(sa, "static const __parsec_chore_t __%s_chores[] ={\n", base_name);
     do {
         jdf_find_property(body->properties, "type", &type_property);
         jdf_find_property(body->properties, "dyld", &dyld_property);
         jdf_find_property(body->properties, JDF_BODY_PROP_EVALUATE, &evaluate_property);
-        if( NULL == type_property) {
+        jdf_find_property(body->properties, "batch", &batch_property);
+        if (NULL == type_property)
+        {
             string_arena_add_string(sa, "#if defined(PARSEC_HAVE_DEV_CPU_SUPPORT)\n");
             string_arena_add_string(sa, "    { .type     = PARSEC_DEV_CPU,\n");
             string_arena_add_string(sa, "      .evaluate = (parsec_evaluate_function_t*)%s,\n",
                                     (NULL == evaluate_property) ? "NULL" : evaluate_property->expr->jdf_c_code.fname);
             string_arena_add_string(sa, "      .hook     = (parsec_hook_t*)hook_of_%s },\n", base_name);
             string_arena_add_string(sa, "#endif  /* defined(PARSEC_HAVE_DEV_CPU_SUPPORT) */\n");
-        } else {
+            if( NULL != batch_property ) {
+                fprintf(stderr,
+                        "Error: batched property (%s) not allowed for devices other than accelerators in body of task %s at line %d\n",
+                        batch_property->expr->jdf_var, f->fname, JDF_OBJECT_LINENO(body));
+                assert( NULL != batch_property );
+            }
+        }
+        else
+        {
             char* dev_upper = strdup_upper(type_property->expr->jdf_var);
 
             string_arena_add_string(sa, "#if defined(PARSEC_HAVE_DEV_%s_SUPPORT)\n", dev_upper);
-            string_arena_add_string(sa, "    { .type     = PARSEC_DEV_%s,\n", dev_upper);
+            string_arena_add_string(sa, "    { .type     = PARSEC_DEV_%s", dev_upper);
+            if( NULL != batch_property) {
+                string_arena_add_string(sa, " | PARSEC_DEV_CHORE_ALLOW_BATCH");
+            }
+            string_arena_add_string(sa, ",\n");
             if( NULL == dyld_property ) {
                 string_arena_add_string(sa, "      .dyld     = NULL,\n");
             } else {
@@ -4491,7 +4507,7 @@ static void jdf_generate_startup_hook( const jdf_t *jdf )
             "    parsec_task_class_t* tc = (parsec_task_class_t*)__parsec_tp->super.super.task_classes_array[i];\n"
             "    __parsec_chore_t* chores = (__parsec_chore_t*)tc->incarnations;\n"
             "    uint32_t idx = 0, j;\n"
-            "    for( j = 0; PARSEC_DEV_NONE != chores[j].type; j++ ) {\n"
+            "    for( j = 0; PARSEC_DEV_NONE != (chores[j].type & PARSEC_DEV_ANY_TYPE); j++ ) {\n"
             "      if( !(supported_dev & chores[j].type) ) continue;\n"
             "      if( j != idx ) {\n"
             "        chores[idx] = chores[j];\n"
@@ -4680,7 +4696,7 @@ static void jdf_generate_constructor( const jdf_t* jdf )
     coutput("  for( i = 0; i < __parsec_tp->super.super.nb_task_classes; i++ ) {\n"
             "    __parsec_tp->super.super.task_classes_array[i] = tc = malloc(sizeof(parsec_task_class_t));\n"
             "    memcpy(tc, %s_task_classes[i], sizeof(parsec_task_class_t));\n"
-            "    for( j = 0; PARSEC_DEV_NONE != tc->incarnations[j].type; j++);  /* compute the number of incarnations */\n"
+            "    for( j = 0; PARSEC_DEV_NONE != (tc->incarnations[j].type & PARSEC_DEV_ANY_TYPE); j++);  /* compute the number of incarnations */\n"
             "    tc->incarnations = (__parsec_chore_t*)malloc((j+1) * sizeof(__parsec_chore_t));\n    "
             "    memcpy((__parsec_chore_t*)tc->incarnations, %s_task_classes[i]->incarnations, (j+1) * sizeof(__parsec_chore_t));\n\n"
             "    /* Add a placeholder for initialization and startup task */\n"
@@ -6731,8 +6747,8 @@ static void jdf_generate_code_hook_gpu(const jdf_t *jdf,
         coutput("  /* Pointer to dynamic gpu function */\n"
                 "  {\n"
                 "    int chore_idx = 0;\n"
-                "    for ( ; PARSEC_DEV_NONE != this_task->task_class->incarnations[chore_idx].type; ++chore_idx) {\n"
-                "      if (this_task->task_class->incarnations[chore_idx].type == PARSEC_DEV_%s) break;\n"
+                "    for ( ; PARSEC_DEV_NONE != (this_task->task_class->incarnations[chore_idx].type & PARSEC_DEV_ANY_TYPE); ++chore_idx) {\n"
+                "      if (this_task->task_class->incarnations[chore_idx].type & PARSEC_DEV_%s) break;\n"
                 "    }\n"
                 "    /* The void* cast prevents the compiler from complaining about the type change */\n"
                 "    parsec_body.dyld_fn = (%s)(void*)this_task->task_class->incarnations[chore_idx].dyld_fn;\n"
@@ -6983,7 +6999,7 @@ static void jdf_generate_code_hook(const jdf_t *jdf,
     coutput("#if defined(PARSEC_HAVE_DEV_%s_SUPPORT)\n", type_upper);
     if( NULL != type_property) {
 
-        if (!strcasecmp(type_property->expr->jdf_var, "cuda")
+        if (!strncasecmp(type_property->expr->jdf_var, "cuda", 4)  /* for batched */
          || !strcasecmp(type_property->expr->jdf_var, "hip")) {
             jdf_generate_code_hook_gpu(jdf, f, body, name);
             goto hook_end_block;

@@ -14,6 +14,9 @@ set(PARSEC_HAVE_DEV_RECURSIVE_SUPPORT 0 CACHE BOOL  "PaRSEC has support for Recu
 if(PARSEC_HAVE_CUDA)
   set(PARSEC_HAVE_DEV_CUDA_SUPPORT 1 CACHE BOOL "PaRSEC support for CUDA")
 endif(PARSEC_HAVE_CUDA)
+if(PARSEC_HAVE_CUDA_BATCH)
+  set(PARSEC_HAVE_DEV_CUDA_BATCH_SUPPORT 1 CACHE BOOL "PaRSEC support for batched CUDA")
+endif(PARSEC_HAVE_CUDA_BATCH)
 if(PARSEC_HAVE_HIP)
   set(PARSEC_HAVE_DEV_HIP_SUPPORT 1 CACHE BOOL "PaRSEC support for HIP")
 endif(PARSEC_HAVE_HIP)

@@ -276,7 +276,7 @@ static int device_cuda_component_close(void)
     /* Check that no CUDA devices are still registered with PaRSEC */
     for(i = 0; i < parsec_mca_device_enabled(); i++) {
         if( NULL == (cdev = (parsec_device_cuda_module_t*)parsec_mca_device_get(i)) ) continue;
-        if(PARSEC_DEV_CUDA != cdev->super.super.type) continue;
+        if(PARSEC_DEV_CUDA & cdev->super.super.type) continue;
 
         PARSEC_DEBUG_VERBOSE(0, parsec_gpu_output_stream,
                              "GPU[%d:%s] CUDA device %d still registered with PaRSEC at the end of CUDA finalize.\n"

@@ -3,6 +3,7 @@
  * Copyright (c) 2013-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
+ * Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
  */
 
 #include "parsec/parsec_config.h"
@@ -107,7 +108,7 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
 
     /* Run the evaluates for the incarnation types to determine if they can
      * execute this task */
-    for(chore_id = 0; PARSEC_DEV_NONE != tc->incarnations[chore_id].type; chore_id++) {
+    for(chore_id = 0; PARSEC_DEV_NONE != (tc->incarnations[chore_id].type & PARSEC_DEV_ANY_TYPE); chore_id++) {
         if( 0 == (this_task->chore_mask & (1<<chore_id)) ) continue;
         if( NULL == tc->incarnations[chore_id].hook ) continue; /* dyld hook not found during initialization */
 
@@ -116,15 +117,15 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
             if( PARSEC_HOOK_RETURN_DONE != rc ) {
                 if( PARSEC_HOOK_RETURN_NEXT != rc ) {
                     PARSEC_DEBUG_VERBOSE(5, parsec_device_output, "Failed to evaluate %s[%d] chore %d",
-                                         tmp, tc->incarnations[chore_id].type,
+                                         tmp, tc->incarnations[chore_id].type & PARSEC_DEV_ANY_TYPE,
                                          chore_id);
                 }
                 /* Mark this chore as tested */
                 this_task->chore_mask &= ~( 1<<chore_id );
                 continue;
             }
         }
-        valid_types |= tc->incarnations[chore_id].type; /* the eval accepted the type, but no device specified yet */
+        valid_types |= (tc->incarnations[chore_id].type & PARSEC_DEV_ANY_TYPE); /* the eval accepted the type, but no device specified yet */
         /* Evaluate may have picked a device, abide by it */
         if( NULL != this_task->selected_device ) {
             assert( this_task->selected_device->type & valid_types );
@@ -140,7 +141,7 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
     if (PARSEC_DEV_CPU == valid_types) { /* shortcut for CPU only tasks */
         this_task->selected_device = dev = parsec_mca_device_get(0);
         this_task->load = 0;
-        for(chore_id = 0; tc->incarnations[chore_id].type != PARSEC_DEV_CPU; chore_id++);
+        for(chore_id = 0; !(tc->incarnations[chore_id].type & PARSEC_DEV_CPU); chore_id++);
         this_task->selected_chore = chore_id;
         PARSEC_DEBUG_VERBOSE(80, parsec_device_output, "%s: Task %s cpu-only task set selected_device %d:%s",
                              __func__, tmp, dev->device_index, dev->name);
@@ -226,7 +227,7 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
             /* Skip the device if no incarnations for its type */
             if(!(dev->type & valid_types)) continue;
             /* Skip recursive devices: time estimates are computed on the associated CPU device */
-            if(dev->type == PARSEC_DEV_RECURSIVE) continue;
+            if(dev->type & PARSEC_DEV_RECURSIVE) continue;
 
             eta = dev->device_load + time_estimate(this_task, dev);
             if( best_eta > eta ) {
@@ -244,14 +245,14 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
             goto no_valid_device;
 
         this_task->selected_device = parsec_mca_device_get(best_index);
-        assert( this_task->selected_device->type != PARSEC_DEV_RECURSIVE );
+        assert( !(this_task->selected_device->type & PARSEC_DEV_RECURSIVE) );
     }
 
 device_selected:
     dev = this_task->selected_device;
     assert( NULL != dev );
     assert( tp->devices_index_mask & (1 << dev->device_index) );
-    for(chore_id = 0; tc->incarnations[chore_id].type != dev->type; chore_id++)
+    for(chore_id = 0; !(tc->incarnations[chore_id].type & dev->type); chore_id++)
         assert(PARSEC_DEV_NONE != tc->incarnations[chore_id].type /* we have selected this device, so there *must* be an incarnation that matches */);
     this_task->selected_chore = chore_id;
     this_task->load = time_estimate(this_task, dev);
@@ -748,8 +749,8 @@ int parsec_mca_device_registration_complete(parsec_context_t* context)
     for( uint32_t i = 0; i < parsec_nb_devices; i++ ) {
         parsec_device_module_t* device = parsec_devices[i];
         if( NULL == device ) continue;
-        if( PARSEC_DEV_RECURSIVE == device->type ) continue;
-        if( PARSEC_DEV_CPU == device->type ) {
+        if( PARSEC_DEV_RECURSIVE & device->type ) continue;
+        if( PARSEC_DEV_CPU & device->type ) {
             c = 0;
             for(int p = 0; p < context->nb_vp; p++)
                 c += context->virtual_processes[p]->nb_cores;
@@ -768,7 +769,7 @@ int parsec_mca_device_registration_complete(parsec_context_t* context)
     for( uint32_t i = 0; i < parsec_nb_devices; i++ ) {
         parsec_device_module_t* device = parsec_devices[i];
         if( NULL == device ) continue;
-        if( PARSEC_DEV_RECURSIVE == device->type ) continue;
+        if( PARSEC_DEV_RECURSIVE & device->type ) continue;
         device->time_estimate_default = total_gflops_fp64/(double)device->gflops_fp64;
         parsec_debug_verbose(6, parsec_device_output, "  Dev[%d] default-time-estimate %-4"PRId64" <- double %-8"PRId64" single %-8"PRId64" tensor %-8"PRId64" half %-8"PRId64" %s",
                              i, device->time_estimate_default, device->gflops_fp64, device->gflops_fp32, device->gflops_tf32, device->gflops_fp16, device->gflops_guess? "GUESSED": "");
@@ -933,7 +934,7 @@ device_taskpool_register_static(parsec_device_module_t* device, parsec_taskpool_
             continue;
         __parsec_chore_t* chores = (__parsec_chore_t*)tc->incarnations;
         for( j = 0; NULL != chores[j].hook; j++ ) {
-            if( chores[j].type != device->type )
+            if( !(chores[j].type & device->type) )
                 continue;
             if(  NULL != chores[j].dyld_fn ) {
                 continue;  /* the function has been set for another device of the same type */

@@ -2,6 +2,7 @@
  * Copyright (c) 2013-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
+ * Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
  */
 
 /** @addtogroup parsec_device
@@ -65,10 +66,13 @@ typedef struct parsec_device_base_component_2_0_0 parsec_device_base_component_t
 #define PARSEC_DEV_CUDA       ((uint8_t)(1 << 2))
 #define PARSEC_DEV_HIP        ((uint8_t)(1 << 3))
 #define PARSEC_DEV_LEVEL_ZERO ((uint8_t)(1 << 4))
+#define PARSEC_DEV_CUDA_BATCH ((uint8_t)(1 << 5))
 #define PARSEC_DEV_TEMPLATE   ((uint8_t)(1 << 7))
 #define PARSEC_DEV_ANY_TYPE   ((uint8_t)    0x3f)
 #define PARSEC_DEV_ALL        ((uint8_t)    0x3f)
 #define PARSEC_DEV_MAX_NB_TYPE                (7)
+/* The following flags are extensions to the device type */
+#define PARSEC_DEV_CHORE_ALLOW_BATCH  ((uint32_t)0x00000100)
 
 #define PARSEC_DEV_GPU_MASK   (PARSEC_DEV_CUDA|PARSEC_DEV_HIP|PARSEC_DEV_LEVEL_ZERO)
 #define PARSEC_DEV_IS_GPU(t)  (0 != ((t) & PARSEC_DEV_GPU_MASK))