Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for batched tasks. #668

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,8 @@ option(PARSEC_GPU_ALLOC_PER_TILE
mark_as_advanced(PARSEC_GPU_ALLOC_PER_TILE)
option(PARSEC_GPU_WITH_CUDA
"Enable GPU support using CUDA kernels" ON)
option(PARSEC_GPU_WITH_CUDA_BATCH
"Enable the runtime support for batched kernels" ON)
option(PARSEC_GPU_WITH_HIP
"Enable GPU support using HIP kernels" ON)
option(PARSEC_GPU_WITH_LEVEL_ZERO
Expand Down Expand Up @@ -729,6 +731,12 @@ int main(int argc, char *argv[]) {
endif (CUDAToolkit_FOUND)
set(PARSEC_HAVE_CU_COMPILER ${CMAKE_CUDA_COMPILER} CACHE BOOL "True if PaRSEC provide support for compiling .cu files")
endif( PARSEC_GPU_WITH_CUDA )
if( PARSEC_GPU_WITH_CUDA_BATCH )
if( NOT PARSEC_HAVE_CUDA)
message(FATAL_ERROR "PARSEC_GPU_WITH_CUDA_BATCH requires PARSEC_GPU_WITH_CUDA. Enable both or none")
endif( NOT PARSEC_HAVE_CUDA)
set(PARSEC_HAVE_CUDA_BATCH True CACHE BOOL "True if support for batched CUDA has been enabled")
endif( PARSEC_GPU_WITH_CUDA_BATCH )

if( PARSEC_GPU_WITH_HIP )
# This is kinda ugly but the PATH and HINTS don't get transmitted to sub-dependents
Expand Down
4 changes: 4 additions & 0 deletions cmake_modules/PaRSECConfig.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ endif(@PARSEC_DIST_WITH_MPI@)
if(@PARSEC_HAVE_CUDA@)
find_package(CUDAToolkit REQUIRED)
set(PARSEC_HAVE_CUDA TRUE)

if(@PARSEC_HAVE_CUDA_BATCH@)
set(PARSEC_HAVE_CUDA_BATCH TRUE)
endif(@PARSEC_HAVE_CUDA_BATCH@)
endif(@PARSEC_HAVE_CUDA@)

if(@PARSEC_HAVE_HIP@)
Expand Down
1 change: 1 addition & 0 deletions parsec/include/parsec/parsec_options.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@
#cmakedefine PARSEC_HAVE_DEV_CPU_SUPPORT
#cmakedefine PARSEC_HAVE_DEV_RECURSIVE_SUPPORT
#cmakedefine PARSEC_HAVE_DEV_CUDA_SUPPORT
#cmakedefine PARSEC_HAVE_DEV_CUDA_BATCH_SUPPORT
#cmakedefine PARSEC_HAVE_DEV_HIP_SUPPORT
#cmakedefine PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT
#cmakedefine PARSEC_HAVE_DEV_OPENCL_SUPPORT
Expand Down
23 changes: 12 additions & 11 deletions parsec/interfaces/dtd/insert_function.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* Copyright (c) 2013-2023 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2023 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2023-2024 NVIDIA Corporation. All rights reserved.
*/

/* **************************************************************************** */
Expand Down Expand Up @@ -1477,7 +1477,7 @@ parsec_dtd_startup(parsec_context_t *context,
if( !(tp->devices_index_mask & (1 << device->device_index))) continue; /* not supported */
// If CUDA is enabled, let the CUDA device activated for this
// taskpool.
if( PARSEC_DEV_CUDA == device->type ) continue;
if( PARSEC_DEV_CUDA & device->type ) continue;
if( NULL != device->taskpool_register )
if( PARSEC_SUCCESS !=
device->taskpool_register(device, (parsec_taskpool_t *)tp)) {
Expand Down Expand Up @@ -2355,8 +2355,8 @@ int parsec_dtd_task_class_add_chore(parsec_taskpool_t *tp,
/* We assume that incarnations is big enough, because it has been pre-allocated
* with PARSEC_DEV_MAX_NB_TYPE+1 chores, as this is a DTD task class */
incarnations = (__parsec_chore_t*)dtd_tc->super.incarnations;
for(i = 0; i < PARSEC_DEV_MAX_NB_TYPE && incarnations[i].type != PARSEC_DEV_NONE; i++) {
if( incarnations[i].type == device_type ) {
for(i = 0; i < PARSEC_DEV_MAX_NB_TYPE && (incarnations[i].type & PARSEC_DEV_ANY_TYPE) != PARSEC_DEV_NONE; i++) {
if( incarnations[i].type & PARSEC_DEV_ANY_TYPE & device_type ) {
parsec_warning("A chore for this device type has already been added to task class '%s'\n",
tc->name);
return PARSEC_ERROR;
Expand All @@ -2369,7 +2369,7 @@ int parsec_dtd_task_class_add_chore(parsec_taskpool_t *tp,
}

incarnations[i].type = device_type;
if(PARSEC_DEV_CUDA == device_type) {
if(PARSEC_DEV_CUDA & device_type) {
incarnations[i].hook = parsec_dtd_gpu_task_submit;
dtd_tc->gpu_func_ptr = (parsec_advance_task_function_t)function;
}
Expand Down Expand Up @@ -3258,19 +3258,20 @@ __parsec_dtd_taskpool_create_task(parsec_taskpool_t *tp,
dtd_tc = parsec_dtd_create_task_classv(name_of_kernel, nb_params, params);
tc = &dtd_tc->super;

__parsec_chore_t **incarnations = (__parsec_chore_t **)&tc->incarnations;
(*incarnations)[0].type = device_type;
if( device_type == PARSEC_DEV_CUDA ) {
__parsec_chore_t *incarnations = (__parsec_chore_t *)tc->incarnations;
incarnations[0].type = device_type;
if( device_type & PARSEC_DEV_CUDA ) {
/* Special case for CUDA: we need an intermediate */
(*incarnations)[0].hook = parsec_dtd_gpu_task_submit;
incarnations[0].hook = parsec_dtd_gpu_task_submit;
dtd_tc->gpu_func_ptr = (parsec_advance_task_function_t)fpointer;
}
else {
/* Default case: the user-provided function is directly the hook to call */
(*incarnations)[0].hook = fpointer; // We can directly call the CPU hook
incarnations[0].hook = fpointer; // We can directly call the CPU hook
dtd_tc->cpu_func_ptr = fpointer;
}
(*incarnations)[1].type = PARSEC_DEV_NONE;
incarnations[1].type = PARSEC_DEV_NONE;
incarnations[1].hook = NULL;

/* Bookkeeping of the task class */
parsec_dtd_register_task_class(&dtd_tp->super, fkey, tc);
Expand Down
32 changes: 24 additions & 8 deletions parsec/interfaces/ptg/ptg-compiler/jdf2c.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
* Copyright (c) 2009-2023 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
*/

#include "parsec/parsec_config.h"
Expand Down Expand Up @@ -3938,25 +3939,40 @@ jdf_generate_function_incarnation_list( const jdf_t *jdf,
jdf_def_list_t* dyld_property;
jdf_def_list_t* evaluate_property = NULL;
jdf_def_list_t* device_property = NULL;
jdf_def_list_t* batch_property = NULL;

(void)jdf;
string_arena_add_string(sa, "static const __parsec_chore_t __%s_chores[] ={\n", base_name);
do {
jdf_find_property(body->properties, "type", &type_property);
jdf_find_property(body->properties, "dyld", &dyld_property);
jdf_find_property(body->properties, JDF_BODY_PROP_EVALUATE, &evaluate_property);
if( NULL == type_property) {
jdf_find_property(body->properties, "batch", &batch_property);
if (NULL == type_property)
{
string_arena_add_string(sa, "#if defined(PARSEC_HAVE_DEV_CPU_SUPPORT)\n");
string_arena_add_string(sa, " { .type = PARSEC_DEV_CPU,\n");
string_arena_add_string(sa, " .evaluate = (parsec_evaluate_function_t*)%s,\n",
(NULL == evaluate_property) ? "NULL" : evaluate_property->expr->jdf_c_code.fname);
string_arena_add_string(sa, " .hook = (parsec_hook_t*)hook_of_%s },\n", base_name);
string_arena_add_string(sa, "#endif /* defined(PARSEC_HAVE_DEV_CPU_SUPPORT) */\n");
} else {
if( NULL != batch_property ) {
fprintf(stderr,
"Error: batched property (%s) not allowed for devices other than accelerators in body of task %s at line %d\n",
batch_property->expr->jdf_var, f->fname, JDF_OBJECT_LINENO(body));
assert( NULL != batch_property );
}
}
else
{
char* dev_upper = strdup_upper(type_property->expr->jdf_var);

string_arena_add_string(sa, "#if defined(PARSEC_HAVE_DEV_%s_SUPPORT)\n", dev_upper);
string_arena_add_string(sa, " { .type = PARSEC_DEV_%s,\n", dev_upper);
string_arena_add_string(sa, " { .type = PARSEC_DEV_%s", dev_upper);
if( NULL != batch_property) {
string_arena_add_string(sa, " | PARSEC_DEV_CHORE_ALLOW_BATCH");
}
string_arena_add_string(sa, ",\n");
if( NULL == dyld_property ) {
string_arena_add_string(sa, " .dyld = NULL,\n");
} else {
Expand Down Expand Up @@ -4491,7 +4507,7 @@ static void jdf_generate_startup_hook( const jdf_t *jdf )
" parsec_task_class_t* tc = (parsec_task_class_t*)__parsec_tp->super.super.task_classes_array[i];\n"
" __parsec_chore_t* chores = (__parsec_chore_t*)tc->incarnations;\n"
" uint32_t idx = 0, j;\n"
" for( j = 0; PARSEC_DEV_NONE != chores[j].type; j++ ) {\n"
" for( j = 0; PARSEC_DEV_NONE != (chores[j].type & PARSEC_DEV_ANY_TYPE); j++ ) {\n"
" if( !(supported_dev & chores[j].type) ) continue;\n"
" if( j != idx ) {\n"
" chores[idx] = chores[j];\n"
Expand Down Expand Up @@ -4680,7 +4696,7 @@ static void jdf_generate_constructor( const jdf_t* jdf )
coutput(" for( i = 0; i < __parsec_tp->super.super.nb_task_classes; i++ ) {\n"
" __parsec_tp->super.super.task_classes_array[i] = tc = malloc(sizeof(parsec_task_class_t));\n"
" memcpy(tc, %s_task_classes[i], sizeof(parsec_task_class_t));\n"
" for( j = 0; PARSEC_DEV_NONE != tc->incarnations[j].type; j++); /* compute the number of incarnations */\n"
" for( j = 0; PARSEC_DEV_NONE != (tc->incarnations[j].type & PARSEC_DEV_ANY_TYPE); j++); /* compute the number of incarnations */\n"
" tc->incarnations = (__parsec_chore_t*)malloc((j+1) * sizeof(__parsec_chore_t));\n "
" memcpy((__parsec_chore_t*)tc->incarnations, %s_task_classes[i]->incarnations, (j+1) * sizeof(__parsec_chore_t));\n\n"
" /* Add a placeholder for initialization and startup task */\n"
Expand Down Expand Up @@ -6731,8 +6747,8 @@ static void jdf_generate_code_hook_gpu(const jdf_t *jdf,
coutput(" /* Pointer to dynamic gpu function */\n"
" {\n"
" int chore_idx = 0;\n"
" for ( ; PARSEC_DEV_NONE != this_task->task_class->incarnations[chore_idx].type; ++chore_idx) {\n"
" if (this_task->task_class->incarnations[chore_idx].type == PARSEC_DEV_%s) break;\n"
" for ( ; PARSEC_DEV_NONE != (this_task->task_class->incarnations[chore_idx].type & PARSEC_DEV_ANY_TYPE); ++chore_idx) {\n"
" if (this_task->task_class->incarnations[chore_idx].type & PARSEC_DEV_%s) break;\n"
" }\n"
" /* The void* cast prevents the compiler from complaining about the type change */\n"
" parsec_body.dyld_fn = (%s)(void*)this_task->task_class->incarnations[chore_idx].dyld_fn;\n"
Expand Down Expand Up @@ -6983,7 +6999,7 @@ static void jdf_generate_code_hook(const jdf_t *jdf,
coutput("#if defined(PARSEC_HAVE_DEV_%s_SUPPORT)\n", type_upper);
if( NULL != type_property) {

if (!strcasecmp(type_property->expr->jdf_var, "cuda")
if (!strncasecmp(type_property->expr->jdf_var, "cuda", 4) /* for batched */
|| !strcasecmp(type_property->expr->jdf_var, "hip")) {
jdf_generate_code_hook_gpu(jdf, f, body, name);
goto hook_end_block;
Expand Down
3 changes: 3 additions & 0 deletions parsec/mca/device/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ set(PARSEC_HAVE_DEV_RECURSIVE_SUPPORT 0 CACHE BOOL "PaRSEC has support for Recu
if(PARSEC_HAVE_CUDA)
set(PARSEC_HAVE_DEV_CUDA_SUPPORT 1 CACHE BOOL "PaRSEC support for CUDA")
endif(PARSEC_HAVE_CUDA)
if(PARSEC_HAVE_CUDA_BATCH)
set(PARSEC_HAVE_DEV_CUDA_BATCH_SUPPORT 1 CACHE BOOL "PaRSEC support for batched CUDA")
endif(PARSEC_HAVE_CUDA_BATCH)
if(PARSEC_HAVE_HIP)
set(PARSEC_HAVE_DEV_HIP_SUPPORT 1 CACHE BOOL "PaRSEC support for HIP")
endif(PARSEC_HAVE_HIP)
Expand Down
2 changes: 1 addition & 1 deletion parsec/mca/device/cuda/device_cuda_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ static int device_cuda_component_close(void)
/* Check that no CUDA devices are still registered with PaRSEC */
for(i = 0; i < parsec_mca_device_enabled(); i++) {
if( NULL == (cdev = (parsec_device_cuda_module_t*)parsec_mca_device_get(i)) ) continue;
if(PARSEC_DEV_CUDA != cdev->super.super.type) continue;
if(PARSEC_DEV_CUDA & cdev->super.super.type) continue;

PARSEC_DEBUG_VERBOSE(0, parsec_gpu_output_stream,
"GPU[%d:%s] CUDA device %d still registered with PaRSEC at the end of CUDA finalize.\n"
Expand Down
23 changes: 12 additions & 11 deletions parsec/mca/device/device.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
* Copyright (c) 2013-2023 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
*/

#include "parsec/parsec_config.h"
Expand Down Expand Up @@ -107,7 +108,7 @@ int parsec_select_best_device( parsec_task_t* this_task ) {

/* Run the evaluates for the incarnation types to determine if they can
* execute this task */
for(chore_id = 0; PARSEC_DEV_NONE != tc->incarnations[chore_id].type; chore_id++) {
for(chore_id = 0; PARSEC_DEV_NONE != (tc->incarnations[chore_id].type & PARSEC_DEV_ANY_TYPE); chore_id++) {
if( 0 == (this_task->chore_mask & (1<<chore_id)) ) continue;
if( NULL == tc->incarnations[chore_id].hook ) continue; /* dyld hook not found during initialization */

Expand All @@ -116,15 +117,15 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
if( PARSEC_HOOK_RETURN_DONE != rc ) {
if( PARSEC_HOOK_RETURN_NEXT != rc ) {
PARSEC_DEBUG_VERBOSE(5, parsec_device_output, "Failed to evaluate %s[%d] chore %d",
tmp, tc->incarnations[chore_id].type,
tmp, tc->incarnations[chore_id].type & PARSEC_DEV_ANY_TYPE,
chore_id);
}
/* Mark this chore as tested */
this_task->chore_mask &= ~( 1<<chore_id );
continue;
}
}
valid_types |= tc->incarnations[chore_id].type; /* the eval accepted the type, but no device specified yet */
valid_types |= (tc->incarnations[chore_id].type & PARSEC_DEV_ANY_TYPE); /* the eval accepted the type, but no device specified yet */
/* Evaluate may have picked a device, abide by it */
if( NULL != this_task->selected_device ) {
assert( this_task->selected_device->type & valid_types );
Expand All @@ -140,7 +141,7 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
if (PARSEC_DEV_CPU == valid_types) { /* shortcut for CPU only tasks */
this_task->selected_device = dev = parsec_mca_device_get(0);
this_task->load = 0;
for(chore_id = 0; tc->incarnations[chore_id].type != PARSEC_DEV_CPU; chore_id++);
for(chore_id = 0; !(tc->incarnations[chore_id].type & PARSEC_DEV_CPU); chore_id++);
this_task->selected_chore = chore_id;
PARSEC_DEBUG_VERBOSE(80, parsec_device_output, "%s: Task %s cpu-only task set selected_device %d:%s",
__func__, tmp, dev->device_index, dev->name);
Expand Down Expand Up @@ -226,7 +227,7 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
/* Skip the device if no incarnations for its type */
if(!(dev->type & valid_types)) continue;
/* Skip recursive devices: time estimates are computed on the associated CPU device */
if(dev->type == PARSEC_DEV_RECURSIVE) continue;
if(dev->type & PARSEC_DEV_RECURSIVE) continue;

eta = dev->device_load + time_estimate(this_task, dev);
if( best_eta > eta ) {
Expand All @@ -244,14 +245,14 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
goto no_valid_device;

this_task->selected_device = parsec_mca_device_get(best_index);
assert( this_task->selected_device->type != PARSEC_DEV_RECURSIVE );
assert( !(this_task->selected_device->type & PARSEC_DEV_RECURSIVE) );
}

device_selected:
dev = this_task->selected_device;
assert( NULL != dev );
assert( tp->devices_index_mask & (1 << dev->device_index) );
for(chore_id = 0; tc->incarnations[chore_id].type != dev->type; chore_id++)
for(chore_id = 0; !(tc->incarnations[chore_id].type & dev->type); chore_id++)
assert(PARSEC_DEV_NONE != tc->incarnations[chore_id].type /* we have selected this device, so there *must* be an incarnation that matches */);
this_task->selected_chore = chore_id;
this_task->load = time_estimate(this_task, dev);
Expand Down Expand Up @@ -748,8 +749,8 @@ int parsec_mca_device_registration_complete(parsec_context_t* context)
for( uint32_t i = 0; i < parsec_nb_devices; i++ ) {
parsec_device_module_t* device = parsec_devices[i];
if( NULL == device ) continue;
if( PARSEC_DEV_RECURSIVE == device->type ) continue;
if( PARSEC_DEV_CPU == device->type ) {
if( PARSEC_DEV_RECURSIVE & device->type ) continue;
if( PARSEC_DEV_CPU & device->type ) {
c = 0;
for(int p = 0; p < context->nb_vp; p++)
c += context->virtual_processes[p]->nb_cores;
Expand All @@ -768,7 +769,7 @@ int parsec_mca_device_registration_complete(parsec_context_t* context)
for( uint32_t i = 0; i < parsec_nb_devices; i++ ) {
parsec_device_module_t* device = parsec_devices[i];
if( NULL == device ) continue;
if( PARSEC_DEV_RECURSIVE == device->type ) continue;
if( PARSEC_DEV_RECURSIVE & device->type ) continue;
device->time_estimate_default = total_gflops_fp64/(double)device->gflops_fp64;
parsec_debug_verbose(6, parsec_device_output, " Dev[%d] default-time-estimate %-4"PRId64" <- double %-8"PRId64" single %-8"PRId64" tensor %-8"PRId64" half %-8"PRId64" %s",
i, device->time_estimate_default, device->gflops_fp64, device->gflops_fp32, device->gflops_tf32, device->gflops_fp16, device->gflops_guess? "GUESSED": "");
Expand Down Expand Up @@ -933,7 +934,7 @@ device_taskpool_register_static(parsec_device_module_t* device, parsec_taskpool_
continue;
__parsec_chore_t* chores = (__parsec_chore_t*)tc->incarnations;
for( j = 0; NULL != chores[j].hook; j++ ) {
if( chores[j].type != device->type )
if( !(chores[j].type & device->type) )
continue;
if( NULL != chores[j].dyld_fn ) {
continue; /* the function has been set for another device of the same type */
Expand Down
4 changes: 4 additions & 0 deletions parsec/mca/device/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
* Copyright (c) 2013-2023 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
*/

/** @addtogroup parsec_device
Expand Down Expand Up @@ -65,10 +66,13 @@ typedef struct parsec_device_base_component_2_0_0 parsec_device_base_component_t
#define PARSEC_DEV_CUDA ((uint8_t)(1 << 2))
#define PARSEC_DEV_HIP ((uint8_t)(1 << 3))
#define PARSEC_DEV_LEVEL_ZERO ((uint8_t)(1 << 4))
#define PARSEC_DEV_CUDA_BATCH ((uint8_t)(1 << 5))
#define PARSEC_DEV_TEMPLATE ((uint8_t)(1 << 7))
#define PARSEC_DEV_ANY_TYPE ((uint8_t) 0x3f)
#define PARSEC_DEV_ALL ((uint8_t) 0x3f)
#define PARSEC_DEV_MAX_NB_TYPE (7)
/* The following flags are extensions to the device type */
#define PARSEC_DEV_CHORE_ALLOW_BATCH ((uint32_t)0x00000100)

#define PARSEC_DEV_GPU_MASK (PARSEC_DEV_CUDA|PARSEC_DEV_HIP|PARSEC_DEV_LEVEL_ZERO)
#define PARSEC_DEV_IS_GPU(t) (0 != ((t) & PARSEC_DEV_GPU_MASK))
Expand Down
Loading
Loading