Skip to content

Commit

Permalink
Merge pull request #663 from abouteiller/bugfix/cpu-features
Browse files Browse the repository at this point in the history
Add: mca control for cpu load balancing (and don't report Gflops figures for cpus we can't determine)
  • Loading branch information
abouteiller authored Sep 12, 2024
2 parents 73c3165 + f27a485 commit 07f7238
Show file tree
Hide file tree
Showing 2 changed files with 101 additions and 69 deletions.
164 changes: 98 additions & 66 deletions parsec/mca/device/device.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2013-2023 The University of Tennessee and The University
* Copyright (c) 2013-2024 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
Expand Down Expand Up @@ -53,6 +53,11 @@ static mca_base_component_t **device_components = NULL;
*/
static int parsec_device_load_balance_skew = 20;
static float load_balance_skew;
/**
* load balance allow scheduling tasks with GPU incarnations to CPU cores
* 0 means that tasks execute on CPU **only if they cannot execute on GPUs**
*/
static int parsec_device_load_balance_allow_cpu = 0;

/**
* @brief Estimates how many nanoseconds this_task will run on devid
Expand Down Expand Up @@ -125,9 +130,9 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
}
}
valid_types |= tc->incarnations[chore_id].type; /* the eval accepted the type, but no device specified yet */
/* Evaluate may have picked a device, abide by it */
if( NULL != this_task->selected_device ) {
assert( this_task->selected_device->type & valid_types );
if( NULL != this_task->selected_device ) { /* When Evaluate picked a device, abide by it */
assert( (1<<this_task->selected_device->device_index) & tp->devices_index_mask /* only valid devices! */ );
assert( this_task->selected_device->type & valid_types /* only valid device types! */ );
PARSEC_DEBUG_VERBOSE(30, parsec_device_output, "%s: Task %s evaluate set selected_device %d:%s",
__func__, tmp, this_task->selected_device->device_index, this_task->selected_device->name);
goto device_selected;
Expand Down Expand Up @@ -205,7 +210,7 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
assert( NULL == this_task->selected_device );
{ /* lets consider the time_estimates to select the best device */
int best_index = -1;
int64_t eta, best_eta = INT64_MAX; /* dev->device_load + time_estimate(this_task, dev); this commented out because we don't count cpu loads */
int64_t eta, best_eta = INT64_MAX;

/* If we have a preferred device (from READ flows), start with it, but still consider
* other options to have some load balance */
Expand All @@ -219,14 +224,15 @@ int parsec_select_best_device( parsec_task_t* this_task ) {

/* Consider how adding the current task would change load balancing
* between devices */
if(!parsec_device_load_balance_allow_cpu)
valid_types &= ~PARSEC_DEV_CPU; /* automatic CPU / GPU load balancing disabled, remove the CPU type */
valid_types &= ~PARSEC_DEV_RECURSIVE; /* Recursive device time estimates are computed on the associated CPU device */
for( int dev_index = 0; dev_index < parsec_mca_device_enabled(); dev_index++ ) {
/* Skip the device if it is disabled for the taskpool */
if(!(tp->devices_index_mask & (1 << dev_index))) continue;
dev = parsec_mca_device_get(dev_index);
/* Skip the device if no incarnations for its type */
if(!(dev->type & valid_types)) continue;
/* Skip recursive devices: time estimates are computed on the associated CPU device */
if(dev->type == PARSEC_DEV_RECURSIVE) continue;

eta = dev->device_load + time_estimate(this_task, dev);
if( best_eta > eta ) {
Expand Down Expand Up @@ -313,10 +319,16 @@ int parsec_mca_device_init(void)
(void)parsec_mca_param_reg_int_name("device", "load_balance_skew",
"Allow load balancing to skew by x%% to favor data reuse",
false, false, parsec_device_load_balance_skew, NULL);
(void)parsec_mca_param_reg_int_name("device", "load_balance_allow_cpu",
"Allow load balancing tasks with GPU incarnations to CPU cores",
false, false, parsec_device_load_balance_allow_cpu, NULL);
if( 0 < (rc = parsec_mca_param_find("device", NULL, "load_balance_skew")) ) {
parsec_mca_param_lookup_int(rc, &parsec_device_load_balance_skew);
}
load_balance_skew = 1.f/(parsec_device_load_balance_skew/100.f+1.f);
if( 0 < (rc = parsec_mca_param_find("device", NULL, "load_balance_allow_cpu")) ) {
parsec_mca_param_lookup_int(rc, &parsec_device_load_balance_allow_cpu);
}
if( 0 < (rc = parsec_mca_param_find("device", NULL, "verbose")) ) {
parsec_mca_param_lookup_int(rc, &parsec_device_verbose);
}
Expand Down Expand Up @@ -790,17 +802,17 @@ int parsec_mca_device_registration_completed(parsec_context_t* context)
#include <sys/sysctl.h>
#endif

#include "parsec/parsec_hwloc.h"

static int cpu_weights(parsec_device_module_t* device, int nstreams)
{
/* This is default value when it cannot be computed */
/* Crude estimate that holds for Nehalem era Xeon processors */
float freq = 2.5f;
float fp_ipc = 8.f;
float dp_ipc = 4.f;
float freq = 0.f;
float fp_ipc = 0.f;
float dp_ipc = 0.f;
char cpu_model[256]="Unkown";
char *cpu_flags = NULL;

#if defined(__linux__)
char *cpu_flags = NULL;
FILE* procinfo = fopen("/proc/cpuinfo", "r");
if( NULL == procinfo ) {
parsec_warning("CPU Features cannot be autodetected on this machine: %s", strerror(errno));
Expand All @@ -809,66 +821,28 @@ static int cpu_weights(parsec_device_module_t* device, int nstreams)
cpu_flags = calloc(4096, sizeof(char));
char str[4096];
while( NULL != fgets(str, 4096, procinfo) ) {
#if defined(__x86_64__) || defined(__i386__)
/* Intel/AMD */
sscanf(str, "model name : %255[^\n]%*c", cpu_model);
if( 0 != sscanf(str, "cpu MHz : %f", &freq) )
freq *= 1e-3;
if( 0 != sscanf(str, "flags : %4095[^\n]%*c", cpu_flags) )
break; /* done reading for an x86 type CPU */
#elif defined(__PPC64__)
/* IBM: Power */
sscanf(str, "cpu : %255[^\n]%*c", cpu_model);
if( 0 != sscanf(str, "clock : %fMHz", &freq) ) {
freq *= 1e-3;
break; /* done reading for a Power type CPU */
}
#endif
}
fclose(procinfo);
#elif defined(__APPLE__)
size_t len = sizeof(cpu_model);
int rc = sysctlbyname("machdep.cpu.brand_string", cpu_model, &len, NULL, 0);
if( rc ) {
parsec_warning("CPU Features cannot be autodetected on this machine (Detected OSX): %s", strerror(errno));
goto notfound;
}
len = 0;
rc = sysctlbyname("machdep.cpu.features", NULL, &len, NULL, 0);
cpu_flags = malloc(len);
rc = sysctlbyname("machdep.cpu.features", cpu_flags, &len, NULL, 0);
if( rc ) {
parsec_warning("CPU Features cannot be autodetected on this machine (Detected OSX): %s", strerror(errno));
goto notfound;
}
#else
goto notfound;
#endif
/* prefer base frequency from model name when available (avoids power
* saving modes and dynamic frequency scaling issues) */
/* prefer base frequency from model name when available (e.g., Intel)
* this avoids reading dynamic frequency during power saving or boost modes */
sscanf(cpu_model, "%*[^@] @ %fGHz", &freq);

fp_ipc = 8;
dp_ipc = 4;
#if defined(__x86_64__) || defined(__i386__)
#if defined(PARSEC_HAVE_BUILTIN_CPU)
__builtin_cpu_init();
#if defined(__AVX__)
if(__builtin_cpu_supports("avx")) {
fp_ipc = 16;
dp_ipc = 8;
}
#endif /* defined(__AVX__) */
#if defined(__AVX2__)
if(__builtin_cpu_supports("avx2")) {
fp_ipc = 32;
dp_ipc = 16;
}
#endif /* defined(__AVX2__) */
#if defined(__AVX512F__)
if(__builtin_cpu_supports("avx512f")) {
fp_ipc = 64;
dp_ipc = 32;
}
#endif /* defined(__AVX512F__) */
#else
if( strstr(cpu_flags, " avx512f") ) {
fp_ipc = 64;
dp_ipc = 32;
Expand All @@ -881,36 +855,94 @@ static int cpu_weights(parsec_device_module_t* device, int nstreams)
fp_ipc = 16;
dp_ipc = 8;
}
#endif
#endif /* defined(__x86_64__) || defined(__i386__) */
else {
fp_ipc = 8;
dp_ipc = 4;
}
#elif defined(__PPC64__)
fp_ipc = 16;
dp_ipc = 8;
#endif /* defined(__x86_64__) || defined(__i386__) || defined(__PPC64__) */
free(cpu_flags);

#elif defined(__APPLE__)
size_t len = sizeof(cpu_model);
size_t val = 0;
int rc = sysctlbyname("machdep.cpu.brand_string", cpu_model, &len, NULL, 0);
if( rc ) {
parsec_warning("CPU Name cannot be autodetected on this machine (OSX): %s", strerror(errno));
goto notfound;
}
/* is it an arm64 mac? */
rc = sysctlbyname("hw.optional.arm64", &val, &len, NULL, 0);
if( 0 == rc && val ) {
/* vector length */
fp_ipc = 16;
dp_ipc = 8;
// TODO: figure out a way to obtain cpu freq on arm64 macos
freq = 0.0;
}
else { /* intel mac */
/* vector length */
int i;
char *keys[4] = {
"hw.optional.avx512f",
"hw.optional.avx2_0",
"hw.optional.avx1_0",
NULL
};
for(i = 0; keys[i] != NULL; i++) {
rc = sysctlbyname(keys[i], &val, &len, NULL, 0);
if( 0 == rc && val )
break;
}
fp_ipc = 8*(8>>i);
dp_ipc = 4*(8>>i);
/* frequency */
len = sizeof(size_t);
rc = sysctlbyname("hw.cpufrequency", &val, &len, NULL, 0);
if( rc ) {
parsec_warning("CPU Frequency cannot be autodetected on this machine (OSX): %s", strerror(errno));
goto notfound;
}
freq = val * 1e-9f;
}
#endif

notfound:
{
int show_caps = 0;
int show_caps_index = parsec_mca_param_find("device", NULL, "show_capabilities");
if(0 < show_caps_index) {
parsec_mca_param_lookup_int(show_caps_index, &show_caps);
}
/* this may show unknown/0.0 if the cpu capabilities couldn't be determined */
if( show_caps ) {
int ncores = parsec_hwloc_nb_real_cores();
parsec_inform("CPU Device: %s\n"
"\tParsec Streams : %d\n"
"\tParsec EUs / Cores : %d / %d\n"
"\tFrequency (GHz) : %.2f\n"
"\tPeak Tflop/s : fp64: %-8.3f fp32: %-8.3f",
cpu_model,
nstreams,
freq, nstreams*freq*dp_ipc*1e-3, nstreams*freq*fp_ipc*1e-3);
nstreams, ncores,
freq, (nstreams > ncores? ncores: nstreams)*freq*dp_ipc*1e-3, (nstreams > ncores? ncores: nstreams)*freq*fp_ipc*1e-3);
}
}
notfound:

/* compute capacity is per-core, not per-device, so as to account for the
* prevalent model where we use sequential, single threaded tasks on CPU devices.
* Advanced users can use the time_estimate property to override if using
* multi-core parallel tasks. */
device->gflops_fp16 = fp_ipc * freq; /* No processor have half precision for now */
device->gflops_tf32 = fp_ipc * freq; /* No processor support tensor operations for now */
device->gflops_fp32 = fp_ipc * freq;
device->gflops_fp64 = dp_ipc * freq;
/* If unset, set to 1 to avoid dividing by 0 when computing the time Estimates
* and mark that the time_estimates are guesses */
if(device->gflops_fp32 == 0)
device->gflops_fp32 = device->gflops_guess = 1;
if(device->gflops_fp64 == 0)
device->gflops_fp64 = device->gflops_guess = 1;
/* CPUs emulate these using normal fp */
device->gflops_fp16 = device->gflops_tf32 = device->gflops_fp32;

return PARSEC_SUCCESS;
}
Expand Down Expand Up @@ -969,7 +1001,7 @@ int parsec_mca_device_attach(parsec_context_t* context)
nb_total_comp_threads += context->virtual_processes[p]->nb_cores;
}

#if defined(PARSEC_HAVE_DEV_CPU_SUPPORT)
#if defined(PARSEC_HAVE_DEV_CPU_SUPPORT)
/* Add the predefined devices: one device for the CPUs */
{
parsec_device_cpus = (parsec_device_module_t*)calloc(1, sizeof(parsec_device_module_t));
Expand All @@ -981,7 +1013,7 @@ int parsec_mca_device_attach(parsec_context_t* context)
parsec_device_cpus->taskpool_register = device_taskpool_register_static;
parsec_mca_device_add(context, parsec_device_cpus);
}
#endif /* defined(PARSEC_HAVE_DEV_CPU_SUPPORT) */
#endif /* defined(PARSEC_HAVE_DEV_CPU_SUPPORT) */

#if defined(PARSEC_HAVE_DEV_RECURSIVE_SUPPORT)
/* and one for the recursive kernels */
Expand Down
6 changes: 3 additions & 3 deletions parsec/scheduling.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2009-2023 The University of Tennessee and The University
* Copyright (c) 2009-2024 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*/
Expand Down Expand Up @@ -136,8 +136,8 @@ int __parsec_execute( parsec_execution_stream_t* es,
rc = parsec_select_best_device(task);
if( PARSEC_ERROR == rc ) return PARSEC_HOOK_RETURN_ERROR;
if( PARSEC_DEV_IS_GPU(task->selected_device->type) ) {
/* counting load on CPU is useless because it would move from 0->1->0 during the span of execute
* TODO: select_best_device during __parsec_schedule and adjust loads at that time to enable cpu/gpu load balancing. */
/* counting load on CPU is useless because it would move from 0->1->0 during the span of execute.
* If we run get_best_device, the caller core is available to run a task, so directly using time_estimate with a 0 base is accurate. */
parsec_atomic_fetch_add_int64(&task->selected_device->device_load, task->load);
}

Expand Down

0 comments on commit 07f7238

Please sign in to comment.