Merge pull request #663 from abouteiller/bugfix/cpu-features

Add: mca control for cpu load balancing (and don't report Gflops figures for cpus we can't determine)
ICLDisco · Sep 12, 2024 · 07f7238 · 07f7238
2 parents 73c3165 + f27a485
commit 07f7238
Show file tree

Hide file tree

Showing 2 changed files with 101 additions and 69 deletions.
diff --git a/parsec/mca/device/device.c b/parsec/mca/device/device.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2023 The University of Tennessee and The University
+ * Copyright (c) 2013-2024 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
@@ -53,6 +53,11 @@ static mca_base_component_t **device_components = NULL;
  */
 static int parsec_device_load_balance_skew = 20;
 static float load_balance_skew;
+/**
+ * load balance allow scheduling tasks with GPU incarnations to CPU cores
+ * 0 means that tasks execute on CPU **only if they cannot execute on GPUs**
+ */
+static int parsec_device_load_balance_allow_cpu = 0;
 
 /**
  * @brief Estimates how many nanoseconds this_task will run on devid
@@ -125,9 +130,9 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
             }
         }
         valid_types |= tc->incarnations[chore_id].type; /* the eval accepted the type, but no device specified yet */
-        /* Evaluate may have picked a device, abide by it */
-        if( NULL != this_task->selected_device ) {
-            assert( this_task->selected_device->type & valid_types );
+        if( NULL != this_task->selected_device ) { /* When Evaluate picked a device, abide by it */
+            assert( (1<<this_task->selected_device->device_index) & tp->devices_index_mask /* only valid devices! */ );
+            assert( this_task->selected_device->type & valid_types /* only valid device types! */ );
             PARSEC_DEBUG_VERBOSE(30, parsec_device_output, "%s: Task %s evaluate set selected_device %d:%s",
                                  __func__, tmp, this_task->selected_device->device_index, this_task->selected_device->name);
             goto device_selected;
@@ -205,7 +210,7 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
     assert( NULL == this_task->selected_device );
     { /* lets consider the time_estimates to select the best device */
         int best_index = -1;
-        int64_t eta, best_eta = INT64_MAX; /* dev->device_load + time_estimate(this_task, dev); this commented out because we don't count cpu loads */
+        int64_t eta, best_eta = INT64_MAX;
 
         /* If we have a preferred device (from READ flows), start with it, but still consider
          * other options to have some load balance */
@@ -219,14 +224,15 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
 
         /* Consider how adding the current task would change load balancing
          * between devices */
+        if(!parsec_device_load_balance_allow_cpu)
+            valid_types &= ~PARSEC_DEV_CPU; /* automatic CPU / GPU load balancing disabled, remove the CPU type */
+        valid_types &= ~PARSEC_DEV_RECURSIVE; /* Recursive device time estimates are computed on the associated CPU device */
         for( int dev_index = 0; dev_index < parsec_mca_device_enabled(); dev_index++ ) {
             /* Skip the device if it is disabled for the taskpool */
             if(!(tp->devices_index_mask & (1 << dev_index))) continue;
             dev = parsec_mca_device_get(dev_index);
             /* Skip the device if no incarnations for its type */
             if(!(dev->type & valid_types)) continue;
-            /* Skip recursive devices: time estimates are computed on the associated CPU device */
-            if(dev->type == PARSEC_DEV_RECURSIVE) continue;
 
             eta = dev->device_load + time_estimate(this_task, dev);
             if( best_eta > eta ) {
@@ -313,10 +319,16 @@ int parsec_mca_device_init(void)
     (void)parsec_mca_param_reg_int_name("device", "load_balance_skew",
                                         "Allow load balancing to skew by x%% to favor data reuse",
                                         false, false, parsec_device_load_balance_skew, NULL);
+    (void)parsec_mca_param_reg_int_name("device", "load_balance_allow_cpu",
+                                        "Allow load balancing tasks with GPU incarnations to CPU cores",
+                                        false, false, parsec_device_load_balance_allow_cpu, NULL);
     if( 0 < (rc = parsec_mca_param_find("device", NULL, "load_balance_skew")) ) {
         parsec_mca_param_lookup_int(rc, &parsec_device_load_balance_skew);
     }
     load_balance_skew = 1.f/(parsec_device_load_balance_skew/100.f+1.f);
+    if( 0 < (rc = parsec_mca_param_find("device", NULL, "load_balance_allow_cpu")) ) {
+        parsec_mca_param_lookup_int(rc, &parsec_device_load_balance_allow_cpu);
+    }
     if( 0 < (rc = parsec_mca_param_find("device", NULL, "verbose")) ) {
         parsec_mca_param_lookup_int(rc, &parsec_device_verbose);
     }
@@ -790,17 +802,17 @@ int parsec_mca_device_registration_completed(parsec_context_t* context)
 #include <sys/sysctl.h>
 #endif
 
+#include "parsec/parsec_hwloc.h"
+
 static int cpu_weights(parsec_device_module_t* device, int nstreams)
 {
-    /* This is default value when it cannot be computed */
-    /* Crude estimate that holds for Nehalem era Xeon processors */
-    float freq = 2.5f;
-    float fp_ipc = 8.f;
-    float dp_ipc = 4.f;
+    float freq = 0.f;
+    float fp_ipc = 0.f;
+    float dp_ipc = 0.f;
     char cpu_model[256]="Unkown";
-    char *cpu_flags = NULL;
 
 #if defined(__linux__)
+    char *cpu_flags = NULL;
     FILE* procinfo = fopen("/proc/cpuinfo", "r");
     if( NULL == procinfo ) {
         parsec_warning("CPU Features cannot be autodetected on this machine: %s", strerror(errno));
@@ -809,66 +821,28 @@ static int cpu_weights(parsec_device_module_t* device, int nstreams)
     cpu_flags = calloc(4096, sizeof(char));
     char str[4096];
     while( NULL != fgets(str, 4096, procinfo) ) {
+#if defined(__x86_64__) || defined(__i386__)
         /* Intel/AMD */
         sscanf(str, "model name : %255[^\n]%*c", cpu_model);
         if( 0 != sscanf(str, "cpu MHz : %f", &freq) )
             freq *= 1e-3;
         if( 0 != sscanf(str, "flags : %4095[^\n]%*c", cpu_flags) )
             break; /* done reading for an x86 type CPU */
+#elif defined(__PPC64__)
         /* IBM: Power */
         sscanf(str, "cpu : %255[^\n]%*c", cpu_model);
         if( 0 != sscanf(str, "clock : %fMHz", &freq) ) {
             freq *= 1e-3;
             break; /* done reading for a Power type CPU */
         }
+#endif
     }
     fclose(procinfo);
-#elif defined(__APPLE__)
-    size_t len = sizeof(cpu_model);
-    int rc = sysctlbyname("machdep.cpu.brand_string", cpu_model, &len, NULL, 0);
-    if( rc ) {
-        parsec_warning("CPU Features cannot be autodetected on this machine (Detected OSX): %s", strerror(errno));
-        goto notfound;
-    }
-    len = 0;
-    rc = sysctlbyname("machdep.cpu.features", NULL, &len, NULL, 0);
-    cpu_flags = malloc(len);
-    rc = sysctlbyname("machdep.cpu.features", cpu_flags, &len, NULL, 0);
-    if( rc ) {
-        parsec_warning("CPU Features cannot be autodetected on this machine (Detected OSX): %s", strerror(errno));
-        goto notfound;
-    }
-#else
-    goto notfound;
-#endif
-    /* prefer base frequency from model name when available (avoids power
-     * saving modes and dynamic frequency scaling issues) */
+    /* prefer base frequency from model name when available (e.g., Intel)
+     * this avoids reading dynamic frequency during power saving or boost modes */
     sscanf(cpu_model, "%*[^@] @ %fGHz", &freq);
 
-    fp_ipc = 8;
-    dp_ipc = 4;
 #if defined(__x86_64__) || defined(__i386__)
-#if defined(PARSEC_HAVE_BUILTIN_CPU)
-    __builtin_cpu_init();
-#if defined(__AVX__)
-    if(__builtin_cpu_supports("avx")) {
-        fp_ipc = 16;
-        dp_ipc = 8;
-    }
-#endif  /* defined(__AVX__) */
-#if defined(__AVX2__)
-    if(__builtin_cpu_supports("avx2")) {
-        fp_ipc = 32;
-        dp_ipc = 16;
-    }
-#endif  /* defined(__AVX2__) */
-#if defined(__AVX512F__)
-    if(__builtin_cpu_supports("avx512f")) {
-        fp_ipc = 64;
-        dp_ipc = 32;
-    }
-#endif  /* defined(__AVX512F__) */
-#else
     if( strstr(cpu_flags, " avx512f") ) {
         fp_ipc = 64;
         dp_ipc = 32;
@@ -881,36 +855,94 @@ static int cpu_weights(parsec_device_module_t* device, int nstreams)
         fp_ipc = 16;
         dp_ipc = 8;
     }
-#endif
-#endif  /* defined(__x86_64__) || defined(__i386__) */
+    else {
+        fp_ipc = 8;
+        dp_ipc = 4;
+    }
+#elif defined(__PPC64__)
+    fp_ipc = 16;
+    dp_ipc = 8;
+#endif  /* defined(__x86_64__) || defined(__i386__) || defined(__PPC64__) */
     free(cpu_flags);
 
+#elif defined(__APPLE__)
+    size_t len = sizeof(cpu_model);
+    size_t val = 0;
+    int rc = sysctlbyname("machdep.cpu.brand_string", cpu_model, &len, NULL, 0);
+    if( rc ) {
+        parsec_warning("CPU Name cannot be autodetected on this machine (OSX): %s", strerror(errno));
+        goto notfound;
+    }
+    /* is it an arm64 mac? */
+    rc = sysctlbyname("hw.optional.arm64", &val, &len, NULL, 0);
+    if( 0 == rc && val ) {
+        /* vector length */
+        fp_ipc = 16;
+        dp_ipc = 8;
+        // TODO: figure out a way to obtain cpu freq on arm64 macos
+        freq = 0.0;
+    }
+    else { /* intel mac */
+        /* vector length */
+        int i;
+        char *keys[4] = {
+          "hw.optional.avx512f",
+          "hw.optional.avx2_0",
+          "hw.optional.avx1_0",
+          NULL
+        };
+        for(i = 0; keys[i] != NULL; i++) {
+            rc = sysctlbyname(keys[i], &val, &len, NULL, 0);
+            if( 0 == rc && val )
+                break;
+        }
+        fp_ipc = 8*(8>>i);
+        dp_ipc = 4*(8>>i);
+        /* frequency */
+        len = sizeof(size_t);
+        rc = sysctlbyname("hw.cpufrequency", &val, &len, NULL, 0);
+        if( rc ) {
+            parsec_warning("CPU Frequency cannot be autodetected on this machine (OSX): %s", strerror(errno));
+            goto notfound;
+        }
+        freq = val * 1e-9f;
+    }
+#endif
+
+notfound:
     {
       int show_caps = 0;
       int show_caps_index = parsec_mca_param_find("device", NULL, "show_capabilities");
       if(0 < show_caps_index) {
           parsec_mca_param_lookup_int(show_caps_index, &show_caps);
       }
+      /* this may show unknown/0.0 if the cpu capabilities couldn't be determined */
       if( show_caps ) {
+          int ncores = parsec_hwloc_nb_real_cores();
           parsec_inform("CPU Device: %s\n"
-                        "\tParsec Streams     : %d\n"
+                        "\tParsec EUs / Cores : %d / %d\n"
                         "\tFrequency (GHz)    : %.2f\n"
                         "\tPeak Tflop/s       : fp64: %-8.3f fp32: %-8.3f",
                         cpu_model,
-                        nstreams,
-                        freq, nstreams*freq*dp_ipc*1e-3, nstreams*freq*fp_ipc*1e-3);
+                        nstreams, ncores,
+                        freq, (nstreams > ncores? ncores: nstreams)*freq*dp_ipc*1e-3, (nstreams > ncores? ncores: nstreams)*freq*fp_ipc*1e-3);
        }
     }
- notfound:
 
     /* compute capacity is per-core, not per-device, so as to account for the
      * prevalent model where we use sequential, single threaded tasks on CPU devices.
      * Advanced users can use the time_estimate property to override if using
      * multi-core parallel tasks. */
-    device->gflops_fp16 = fp_ipc * freq; /* No processor have half precision for now */
-    device->gflops_tf32 = fp_ipc * freq; /* No processor support tensor operations for now */
     device->gflops_fp32 = fp_ipc * freq;
     device->gflops_fp64 = dp_ipc * freq;
+    /* If unset, set to 1 to avoid dividing by 0 when computing the time Estimates
+     * and mark that the time_estimates are guesses */
+    if(device->gflops_fp32 == 0)
+      device->gflops_fp32 = device->gflops_guess = 1;
+    if(device->gflops_fp64 == 0)
+      device->gflops_fp64 = device->gflops_guess = 1;
+    /* CPUs emulate these using normal fp */
+    device->gflops_fp16 = device->gflops_tf32 = device->gflops_fp32;
 
     return PARSEC_SUCCESS;
 }
@@ -969,7 +1001,7 @@ int parsec_mca_device_attach(parsec_context_t* context)
         nb_total_comp_threads += context->virtual_processes[p]->nb_cores;
     }
 
- #if defined(PARSEC_HAVE_DEV_CPU_SUPPORT)
+#if defined(PARSEC_HAVE_DEV_CPU_SUPPORT)
     /* Add the predefined devices: one device for the CPUs */
     {
         parsec_device_cpus = (parsec_device_module_t*)calloc(1, sizeof(parsec_device_module_t));
@@ -981,7 +1013,7 @@ int parsec_mca_device_attach(parsec_context_t* context)
         parsec_device_cpus->taskpool_register = device_taskpool_register_static;
         parsec_mca_device_add(context, parsec_device_cpus);
     }
- #endif  /* defined(PARSEC_HAVE_DEV_CPU_SUPPORT) */
+#endif  /* defined(PARSEC_HAVE_DEV_CPU_SUPPORT) */
 
 #if defined(PARSEC_HAVE_DEV_RECURSIVE_SUPPORT)
     /* and one for the recursive kernels */

diff --git a/parsec/scheduling.c b/parsec/scheduling.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2009-2023 The University of Tennessee and The University
+ * Copyright (c) 2009-2024 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  */
@@ -136,8 +136,8 @@ int __parsec_execute( parsec_execution_stream_t* es,
     rc = parsec_select_best_device(task);
     if( PARSEC_ERROR == rc ) return PARSEC_HOOK_RETURN_ERROR;
     if( PARSEC_DEV_IS_GPU(task->selected_device->type) ) {
-        /* counting load on CPU is useless because it would move from 0->1->0 during the span of execute
-         * TODO: select_best_device during __parsec_schedule and adjust loads at that time to enable cpu/gpu load balancing. */
+        /* counting load on CPU is useless because it would move from 0->1->0 during the span of execute.
+         * If we run get_best_device, the caller core is available to run a task, so directly using time_estimate with a 0 base is accurate. */
         parsec_atomic_fetch_add_int64(&task->selected_device->device_load, task->load);
     }