Fix computation of available memory on gpu (avoid truncation and

conversions) and add the amount of reserved mem to the device caps output
ICLDisco · May 16, 2024 · 702e648 · 702e648
1 parent 74d9dcd
commit 702e648
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 59 deletions.
diff --git a/parsec/mca/device/cuda/device_cuda_module.c b/parsec/mca/device/cuda/device_cuda_module.c
@@ -575,23 +575,15 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module )
     }
 
     if( show_caps ) {
-        parsec_inform("GPU Device %-8s: %s [capability %d.%d] %s\n"
-                      "\tLocation (PCI Bus/Device/Domain): %x:%x.%x\n"
-                      "\tSM                 : %d\n"
-                      "\tFrequency (GHz)    : %f\n"
-                      "\tpeak Tflop/s       : %4.2f fp64,\t%4.2f fp32,\t%4.2f tf32,\t%4.2f fp16\n"
-                      "\tPeak Mem Bw (GB/s) : %.2f [Clock Rate (Ghz) %.2f | Bus Width (bits) %d]\n"
-                      "\tconcurrency        : %s\n"
-                      "\tcomputeMode        : %d\n",
-                      device->name, szName, cuda_device->major, cuda_device->minor,
-                      device->gflops_guess? "(GUESSED Peak Tflop/s; load imbalance may RECUDE PERFORMANCE)": "",
-                      prop.pciBusID, prop.pciDeviceID, prop.pciDomainID,
-                      streaming_multiprocessor,
-                      freqHz*1e-9f,
-                      fp64*1e-3, fp32*1e-3, tf32*1e-3, fp16*1e-3,
-                      2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6, prop.memoryClockRate*1e-6, prop.memoryBusWidth,
-                      (concurrency == 1)? "yes": "no",
-                      computemode);
+        parsec_inform("GPU Device %-8s: %s %.0fGB [pci %x:%x.%x]\n"
+                      "\tFrequency (GHz)    : %.2f\t[SM: %d | Capabilities: %d.%d | Concurency %s | ComputeMode %d]\n"
+                      "\tPeak Tflop/s %-5s : fp64: %-8.3f fp32: %-8.3f fp16: %-8.3f tf32: %-8.3f\n"
+                      "\tPeak Mem Bw (GB/s) : %.2f\t[Clock Rate (Ghz) %.2f | Bus Width (bits) %d]\tReserved Pool (GB): %.1f\n",
+                      device->name, szName, prop.totalGlobalMem/1024.f/1024.f/1024.f, prop.pciBusID, prop.pciDeviceID, prop.pciDomainID,
+                      freqHz*1e-9f, streaming_multiprocessor, cuda_device->major, cuda_device->minor,
+                      (concurrency == 1)? "yes": "no", computemode,
+                      device->gflops_guess? "GUESS": "", fp64*1e-3, fp32*1e-3, fp16*1e-3, tf32*1e-3,
+                      2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6, prop.memoryClockRate*1e-6, prop.memoryBusWidth, gpu_device->mem_block_size*gpu_device->mem_nb_blocks/1024.f/1024.f/1024.f);
     }
 
     *module = device;

diff --git a/parsec/mca/device/device.c b/parsec/mca/device/device.c
@@ -278,14 +278,15 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
 #endif
     return PARSEC_SUCCESS;
 
-no_valid_device:
+no_valid_device: {
 #if !defined(PARSEC_DEBUG_NOISIER)
     char tmp[MAX_TASK_STRLEN];
     parsec_task_snprintf(tmp, MAX_TASK_STRLEN, this_task);
 #endif
     parsec_warning("Task %s ran out of valid incarnations. No device selected.",
                    tmp);
     return PARSEC_ERROR;
+  }
 }
 
 PARSEC_OBJ_CLASS_INSTANCE(parsec_device_module_t, parsec_object_t,
@@ -885,8 +886,8 @@ static int cpu_weights(parsec_device_module_t* device, int nstreams)
       if( show_caps ) {
           parsec_inform("CPU Device: %s\n"
                         "\tParsec Streams     : %d\n"
-                        "\tFrequency (GHz)    : %2.2f\n"
-                        "\tPeak Tflop/s       : %2.4f fp64,\t%2.4f fp32",
+                        "\tFrequency (GHz)    : %.2f\n"
+                        "\tPeak Tflop/s       : fp64: %-8.3f fp32: %-8.3f",
                         cpu_model,
                         nstreams,
                         freq, nstreams*freq*dp_ipc*1e-3, nstreams*freq*fp_ipc*1e-3);

diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c
@@ -597,9 +597,9 @@ parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device,
     int rc;
     (void)eltsize;
 
-    size_t how_much_we_allocate;
+    size_t alloc_size;
     size_t total_mem, initial_free_mem;
-    uint32_t mem_elem_per_gpu = 0;
+    size_t mem_elem_per_gpu = 0;
 
     rc = gpu_device->set_device(gpu_device);
     if(PARSEC_SUCCESS != rc)
@@ -616,25 +616,27 @@ parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device,
                            gpu_device->super.device_index, gpu_device->super.name);
             return PARSEC_ERROR;
         } else {
-            how_much_we_allocate = number_blocks * eltsize;
+            alloc_size = number_blocks * eltsize;
         }
     } else {
-        /** number_blocks == -1 means memory_percentage is used */
-        how_much_we_allocate = (memory_percentage * initial_free_mem) / 100;
-    }
-    if( how_much_we_allocate > initial_free_mem ) {
-        /** Handle the case of jokers who require more than 100% of memory,
-         *  and eleventh case of computer scientists who don't know how
-         *  to divide a number by another
-         */
-        parsec_warning("GPU[%d:%s] Requested %zd bytes on GPU device, but only %zd bytes are available -- reducing allocation to max available",
-                       gpu_device->super.device_index, gpu_device->super.name, how_much_we_allocate, initial_free_mem);
-        how_much_we_allocate = initial_free_mem;
-    }
-    if( how_much_we_allocate < eltsize ) {
-        /** Handle another kind of jokers entirely, and cases of
-         *  not enough memory on the device
-         */
+        /* number_blocks == -1 means memory_percentage is used */
+        alloc_size = (memory_percentage * initial_free_mem) / 100;
+        /* round-up in eltsize */
+        alloc_size = eltsize * ((alloc_size + eltsize - 1 ) / eltsize);
+    }
+    if( alloc_size >= initial_free_mem ) {
+        /* Mapping more than 100% of GPU memory is obviously wrong
+         * Mapping exactly 100% of the GPU memory ends up producing errors about __global__ function call is not configured
+         * Mapping 95% works with low-end GPUs like 1060, how much to let available for gpu runtime, I don't know how to calculate */
+        parsec_warning("GPU[%d:%s] Requested %zd bytes on GPU device, but only %zd bytes are available -- reducing allocation to 95%% of max available",
+                       gpu_device->super.device_index, gpu_device->super.name, alloc_size, initial_free_mem);
+        alloc_size = (95 * initial_free_mem) / 100;
+        /* round-up in eltsize */
+        alloc_size = eltsize * ((alloc_size + eltsize - 1 ) / eltsize);
+    }
+    if( alloc_size < eltsize ) {
+        /* Handle another kind of jokers entirely, and cases of
+         * not enough memory on the device */
         parsec_warning("GPU[%d:%s] Cannot allocate at least one element",
                        gpu_device->super.device_index, gpu_device->super.name);
         return PARSEC_ERROR;
@@ -647,15 +649,15 @@ parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device,
      * during the computations
      */
     while( (free_mem > eltsize )
-           && ((total_mem - free_mem) < how_much_we_allocate) ) {
+           && ((total_mem - free_mem) < alloc_size) ) {
         parsec_gpu_data_copy_t* gpu_elem;
         void *device_ptr;
 
         rc = gpu_device->memory_allocate(gpu_device, eltsize, &device_ptr);
         if(PARSEC_SUCCESS != rc) {
             size_t _free_mem, _total_mem;
             gpu_device->memory_info(gpu_device, &_free_mem, &_total_mem );
-            parsec_inform("GPU[%d:%s] Per context: free mem %zu total mem %zu (allocated tiles %u)",
+            parsec_inform("GPU[%d:%s] Per context: free mem %zu total mem %zu (allocated tiles %zu)",
                           gpu_device->super.device_index, gpu_device->super.name,_free_mem, _total_mem, mem_elem_per_gpu);
             break;
         }
@@ -679,43 +681,34 @@ parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device,
     }
     else {
         PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
-                             "GPU[%d:%s] Allocate %u tiles on the GPU memory",
+                             "GPU[%d:%s] Allocate %zu tiles on the GPU memory",
                              gpu_device->super.device_index, gpu_device->super.name, mem_elem_per_gpu );
     }
     PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
-                         "GPU[%d:%s] Allocate %u tiles on the GPU memory", gpu_device->super.device_index, gpu_device->super.name, mem_elem_per_gpu);
+                         "GPU[%d:%s] Allocate %zu tiles on the GPU memory", gpu_device->super.device_index, gpu_device->super.name, mem_elem_per_gpu);
 #else
     if( NULL == gpu_device->memory ) {
         void* base_ptr;
-        /* We allocate all the memory on the GPU and we use our memory management. */
-        /* This computation leads to allocating more than available if we asked for more than GPU memory */
-        mem_elem_per_gpu = (how_much_we_allocate + eltsize - 1 ) / eltsize;
-        size_t total_size = (size_t)mem_elem_per_gpu * eltsize;
-
-        if (total_size > initial_free_mem) {
-            /* Mapping more than 100% of GPU memory is obviously wrong */
-            /* Mapping exactly 100% of the GPU memory ends up producing errors about __global__ function call is not configured */
-            /* Mapping 95% works with low-end GPUs like 1060, how much to let available for gpu runtime, I don't know how to calculate */
-            total_size = (size_t)((int)(.9*initial_free_mem / eltsize)) * eltsize;
-            mem_elem_per_gpu = total_size / eltsize;
-        }
-        rc = gpu_device->memory_allocate(gpu_device, total_size, &base_ptr);
+
+        rc = gpu_device->memory_allocate(gpu_device, alloc_size, &base_ptr);
         if(PARSEC_SUCCESS != rc) {
             parsec_warning("GPU[%d:%s] Allocating %zu bytes of memory on the GPU device failed",
-                           gpu_device->super.device_index, gpu_device->super.name, total_size);
+                           gpu_device->super.device_index, gpu_device->super.name, alloc_size);
             gpu_device->memory = NULL;
             return PARSEC_ERROR;
         }
 
+        assert(alloc_size % eltsize == 0); /* we rounded up earlier... */
+        mem_elem_per_gpu = alloc_size / eltsize;
         gpu_device->memory = zone_malloc_init( base_ptr, mem_elem_per_gpu, eltsize );
-
         if( gpu_device->memory == NULL ) {
             parsec_warning("GPU[%d:%s] Cannot allocate memory on GPU %s. Skip it!",
                            gpu_device->super.device_index, gpu_device->super.name, gpu_device->super.name);
             return PARSEC_ERROR;
         }
+
         PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
-                            "GPU[%d:%s] Allocate %u segments of size %d on the GPU memory",
+                            "GPU[%d:%s] Allocate %zu segments of size %zu on the GPU memory",
                             gpu_device->super.device_index, gpu_device->super.name, mem_elem_per_gpu, eltsize );
     }
 #endif