Skip to content

Commit

Permalink
Fix computation of available memory on gpu (avoid truncation and
Browse files Browse the repository at this point in the history
conversions) and add the amount of reserved mem to the device caps output
  • Loading branch information
abouteiller committed May 16, 2024
1 parent 74d9dcd commit 702e648
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 59 deletions.
26 changes: 9 additions & 17 deletions parsec/mca/device/cuda/device_cuda_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -575,23 +575,15 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module )
}

if( show_caps ) {
parsec_inform("GPU Device %-8s: %s [capability %d.%d] %s\n"
"\tLocation (PCI Bus/Device/Domain): %x:%x.%x\n"
"\tSM : %d\n"
"\tFrequency (GHz) : %f\n"
"\tpeak Tflop/s : %4.2f fp64,\t%4.2f fp32,\t%4.2f tf32,\t%4.2f fp16\n"
"\tPeak Mem Bw (GB/s) : %.2f [Clock Rate (Ghz) %.2f | Bus Width (bits) %d]\n"
"\tconcurrency : %s\n"
"\tcomputeMode : %d\n",
device->name, szName, cuda_device->major, cuda_device->minor,
device->gflops_guess? "(GUESSED Peak Tflop/s; load imbalance may RECUDE PERFORMANCE)": "",
prop.pciBusID, prop.pciDeviceID, prop.pciDomainID,
streaming_multiprocessor,
freqHz*1e-9f,
fp64*1e-3, fp32*1e-3, tf32*1e-3, fp16*1e-3,
2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6, prop.memoryClockRate*1e-6, prop.memoryBusWidth,
(concurrency == 1)? "yes": "no",
computemode);
parsec_inform("GPU Device %-8s: %s %.0fGB [pci %x:%x.%x]\n"
"\tFrequency (GHz) : %.2f\t[SM: %d | Capabilities: %d.%d | Concurency %s | ComputeMode %d]\n"
"\tPeak Tflop/s %-5s : fp64: %-8.3f fp32: %-8.3f fp16: %-8.3f tf32: %-8.3f\n"
"\tPeak Mem Bw (GB/s) : %.2f\t[Clock Rate (Ghz) %.2f | Bus Width (bits) %d]\tReserved Pool (GB): %.1f\n",
device->name, szName, prop.totalGlobalMem/1024.f/1024.f/1024.f, prop.pciBusID, prop.pciDeviceID, prop.pciDomainID,
freqHz*1e-9f, streaming_multiprocessor, cuda_device->major, cuda_device->minor,
(concurrency == 1)? "yes": "no", computemode,
device->gflops_guess? "GUESS": "", fp64*1e-3, fp32*1e-3, fp16*1e-3, tf32*1e-3,
2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6, prop.memoryClockRate*1e-6, prop.memoryBusWidth, gpu_device->mem_block_size*gpu_device->mem_nb_blocks/1024.f/1024.f/1024.f);
}

*module = device;
Expand Down
7 changes: 4 additions & 3 deletions parsec/mca/device/device.c
Original file line number Diff line number Diff line change
Expand Up @@ -278,14 +278,15 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
#endif
return PARSEC_SUCCESS;

no_valid_device:
no_valid_device: {
#if !defined(PARSEC_DEBUG_NOISIER)
char tmp[MAX_TASK_STRLEN];
parsec_task_snprintf(tmp, MAX_TASK_STRLEN, this_task);
#endif
parsec_warning("Task %s ran out of valid incarnations. No device selected.",
tmp);
return PARSEC_ERROR;
}
}

PARSEC_OBJ_CLASS_INSTANCE(parsec_device_module_t, parsec_object_t,
Expand Down Expand Up @@ -885,8 +886,8 @@ static int cpu_weights(parsec_device_module_t* device, int nstreams)
if( show_caps ) {
parsec_inform("CPU Device: %s\n"
"\tParsec Streams : %d\n"
"\tFrequency (GHz) : %2.2f\n"
"\tPeak Tflop/s : %2.4f fp64,\t%2.4f fp32",
"\tFrequency (GHz) : %.2f\n"
"\tPeak Tflop/s : fp64: %-8.3f fp32: %-8.3f",
cpu_model,
nstreams,
freq, nstreams*freq*dp_ipc*1e-3, nstreams*freq*fp_ipc*1e-3);
Expand Down
71 changes: 32 additions & 39 deletions parsec/mca/device/device_gpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -597,9 +597,9 @@ parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device,
int rc;
(void)eltsize;

size_t how_much_we_allocate;
size_t alloc_size;
size_t total_mem, initial_free_mem;
uint32_t mem_elem_per_gpu = 0;
size_t mem_elem_per_gpu = 0;

rc = gpu_device->set_device(gpu_device);
if(PARSEC_SUCCESS != rc)
Expand All @@ -616,25 +616,27 @@ parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device,
gpu_device->super.device_index, gpu_device->super.name);
return PARSEC_ERROR;
} else {
how_much_we_allocate = number_blocks * eltsize;
alloc_size = number_blocks * eltsize;
}
} else {
/** number_blocks == -1 means memory_percentage is used */
how_much_we_allocate = (memory_percentage * initial_free_mem) / 100;
}
if( how_much_we_allocate > initial_free_mem ) {
/** Handle the case of jokers who require more than 100% of memory,
* and eleventh case of computer scientists who don't know how
* to divide a number by another
*/
parsec_warning("GPU[%d:%s] Requested %zd bytes on GPU device, but only %zd bytes are available -- reducing allocation to max available",
gpu_device->super.device_index, gpu_device->super.name, how_much_we_allocate, initial_free_mem);
how_much_we_allocate = initial_free_mem;
}
if( how_much_we_allocate < eltsize ) {
/** Handle another kind of jokers entirely, and cases of
* not enough memory on the device
*/
/* number_blocks == -1 means memory_percentage is used */
alloc_size = (memory_percentage * initial_free_mem) / 100;
/* round-up in eltsize */
alloc_size = eltsize * ((alloc_size + eltsize - 1 ) / eltsize);
}
if( alloc_size >= initial_free_mem ) {
/* Mapping more than 100% of GPU memory is obviously wrong
* Mapping exactly 100% of the GPU memory ends up producing errors about __global__ function call is not configured
* Mapping 95% works with low-end GPUs like 1060, how much to let available for gpu runtime, I don't know how to calculate */
parsec_warning("GPU[%d:%s] Requested %zd bytes on GPU device, but only %zd bytes are available -- reducing allocation to 95%% of max available",
gpu_device->super.device_index, gpu_device->super.name, alloc_size, initial_free_mem);
alloc_size = (95 * initial_free_mem) / 100;
/* round-up in eltsize */
alloc_size = eltsize * ((alloc_size + eltsize - 1 ) / eltsize);
}
if( alloc_size < eltsize ) {
/* Handle another kind of jokers entirely, and cases of
* not enough memory on the device */
parsec_warning("GPU[%d:%s] Cannot allocate at least one element",
gpu_device->super.device_index, gpu_device->super.name);
return PARSEC_ERROR;
Expand All @@ -647,15 +649,15 @@ parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device,
* during the computations
*/
while( (free_mem > eltsize )
&& ((total_mem - free_mem) < how_much_we_allocate) ) {
&& ((total_mem - free_mem) < alloc_size) ) {
parsec_gpu_data_copy_t* gpu_elem;
void *device_ptr;

rc = gpu_device->memory_allocate(gpu_device, eltsize, &device_ptr);
if(PARSEC_SUCCESS != rc) {
size_t _free_mem, _total_mem;
gpu_device->memory_info(gpu_device, &_free_mem, &_total_mem );
parsec_inform("GPU[%d:%s] Per context: free mem %zu total mem %zu (allocated tiles %u)",
parsec_inform("GPU[%d:%s] Per context: free mem %zu total mem %zu (allocated tiles %zu)",
gpu_device->super.device_index, gpu_device->super.name,_free_mem, _total_mem, mem_elem_per_gpu);
break;
}
Expand All @@ -679,43 +681,34 @@ parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device,
}
else {
PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
"GPU[%d:%s] Allocate %u tiles on the GPU memory",
"GPU[%d:%s] Allocate %zu tiles on the GPU memory",
gpu_device->super.device_index, gpu_device->super.name, mem_elem_per_gpu );
}
PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
"GPU[%d:%s] Allocate %u tiles on the GPU memory", gpu_device->super.device_index, gpu_device->super.name, mem_elem_per_gpu);
"GPU[%d:%s] Allocate %zu tiles on the GPU memory", gpu_device->super.device_index, gpu_device->super.name, mem_elem_per_gpu);
#else
if( NULL == gpu_device->memory ) {
void* base_ptr;
/* We allocate all the memory on the GPU and we use our memory management. */
/* This computation leads to allocating more than available if we asked for more than GPU memory */
mem_elem_per_gpu = (how_much_we_allocate + eltsize - 1 ) / eltsize;
size_t total_size = (size_t)mem_elem_per_gpu * eltsize;

if (total_size > initial_free_mem) {
/* Mapping more than 100% of GPU memory is obviously wrong */
/* Mapping exactly 100% of the GPU memory ends up producing errors about __global__ function call is not configured */
/* Mapping 95% works with low-end GPUs like 1060, how much to let available for gpu runtime, I don't know how to calculate */
total_size = (size_t)((int)(.9*initial_free_mem / eltsize)) * eltsize;
mem_elem_per_gpu = total_size / eltsize;
}
rc = gpu_device->memory_allocate(gpu_device, total_size, &base_ptr);

rc = gpu_device->memory_allocate(gpu_device, alloc_size, &base_ptr);
if(PARSEC_SUCCESS != rc) {
parsec_warning("GPU[%d:%s] Allocating %zu bytes of memory on the GPU device failed",
gpu_device->super.device_index, gpu_device->super.name, total_size);
gpu_device->super.device_index, gpu_device->super.name, alloc_size);
gpu_device->memory = NULL;
return PARSEC_ERROR;
}

assert(alloc_size % eltsize == 0); /* we rounded up earlier... */
mem_elem_per_gpu = alloc_size / eltsize;
gpu_device->memory = zone_malloc_init( base_ptr, mem_elem_per_gpu, eltsize );

if( gpu_device->memory == NULL ) {
parsec_warning("GPU[%d:%s] Cannot allocate memory on GPU %s. Skip it!",
gpu_device->super.device_index, gpu_device->super.name, gpu_device->super.name);
return PARSEC_ERROR;
}

PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
"GPU[%d:%s] Allocate %u segments of size %d on the GPU memory",
"GPU[%d:%s] Allocate %zu segments of size %zu on the GPU memory",
gpu_device->super.device_index, gpu_device->super.name, mem_elem_per_gpu, eltsize );
}
#endif
Expand Down

0 comments on commit 702e648

Please sign in to comment.