Skip to content

Commit

Permalink
Merge pull request #666 from therault/evicitons-in-gpu-device-statistics
Browse files Browse the repository at this point in the history
Add the number of copies evicted in the statistics of the devices.
  • Loading branch information
bosilca authored Sep 9, 2024
2 parents a5f49ab + 4602cc2 commit dcba0c0
Show file tree
Hide file tree
Showing 6 changed files with 25 additions and 12 deletions.
1 change: 1 addition & 0 deletions parsec/mca/device/cuda/device_cuda_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -528,6 +528,7 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module )
device->data_out_to_host = 0;
device->required_data_in = 0;
device->required_data_out = 0;
device->nb_evictions = 0;

device->attach = parsec_device_attach;
device->detach = parsec_device_detach;
Expand Down
32 changes: 20 additions & 12 deletions parsec/mca/device/device.c
Original file line number Diff line number Diff line change
Expand Up @@ -408,10 +408,10 @@ void parsec_compute_best_unit( uint64_t length, float* updated_value, char** bes

void parsec_devices_save_statistics(uint64_t **pstats) {
if(NULL == *pstats) {
*pstats = (uint64_t*)calloc(sizeof(uint64_t), parsec_nb_devices * 6 /* see below for the number of arrays */);
*pstats = (uint64_t*)calloc(sizeof(uint64_t), parsec_nb_devices * 7 /* see below for the number of arrays */);
}
else {
memset(*pstats, 0, parsec_nb_devices * sizeof(uint64_t) * 6);
memset(*pstats, 0, parsec_nb_devices * sizeof(uint64_t) * 7);
}
uint64_t *stats = *pstats;
uint64_t *executed_tasks = stats;
Expand All @@ -420,12 +420,14 @@ void parsec_devices_save_statistics(uint64_t **pstats) {
uint64_t *req_in = stats + 3*parsec_nb_devices;
uint64_t *req_out = stats + 4*parsec_nb_devices;
uint64_t *transfer_d2d = stats + 5*parsec_nb_devices;
uint64_t *nb_evictions = stats + 6*parsec_nb_devices;

for(uint32_t i = 0; i < parsec_nb_devices; i++) {
parsec_device_module_t *device = parsec_devices[i];
if(NULL == device) continue;
assert( i == device->device_index );
executed_tasks[i] = device->executed_tasks;
nb_evictions[i] = device->nb_evictions;
transfer_in[i] = device->data_in_from_device[0]; /* cpu-core device */
transfer_out[i] = device->data_out_to_host;
req_in[i] = device->required_data_in;
Expand All @@ -446,6 +448,7 @@ void parsec_devices_print_statistics(parsec_context_t *parsec_context, uint64_t
uint64_t *end_stats = NULL;
uint64_t total_tasks = 0, total_data_in = 0, total_data_out = 0;
uint64_t total_required_in = 0, total_required_out = 0, total_d2d = 0;
uint64_t total_evicted = 0;
float gtotal = 0.0;
float best_data_in, best_data_out, best_d2d;
float best_required_in, best_required_out;
Expand All @@ -457,7 +460,7 @@ void parsec_devices_print_statistics(parsec_context_t *parsec_context, uint64_t
/* initialize the arrays */
parsec_devices_save_statistics(&end_stats);
if(NULL != start_stats) {
for(i = 0; i < parsec_nb_devices * 6; i++) {
for(i = 0; i < parsec_nb_devices * 7; i++) {
assert(end_stats[i] >= start_stats[i]);
end_stats[i] -= start_stats[i];
}
Expand All @@ -468,6 +471,7 @@ void parsec_devices_print_statistics(parsec_context_t *parsec_context, uint64_t
uint64_t *required_in = end_stats + 3*parsec_nb_devices;
uint64_t *required_out = end_stats + 4*parsec_nb_devices;
uint64_t *transferred_d2d = end_stats + 5*parsec_nb_devices;
uint64_t *nb_evictions = end_stats + 6*parsec_nb_devices;

/* Compute total statistics */
for(i = 0; i < parsec_nb_devices; i++) {
Expand All @@ -479,17 +483,18 @@ void parsec_devices_print_statistics(parsec_context_t *parsec_context, uint64_t
total_required_in += required_in[i];
total_required_out += required_out[i];
total_d2d += transferred_d2d[i];
total_evicted += nb_evictions[i];
}

/* Print statistics */
gtotal = (float)total_tasks;
double percent_in, percent_out, percent_d2d;

printf("+----------------------------------------------------------------------------------------------------------------------------+\n");
printf("| | | Data In | Data Out |\n");
printf("|Rank %3d | # KERNEL | %% | Required | Transfered H2D(%%) | Transfered D2D(%%) | Required | Transfered(%%) |\n",
printf("+-----------------------------------------------------------------------------------------------------------------------------------------------+\n");
printf("| | | Data In | Data Out | |\n");
printf("|Rank %3d | # KERNEL | %% | Required | Transfered H2D(%%) | Transfered D2D(%%) | Required | Transfered(%%) | Evictions |\n",
(NULL == parsec_context ? parsec_debug_rank : parsec_context->my_rank));
printf("|---------|-----------|--------|------------|-----------------------|-----------------------|------------|-------------------|\n");
printf("|---------|-----------|--------|------------|-----------------------|-----------------------|------------|-------------------|------------------|\n");
for( i = 0; i < parsec_nb_devices; i++ ) {
if( NULL == (device = parsec_devices[i]) ) continue;

Expand All @@ -503,15 +508,16 @@ void parsec_devices_print_statistics(parsec_context_t *parsec_context, uint64_t
percent_d2d = (0 == required_in[i])? nan(""): (((double)transferred_d2d[i]) / (double)required_in[i] ) * 100.0;
percent_out = (0 == required_out[i])? nan(""): (((double)transferred_out[i]) / (double)required_out[i] ) * 100.0;

printf("| Dev %2d |%10"PRIu64" | %6.2f | %8.2f%2s | %8.2f%2s(%5.2f) | %8.2f%2s(%5.2f) | %8.2f%2s | %8.2f%2s(%5.2f) | %s\n",
printf("| Dev %2d |%10"PRIu64" | %6.2f | %8.2f%2s | %8.2f%2s(%5.2f) | %8.2f%2s(%5.2f) | %8.2f%2s | %8.2f%2s(%5.2f) | %10"PRIu64" | %s\n",
device->device_index, executed_tasks[i], (executed_tasks[i]/gtotal)*100.00,
best_required_in, required_in_unit, best_data_in, data_in_unit, percent_in,
best_d2d, d2d_unit, percent_d2d,
best_required_out, required_out_unit, best_data_out, data_out_unit, percent_out,
nb_evictions[i],
device->name );
}

printf("|---------|-----------|--------|------------|-----------------------|-----------------------|------------|-------------------|\n");
printf("|---------|-----------|--------|------------|-----------------------|-----------------------|------------|-------------------|------------------|\n");

parsec_compute_best_unit( total_required_in, &best_required_in, &required_in_unit );
parsec_compute_best_unit( total_required_out, &best_required_out, &required_out_unit );
Expand All @@ -523,12 +529,13 @@ void parsec_devices_print_statistics(parsec_context_t *parsec_context, uint64_t
percent_d2d = (0 == total_required_in)? nan(""): (((double)total_d2d) / (double)total_required_in) * 100.0;
percent_out = (0 == total_required_out)? nan(""): (((double)total_data_out) / (double)total_required_out) * 100.0;

printf("|All Devs |%10"PRIu64" | %6.2f | %8.2f%2s | %8.2f%2s(%5.2f) | %8.2f%2s(%5.2f) | %8.2f%2s | %8.2f%2s(%5.2f) |\n",
printf("|All Devs |%10"PRIu64" | %6.2f | %8.2f%2s | %8.2f%2s(%5.2f) | %8.2f%2s(%5.2f) | %8.2f%2s | %8.2f%2s(%5.2f) | %10"PRIu64" |\n",
total_tasks, (total_tasks/gtotal)*100.00,
best_required_in, required_in_unit, best_data_in, data_in_unit, percent_in,
best_d2d, d2d_unit, percent_d2d,
best_required_out, required_out_unit, best_data_out, data_out_unit, percent_out);
printf("+----------------------------------------------------------------------------------------------------------------------------+\n");
best_required_out, required_out_unit, best_data_out, data_out_unit, percent_out,
total_evicted);
printf("+-----------------------------------------------------------------------------------------------------------------------------------------------+\n");

parsec_devices_free_statistics(&end_stats);
}
Expand All @@ -545,6 +552,7 @@ void parsec_mca_device_reset_statistics(parsec_context_t *parsec_context) {
device->data_out_to_host = 0;
device->required_data_in = 0;
device->required_data_out = 0;
device->nb_evictions = 0;
}
}

Expand Down
1 change: 1 addition & 0 deletions parsec/mca/device/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ struct parsec_device_module_s {
uint64_t required_data_out;
uint64_t executed_tasks;
uint64_t nb_data_faults;
uint64_t nb_evictions;
/* We provide the compute capacity of the device in GFlop/s so that conversion to #nanosec in load estimates is straightforward */
/* These compute capacities can be useful for users when providing their own
* time_estimate functions: the user can divide the number of flops for the
Expand Down
1 change: 1 addition & 0 deletions parsec/mca/device/device_gpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -1064,6 +1064,7 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
gpu_device->super.device_index, gpu_device->super.name, task_name, this_task->task_class->name, i, lru_gpu_elem);
oldmaster = NULL;
}
gpu_device->super.nb_evictions++;
#if !defined(PARSEC_GPU_ALLOC_PER_TILE)
/* Let's free this space, and try again to malloc some space */
PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
Expand Down
1 change: 1 addition & 0 deletions parsec/mca/device/level_zero/device_level_zero_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,7 @@ int parsec_level_zero_module_init( int dev_id, parsec_device_level_zero_driver_t
device->data_out_to_host = 0;
device->required_data_in = 0;
device->required_data_out = 0;
device->nb_evictions = 0;

device->attach = parsec_device_attach;
device->detach = parsec_device_detach;
Expand Down
1 change: 1 addition & 0 deletions parsec/mca/device/template/device_template_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ parsec_device_template_module_init( int deviceid, parsec_device_module_t** modul
device->super.transferred_data_out = 0;
device->super.required_data_in = 0;
device->super.required_data_out = 0;
device->super.nb_evictions = 0;

device->super.attach = (parsec_device_attach_f)parsec_device_template_attach;
device->super.detach = (parsec_device_detach_f)parsec_device_template_detach;
Expand Down

0 comments on commit dcba0c0

Please sign in to comment.