Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix the lack of direct GPU to GPU communications in multi-device runs. #642

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions parsec/data.c
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,7 @@ void parsec_data_end_transfer_ownership_to_copy(parsec_data_t* data,
"DEV[%d]: end transfer ownership of data %p to copy %p in mode %d",
device, data, copy, access_mode);
assert( NULL != copy );
assert(copy->data_transfer_status != PARSEC_DATA_STATUS_UNDER_TRANSFER /* this must be set by the caller */);
if( PARSEC_FLOW_ACCESS_READ & access_mode ) {
copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED;
}
Expand Down Expand Up @@ -400,6 +401,7 @@ int parsec_data_start_transfer_ownership_to_copy(parsec_data_t* data,
data->owner_device = -1;
}
if( PARSEC_DATA_COHERENCY_EXCLUSIVE == data->device_copies[i]->coherency_state ) {
assert(data->device_copies[i]->data_transfer_status != PARSEC_DATA_STATUS_UNDER_TRANSFER);
data->device_copies[i]->coherency_state = PARSEC_DATA_COHERENCY_SHARED;
}
}
Expand All @@ -410,6 +412,7 @@ int parsec_data_start_transfer_ownership_to_copy(parsec_data_t* data,
for( i = 0; i < parsec_nb_devices; i++ ) {
if( NULL == data->device_copies[i] ) continue;
if( PARSEC_DATA_COHERENCY_INVALID == data->device_copies[i]->coherency_state ) continue;
assert(data->device_copies[i]->data_transfer_status != PARSEC_DATA_STATUS_UNDER_TRANSFER);
data->device_copies[i]->coherency_state = PARSEC_DATA_COHERENCY_SHARED;
}
}
Expand All @@ -426,6 +429,9 @@ int parsec_data_start_transfer_ownership_to_copy(parsec_data_t* data,
}

assert( -1 != valid_copy );
/* transfer is required, mark the destination copy invalid until
* end_transfer_ownership removes the UNDER_TRANSFER flag. */
copy->coherency_state = PARSEC_DATA_COHERENCY_INVALID;
return valid_copy;
}

Expand Down
9 changes: 8 additions & 1 deletion parsec/data_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,14 @@ struct parsec_data_copy_s {
void *device_private; /**< The pointer to the device-specific data.
* Overlay data distributions assume that arithmetic
* can be done on these pointers. */
parsec_data_status_t data_transfer_status; /** three status */
parsec_data_status_t data_transfer_status; /**< Have we scheduled a communication to update this data yet?
* Possible values are NOT_TRANSFER, UNDER_TRANSFER, TRANSFER_COMPLETE.
* NB: this closely follows, but is not equivalent, to
* the coherency_flag INVALID. A data copy that is 'under transfer'
* is always INVALID. However, a data copy that is INVALID could be
* so for many reasons, not necessarily because a transfer is ongoing.
* We use this transfer_status to guard scheduling multiple transfers
* on the same data. */
parsec_datatype_t dtt; /**< the appropriate type for the network engine to send an element */
};

Expand Down
37 changes: 3 additions & 34 deletions parsec/mca/device/cuda/device_cuda_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,12 @@ static int device_cuda_component_query(mca_base_module_2_0_0_t **module, int *pr
static int device_cuda_component_register(void);

/* mca params */
int parsec_device_cuda_enabled_index, parsec_device_cuda_enabled;
int parsec_device_cuda_enabled_index, parsec_device_cuda_enabled, parsec_cuda_nvlink_mask;
int parsec_cuda_max_streams = PARSEC_GPU_MAX_STREAMS;
int parsec_cuda_memory_block_size, parsec_cuda_memory_percentage, parsec_cuda_memory_number_of_blocks;
char* parsec_cuda_lib_path = NULL;

static int cuda_mask, cuda_nvlink_mask;
static int cuda_mask;
static int parsec_cuda_sort_pending;

#if defined(PARSEC_PROF_TRACE)
Expand Down Expand Up @@ -122,37 +122,6 @@ static int device_cuda_component_query(mca_base_module_t **module, int *priority
parsec_device_cuda_component.modules[j] = NULL;
}

parsec_device_cuda_module_t *source_gpu, *target_gpu;
cudaError_t cudastatus;

for( i = 0; i < parsec_device_cuda_enabled && NULL != (source_gpu = (parsec_device_cuda_module_t*)parsec_device_cuda_component.modules[i]); i++ ) {
int canAccessPeer;
source_gpu->super.peer_access_mask = 0;

if( ! ( (1<<i) & cuda_nvlink_mask ) )
continue; /* The user disabled NVLINK for that GPU */

cudastatus = cudaSetDevice( source_gpu->cuda_index );
PARSEC_CUDA_CHECK_ERROR( "(parsec_device_cuda_component_query) cudaSetDevice", cudastatus,
{continue;} );

for( j = 0; NULL != (target_gpu = (parsec_device_cuda_module_t*)parsec_device_cuda_component.modules[j]); j++ ) {
if( i == j ) continue;

/* Communication mask */
cudastatus = cudaDeviceCanAccessPeer( &canAccessPeer, source_gpu->cuda_index, target_gpu->cuda_index );
PARSEC_CUDA_CHECK_ERROR( "(parsec_device_cuda_component_query) cudaDeviceCanAccessPeer", cudastatus,
{continue;} );
if( 1 == canAccessPeer ) {
cudastatus = cudaDeviceEnablePeerAccess( target_gpu->cuda_index, 0 );
PARSEC_CUDA_CHECK_ERROR( "(parsec_device_cuda_component_query) cuCtxEnablePeerAccess", cudastatus,
{continue;} );
source_gpu->super.peer_access_mask = (int16_t)(source_gpu->super.peer_access_mask | (int16_t)(1 <<
target_gpu->super.super.device_index));
}
}
}

parsec_device_enable_debug();

/* module type should be: const mca_base_module_t ** */
Expand All @@ -173,7 +142,7 @@ static int device_cuda_component_register(void)
false, false, 0xffffffff, &cuda_mask);
(void)parsec_mca_param_reg_int_name("device_cuda", "nvlink_mask",
"What devices are allowed to use NVLINK if available (default all)",
false, false, 0xffffffff, &cuda_nvlink_mask);
false, false, 0xffffffff, &parsec_cuda_nvlink_mask);
(void)parsec_mca_param_reg_int_name("device_cuda", "verbose",
"Set the verbosity level of the CUDA device (negative value: use debug verbosity), higher is less verbose)\n",
false, false, -1, &parsec_gpu_verbosity);
Expand Down
2 changes: 1 addition & 1 deletion parsec/mca/device/cuda/device_cuda_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
BEGIN_C_DECLS

/* From MCA parameters */
extern int parsec_device_cuda_enabled_index, parsec_device_cuda_enabled;
extern int parsec_device_cuda_enabled_index, parsec_device_cuda_enabled, parsec_cuda_nvlink_mask;
extern int parsec_cuda_max_streams;
extern int parsec_cuda_memory_block_size, parsec_cuda_memory_percentage, parsec_cuda_memory_number_of_blocks;
extern char* parsec_cuda_lib_path;
Expand Down
36 changes: 36 additions & 0 deletions parsec/mca/device/cuda/device_cuda_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,41 @@ static int parsec_cuda_device_lookup_cudamp_floprate(const struct cudaDeviceProp
return PARSEC_SUCCESS;
}

static int parsec_cuda_all_devices_attached(parsec_device_module_t *device)
{
parsec_device_cuda_module_t *source_gpu, *target_gpu;
cudaError_t cudastatus;

source_gpu = (parsec_device_cuda_module_t*)device;
int i = device->device_index;
int canAccessPeer;
source_gpu->super.peer_access_mask = 0;

if( ! ( (1<<i) & parsec_cuda_nvlink_mask ) )
return PARSEC_SUCCESS; /* The user disabled NVLINK for that GPU */

cudastatus = cudaSetDevice( source_gpu->cuda_index );
PARSEC_CUDA_CHECK_ERROR( "(parsec_device_cuda_component_query) cudaSetDevice", cudastatus,
{return PARSEC_ERR_DEVICE;} );

for( int j = 0; NULL != (target_gpu = (parsec_device_cuda_module_t*)parsec_device_cuda_component.modules[j]); j++ ) {
if( target_gpu == source_gpu ) continue;

/* Communication mask */
cudastatus = cudaDeviceCanAccessPeer( &canAccessPeer, source_gpu->cuda_index, target_gpu->cuda_index );
PARSEC_CUDA_CHECK_ERROR( "(parsec_device_cuda_component_query) cudaDeviceCanAccessPeer", cudastatus,
{continue;} );
if( 1 == canAccessPeer ) {
cudastatus = cudaDeviceEnablePeerAccess( target_gpu->cuda_index, 0 );
PARSEC_CUDA_CHECK_ERROR( "(parsec_device_cuda_component_query) cuCtxEnablePeerAccess", cudastatus,
{continue;} );
source_gpu->super.peer_access_mask = (int16_t)(source_gpu->super.peer_access_mask |
(int16_t)(1 << target_gpu->super.super.device_index));
}
}
return PARSEC_SUCCESS;
}

static int
parsec_cuda_memory_register(parsec_device_module_t* device, parsec_data_collection_t* desc,
void* ptr, size_t length)
Expand Down Expand Up @@ -522,6 +557,7 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module )

device->memory_register = parsec_cuda_memory_register;
device->memory_unregister = parsec_cuda_memory_unregister;
device->all_devices_attached = parsec_cuda_all_devices_attached;
gpu_device->set_device = parsec_cuda_set_device;
gpu_device->memcpy_async = parsec_cuda_memcpy_async;
gpu_device->event_record = parsec_cuda_event_record;
Expand Down
5 changes: 4 additions & 1 deletion parsec/mca/device/device.c
Original file line number Diff line number Diff line change
Expand Up @@ -660,6 +660,7 @@ int parsec_mca_device_registration_complete(parsec_context_t* context)

if(parsec_mca_device_are_freezed)
return PARSEC_ERR_NOT_SUPPORTED;
parsec_mca_device_are_freezed = 1;

for( uint32_t i = 0; i < parsec_nb_devices; i++ ) {
parsec_device_module_t* device = parsec_devices[i];
Expand Down Expand Up @@ -688,9 +689,11 @@ int parsec_mca_device_registration_complete(parsec_context_t* context)
device->time_estimate_default = total_gflops_fp64/(double)device->gflops_fp64;
parsec_debug_verbose(6, parsec_device_output, " Dev[%d] default-time-estimate %-4"PRId64" <- double %-8"PRId64" single %-8"PRId64" tensor %-8"PRId64" half %-8"PRId64" %s",
i, device->time_estimate_default, device->gflops_fp64, device->gflops_fp32, device->gflops_tf32, device->gflops_fp16, device->gflops_guess? "GUESSED": "");
if(NULL != device->all_devices_attached) {
device->all_devices_attached(device);
}
}

parsec_mca_device_are_freezed = 1;
return PARSEC_SUCCESS;
}

Expand Down
11 changes: 11 additions & 0 deletions parsec/mca/device/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,16 @@ typedef int (*parsec_device_sort_pending_list_function_f)(parsec_device_module_
*/
typedef parsec_hook_return_t (*parsec_device_kernel_scheduler_function_t)( parsec_device_module_t *module, parsec_execution_stream_t *es, void *task);

/**
* @brief Callback to complete initialization of a device after all
* other devices have done their initialization/attachment
* Typically used to compute the interconnect matrix between devices
*
* @param [INOUT]module: the module to complete the initialization
* @return PARSEC_SUCCESS or an error code
*/
typedef int (*parsec_device_all_devices_attached_f)(parsec_device_module_t *module);

struct parsec_device_module_s {
parsec_object_t super;
const parsec_device_base_component_t *component;
Expand All @@ -144,6 +154,7 @@ struct parsec_device_module_s {
parsec_device_find_function_f find_function;
parsec_device_sort_pending_list_function_f sort_pending_list;
parsec_device_kernel_scheduler_function_t kernel_scheduler;
parsec_device_all_devices_attached_f all_devices_attached;

parsec_info_object_array_t infos; /**< Per-device info objects are stored here */
struct parsec_context_s* context; /**< The PaRSEC context this device belongs too */
Expand Down
Loading
Loading