ICLDisco · abouteiller · Apr 1, 2024 · Mar 8, 2024 · Mar 11, 2024 · Mar 11, 2024
@@ -307,6 +307,7 @@ void parsec_data_end_transfer_ownership_to_copy(parsec_data_t* data,
                          "DEV[%d]: end transfer ownership of data %p to copy %p in mode %d",
                          device, data, copy, access_mode);
     assert( NULL != copy );
+    assert(copy->data_transfer_status != PARSEC_DATA_STATUS_UNDER_TRANSFER /* this must be set by the caller */);
     if( PARSEC_FLOW_ACCESS_READ & access_mode ) {
         copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED;
     }
@@ -400,6 +401,7 @@ int parsec_data_start_transfer_ownership_to_copy(parsec_data_t* data,
                  data->owner_device = -1;
             }
             if( PARSEC_DATA_COHERENCY_EXCLUSIVE == data->device_copies[i]->coherency_state ) {
+                assert(data->device_copies[i]->data_transfer_status != PARSEC_DATA_STATUS_UNDER_TRANSFER);
                 data->device_copies[i]->coherency_state = PARSEC_DATA_COHERENCY_SHARED;
             }
         }
@@ -410,6 +412,7 @@ int parsec_data_start_transfer_ownership_to_copy(parsec_data_t* data,
         for( i = 0; i < parsec_nb_devices; i++ ) {
             if( NULL == data->device_copies[i] ) continue;
             if( PARSEC_DATA_COHERENCY_INVALID == data->device_copies[i]->coherency_state ) continue;
+            assert(data->device_copies[i]->data_transfer_status != PARSEC_DATA_STATUS_UNDER_TRANSFER);
             data->device_copies[i]->coherency_state = PARSEC_DATA_COHERENCY_SHARED;
         }
     }
@@ -426,6 +429,9 @@ int parsec_data_start_transfer_ownership_to_copy(parsec_data_t* data,
     }
 
     assert( -1 != valid_copy );
+    /* transfer is required, mark the destination copy invalid until
+     * end_transfer_ownership removes the UNDER_TRANSFER flag. */
+    copy->coherency_state = PARSEC_DATA_COHERENCY_INVALID;
     return valid_copy;
 }
 

@@ -68,7 +68,14 @@ struct parsec_data_copy_s {
     void                     *device_private;        /**< The pointer to the device-specific data.
                                                       *   Overlay data distributions assume that arithmetic
                                                       *   can be done on these pointers. */
-    parsec_data_status_t      data_transfer_status;  /** three status */
+    parsec_data_status_t      data_transfer_status;  /**< Have we scheduled a communication to update this data yet?
+                                                      *   Possible values are NOT_TRANSFER, UNDER_TRANSFER, TRANSFER_COMPLETE.
+                                                      *   NB: this closely follows, but is not equivalent, to
+                                                      *   the coherency_flag INVALID. A data copy that is 'under transfer'
+                                                      *   is always INVALID. However, a data copy that is INVALID could be
+                                                      *   so for many reasons, not necessarily because a transfer is ongoing.
+                                                      *   We use this transfer_status to guard scheduling multiple transfers
+                                                      *   on the same data. */
     parsec_datatype_t         dtt;                   /**< the appropriate type for the network engine to send an element */
 };
 

@@ -35,12 +35,12 @@ static int device_cuda_component_query(mca_base_module_2_0_0_t **module, int *pr
 static int device_cuda_component_register(void);
 
 /* mca params */
-int parsec_device_cuda_enabled_index, parsec_device_cuda_enabled;
+int parsec_device_cuda_enabled_index, parsec_device_cuda_enabled, parsec_cuda_nvlink_mask;
 int parsec_cuda_max_streams = PARSEC_GPU_MAX_STREAMS;
 int parsec_cuda_memory_block_size, parsec_cuda_memory_percentage, parsec_cuda_memory_number_of_blocks;
 char* parsec_cuda_lib_path = NULL;
 
-static int cuda_mask, cuda_nvlink_mask;
+static int cuda_mask;
 static int parsec_cuda_sort_pending;
 
 #if defined(PARSEC_PROF_TRACE)
@@ -122,37 +122,6 @@ static int device_cuda_component_query(mca_base_module_t **module, int *priority
         parsec_device_cuda_component.modules[j] = NULL;
     }
 
-    parsec_device_cuda_module_t *source_gpu, *target_gpu;
-    cudaError_t cudastatus;
-
-    for( i = 0; i < parsec_device_cuda_enabled && NULL != (source_gpu = (parsec_device_cuda_module_t*)parsec_device_cuda_component.modules[i]); i++ ) {
-        int canAccessPeer;
-        source_gpu->super.peer_access_mask = 0;
-
-        if( ! ( (1<<i) & cuda_nvlink_mask ) )
-            continue; /* The user disabled NVLINK for that GPU */
-
-        cudastatus = cudaSetDevice( source_gpu->cuda_index );
-        PARSEC_CUDA_CHECK_ERROR( "(parsec_device_cuda_component_query) cudaSetDevice", cudastatus,
-                                 {continue;} );
-
-        for( j = 0; NULL != (target_gpu = (parsec_device_cuda_module_t*)parsec_device_cuda_component.modules[j]); j++ ) {
-            if( i == j ) continue;
-
-            /* Communication mask */
-            cudastatus = cudaDeviceCanAccessPeer( &canAccessPeer, source_gpu->cuda_index, target_gpu->cuda_index );
-            PARSEC_CUDA_CHECK_ERROR( "(parsec_device_cuda_component_query) cudaDeviceCanAccessPeer", cudastatus,
-                                     {continue;} );
-            if( 1 == canAccessPeer ) {
-                cudastatus = cudaDeviceEnablePeerAccess( target_gpu->cuda_index, 0 );
-                PARSEC_CUDA_CHECK_ERROR( "(parsec_device_cuda_component_query) cuCtxEnablePeerAccess", cudastatus,
-                                         {continue;} );
-                source_gpu->super.peer_access_mask = (int16_t)(source_gpu->super.peer_access_mask | (int16_t)(1 <<
-                        target_gpu->super.super.device_index));
-            }
-        }
-    }
-
     parsec_device_enable_debug();
 
     /* module type should be: const mca_base_module_t ** */
@@ -173,7 +142,7 @@ static int device_cuda_component_register(void)
                                         false, false, 0xffffffff, &cuda_mask);
      (void)parsec_mca_param_reg_int_name("device_cuda", "nvlink_mask",
                                         "What devices are allowed to use NVLINK if available (default all)",
-                                        false, false, 0xffffffff, &cuda_nvlink_mask);
+                                        false, false, 0xffffffff, &parsec_cuda_nvlink_mask);
     (void)parsec_mca_param_reg_int_name("device_cuda", "verbose",
                                         "Set the verbosity level of the CUDA device (negative value: use debug verbosity), higher is less verbose)\n",
                                         false, false, -1, &parsec_gpu_verbosity);

@@ -14,7 +14,7 @@
 BEGIN_C_DECLS
 
 /* From MCA parameters */
-extern int parsec_device_cuda_enabled_index, parsec_device_cuda_enabled;
+extern int parsec_device_cuda_enabled_index, parsec_device_cuda_enabled, parsec_cuda_nvlink_mask;
 extern int parsec_cuda_max_streams;
 extern int parsec_cuda_memory_block_size, parsec_cuda_memory_percentage, parsec_cuda_memory_number_of_blocks;
 extern char* parsec_cuda_lib_path;

@@ -141,6 +141,41 @@ static int parsec_cuda_device_lookup_cudamp_floprate(const struct cudaDeviceProp
     return PARSEC_SUCCESS;
 }
 
+static int parsec_cuda_all_devices_attached(parsec_device_module_t *device)
+{
+    parsec_device_cuda_module_t *source_gpu, *target_gpu;
+    cudaError_t cudastatus;
+
+    source_gpu = (parsec_device_cuda_module_t*)device;
+    int i = device->device_index;
+    int canAccessPeer;
+    source_gpu->super.peer_access_mask = 0;
+
+    if( ! ( (1<<i) & parsec_cuda_nvlink_mask ) )
+        return PARSEC_SUCCESS; /* The user disabled NVLINK for that GPU */
+
+    cudastatus = cudaSetDevice( source_gpu->cuda_index );
+    PARSEC_CUDA_CHECK_ERROR( "(parsec_device_cuda_component_query) cudaSetDevice", cudastatus,
+                            {return PARSEC_ERR_DEVICE;} );
+
+    for( int j = 0; NULL != (target_gpu = (parsec_device_cuda_module_t*)parsec_device_cuda_component.modules[j]); j++ ) {
+        if( target_gpu == source_gpu ) continue;
+
+        /* Communication mask */
+        cudastatus = cudaDeviceCanAccessPeer( &canAccessPeer, source_gpu->cuda_index, target_gpu->cuda_index );
+        PARSEC_CUDA_CHECK_ERROR( "(parsec_device_cuda_component_query) cudaDeviceCanAccessPeer", cudastatus,
+                                 {continue;} );
+        if( 1 == canAccessPeer ) {
+            cudastatus = cudaDeviceEnablePeerAccess( target_gpu->cuda_index, 0 );
+            PARSEC_CUDA_CHECK_ERROR( "(parsec_device_cuda_component_query) cuCtxEnablePeerAccess", cudastatus,
+                                     {continue;} );
+            source_gpu->super.peer_access_mask = (int16_t)(source_gpu->super.peer_access_mask | 
+                (int16_t)(1 << target_gpu->super.super.device_index));
+        }
+    }
+    return PARSEC_SUCCESS;
+}
+
 static int
 parsec_cuda_memory_register(parsec_device_module_t* device, parsec_data_collection_t* desc,
                             void* ptr, size_t length)
@@ -522,6 +557,7 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module )
 
     device->memory_register          = parsec_cuda_memory_register;
     device->memory_unregister        = parsec_cuda_memory_unregister;
+    device->all_devices_attached     = parsec_cuda_all_devices_attached;
     gpu_device->set_device       = parsec_cuda_set_device;
     gpu_device->memcpy_async     = parsec_cuda_memcpy_async;
     gpu_device->event_record     = parsec_cuda_event_record;

@@ -660,6 +660,7 @@ int parsec_mca_device_registration_complete(parsec_context_t* context)
 
     if(parsec_mca_device_are_freezed)
         return PARSEC_ERR_NOT_SUPPORTED;
+    parsec_mca_device_are_freezed = 1;
 
     for( uint32_t i = 0; i < parsec_nb_devices; i++ ) {
         parsec_device_module_t* device = parsec_devices[i];
@@ -688,9 +689,11 @@ int parsec_mca_device_registration_complete(parsec_context_t* context)
         device->time_estimate_default = total_gflops_fp64/(double)device->gflops_fp64;
         parsec_debug_verbose(6, parsec_device_output, "  Dev[%d] default-time-estimate %-4"PRId64" <- double %-8"PRId64" single %-8"PRId64" tensor %-8"PRId64" half %-8"PRId64" %s",
                              i, device->time_estimate_default, device->gflops_fp64, device->gflops_fp32, device->gflops_tf32, device->gflops_fp16, device->gflops_guess? "GUESSED": "");
+        if(NULL != device->all_devices_attached) {
+            device->all_devices_attached(device);
+        }
     }
 
-    parsec_mca_device_are_freezed = 1;
     return PARSEC_SUCCESS;
 }
 

@@ -129,6 +129,16 @@ typedef int  (*parsec_device_sort_pending_list_function_f)(parsec_device_module_
  */
 typedef parsec_hook_return_t (*parsec_device_kernel_scheduler_function_t)( parsec_device_module_t *module, parsec_execution_stream_t *es, void *task);
 
+/**
+ * @brief Callback to complete initialization of a device after all
+ *   other devices have done their initialization/attachment
+ *   Typically used to compute the interconnect matrix between devices
+ *
+ * @param [INOUT]module: the module to complete the initialization
+ * @return PARSEC_SUCCESS or an error code
+ */
+typedef int (*parsec_device_all_devices_attached_f)(parsec_device_module_t *module);
+
 struct parsec_device_module_s {
     parsec_object_t                        super;
     const parsec_device_base_component_t  *component;
@@ -144,6 +154,7 @@ struct parsec_device_module_s {
     parsec_device_find_function_f          find_function;
     parsec_device_sort_pending_list_function_f sort_pending_list;
     parsec_device_kernel_scheduler_function_t  kernel_scheduler;
+    parsec_device_all_devices_attached_f   all_devices_attached;
 
     parsec_info_object_array_t             infos; /**< Per-device info objects are stored here */
     struct parsec_context_s* context;  /**< The PaRSEC context this device belongs too */