diff --git a/parsec/mca/device/cuda/device_cuda_component.c b/parsec/mca/device/cuda/device_cuda_component.c index 4dc516e05..6b44ed7b9 100644 --- a/parsec/mca/device/cuda/device_cuda_component.c +++ b/parsec/mca/device/cuda/device_cuda_component.c @@ -40,7 +40,8 @@ int parsec_cuda_max_streams = PARSEC_GPU_MAX_STREAMS; int parsec_cuda_memory_block_size, parsec_cuda_memory_percentage, parsec_cuda_memory_number_of_blocks; char* parsec_cuda_lib_path = NULL; -static int cuda_mask; +static int parsec_device_cuda_mask = 0xFF; +static int parsec_device_cuda_avail = 0; static int parsec_cuda_sort_pending; #if defined(PARSEC_PROF_TRACE) @@ -104,10 +105,10 @@ static int device_cuda_component_query(mca_base_module_t **module, int *priority else parsec_device_cuda_component.modules = NULL; - for( i = j = 0; i < parsec_device_cuda_enabled; i++ ) { + for( i = j = 0; (i < parsec_device_cuda_avail) && (j < parsec_device_cuda_enabled); i++ ) { /* Allow fine grain selection of the GPU's */ - if( !((1 << i) & cuda_mask) ) continue; + if( !((1 << i) & parsec_device_cuda_mask) ) continue; rc = parsec_cuda_module_init(i, &parsec_device_cuda_component.modules[j]); if( PARSEC_SUCCESS != rc ) { @@ -138,11 +139,24 @@ static int device_cuda_component_register(void) "The number of CUDA device to enable for the next PaRSEC context (-1 for all available)", false, false, -1, &parsec_device_cuda_enabled); (void)parsec_mca_param_reg_int_name("device_cuda", "mask", - "The bitwise mask of CUDA devices to be enabled (default all)", - false, false, 0xffffffff, &cuda_mask); - (void)parsec_mca_param_reg_int_name("device_cuda", "nvlink_mask", + "The bitwise mask of CUDA devices to be enabled (default all). Leave it untouched to be superseeded by CUDA_VISIBLE_DEVICES.", + false, false, 0xffffffff, &parsec_device_cuda_mask); + (void)parsec_mca_param_reg_int_name("device_cuda", "nvlink_mask", "What devices are allowed to use NVLINK if available (default all)", false, false, 0xffffffff, &parsec_cuda_nvlink_mask); + if( 0xffffffff == parsec_cuda_nvlink_mask ) { + char* visible_devs = getenv("CUDA_VISIBLE_DEVICES"); + if( NULL != visible_devs ) { + parsec_cuda_nvlink_mask = 0; + while( NULL != visible_devs ) { + int idx = atoi(visible_devs); + parsec_cuda_nvlink_mask |= (1 << idx); + visible_devs = strchr(visible_devs, ","); + if( NULL != visible_devs ) visible_devs++; /* skip the delimiter */ + } + } + } + (void)parsec_mca_param_reg_int_name("device_cuda", "verbose", "Set the verbosity level of the CUDA device (negative value: use debug verbosity), higher is less verbose)\n", false, false, -1, &parsec_gpu_verbosity); @@ -185,15 +199,14 @@ static int device_cuda_component_register(void) static int device_cuda_component_open(void) { cudaError_t cudastatus; - int ndevices; if( 0 == parsec_device_cuda_enabled ) { return MCA_ERROR; /* Nothing to do around here */ } - cudastatus = cudaGetDeviceCount( &ndevices ); + cudastatus = cudaGetDeviceCount(&parsec_device_cuda_avail); if( cudaErrorNoDevice == (cudaError_t) cudastatus ) { - ndevices = 0; + parsec_device_cuda_avail = 0; /* This is normal on machines with no GPUs, let it flow * to do the normal checks vis-a-vis the number of requested * devices and issue a warning only when not fulfilling @@ -208,31 +221,22 @@ static int device_cuda_component_open(void) } ); } - if( ndevices > parsec_device_cuda_enabled ) { - if( 0 < parsec_device_cuda_enabled ) { - ndevices = parsec_device_cuda_enabled; - } - } else if (ndevices < parsec_device_cuda_enabled ) { + /* Update the number of GPU for the upper layer */ + if (parsec_device_cuda_avail < parsec_device_cuda_enabled ) { if( 0 < parsec_device_cuda_enabled ) { - if( 0 == ndevices ) { + if( 0 == parsec_device_cuda_avail ) { parsec_warning("User requested %d CUDA devices, but none are available on %s." " CUDA support will be therefore disabled.", parsec_device_cuda_enabled, parsec_hostname); } else { parsec_warning("User requested %d CUDA devices, but only %d are available on %s.", - parsec_device_cuda_enabled, ndevices, parsec_hostname); + parsec_device_cuda_enabled, parsec_device_cuda_avail, parsec_hostname); } - parsec_mca_param_set_int(parsec_device_cuda_enabled_index, ndevices); } + parsec_mca_param_set_int(parsec_device_cuda_enabled_index, parsec_device_cuda_avail); } - /* Update the number of GPU for the upper layer */ - parsec_device_cuda_enabled = ndevices; - if( 0 == ndevices ) { - return MCA_ERROR; - } - - return MCA_SUCCESS; + return (0 == parsec_device_cuda_avail) ? MCA_ERROR : MCA_SUCCESS; } /** diff --git a/tests/runtime/cuda/stress_main.c b/tests/runtime/cuda/stress_main.c index 31176f4a8..5f060bdfc 100644 --- a/tests/runtime/cuda/stress_main.c +++ b/tests/runtime/cuda/stress_main.c @@ -2,6 +2,7 @@ #include "parsec/data_distribution.h" #include "parsec/data_dist/matrix/matrix.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" +#include "parsec/utils/mca_param.h" #include "stress.h" #include "stress_wrapper.h" @@ -15,7 +16,7 @@ int main(int argc, char *argv[]) parsec_context_t *parsec = NULL; parsec_taskpool_t *tp; int size = 1; - int rank = 0; + int rank = 0, nb_gpus = 1; #if defined(DISTRIBUTED) { @@ -24,6 +25,27 @@ int main(int argc, char *argv[]) } MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); +#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) + { + MPI_Comm local_comm; + int local_rank, local_size; + MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, + MPI_INFO_NULL, &local_comm); + MPI_Comm_rank(local_comm, &local_rank); + MPI_Comm_size(local_comm, &local_size); + MPI_Comm_free(&local_comm); + int gpu_mask = 0; + for (int i = 0; i < nb_gpus; i++) + { + gpu_mask |= ((1 << local_rank) << i); + } + char *value; + asprintf(&value, "%d", gpu_mask); + parsec_setenv_mca_param("device_cuda_mask", value, &environ); + free(value); + value = NULL; + } +#endif /* defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)*/ #endif /* DISTRIBUTED */ parsec = parsec_init(-1, &argc, &argv); diff --git a/tests/runtime/cuda/testing_get_best_device.c b/tests/runtime/cuda/testing_get_best_device.c index 4bfd43b5d..f42279534 100644 --- a/tests/runtime/cuda/testing_get_best_device.c +++ b/tests/runtime/cuda/testing_get_best_device.c @@ -45,17 +45,10 @@ int main(int argc, char *argv[]) char **pargv; /* Default */ - int m = 0; - int N = 8; - int NB = 4; - int P = 1; - int KP = 1; - int KQ = 1; - int cores = -1; - int nb_gpus = 0; - int info = 0; - - while ((ch = getopt(argc, argv, "m:N:t:s:S:P:c:g:h")) != -1) { + int m = 0, N = 8, NB = 4, P = 1, KP = 1, KQ = 1; + int cores = -1, nb_gpus = 0, nb_avail_gpu = 0, info = 0, gpu_mask = 0xFF; + + while ((ch = getopt(argc, argv, "m:N:t:s:S:P:c:g:G:h")) != -1) { switch (ch) { case 'm': m = atoi(optarg); break; case 'N': N = atoi(optarg); break; @@ -65,6 +58,7 @@ int main(int argc, char *argv[]) case 'P': P = atoi(optarg); break; case 'c': cores = atoi(optarg); break; case 'g': nb_gpus = atoi(optarg); break; + case 'G': gpu_mask = atoi(optarg); break; case '?': case 'h': default: fprintf(stderr, "-m : initialize MPI_THREAD_MULTIPLE (default: 0/no)\n" @@ -75,6 +69,7 @@ int main(int argc, char *argv[]) "-P : rows (P) in the PxQ process grid (default: 1)\n" "-c : number of cores used (default: -1)\n" "-g : number of GPUs used (default: 0)\n" + "-G : mask of the GPUs to be used (default: 0xff)" "-h : print this help message\n" "\n"); exit(1); @@ -102,16 +97,20 @@ int main(int argc, char *argv[]) break; } } - #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) extern char **environ; char *value; if( nb_gpus < 1 && 0 == rank ) { - fprintf(stderr, "Warning: if run on GPUs, please set --gpus=value bigger than 0\n"); + fprintf(stderr, "Warning: if run on GPUs, please set -g value bigger than 0\n"); } asprintf(&value, "%d", nb_gpus); parsec_setenv_mca_param( "device_cuda_enabled", value, &environ ); - free(value); + free(value); value = NULL; + if( 0xFF != gpu_mask ) { + asprintf(&value, "%d", gpu_mask); + parsec_setenv_mca_param("device_cuda_mask", value, &environ); + free(value); value = NULL; + } #endif /* Initialize PaRSEC */ @@ -134,7 +133,7 @@ int main(int argc, char *argv[]) } cores = nb_total_comp_threads; } - + nb_avail_gpu = parsec_context_query(parsec, PARSEC_CONTEXT_QUERY_DEVICES, PARSEC_DEV_CUDA); /* initializing matrix structure */ parsec_matrix_block_cyclic_t dcA; parsec_matrix_block_cyclic_init(&dcA, PARSEC_MATRIX_DOUBLE, PARSEC_MATRIX_TILE, @@ -153,9 +152,9 @@ int main(int argc, char *argv[]) /* Main routines */ SYNC_TIME_START(); info = parsec_get_best_device_check(parsec, (parsec_tiled_matrix_t *)&dcA); - SYNC_TIME_PRINT(rank, ("Get_best_device" "\tN= %d NB= %d " + SYNC_TIME_PRINT(rank, ("Get_best_device\tN= %d NB= %d " "PxQ= %d %d KPxKQ= %d %d cores= %d nb_gpus= %d\n", - N, NB, P, nodes/P, KP, KQ, cores, parsec_nb_devices-2)); + N, NB, P, nodes / P, KP, KQ, cores, nb_avail_gpu)); /* Check result */ if( 0 == rank && info != 0 ) {