Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow selection of a particular GPU (via the mask). #670

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 28 additions & 24 deletions parsec/mca/device/cuda/device_cuda_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ int parsec_cuda_max_streams = PARSEC_GPU_MAX_STREAMS;
int parsec_cuda_memory_block_size, parsec_cuda_memory_percentage, parsec_cuda_memory_number_of_blocks;
char* parsec_cuda_lib_path = NULL;

static int cuda_mask;
static int parsec_device_cuda_mask = 0xFF;
static int parsec_device_cuda_avail = 0;
static int parsec_cuda_sort_pending;

#if defined(PARSEC_PROF_TRACE)
Expand Down Expand Up @@ -104,10 +105,10 @@ static int device_cuda_component_query(mca_base_module_t **module, int *priority
else
parsec_device_cuda_component.modules = NULL;

for( i = j = 0; i < parsec_device_cuda_enabled; i++ ) {
for( i = j = 0; (i < parsec_device_cuda_avail) && (j < parsec_device_cuda_enabled); i++ ) {

/* Allow fine grain selection of the GPU's */
if( !((1 << i) & cuda_mask) ) continue;
if( !((1 << i) & parsec_device_cuda_mask) ) continue;

rc = parsec_cuda_module_init(i, &parsec_device_cuda_component.modules[j]);
if( PARSEC_SUCCESS != rc ) {
Expand Down Expand Up @@ -138,11 +139,24 @@ static int device_cuda_component_register(void)
"The number of CUDA device to enable for the next PaRSEC context (-1 for all available)",
false, false, -1, &parsec_device_cuda_enabled);
(void)parsec_mca_param_reg_int_name("device_cuda", "mask",
"The bitwise mask of CUDA devices to be enabled (default all)",
false, false, 0xffffffff, &cuda_mask);
(void)parsec_mca_param_reg_int_name("device_cuda", "nvlink_mask",
"The bitwise mask of CUDA devices to be enabled (default all). Leave it untouched to be superseeded by CUDA_VISIBLE_DEVICES.",
false, false, 0xffffffff, &parsec_device_cuda_mask);
(void)parsec_mca_param_reg_int_name("device_cuda", "nvlink_mask",
"What devices are allowed to use NVLINK if available (default all)",
false, false, 0xffffffff, &parsec_cuda_nvlink_mask);
if( 0xffffffff == parsec_cuda_nvlink_mask ) {
char* visible_devs = getenv("CUDA_VISIBLE_DEVICES");
if( NULL != visible_devs ) {
parsec_cuda_nvlink_mask = 0;
while( NULL != visible_devs ) {
int idx = atoi(visible_devs);
parsec_cuda_nvlink_mask |= (1 << idx);
visible_devs = strchr(visible_devs, ",");
if( NULL != visible_devs ) visible_devs++; /* skip the delimiter */
}
}
}

(void)parsec_mca_param_reg_int_name("device_cuda", "verbose",
"Set the verbosity level of the CUDA device (negative value: use debug verbosity), higher is less verbose)\n",
false, false, -1, &parsec_gpu_verbosity);
Expand Down Expand Up @@ -185,15 +199,14 @@ static int device_cuda_component_register(void)
static int device_cuda_component_open(void)
{
cudaError_t cudastatus;
int ndevices;

if( 0 == parsec_device_cuda_enabled ) {
return MCA_ERROR; /* Nothing to do around here */
}

cudastatus = cudaGetDeviceCount( &ndevices );
cudastatus = cudaGetDeviceCount(&parsec_device_cuda_avail);
if( cudaErrorNoDevice == (cudaError_t) cudastatus ) {
ndevices = 0;
parsec_device_cuda_avail = 0;
/* This is normal on machines with no GPUs, let it flow
* to do the normal checks vis-a-vis the number of requested
* devices and issue a warning only when not fulfilling
Expand All @@ -208,31 +221,22 @@ static int device_cuda_component_open(void)
} );
}

if( ndevices > parsec_device_cuda_enabled ) {
if( 0 < parsec_device_cuda_enabled ) {
ndevices = parsec_device_cuda_enabled;
}
} else if (ndevices < parsec_device_cuda_enabled ) {
/* Update the number of GPU for the upper layer */
if (parsec_device_cuda_avail < parsec_device_cuda_enabled ) {
if( 0 < parsec_device_cuda_enabled ) {
if( 0 == ndevices ) {
if( 0 == parsec_device_cuda_avail ) {
parsec_warning("User requested %d CUDA devices, but none are available on %s."
" CUDA support will be therefore disabled.",
parsec_device_cuda_enabled, parsec_hostname);
} else {
parsec_warning("User requested %d CUDA devices, but only %d are available on %s.",
parsec_device_cuda_enabled, ndevices, parsec_hostname);
parsec_device_cuda_enabled, parsec_device_cuda_avail, parsec_hostname);
}
parsec_mca_param_set_int(parsec_device_cuda_enabled_index, ndevices);
}
parsec_mca_param_set_int(parsec_device_cuda_enabled_index, parsec_device_cuda_avail);
}

/* Update the number of GPU for the upper layer */
parsec_device_cuda_enabled = ndevices;
if( 0 == ndevices ) {
return MCA_ERROR;
}

return MCA_SUCCESS;
return (0 == parsec_device_cuda_avail) ? MCA_ERROR : MCA_SUCCESS;
}

/**
Expand Down
24 changes: 23 additions & 1 deletion tests/runtime/cuda/stress_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "parsec/data_distribution.h"
#include "parsec/data_dist/matrix/matrix.h"
#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"
#include "parsec/utils/mca_param.h"

#include "stress.h"
#include "stress_wrapper.h"
Expand All @@ -15,7 +16,7 @@ int main(int argc, char *argv[])
parsec_context_t *parsec = NULL;
parsec_taskpool_t *tp;
int size = 1;
int rank = 0;
int rank = 0, nb_gpus = 1;

#if defined(DISTRIBUTED)
{
Expand All @@ -24,6 +25,27 @@ int main(int argc, char *argv[])
}
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
{
MPI_Comm local_comm;
int local_rank, local_size;
MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,
MPI_INFO_NULL, &local_comm);
MPI_Comm_rank(local_comm, &local_rank);
MPI_Comm_size(local_comm, &local_size);
MPI_Comm_free(&local_comm);
int gpu_mask = 0;
for (int i = 0; i < nb_gpus; i++)
{
gpu_mask |= ((1 << local_rank) << i);
}
char *value;
asprintf(&value, "%d", gpu_mask);
parsec_setenv_mca_param("device_cuda_mask", value, &environ);
free(value);
value = NULL;
}
#endif /* defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)*/
#endif /* DISTRIBUTED */

parsec = parsec_init(-1, &argc, &argv);
Expand Down
33 changes: 16 additions & 17 deletions tests/runtime/cuda/testing_get_best_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,10 @@ int main(int argc, char *argv[])
char **pargv;

/* Default */
int m = 0;
int N = 8;
int NB = 4;
int P = 1;
int KP = 1;
int KQ = 1;
int cores = -1;
int nb_gpus = 0;
int info = 0;

while ((ch = getopt(argc, argv, "m:N:t:s:S:P:c:g:h")) != -1) {
int m = 0, N = 8, NB = 4, P = 1, KP = 1, KQ = 1;
int cores = -1, nb_gpus = 0, nb_avail_gpu = 0, info = 0, gpu_mask = 0xFF;

while ((ch = getopt(argc, argv, "m:N:t:s:S:P:c:g:G:h")) != -1) {
switch (ch) {
case 'm': m = atoi(optarg); break;
case 'N': N = atoi(optarg); break;
Expand All @@ -65,6 +58,7 @@ int main(int argc, char *argv[])
case 'P': P = atoi(optarg); break;
case 'c': cores = atoi(optarg); break;
case 'g': nb_gpus = atoi(optarg); break;
case 'G': gpu_mask = atoi(optarg); break;
case '?': case 'h': default:
fprintf(stderr,
"-m : initialize MPI_THREAD_MULTIPLE (default: 0/no)\n"
Expand All @@ -75,6 +69,7 @@ int main(int argc, char *argv[])
"-P : rows (P) in the PxQ process grid (default: 1)\n"
"-c : number of cores used (default: -1)\n"
"-g : number of GPUs used (default: 0)\n"
"-G : mask of the GPUs to be used (default: 0xff)"
"-h : print this help message\n"
"\n");
exit(1);
Expand Down Expand Up @@ -102,16 +97,20 @@ int main(int argc, char *argv[])
break;
}
}

#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
extern char **environ;
char *value;
if( nb_gpus < 1 && 0 == rank ) {
fprintf(stderr, "Warning: if run on GPUs, please set --gpus=value bigger than 0\n");
fprintf(stderr, "Warning: if run on GPUs, please set -g value bigger than 0\n");
}
asprintf(&value, "%d", nb_gpus);
parsec_setenv_mca_param( "device_cuda_enabled", value, &environ );
free(value);
free(value); value = NULL;
if( 0xFF != gpu_mask ) {
asprintf(&value, "%d", gpu_mask);
parsec_setenv_mca_param("device_cuda_mask", value, &environ);
free(value); value = NULL;
}
#endif

/* Initialize PaRSEC */
Expand All @@ -134,7 +133,7 @@ int main(int argc, char *argv[])
}
cores = nb_total_comp_threads;
}

nb_avail_gpu = parsec_context_query(parsec, PARSEC_CONTEXT_QUERY_DEVICES, PARSEC_DEV_CUDA);
/* initializing matrix structure */
parsec_matrix_block_cyclic_t dcA;
parsec_matrix_block_cyclic_init(&dcA, PARSEC_MATRIX_DOUBLE, PARSEC_MATRIX_TILE,
Expand All @@ -153,9 +152,9 @@ int main(int argc, char *argv[])
/* Main routines */
SYNC_TIME_START();
info = parsec_get_best_device_check(parsec, (parsec_tiled_matrix_t *)&dcA);
SYNC_TIME_PRINT(rank, ("Get_best_device" "\tN= %d NB= %d "
SYNC_TIME_PRINT(rank, ("Get_best_device\tN= %d NB= %d "
"PxQ= %d %d KPxKQ= %d %d cores= %d nb_gpus= %d\n",
N, NB, P, nodes/P, KP, KQ, cores, parsec_nb_devices-2));
N, NB, P, nodes / P, KP, KQ, cores, nb_avail_gpu));

/* Check result */
if( 0 == rank && info != 0 ) {
Expand Down
Loading