diff --git a/parsec/mca/device/cuda/device_cuda_component.c b/parsec/mca/device/cuda/device_cuda_component.c
index 4dc516e05..6b44ed7b9 100644
--- a/parsec/mca/device/cuda/device_cuda_component.c
+++ b/parsec/mca/device/cuda/device_cuda_component.c
@@ -40,7 +40,8 @@ int parsec_cuda_max_streams = PARSEC_GPU_MAX_STREAMS;
 int parsec_cuda_memory_block_size, parsec_cuda_memory_percentage, parsec_cuda_memory_number_of_blocks;
 char* parsec_cuda_lib_path = NULL;
 
-static int cuda_mask;
+static int parsec_device_cuda_mask = 0xFF;
+static int parsec_device_cuda_avail = 0;
 static int parsec_cuda_sort_pending;
 
 #if defined(PARSEC_PROF_TRACE)
@@ -104,10 +105,10 @@ static int device_cuda_component_query(mca_base_module_t **module, int *priority
     else
         parsec_device_cuda_component.modules = NULL;
 
-    for( i = j = 0; i < parsec_device_cuda_enabled; i++ ) {
+    for( i = j = 0; (i < parsec_device_cuda_avail) && (j < parsec_device_cuda_enabled); i++ ) {
 
         /* Allow fine grain selection of the GPU's */
-        if( !((1 << i) & cuda_mask) ) continue;
+        if( !((1 << i) & parsec_device_cuda_mask) ) continue;
 
         rc = parsec_cuda_module_init(i, &parsec_device_cuda_component.modules[j]);
         if( PARSEC_SUCCESS != rc ) {
@@ -138,11 +139,24 @@ static int device_cuda_component_register(void)
                                                    "The number of CUDA device to enable for the next PaRSEC context (-1 for all available)",
                                                    false, false, -1, &parsec_device_cuda_enabled);
     (void)parsec_mca_param_reg_int_name("device_cuda", "mask",
-                                        "The bitwise mask of CUDA devices to be enabled (default all)",
-                                        false, false, 0xffffffff, &cuda_mask);
-     (void)parsec_mca_param_reg_int_name("device_cuda", "nvlink_mask",
+                                        "The bitwise mask of CUDA devices to be enabled (default all). Leave it untouched to be superseeded by CUDA_VISIBLE_DEVICES.",
+                                        false, false, 0xffffffff, &parsec_device_cuda_mask);
+    (void)parsec_mca_param_reg_int_name("device_cuda", "nvlink_mask",
                                         "What devices are allowed to use NVLINK if available (default all)",
                                         false, false, 0xffffffff, &parsec_cuda_nvlink_mask);
+    if( 0xffffffff == parsec_cuda_nvlink_mask ) {
+        char* visible_devs = getenv("CUDA_VISIBLE_DEVICES");
+        if( NULL != visible_devs ) {
+            parsec_cuda_nvlink_mask = 0;
+            while( NULL != visible_devs ) {
+                int idx = atoi(visible_devs);
+                parsec_cuda_nvlink_mask |= (1 << idx);
+                visible_devs = strchr(visible_devs, ",");
+                if( NULL != visible_devs ) visible_devs++;  /* skip the delimiter */
+            }
+        }
+    }
+
     (void)parsec_mca_param_reg_int_name("device_cuda", "verbose",
                                         "Set the verbosity level of the CUDA device (negative value: use debug verbosity), higher is less verbose)\n",
                                         false, false, -1, &parsec_gpu_verbosity);
@@ -185,15 +199,14 @@ static int device_cuda_component_register(void)
 static int device_cuda_component_open(void)
 {
     cudaError_t cudastatus;
-    int ndevices;
 
     if( 0 == parsec_device_cuda_enabled ) {
         return MCA_ERROR;  /* Nothing to do around here */
     }
 
-    cudastatus = cudaGetDeviceCount( &ndevices );
+    cudastatus = cudaGetDeviceCount(&parsec_device_cuda_avail);
     if( cudaErrorNoDevice == (cudaError_t) cudastatus ) {
-        ndevices = 0;
+        parsec_device_cuda_avail = 0;
         /* This is normal on machines with no GPUs, let it flow
          * to do the normal checks vis-a-vis the number of requested
          * devices and issue a warning only when not fulfilling
@@ -208,31 +221,22 @@ static int device_cuda_component_open(void)
                              } );
     }
 
-    if( ndevices > parsec_device_cuda_enabled ) {
-        if( 0 < parsec_device_cuda_enabled ) {
-            ndevices = parsec_device_cuda_enabled;
-        }
-    } else if (ndevices < parsec_device_cuda_enabled ) {
+    /* Update the number of GPU for the upper layer */
+    if (parsec_device_cuda_avail < parsec_device_cuda_enabled ) {
         if( 0 < parsec_device_cuda_enabled ) {
-            if( 0 == ndevices ) {
+            if( 0 == parsec_device_cuda_avail ) {
                 parsec_warning("User requested %d CUDA devices, but none are available on %s."
                                " CUDA support will be therefore disabled.",
                                parsec_device_cuda_enabled, parsec_hostname);
             } else {
                 parsec_warning("User requested %d CUDA devices, but only %d are available on %s.",
-                               parsec_device_cuda_enabled, ndevices, parsec_hostname);
+                               parsec_device_cuda_enabled, parsec_device_cuda_avail, parsec_hostname);
             }
-            parsec_mca_param_set_int(parsec_device_cuda_enabled_index, ndevices);
         }
+        parsec_mca_param_set_int(parsec_device_cuda_enabled_index, parsec_device_cuda_avail);
     }
 
-    /* Update the number of GPU for the upper layer */
-    parsec_device_cuda_enabled = ndevices;
-    if( 0 == ndevices ) {
-        return MCA_ERROR;
-    }
-
-    return MCA_SUCCESS;
+    return (0 == parsec_device_cuda_avail) ? MCA_ERROR : MCA_SUCCESS;
 }
 
 /**
diff --git a/tests/runtime/cuda/stress_main.c b/tests/runtime/cuda/stress_main.c
index 31176f4a8..5f060bdfc 100644
--- a/tests/runtime/cuda/stress_main.c
+++ b/tests/runtime/cuda/stress_main.c
@@ -2,6 +2,7 @@
 #include "parsec/data_distribution.h"
 #include "parsec/data_dist/matrix/matrix.h"
 #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"
+#include "parsec/utils/mca_param.h"
 
 #include "stress.h"
 #include "stress_wrapper.h"
@@ -15,7 +16,7 @@ int main(int argc, char *argv[])
     parsec_context_t *parsec = NULL;
     parsec_taskpool_t *tp;
     int size = 1;
-    int rank = 0;
+    int rank = 0, nb_gpus = 1;
 
 #if defined(DISTRIBUTED)
     {
@@ -24,6 +25,27 @@ int main(int argc, char *argv[])
     }
     MPI_Comm_size(MPI_COMM_WORLD, &size);
     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
+    {
+        MPI_Comm local_comm;
+        int local_rank, local_size;
+        MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,
+                            MPI_INFO_NULL, &local_comm);
+        MPI_Comm_rank(local_comm, &local_rank);
+        MPI_Comm_size(local_comm, &local_size);
+        MPI_Comm_free(&local_comm);
+        int gpu_mask = 0;
+        for (int i = 0; i < nb_gpus; i++)
+        {
+            gpu_mask |= ((1 << local_rank) << i);
+        }
+        char *value;
+        asprintf(&value, "%d", gpu_mask);
+        parsec_setenv_mca_param("device_cuda_mask", value, &environ);
+        free(value);
+        value = NULL;
+    }
+#endif /* defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)*/
 #endif /* DISTRIBUTED */
 
     parsec = parsec_init(-1, &argc, &argv);
diff --git a/tests/runtime/cuda/testing_get_best_device.c b/tests/runtime/cuda/testing_get_best_device.c
index 4bfd43b5d..f42279534 100644
--- a/tests/runtime/cuda/testing_get_best_device.c
+++ b/tests/runtime/cuda/testing_get_best_device.c
@@ -45,17 +45,10 @@ int main(int argc, char *argv[])
     char **pargv;
 
     /* Default */
-    int m = 0;
-    int N = 8;
-    int NB = 4;
-    int P = 1;
-    int KP = 1;
-    int KQ = 1;
-    int cores = -1;
-    int nb_gpus = 0;
-    int info = 0;
-
-    while ((ch = getopt(argc, argv, "m:N:t:s:S:P:c:g:h")) != -1) {
+    int m = 0, N = 8, NB = 4, P = 1, KP = 1, KQ = 1;
+    int cores = -1, nb_gpus = 0, nb_avail_gpu = 0, info = 0, gpu_mask = 0xFF;
+
+    while ((ch = getopt(argc, argv, "m:N:t:s:S:P:c:g:G:h")) != -1) {
         switch (ch) {
             case 'm': m = atoi(optarg); break;
             case 'N': N = atoi(optarg); break;
@@ -65,6 +58,7 @@ int main(int argc, char *argv[])
             case 'P': P = atoi(optarg); break;
             case 'c': cores = atoi(optarg); break;
             case 'g': nb_gpus = atoi(optarg); break;
+            case 'G': gpu_mask = atoi(optarg); break;
             case '?': case 'h': default:
                 fprintf(stderr,
                         "-m : initialize MPI_THREAD_MULTIPLE (default: 0/no)\n"
@@ -75,6 +69,7 @@ int main(int argc, char *argv[])
                         "-P : rows (P) in the PxQ process grid (default: 1)\n"
                         "-c : number of cores used (default: -1)\n"
                         "-g : number of GPUs used (default: 0)\n"
+                        "-G : mask of the GPUs to be used (default: 0xff)"
                         "-h : print this help message\n"
                         "\n");
                  exit(1);
@@ -102,16 +97,20 @@ int main(int argc, char *argv[])
             break;
         }
     }
-
 #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
     extern char **environ;
     char *value;
     if( nb_gpus < 1 && 0 == rank ) {
-        fprintf(stderr, "Warning: if run on GPUs, please set --gpus=value bigger than 0\n");
+        fprintf(stderr, "Warning: if run on GPUs, please set -g value bigger than 0\n");
     }
     asprintf(&value, "%d", nb_gpus);
     parsec_setenv_mca_param( "device_cuda_enabled", value, &environ );
-    free(value);
+    free(value); value = NULL;
+    if( 0xFF != gpu_mask ) {
+        asprintf(&value, "%d", gpu_mask);
+        parsec_setenv_mca_param("device_cuda_mask", value, &environ);
+        free(value); value = NULL;
+    }
 #endif
 
     /* Initialize PaRSEC */
@@ -134,7 +133,7 @@ int main(int argc, char *argv[])
         }
         cores = nb_total_comp_threads;
     }
-
+    nb_avail_gpu = parsec_context_query(parsec, PARSEC_CONTEXT_QUERY_DEVICES, PARSEC_DEV_CUDA);
     /* initializing matrix structure */
     parsec_matrix_block_cyclic_t dcA;
     parsec_matrix_block_cyclic_init(&dcA, PARSEC_MATRIX_DOUBLE, PARSEC_MATRIX_TILE,
@@ -153,9 +152,9 @@ int main(int argc, char *argv[])
     /* Main routines */
     SYNC_TIME_START(); 
     info = parsec_get_best_device_check(parsec, (parsec_tiled_matrix_t *)&dcA);
-    SYNC_TIME_PRINT(rank, ("Get_best_device" "\tN= %d NB= %d "
+    SYNC_TIME_PRINT(rank, ("Get_best_device\tN= %d NB= %d "
                            "PxQ= %d %d KPxKQ= %d %d cores= %d nb_gpus= %d\n",
-                           N, NB, P, nodes/P, KP, KQ, cores, parsec_nb_devices-2)); 
+                           N, NB, P, nodes / P, KP, KQ, cores, nb_avail_gpu));
 
     /* Check result */
     if( 0 == rank && info != 0 ) {