diff --git a/cudampilib/cudampilib.c b/cudampilib/cudampilib.c
index c79c12e..deb4ddf 100644
--- a/cudampilib/cudampilib.c
+++ b/cudampilib/cudampilib.c
@@ -72,6 +72,7 @@ float __cudampi__globalpowerlimit;
 int powermeasurecounter[__CUDAMPI_MAX_THREAD_COUNT] = {0};
 
 int __cudampi__batch_size;
+int __cudampi__cpu_enabled;
 extern struct __cudampi__arguments_type __cudampi__arguments;
 
 static char doc[] = "Cudampi program";
@@ -507,7 +508,10 @@ void __cudampi__initializeMPI(int argc, char **argv) {
   }
 
   __cudampi__batch_size = __cudampi__arguments.batch_size;
+  __cudampi__cpu_enabled = __cudampi__arguments.cpu_enabled;
   MPI_Bcast(&__cudampi__batch_size, 1, MPI_INT, 0, MPI_COMM_WORLD);
+  MPI_Bcast(&__cudampi__cpu_enabled, 1, MPI_INT, 0, MPI_COMM_WORLD);
+
 
   MPI_Allgather(&__cudampi__localGpuDeviceCount, 1, MPI_INT, __cudampi__GPUcountspernode, 1, MPI_INT, MPI_COMM_WORLD);
 
@@ -516,13 +520,6 @@ void __cudampi__initializeMPI(int argc, char **argv) {
 
   MPI_Allgather(&__cudampi__localFreeThreadCount, 1, MPI_INT, __cudampi__freeThreadsPerNode, 1, MPI_INT, MPI_COMM_WORLD);
 
-  if (!__cudampi__arguments.cpu_enabled){
-    for (int i=0; i < __cudampi__MPIproccount; i++){
-      __cudampi__freeThreadsPerNode[i] = 0;
-    }
-  }
-
-
   // check if there is a configuration file
   FILE *filep = fopen("__cudampi.conf", "r");
 
diff --git a/cudampilib/cudampislave.c b/cudampilib/cudampislave.c
index 3154402..6053615 100644
--- a/cudampilib/cudampislave.c
+++ b/cudampilib/cudampislave.c
@@ -45,6 +45,7 @@ int __cudampi__localGpuDeviceCount = 1;
 int __cudampi__localFreeThreadCount = 0;
 
 int __cudampi__batch_size;
+int __cudampi__cpu_enabled;
 
 unsigned long cpuStreamsValid[CPU_STREAMS_SUPPORTED];
 
@@ -483,17 +484,23 @@ int main(int argc, char **argv) {
     exit(-1); // we could exit in a nicer way! TBD
   }
 
-  if (cudaSuccess != __cudampi__getCpuFreeThreads(&__cudampi__localFreeThreadCount)) {
-    log_message(LOG_ERROR, "Error invoking __cudampi__getCpuFreeThreads()");
-    exit(-1);
+  MPI_Bcast(&__cudampi__batch_size, 1, MPI_INT, 0, MPI_COMM_WORLD);
+  MPI_Bcast(&__cudampi__cpu_enabled, 1, MPI_INT, 0, MPI_COMM_WORLD);
+
+  if (__cudampi__cpu_enabled){
+      if (cudaSuccess != __cudampi__getCpuFreeThreads(&__cudampi__localFreeThreadCount)) {
+      log_message(LOG_ERROR, "Error invoking __cudampi__getCpuFreeThreads()");
+      exit(-1);
+    }
+  }
+  else {
+    __cudampi__localFreeThreadCount = 0;
   }
 
   MPI_Allgather(&__cudampi__localGpuDeviceCount, 1, MPI_INT, __cudampi__GPUcountspernode, 1, MPI_INT, MPI_COMM_WORLD);
 
   MPI_Allgather(&__cudampi__localFreeThreadCount, 1, MPI_INT, __cudampi__freeThreadsPerNode, 1, MPI_INT, MPI_COMM_WORLD);
 
-  MPI_Bcast(&__cudampi__batch_size, 1, MPI_INT, 0, MPI_COMM_WORLD);
-
   MPI_Bcast(&__cudampi_totaldevicecount, 1, MPI_INT, 0, MPI_COMM_WORLD);
 
   __cudampi_targetMPIrankfordevice = (int *)malloc(__cudampi_totaldevicecount * sizeof(int));