From f2ca1e996945d522acfa9cc674e62661461cc64b Mon Sep 17 00:00:00 2001
From: Peter Heywood <peethwd@gmail.com>
Date: Tue, 21 Nov 2023 17:33:08 +0000
Subject: [PATCH] WIP: MPI Ensembles can use multiple ranks per node, upto the
 gpu count, evenly (ish) distributing gpus

Only one rank per node sends the device string back for telemetry, others send back an empty string (while the assembleGPUsString method is expecting a message from each rank in the world.

Currently (i.e. still WIP bits)
+ Using more ranks than GPUs per node will erorr wiht a generic exception. Can potentially make these extra runners do 0 work, by never requesting jobs / stating they are finished immediately.
+ If users specify devices via config / cli, this will be applied to all mpi ranks per node. This may or may not be desirable.

Using a specialised communicator might be a better way to do this (build a communicator of mpi ranks participating in th ensemble, and use that in place of MPI_COMM_WORLD). All ranks would need to return a message stating if they are participating or not (0/1) which would then be used as a color in a call to MPI_COMM_split, to create the new communicator(s).
---
 .../flamegpu/simulation/detail/MPIEnsemble.h  |  2 +-
 src/flamegpu/simulation/CUDAEnsemble.cu       | 31 +++++++++++++++++++
 src/flamegpu/simulation/detail/MPIEnsemble.cu | 10 +++---
 3 files changed, 38 insertions(+), 5 deletions(-)
diff --git a/include/flamegpu/simulation/detail/MPIEnsemble.h b/include/flamegpu/simulation/detail/MPIEnsemble.h
index 3c3a46a45..1855d7903 100644
--- a/include/flamegpu/simulation/detail/MPIEnsemble.h
+++ b/include/flamegpu/simulation/detail/MPIEnsemble.h
@@ -86,7 +86,7 @@ class MPIEnsemble {
      */
     void worldBarrier();
     /**
-     * If world_rank!=0, send the local GPU string to world_rank==0 and return empty string
+     * If world_rank!=0 and group_rank == 0, send the local GPU string to world_rank==0 and return empty string
      * If world_rank==0, receive GPU strings and assemble the full remote GPU string to be returned
      */
     std::string assembleGPUsString();
diff --git a/src/flamegpu/simulation/CUDAEnsemble.cu b/src/flamegpu/simulation/CUDAEnsemble.cu
index dcbb48ab4..ecece080c 100644
--- a/src/flamegpu/simulation/CUDAEnsemble.cu
+++ b/src/flamegpu/simulation/CUDAEnsemble.cu
@@ -153,10 +153,41 @@ unsigned int CUDAEnsemble::simulate(const RunPlanVector& plans) {
     std::set<int> devices;
     if (config.devices.size()) {
         devices = config.devices;
+        // @todo - handle users providing devices to use when MPI is also enabled. not sure how we want to do this.
     } else {
+        // If no devices were specified by the user, use all visible devices but load balance if MPI is in use.
+#ifdef FLAMEGPU_ENABLE_MPI
+        // If using MPI with a single rank per node, use all devices
+        if (mpi->group_size == 1) {
+            for (int i = 0; i < device_count; ++i) {
+                devices.emplace(i);
+            }
+        // If using more than one rank per node, but <= the number of devices, evenly balance (as best as possible)
+        } else if (mpi->group_size > 1 && mpi->group_size <= device_count) {
+            // find the balanced number of gpus per rank, and how many will need +1
+            int gpusPerRank = device_count / mpi->group_size;
+            int unallocated = device_count - (gpusPerRank * mpi->group_size);
+            // Compute the indices of the first and last gpu to be assigned to the current rank, based on how many lower ranks will have +1
+            int lowerRanksWithPlus1 = mpi->group_rank < unallocated ? mpi->group_rank : unallocated;
+            int lowerranksWithPlus0 = std::max(0, mpi->group_rank - unallocated);
+            int first = (lowerRanksWithPlus1 * (gpusPerRank + 1)) + (lowerranksWithPlus0 * gpusPerRank);
+            int last = mpi->group_rank < unallocated ? first + gpusPerRank + 1 : first + gpusPerRank;
+            // Assign the devices for this rank
+            for (int i = first; i < last; i++) {
+                devices.emplace(i);
+            };
+        // Otherwise, when there are more mpi ranks on the current node than GPUs
+        } else {
+            // @todo - assign GPUs for the first device_count ranks on this node.
+            // @todo - other ranks don't get a gpu, which later on will mean they just need to say they are done?
+            // @todo - throw a better exception
+            THROW exception::InvalidCUDAdevice("@todo");
+        }
+#else  // FLAMEGPU_ENABLE_MPI
         for (int i = 0; i < device_count; ++i) {
             devices.emplace(i);
         }
+#endif  // FLAMEGPU_ENABLE_MPI
     }
     // Check that each device is capable, and init cuda context
     for (auto d = devices.begin(); d != devices.end(); ++d) {
diff --git a/src/flamegpu/simulation/detail/MPIEnsemble.cu b/src/flamegpu/simulation/detail/MPIEnsemble.cu
index 6fb26f391..7adb7d87b 100644
--- a/src/flamegpu/simulation/detail/MPIEnsemble.cu
+++ b/src/flamegpu/simulation/detail/MPIEnsemble.cu
@@ -138,7 +138,7 @@ void MPIEnsemble::worldBarrier() {
 }
 std::string MPIEnsemble::assembleGPUsString() {
     std::string remote_device_names;
-    // All ranks should notify rank 0 of their GPU devices
+    // One rank per node should notify rank 0 of their GPU devices. other ranks will send an empty message.
     if (world_rank == 0) {
         int bufflen = 256;  // Length of name string in cudaDeviceProp
         char *buff = static_cast<char*>(malloc(bufflen));
@@ -166,12 +166,14 @@ std::string MPIEnsemble::assembleGPUsString() {
                 EnvelopeTag::TelemetryDevices,  // int tag
                 MPI_COMM_WORLD,                 // MPI_Comm communicator
                 &status);                       // MPI_Status*
-            remote_device_names.append(", ");
-            remote_device_names.append(buff);
+            if(strlen > 1){ 
+                remote_device_names.append(", ");
+                remote_device_names.append(buff);
+            }
         }
         free(buff);
     } else {
-        const std::string d_string = compute_capability::getDeviceNames(config.devices);
+        const std::string d_string = group_rank == 0 ? compute_capability::getDeviceNames(config.devices) : "";
         // Send GPU count
         MPI_Send(
             d_string.c_str(),               // void* data