wip

FLAMEGPU · Nov 27, 2023 · 982c248 · 982c248
1 parent 7847445
commit 982c248
Show file tree

Hide file tree

Showing 4 changed files with 95 additions and 35 deletions.
diff --git a/include/flamegpu/simulation/detail/MPIEnsemble.h b/include/flamegpu/simulation/detail/MPIEnsemble.h
@@ -41,11 +41,11 @@ class MPIEnsemble {
     /**
      * The rank within the MPI shared memory communicator (i.e. the within the node)
      */
-    const int group_rank;
+    const int local_rank;
     /**
      * The size of the MPI shared memory communicator (i.e. the within the node)
      */
-    const int group_size;
+    const int local_size;
     /**
      * The total number of runs to be executed (only used for printing error warnings)
      */
@@ -86,7 +86,7 @@ class MPIEnsemble {
      */
     void worldBarrier();
     /**
-     * If world_rank!=0 and group_rank == 0, send the local GPU string to world_rank==0 and return empty string
+     * If world_rank!=0 and local_rank == 0, send the local GPU string to world_rank==0 and return empty string
      * If world_rank==0, receive GPU strings and assemble the full remote GPU string to be returned
      */
     std::string assembleGPUsString();
@@ -96,6 +96,20 @@ class MPIEnsemble {
     void retrieveLocalErrorDetail(std::mutex &log_export_queue_mutex,
         std::multimap<int, AbstractSimRunner::ErrorDetail> &err_detail,
         std::vector<AbstractSimRunner::ErrorDetail> &err_detail_local, int i);
+    /**
+     * Create the split MPI Communicator based on if the thread is participating in ensemble execution or not, based on the group rank and number of local GPUs.
+     * @param localGPUCount the numeber of GPU devices for the current rank
+     * @return success of this method
+     */
+    bool createParticipatingCommunicator(int localGPUCount);
+    /**
+     * Accessor method for the size of the MPI communicator containing "participating" (or non-participating) ranks
+     */
+    int getParticipatingCommSize() { return this->participating_size; }
+    /**
+     * Accessor method for the rank within the MPI communicator containing "participating" (or non-participating) ranks
+     */
+    int getParticipatingCommRank() { return this->participating_rank; }
 
  private:
     /**
@@ -126,6 +140,25 @@ class MPIEnsemble {
      * MPI representation of AbstractSimRunner::ErrorDetail type
      */
     const MPI_Datatype MPI_ERROR_DETAIL;
+    /**
+     * flag indicating if the current MPI rank is a participating rank (i.e. it has atleast one GPU it can use).
+     * This is not a const public member, as it can only be computed after world and local rank / sizes and local gpu count are known.
+     * This is used to form the colour in the participating communicator 
+     */
+    bool rank_is_participating;
+    /**
+     * An MPI communicator, split by whether the mpi rank is participating in simulation or not.
+     * non-participating ranks will have a commincator which only contains non-participating ranks, but these will never use the communicator
+     */
+    MPI_Comm comm_participating;
+    /**
+     * The size of the MPI communicator containing "participating" (or non-participating) ranks
+     */
+    int participating_size;
+    /**
+     * The rank within the MPI communicator containing "participating" (or non-participating) ranks
+     */
+    int participating_rank;
 };
 }  // namespace detail
 }  // namespace flamegpu

diff --git a/include/flamegpu/simulation/detail/MPISimRunner.h b/include/flamegpu/simulation/detail/MPISimRunner.h
@@ -60,6 +60,7 @@ class MPISimRunner : public AbstractSimRunner {
         std::shared_ptr<const StepLoggingConfig> _step_log_config,
         std::shared_ptr<const LoggingConfig> _exit_log_config,
         int _device_id,
+        int _device_count_on_rank,
         unsigned int _runner_id,
         flamegpu::Verbosity _verbosity,
         std::map<unsigned int, RunLog> &run_logs,

diff --git a/src/flamegpu/simulation/CUDAEnsemble.cu b/src/flamegpu/simulation/CUDAEnsemble.cu
@@ -157,32 +157,27 @@ unsigned int CUDAEnsemble::simulate(const RunPlanVector& plans) {
     } else {
         // If no devices were specified by the user, use all visible devices but load balance if MPI is in use.
 #ifdef FLAMEGPU_ENABLE_MPI
-        // If using MPI with a single rank per node, use all devices
-        if (mpi->group_size == 1) {
-            for (int i = 0; i < device_count; ++i) {
-                devices.emplace(i);
-            }
-        // If using more than one rank per node, but <= the number of devices, evenly balance (as best as possible)
-        } else if (mpi->group_size > 1 && mpi->group_size <= device_count) {
-            // find the balanced number of gpus per rank, and how many will need +1
-            int gpusPerRank = device_count / mpi->group_size;
-            int unallocated = device_count - (gpusPerRank * mpi->group_size);
-            // Compute the indices of the first and last gpu to be assigned to the current rank, based on how many lower ranks will have +1
-            int lowerRanksWithPlus1 = mpi->group_rank < unallocated ? mpi->group_rank : unallocated;
-            int lowerranksWithPlus0 = std::max(0, mpi->group_rank - unallocated);
-            int first = (lowerRanksWithPlus1 * (gpusPerRank + 1)) + (lowerranksWithPlus0 * gpusPerRank);
-            int last = mpi->group_rank < unallocated ? first + gpusPerRank + 1 : first + gpusPerRank;
-            // Assign the devices for this rank
-            for (int i = first; i < last; i++) {
-                devices.emplace(i);
+            // If using more than one rank per node, but <= the number of devices, evenly balance (as best as possible)
+            if (mpi != nullptr && mpi->group_size > 1 && mpi->group_size <= device_count) {
+                // find the balanced number of gpus per rank, and how many will need +1
+                int gpusPerRank = device_count / mpi->group_size;
+                int unallocated = device_count - (gpusPerRank * mpi->group_size);
+                // Compute the indices of the first and last gpu to be assigned to the current rank, based on how many lower ranks will have +1
+                int lowerRanksWithPlus1 = mpi->group_rank < unallocated ? mpi->group_rank : unallocated;
+                int lowerranksWithPlus0 = std::max(0, mpi->group_rank - unallocated);
+                int first = (lowerRanksWithPlus1 * (gpusPerRank + 1)) + (lowerranksWithPlus0 * gpusPerRank);
+                int last = mpi->group_rank < unallocated ? first + gpusPerRank + 1 : first + gpusPerRank;
+                // Assign the devices for this rank
+                for (int i = first; i < last; i++) {
+                    devices.emplace(i);
+                }
+            // Otherwise, if using one rank per node or when there are more mpi ranks on the current node than GPUs, use all devices. 
+            } else {
+                // If using MPI with a single rank per node, use all devices
+                for (int i = 0; i < device_count; ++i) {
+                    devices.emplace(i);
+                }
             }
-        // Otherwise, when there are more mpi ranks on the current node than GPUs
-        } else {
-            // @todo - assign GPUs for the first device_count ranks on this node.
-            // @todo - other ranks don't get a gpu, which later on will mean they just need to say they are done?
-            // @todo - throw a better exception
-            THROW exception::InvalidCUDAdevice("@todo");
-        }
 #else  // FLAMEGPU_ENABLE_MPI
         for (int i = 0; i < device_count; ++i) {
             devices.emplace(i);
@@ -208,6 +203,13 @@ unsigned int CUDAEnsemble::simulate(const RunPlanVector& plans) {
         THROW exception::InvalidCUDAdevice("FLAMEGPU2 has nto been built with an appropraite compute capability for any devices, unable to continue\n");
     }
 
+    // Once the number of devices per rank is known, we can create the actual communicator to be used during MPI
+#ifdef FLAMEGPU_ENABLE_MPI
+    if (mpi != nullptr) {
+
+    }
+#endif
+
     const unsigned int TOTAL_RUNNERS = static_cast<unsigned int>(devices.size()) * config.concurrent_runs;
 
     // Log Time (We can't use CUDA events here, due to device resets)

diff --git a/src/flamegpu/simulation/detail/MPIEnsemble.cu b/src/flamegpu/simulation/detail/MPIEnsemble.cu
@@ -13,7 +13,11 @@ MPIEnsemble::MPIEnsemble(const CUDAEnsemble::EnsembleConfig &_config, const unsi
     , group_rank(queryMPISharedGroupRank())
     , group_size(queryMPISharedGroupSize())
     , total_runs(_total_runs)
-    , MPI_ERROR_DETAIL(AbstractSimRunner::createErrorDetailMPIDatatype()) { }
+    , MPI_ERROR_DETAIL(AbstractSimRunner::createErrorDetailMPIDatatype())
+    , rank_is_participating(false);
+    , comm_participating(MPI_COMM_NULL)
+    , participating_size(0)
+    , participating_rank(-1) { }
 
 int MPIEnsemble::receiveErrors(std::multimap<int, AbstractSimRunner::ErrorDetail> &err_detail) {
     int errCount = 0;
@@ -202,20 +206,20 @@ int MPIEnsemble::queryMPIWorldSize() {
 
 int MPIEnsemble::queryMPISharedGroupRank() {
     initMPI();
-    int group_rank = -1;
+    int local_rank = -1;
     MPI_Comm group;
     MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &group);
-    MPI_Comm_rank(group, &group_rank);
-    return group_rank;
+    MPI_Comm_rank(group, &local_rank);
+    return local_rank;
 }
 
 int MPIEnsemble::queryMPISharedGroupSize() {
     initMPI();
-    int group_size = -1;
+    int local_size = -1;
     MPI_Comm group;
     MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &group);
-    MPI_Comm_size(group, &group_size);
-    return group_size;
+    MPI_Comm_size(group, &local_size);
+    return local_size;
 }
 
 void MPIEnsemble::initMPI() {
@@ -282,6 +286,26 @@ std::vector<detail::AbstractSimRunner::ErrorDetail> &err_detail_local, const int
     }
 }
 
+bool MPIEnsemble::createParticipatingCommunicator(const int localGPUCount) {
+    // If the communicator has not yet been created, create it and get the rank and size.
+    if (this->comm_participating != MPI_COMM_NULL) {
+        // determine if this thread is participating or not, i..e. the colour of the rank
+        this->rank_is_participating = true; // 'todo
+        // Split the world  communicator, if the split fails, abort (this makes the return type not useful tbh.) // @todo tweak the error handling.
+        if (MPI_Comm_split(MPI_COMM_WORLD, this->rank_is_participating, this->world_rank, &this->comm_participating) != MPI_SUCCESS) {
+            fprintf(stderr, "Error creating communicator\n");
+            MPI_Abort(MPI_COMM_WORLD, 1);
+            return false;
+        }
+        // Get the size of the split pariticpating communicator
+        MPI_Comm_size(this->comm_participating, &this->participating_size);
+        // Get the local rank within the split communicator
+        MPI_Comm_rank(this->comm_participating, &this->participating_rank);
+    }
+    return true;
+}
+
+
 }  // namespace detail
 }  // namespace flamegpu
 #endif