wip

FLAMEGPU · Nov 27, 2023 · c3b23ea · c3b23ea
1 parent 982c248
commit c3b23ea
Show file tree

Hide file tree

Showing 4 changed files with 56 additions and 25 deletions.
diff --git a/include/flamegpu/simulation/detail/MPIEnsemble.h b/include/flamegpu/simulation/detail/MPIEnsemble.h
@@ -98,10 +98,14 @@ class MPIEnsemble {
         std::vector<AbstractSimRunner::ErrorDetail> &err_detail_local, int i);
     /**
      * Create the split MPI Communicator based on if the thread is participating in ensemble execution or not, based on the group rank and number of local GPUs.
-     * @param localGPUCount the numeber of GPU devices for the current rank
+     * @param isParticipating If this rank is participating (i.e. it has a local device assigned)
      * @return success of this method
      */
-    bool createParticipatingCommunicator(int localGPUCount);
+    bool createParticipatingCommunicator(bool isParticipating);
+    /**
+     * Accessor method for if the rank is participating or not (i.e. the colour of the communicator split)
+     */
+    int getRankIsParticipating() { return this->rank_is_participating; }
     /**
      * Accessor method for the size of the MPI communicator containing "participating" (or non-participating) ranks
      */

diff --git a/include/flamegpu/simulation/detail/MPISimRunner.h b/include/flamegpu/simulation/detail/MPISimRunner.h
@@ -60,7 +60,6 @@ class MPISimRunner : public AbstractSimRunner {
         std::shared_ptr<const StepLoggingConfig> _step_log_config,
         std::shared_ptr<const LoggingConfig> _exit_log_config,
         int _device_id,
-        int _device_count_on_rank,
         unsigned int _runner_id,
         flamegpu::Verbosity _verbosity,
         std::map<unsigned int, RunLog> &run_logs,

diff --git a/src/flamegpu/simulation/CUDAEnsemble.cu b/src/flamegpu/simulation/CUDAEnsemble.cu
@@ -157,25 +157,29 @@ unsigned int CUDAEnsemble::simulate(const RunPlanVector& plans) {
     } else {
         // If no devices were specified by the user, use all visible devices but load balance if MPI is in use.
 #ifdef FLAMEGPU_ENABLE_MPI
-            // If using more than one rank per node, but <= the number of devices, evenly balance (as best as possible)
-            if (mpi != nullptr && mpi->group_size > 1 && mpi->group_size <= device_count) {
+            // If not using MPI, or a single rank is in use on this node, use all gpus.
+            if (mpi == nullptr || mpi->local_size == 1) {
+                for (int i = 0; i < device_count; ++i) {
+                    devices.emplace(i);
+                }
+            // If using MPI, with more than one rank on this node, but less ranks than gpus, attempt to load balance
+            } else if (mpi->local_size > 1 && mpi->local_size <= device_count) {
                 // find the balanced number of gpus per rank, and how many will need +1
-                int gpusPerRank = device_count / mpi->group_size;
-                int unallocated = device_count - (gpusPerRank * mpi->group_size);
+                int gpusPerRank = device_count / mpi->local_size;
+                int unallocated = device_count - (gpusPerRank * mpi->local_size);
                 // Compute the indices of the first and last gpu to be assigned to the current rank, based on how many lower ranks will have +1
-                int lowerRanksWithPlus1 = mpi->group_rank < unallocated ? mpi->group_rank : unallocated;
-                int lowerranksWithPlus0 = std::max(0, mpi->group_rank - unallocated);
+                int lowerRanksWithPlus1 = mpi->local_rank < unallocated ? mpi->local_rank : unallocated;
+                int lowerranksWithPlus0 = std::max(0, mpi->local_rank - unallocated);
                 int first = (lowerRanksWithPlus1 * (gpusPerRank + 1)) + (lowerranksWithPlus0 * gpusPerRank);
-                int last = mpi->group_rank < unallocated ? first + gpusPerRank + 1 : first + gpusPerRank;
+                int last = mpi->local_rank < unallocated ? first + gpusPerRank + 1 : first + gpusPerRank;
                 // Assign the devices for this rank
                 for (int i = first; i < last; i++) {
                     devices.emplace(i);
                 }
-            // Otherwise, if using one rank per node or when there are more mpi ranks on the current node than GPUs, use all devices. 
+            // Otherwise, if more ranks are on this node than gpus, use 1 gpu per rank until there are no gpus left.
             } else {
-                // If using MPI with a single rank per node, use all devices
-                for (int i = 0; i < device_count; ++i) {
-                    devices.emplace(i);
+                if (mpi->local_rank < device_count) {
+                    devices.emplace(mpi->local_rank);
                 }
             }
 #else  // FLAMEGPU_ENABLE_MPI
@@ -185,6 +189,7 @@ unsigned int CUDAEnsemble::simulate(const RunPlanVector& plans) {
 #endif  // FLAMEGPU_ENABLE_MPI
     }
     // Check that each device is capable, and init cuda context
+    size_t devices_size_before = devices.size();
     for (auto d = devices.begin(); d != devices.end(); ++d) {
         if (!detail::compute_capability::checkComputeCapability(*d)) {
             fprintf(stderr, "FLAMEGPU2 has not been built with an appropriate compute capability for device %d, this device will not be used.\n", *d);
@@ -198,15 +203,38 @@ unsigned int CUDAEnsemble::simulate(const RunPlanVector& plans) {
     // Return to device 0 (or check original device first?)
     gpuErrchk(cudaSetDevice(0));
 
-    // If there are no devices left, we need to error as the work cannot be executed.
+    // If there are no devices left (and we are not using mpi), we need to error as the work cannot be executed.
+#ifdef FLAMEGPU_ENABLE_MPI
+    if (devices.size() == 0 && mpi == nullptr) {
+        THROW exception::InvalidCUDAdevice("FLAMEGPU2 has not been built with an appropraite compute capability for any devices, unable to continue\n");
+    }
+#else  // FLAMEGPU_ENABLE_MPI
     if (devices.size() == 0) {
-        THROW exception::InvalidCUDAdevice("FLAMEGPU2 has nto been built with an appropraite compute capability for any devices, unable to continue\n");
+        THROW exception::InvalidCUDAdevice("FLAMEGPU2 has not been built with an appropraite compute capability for any devices, unable to continue\n");
     }
+#endif  // FLAMEGPU_ENABLE_MPI
 
     // Once the number of devices per rank is known, we can create the actual communicator to be used during MPI
 #ifdef FLAMEGPU_ENABLE_MPI
     if (mpi != nullptr) {
-
+        // This rank is participating if it has atleast one device assigned to it. 
+        mpi->createParticipatingCommunicator(devices.size() > 0);
+        // @todo - if world rank 0 is not participating, i.e. no gpu, then this is allll going to break, so instead participating rank 0 will need to be the "main" mpi rank?
+
+        // @debug - print the world, local and pariticpating ranks /sizes.
+        printf("World %d/%d, Local %d,%d, gpus %zu, participating %d %d/%d\n", mpi->world_rank, mpi->world_size, mpi->local_rank, mpi->local_size, devices.size(), mpi->getRankIsParticipating(), mpi->getParticipatingCommRank(), mpi->getParticipatingCommSize());
+
+        // @todo - if the participating communicator size is 0, mpi needs to error out, there were no usable gpus.
+        // @todo - use the new communicator, at teh momement, this will hang if anyhone is not participating.
+
+        if (mpi->world_rank == 0 && mpi->getRankIsParticipating() == 0) {
+            // @todo - error if world rank 0 doesn't have a gpu. kill everything.
+
+            // If the world size is not the participating size, issue a warning.
+            if (mpi->world_size != mpi->getParticipatingCommSize() && config.verbosity >= Verbosity::Default) {
+                fprintf(stderr, "Warning: MPI Ensemble launched with %d MPI ranks, but only %d ranks have GPUs assigned. Additional %d ranks will not execute any simulations.\n", mpi->world_size, mpi->getParticipatingCommSize(), mpi->getParticipatingCommSize() - mpi->world_size);
+            }
+        }
     }
 #endif
 
@@ -269,7 +297,7 @@ unsigned int CUDAEnsemble::simulate(const RunPlanVector& plans) {
             int flag;
             int mpi_runners_fin = 1;  // Start at 1 because we have always already finished
             // Wait for all runs to have been assigned, and all MPI runners to have been notified of fin
-            while (next_run < plans.size() || mpi_runners_fin < mpi->world_size) {
+            while (next_run < plans.size() || mpi_runners_fin < mpi->getParticipatingCommSize()) {
                 // Check for errors
                 const int t_err_count = mpi->receiveErrors(err_detail);
                 err_count += t_err_count;

diff --git a/src/flamegpu/simulation/detail/MPIEnsemble.cu b/src/flamegpu/simulation/detail/MPIEnsemble.cu
@@ -10,11 +10,11 @@ MPIEnsemble::MPIEnsemble(const CUDAEnsemble::EnsembleConfig &_config, const unsi
     : config(_config)
     , world_rank(queryMPIWorldRank())
     , world_size(queryMPIWorldSize())
-    , group_rank(queryMPISharedGroupRank())
-    , group_size(queryMPISharedGroupSize())
+    , local_rank(queryMPISharedGroupRank())
+    , local_size(queryMPISharedGroupSize())
     , total_runs(_total_runs)
     , MPI_ERROR_DETAIL(AbstractSimRunner::createErrorDetailMPIDatatype())
-    , rank_is_participating(false);
+    , rank_is_participating(false)
     , comm_participating(MPI_COMM_NULL)
     , participating_size(0)
     , participating_rank(-1) { }
@@ -177,7 +177,7 @@ std::string MPIEnsemble::assembleGPUsString() {
         }
         free(buff);
     } else {
-        const std::string d_string = group_rank == 0 ? compute_capability::getDeviceNames(config.devices) : "";
+        const std::string d_string = local_rank == 0 ? compute_capability::getDeviceNames(config.devices) : "";
         // Send GPU count
         MPI_Send(
             d_string.c_str(),               // void* data
@@ -286,11 +286,11 @@ std::vector<detail::AbstractSimRunner::ErrorDetail> &err_detail_local, const int
     }
 }
 
-bool MPIEnsemble::createParticipatingCommunicator(const int localGPUCount) {
+bool MPIEnsemble::createParticipatingCommunicator(const bool isParticipating) {
     // If the communicator has not yet been created, create it and get the rank and size.
-    if (this->comm_participating != MPI_COMM_NULL) {
+    if (this->comm_participating == MPI_COMM_NULL) {
         // determine if this thread is participating or not, i..e. the colour of the rank
-        this->rank_is_participating = true; // 'todo
+        this->rank_is_participating = isParticipating;
         // Split the world  communicator, if the split fails, abort (this makes the return type not useful tbh.) // @todo tweak the error handling.
         if (MPI_Comm_split(MPI_COMM_WORLD, this->rank_is_participating, this->world_rank, &this->comm_participating) != MPI_SUCCESS) {
             fprintf(stderr, "Error creating communicator\n");