diff --git a/include/flamegpu/simulation/detail/MPIEnsemble.h b/include/flamegpu/simulation/detail/MPIEnsemble.h index 1855d7903..75370c936 100644 --- a/include/flamegpu/simulation/detail/MPIEnsemble.h +++ b/include/flamegpu/simulation/detail/MPIEnsemble.h @@ -41,11 +41,11 @@ class MPIEnsemble { /** * The rank within the MPI shared memory communicator (i.e. the within the node) */ - const int group_rank; + const int local_rank; /** * The size of the MPI shared memory communicator (i.e. the within the node) */ - const int group_size; + const int local_size; /** * The total number of runs to be executed (only used for printing error warnings) */ @@ -86,7 +86,7 @@ class MPIEnsemble { */ void worldBarrier(); /** - * If world_rank!=0 and group_rank == 0, send the local GPU string to world_rank==0 and return empty string + * If world_rank!=0 and local_rank == 0, send the local GPU string to world_rank==0 and return empty string * If world_rank==0, receive GPU strings and assemble the full remote GPU string to be returned */ std::string assembleGPUsString(); @@ -96,6 +96,20 @@ class MPIEnsemble { void retrieveLocalErrorDetail(std::mutex &log_export_queue_mutex, std::multimap &err_detail, std::vector &err_detail_local, int i); + /** + * Create the split MPI Communicator based on if the thread is participating in ensemble execution or not, based on the group rank and number of local GPUs. + * @param localGPUCount the numeber of GPU devices for the current rank + * @return success of this method + */ + bool createParticipatingCommunicator(int localGPUCount); + /** + * Accessor method for the size of the MPI communicator containing "participating" (or non-participating) ranks + */ + int getParticipatingCommSize() { return this->participating_size; } + /** + * Accessor method for the rank within the MPI communicator containing "participating" (or non-participating) ranks + */ + int getParticipatingCommRank() { return this->participating_rank; } private: /** @@ -126,6 +140,25 @@ class MPIEnsemble { * MPI representation of AbstractSimRunner::ErrorDetail type */ const MPI_Datatype MPI_ERROR_DETAIL; + /** + * flag indicating if the current MPI rank is a participating rank (i.e. it has atleast one GPU it can use). + * This is not a const public member, as it can only be computed after world and local rank / sizes and local gpu count are known. + * This is used to form the colour in the participating communicator + */ + bool rank_is_participating; + /** + * An MPI communicator, split by whether the mpi rank is participating in simulation or not. + * non-participating ranks will have a commincator which only contains non-participating ranks, but these will never use the communicator + */ + MPI_Comm comm_participating; + /** + * The size of the MPI communicator containing "participating" (or non-participating) ranks + */ + int participating_size; + /** + * The rank within the MPI communicator containing "participating" (or non-participating) ranks + */ + int participating_rank; }; } // namespace detail } // namespace flamegpu diff --git a/include/flamegpu/simulation/detail/MPISimRunner.h b/include/flamegpu/simulation/detail/MPISimRunner.h index 753b6805e..faa4ae9a9 100644 --- a/include/flamegpu/simulation/detail/MPISimRunner.h +++ b/include/flamegpu/simulation/detail/MPISimRunner.h @@ -60,6 +60,7 @@ class MPISimRunner : public AbstractSimRunner { std::shared_ptr _step_log_config, std::shared_ptr _exit_log_config, int _device_id, + int _device_count_on_rank, unsigned int _runner_id, flamegpu::Verbosity _verbosity, std::map &run_logs, diff --git a/src/flamegpu/simulation/CUDAEnsemble.cu b/src/flamegpu/simulation/CUDAEnsemble.cu index d2b07f9cf..afdae7d54 100644 --- a/src/flamegpu/simulation/CUDAEnsemble.cu +++ b/src/flamegpu/simulation/CUDAEnsemble.cu @@ -157,32 +157,27 @@ unsigned int CUDAEnsemble::simulate(const RunPlanVector& plans) { } else { // If no devices were specified by the user, use all visible devices but load balance if MPI is in use. #ifdef FLAMEGPU_ENABLE_MPI - // If using MPI with a single rank per node, use all devices - if (mpi->group_size == 1) { - for (int i = 0; i < device_count; ++i) { - devices.emplace(i); - } - // If using more than one rank per node, but <= the number of devices, evenly balance (as best as possible) - } else if (mpi->group_size > 1 && mpi->group_size <= device_count) { - // find the balanced number of gpus per rank, and how many will need +1 - int gpusPerRank = device_count / mpi->group_size; - int unallocated = device_count - (gpusPerRank * mpi->group_size); - // Compute the indices of the first and last gpu to be assigned to the current rank, based on how many lower ranks will have +1 - int lowerRanksWithPlus1 = mpi->group_rank < unallocated ? mpi->group_rank : unallocated; - int lowerranksWithPlus0 = std::max(0, mpi->group_rank - unallocated); - int first = (lowerRanksWithPlus1 * (gpusPerRank + 1)) + (lowerranksWithPlus0 * gpusPerRank); - int last = mpi->group_rank < unallocated ? first + gpusPerRank + 1 : first + gpusPerRank; - // Assign the devices for this rank - for (int i = first; i < last; i++) { - devices.emplace(i); + // If using more than one rank per node, but <= the number of devices, evenly balance (as best as possible) + if (mpi != nullptr && mpi->group_size > 1 && mpi->group_size <= device_count) { + // find the balanced number of gpus per rank, and how many will need +1 + int gpusPerRank = device_count / mpi->group_size; + int unallocated = device_count - (gpusPerRank * mpi->group_size); + // Compute the indices of the first and last gpu to be assigned to the current rank, based on how many lower ranks will have +1 + int lowerRanksWithPlus1 = mpi->group_rank < unallocated ? mpi->group_rank : unallocated; + int lowerranksWithPlus0 = std::max(0, mpi->group_rank - unallocated); + int first = (lowerRanksWithPlus1 * (gpusPerRank + 1)) + (lowerranksWithPlus0 * gpusPerRank); + int last = mpi->group_rank < unallocated ? first + gpusPerRank + 1 : first + gpusPerRank; + // Assign the devices for this rank + for (int i = first; i < last; i++) { + devices.emplace(i); + } + // Otherwise, if using one rank per node or when there are more mpi ranks on the current node than GPUs, use all devices. + } else { + // If using MPI with a single rank per node, use all devices + for (int i = 0; i < device_count; ++i) { + devices.emplace(i); + } } - // Otherwise, when there are more mpi ranks on the current node than GPUs - } else { - // @todo - assign GPUs for the first device_count ranks on this node. - // @todo - other ranks don't get a gpu, which later on will mean they just need to say they are done? - // @todo - throw a better exception - THROW exception::InvalidCUDAdevice("@todo"); - } #else // FLAMEGPU_ENABLE_MPI for (int i = 0; i < device_count; ++i) { devices.emplace(i); @@ -208,6 +203,13 @@ unsigned int CUDAEnsemble::simulate(const RunPlanVector& plans) { THROW exception::InvalidCUDAdevice("FLAMEGPU2 has nto been built with an appropraite compute capability for any devices, unable to continue\n"); } + // Once the number of devices per rank is known, we can create the actual communicator to be used during MPI +#ifdef FLAMEGPU_ENABLE_MPI + if (mpi != nullptr) { + + } +#endif + const unsigned int TOTAL_RUNNERS = static_cast(devices.size()) * config.concurrent_runs; // Log Time (We can't use CUDA events here, due to device resets) diff --git a/src/flamegpu/simulation/detail/MPIEnsemble.cu b/src/flamegpu/simulation/detail/MPIEnsemble.cu index 394ecedcc..72c259789 100644 --- a/src/flamegpu/simulation/detail/MPIEnsemble.cu +++ b/src/flamegpu/simulation/detail/MPIEnsemble.cu @@ -13,7 +13,11 @@ MPIEnsemble::MPIEnsemble(const CUDAEnsemble::EnsembleConfig &_config, const unsi , group_rank(queryMPISharedGroupRank()) , group_size(queryMPISharedGroupSize()) , total_runs(_total_runs) - , MPI_ERROR_DETAIL(AbstractSimRunner::createErrorDetailMPIDatatype()) { } + , MPI_ERROR_DETAIL(AbstractSimRunner::createErrorDetailMPIDatatype()) + , rank_is_participating(false); + , comm_participating(MPI_COMM_NULL) + , participating_size(0) + , participating_rank(-1) { } int MPIEnsemble::receiveErrors(std::multimap &err_detail) { int errCount = 0; @@ -202,20 +206,20 @@ int MPIEnsemble::queryMPIWorldSize() { int MPIEnsemble::queryMPISharedGroupRank() { initMPI(); - int group_rank = -1; + int local_rank = -1; MPI_Comm group; MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &group); - MPI_Comm_rank(group, &group_rank); - return group_rank; + MPI_Comm_rank(group, &local_rank); + return local_rank; } int MPIEnsemble::queryMPISharedGroupSize() { initMPI(); - int group_size = -1; + int local_size = -1; MPI_Comm group; MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &group); - MPI_Comm_size(group, &group_size); - return group_size; + MPI_Comm_size(group, &local_size); + return local_size; } void MPIEnsemble::initMPI() { @@ -282,6 +286,26 @@ std::vector &err_detail_local, const int } } +bool MPIEnsemble::createParticipatingCommunicator(const int localGPUCount) { + // If the communicator has not yet been created, create it and get the rank and size. + if (this->comm_participating != MPI_COMM_NULL) { + // determine if this thread is participating or not, i..e. the colour of the rank + this->rank_is_participating = true; // 'todo + // Split the world communicator, if the split fails, abort (this makes the return type not useful tbh.) // @todo tweak the error handling. + if (MPI_Comm_split(MPI_COMM_WORLD, this->rank_is_participating, this->world_rank, &this->comm_participating) != MPI_SUCCESS) { + fprintf(stderr, "Error creating communicator\n"); + MPI_Abort(MPI_COMM_WORLD, 1); + return false; + } + // Get the size of the split pariticpating communicator + MPI_Comm_size(this->comm_participating, &this->participating_size); + // Get the local rank within the split communicator + MPI_Comm_rank(this->comm_participating, &this->participating_rank); + } + return true; +} + + } // namespace detail } // namespace flamegpu #endif