Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
ptheywood committed Nov 27, 2023
1 parent 7847445 commit 982c248
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 35 deletions.
39 changes: 36 additions & 3 deletions include/flamegpu/simulation/detail/MPIEnsemble.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,11 @@ class MPIEnsemble {
/**
* The rank within the MPI shared memory communicator (i.e. the within the node)
*/
const int group_rank;
const int local_rank;
/**
* The size of the MPI shared memory communicator (i.e. the within the node)
*/
const int group_size;
const int local_size;
/**
* The total number of runs to be executed (only used for printing error warnings)
*/
Expand Down Expand Up @@ -86,7 +86,7 @@ class MPIEnsemble {
*/
void worldBarrier();
/**
* If world_rank!=0 and group_rank == 0, send the local GPU string to world_rank==0 and return empty string
* If world_rank!=0 and local_rank == 0, send the local GPU string to world_rank==0 and return empty string
* If world_rank==0, receive GPU strings and assemble the full remote GPU string to be returned
*/
std::string assembleGPUsString();
Expand All @@ -96,6 +96,20 @@ class MPIEnsemble {
void retrieveLocalErrorDetail(std::mutex &log_export_queue_mutex,
std::multimap<int, AbstractSimRunner::ErrorDetail> &err_detail,
std::vector<AbstractSimRunner::ErrorDetail> &err_detail_local, int i);
/**
* Create the split MPI Communicator based on if the thread is participating in ensemble execution or not, based on the group rank and number of local GPUs.
* @param localGPUCount the numeber of GPU devices for the current rank
* @return success of this method
*/
bool createParticipatingCommunicator(int localGPUCount);
/**
* Accessor method for the size of the MPI communicator containing "participating" (or non-participating) ranks
*/
int getParticipatingCommSize() { return this->participating_size; }
/**
* Accessor method for the rank within the MPI communicator containing "participating" (or non-participating) ranks
*/
int getParticipatingCommRank() { return this->participating_rank; }

private:
/**
Expand Down Expand Up @@ -126,6 +140,25 @@ class MPIEnsemble {
* MPI representation of AbstractSimRunner::ErrorDetail type
*/
const MPI_Datatype MPI_ERROR_DETAIL;
/**
* flag indicating if the current MPI rank is a participating rank (i.e. it has atleast one GPU it can use).
* This is not a const public member, as it can only be computed after world and local rank / sizes and local gpu count are known.
* This is used to form the colour in the participating communicator
*/
bool rank_is_participating;
/**
* An MPI communicator, split by whether the mpi rank is participating in simulation or not.
* non-participating ranks will have a commincator which only contains non-participating ranks, but these will never use the communicator
*/
MPI_Comm comm_participating;
/**
* The size of the MPI communicator containing "participating" (or non-participating) ranks
*/
int participating_size;
/**
* The rank within the MPI communicator containing "participating" (or non-participating) ranks
*/
int participating_rank;
};
} // namespace detail
} // namespace flamegpu
Expand Down
1 change: 1 addition & 0 deletions include/flamegpu/simulation/detail/MPISimRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ class MPISimRunner : public AbstractSimRunner {
std::shared_ptr<const StepLoggingConfig> _step_log_config,
std::shared_ptr<const LoggingConfig> _exit_log_config,
int _device_id,
int _device_count_on_rank,
unsigned int _runner_id,
flamegpu::Verbosity _verbosity,
std::map<unsigned int, RunLog> &run_logs,
Expand Down
52 changes: 27 additions & 25 deletions src/flamegpu/simulation/CUDAEnsemble.cu
Original file line number Diff line number Diff line change
Expand Up @@ -157,32 +157,27 @@ unsigned int CUDAEnsemble::simulate(const RunPlanVector& plans) {
} else {
// If no devices were specified by the user, use all visible devices but load balance if MPI is in use.
#ifdef FLAMEGPU_ENABLE_MPI
// If using MPI with a single rank per node, use all devices
if (mpi->group_size == 1) {
for (int i = 0; i < device_count; ++i) {
devices.emplace(i);
}
// If using more than one rank per node, but <= the number of devices, evenly balance (as best as possible)
} else if (mpi->group_size > 1 && mpi->group_size <= device_count) {
// find the balanced number of gpus per rank, and how many will need +1
int gpusPerRank = device_count / mpi->group_size;
int unallocated = device_count - (gpusPerRank * mpi->group_size);
// Compute the indices of the first and last gpu to be assigned to the current rank, based on how many lower ranks will have +1
int lowerRanksWithPlus1 = mpi->group_rank < unallocated ? mpi->group_rank : unallocated;
int lowerranksWithPlus0 = std::max(0, mpi->group_rank - unallocated);
int first = (lowerRanksWithPlus1 * (gpusPerRank + 1)) + (lowerranksWithPlus0 * gpusPerRank);
int last = mpi->group_rank < unallocated ? first + gpusPerRank + 1 : first + gpusPerRank;
// Assign the devices for this rank
for (int i = first; i < last; i++) {
devices.emplace(i);
// If using more than one rank per node, but <= the number of devices, evenly balance (as best as possible)
if (mpi != nullptr && mpi->group_size > 1 && mpi->group_size <= device_count) {
// find the balanced number of gpus per rank, and how many will need +1
int gpusPerRank = device_count / mpi->group_size;
int unallocated = device_count - (gpusPerRank * mpi->group_size);
// Compute the indices of the first and last gpu to be assigned to the current rank, based on how many lower ranks will have +1
int lowerRanksWithPlus1 = mpi->group_rank < unallocated ? mpi->group_rank : unallocated;
int lowerranksWithPlus0 = std::max(0, mpi->group_rank - unallocated);
int first = (lowerRanksWithPlus1 * (gpusPerRank + 1)) + (lowerranksWithPlus0 * gpusPerRank);
int last = mpi->group_rank < unallocated ? first + gpusPerRank + 1 : first + gpusPerRank;
// Assign the devices for this rank
for (int i = first; i < last; i++) {
devices.emplace(i);
}
// Otherwise, if using one rank per node or when there are more mpi ranks on the current node than GPUs, use all devices.
} else {
// If using MPI with a single rank per node, use all devices
for (int i = 0; i < device_count; ++i) {
devices.emplace(i);
}
}
// Otherwise, when there are more mpi ranks on the current node than GPUs
} else {
// @todo - assign GPUs for the first device_count ranks on this node.
// @todo - other ranks don't get a gpu, which later on will mean they just need to say they are done?
// @todo - throw a better exception
THROW exception::InvalidCUDAdevice("@todo");
}
#else // FLAMEGPU_ENABLE_MPI
for (int i = 0; i < device_count; ++i) {
devices.emplace(i);
Expand All @@ -208,6 +203,13 @@ unsigned int CUDAEnsemble::simulate(const RunPlanVector& plans) {
THROW exception::InvalidCUDAdevice("FLAMEGPU2 has nto been built with an appropraite compute capability for any devices, unable to continue\n");
}

// Once the number of devices per rank is known, we can create the actual communicator to be used during MPI
#ifdef FLAMEGPU_ENABLE_MPI
if (mpi != nullptr) {

}
#endif

const unsigned int TOTAL_RUNNERS = static_cast<unsigned int>(devices.size()) * config.concurrent_runs;

// Log Time (We can't use CUDA events here, due to device resets)
Expand Down
38 changes: 31 additions & 7 deletions src/flamegpu/simulation/detail/MPIEnsemble.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,11 @@ MPIEnsemble::MPIEnsemble(const CUDAEnsemble::EnsembleConfig &_config, const unsi
, group_rank(queryMPISharedGroupRank())
, group_size(queryMPISharedGroupSize())
, total_runs(_total_runs)
, MPI_ERROR_DETAIL(AbstractSimRunner::createErrorDetailMPIDatatype()) { }
, MPI_ERROR_DETAIL(AbstractSimRunner::createErrorDetailMPIDatatype())
, rank_is_participating(false);
, comm_participating(MPI_COMM_NULL)
, participating_size(0)
, participating_rank(-1) { }

int MPIEnsemble::receiveErrors(std::multimap<int, AbstractSimRunner::ErrorDetail> &err_detail) {
int errCount = 0;
Expand Down Expand Up @@ -202,20 +206,20 @@ int MPIEnsemble::queryMPIWorldSize() {

int MPIEnsemble::queryMPISharedGroupRank() {
initMPI();
int group_rank = -1;
int local_rank = -1;
MPI_Comm group;
MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &group);
MPI_Comm_rank(group, &group_rank);
return group_rank;
MPI_Comm_rank(group, &local_rank);
return local_rank;
}

int MPIEnsemble::queryMPISharedGroupSize() {
initMPI();
int group_size = -1;
int local_size = -1;
MPI_Comm group;
MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &group);
MPI_Comm_size(group, &group_size);
return group_size;
MPI_Comm_size(group, &local_size);
return local_size;
}

void MPIEnsemble::initMPI() {
Expand Down Expand Up @@ -282,6 +286,26 @@ std::vector<detail::AbstractSimRunner::ErrorDetail> &err_detail_local, const int
}
}

bool MPIEnsemble::createParticipatingCommunicator(const int localGPUCount) {
// If the communicator has not yet been created, create it and get the rank and size.
if (this->comm_participating != MPI_COMM_NULL) {
// determine if this thread is participating or not, i..e. the colour of the rank
this->rank_is_participating = true; // 'todo
// Split the world communicator, if the split fails, abort (this makes the return type not useful tbh.) // @todo tweak the error handling.
if (MPI_Comm_split(MPI_COMM_WORLD, this->rank_is_participating, this->world_rank, &this->comm_participating) != MPI_SUCCESS) {
fprintf(stderr, "Error creating communicator\n");
MPI_Abort(MPI_COMM_WORLD, 1);
return false;
}
// Get the size of the split pariticpating communicator
MPI_Comm_size(this->comm_participating, &this->participating_size);
// Get the local rank within the split communicator
MPI_Comm_rank(this->comm_participating, &this->participating_rank);
}
return true;
}


} // namespace detail
} // namespace flamegpu
#endif

0 comments on commit 982c248

Please sign in to comment.