Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
ptheywood committed Nov 27, 2023
1 parent 982c248 commit c3b23ea
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 25 deletions.
8 changes: 6 additions & 2 deletions include/flamegpu/simulation/detail/MPIEnsemble.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,14 @@ class MPIEnsemble {
std::vector<AbstractSimRunner::ErrorDetail> &err_detail_local, int i);
/**
* Create the split MPI Communicator based on if the thread is participating in ensemble execution or not, based on the group rank and number of local GPUs.
* @param localGPUCount the numeber of GPU devices for the current rank
* @param isParticipating If this rank is participating (i.e. it has a local device assigned)
* @return success of this method
*/
bool createParticipatingCommunicator(int localGPUCount);
bool createParticipatingCommunicator(bool isParticipating);
/**
* Accessor method for if the rank is participating or not (i.e. the colour of the communicator split)
*/
int getRankIsParticipating() { return this->rank_is_participating; }
/**
* Accessor method for the size of the MPI communicator containing "participating" (or non-participating) ranks
*/
Expand Down
1 change: 0 additions & 1 deletion include/flamegpu/simulation/detail/MPISimRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ class MPISimRunner : public AbstractSimRunner {
std::shared_ptr<const StepLoggingConfig> _step_log_config,
std::shared_ptr<const LoggingConfig> _exit_log_config,
int _device_id,
int _device_count_on_rank,
unsigned int _runner_id,
flamegpu::Verbosity _verbosity,
std::map<unsigned int, RunLog> &run_logs,
Expand Down
58 changes: 43 additions & 15 deletions src/flamegpu/simulation/CUDAEnsemble.cu
Original file line number Diff line number Diff line change
Expand Up @@ -157,25 +157,29 @@ unsigned int CUDAEnsemble::simulate(const RunPlanVector& plans) {
} else {
// If no devices were specified by the user, use all visible devices but load balance if MPI is in use.
#ifdef FLAMEGPU_ENABLE_MPI
// If using more than one rank per node, but <= the number of devices, evenly balance (as best as possible)
if (mpi != nullptr && mpi->group_size > 1 && mpi->group_size <= device_count) {
// If not using MPI, or a single rank is in use on this node, use all gpus.
if (mpi == nullptr || mpi->local_size == 1) {
for (int i = 0; i < device_count; ++i) {
devices.emplace(i);
}
// If using MPI, with more than one rank on this node, but less ranks than gpus, attempt to load balance
} else if (mpi->local_size > 1 && mpi->local_size <= device_count) {
// find the balanced number of gpus per rank, and how many will need +1
int gpusPerRank = device_count / mpi->group_size;
int unallocated = device_count - (gpusPerRank * mpi->group_size);
int gpusPerRank = device_count / mpi->local_size;
int unallocated = device_count - (gpusPerRank * mpi->local_size);
// Compute the indices of the first and last gpu to be assigned to the current rank, based on how many lower ranks will have +1
int lowerRanksWithPlus1 = mpi->group_rank < unallocated ? mpi->group_rank : unallocated;
int lowerranksWithPlus0 = std::max(0, mpi->group_rank - unallocated);
int lowerRanksWithPlus1 = mpi->local_rank < unallocated ? mpi->local_rank : unallocated;
int lowerranksWithPlus0 = std::max(0, mpi->local_rank - unallocated);
int first = (lowerRanksWithPlus1 * (gpusPerRank + 1)) + (lowerranksWithPlus0 * gpusPerRank);
int last = mpi->group_rank < unallocated ? first + gpusPerRank + 1 : first + gpusPerRank;
int last = mpi->local_rank < unallocated ? first + gpusPerRank + 1 : first + gpusPerRank;
// Assign the devices for this rank
for (int i = first; i < last; i++) {
devices.emplace(i);
}
// Otherwise, if using one rank per node or when there are more mpi ranks on the current node than GPUs, use all devices.
// Otherwise, if more ranks are on this node than gpus, use 1 gpu per rank until there are no gpus left.
} else {
// If using MPI with a single rank per node, use all devices
for (int i = 0; i < device_count; ++i) {
devices.emplace(i);
if (mpi->local_rank < device_count) {
devices.emplace(mpi->local_rank);
}
}
#else // FLAMEGPU_ENABLE_MPI
Expand All @@ -185,6 +189,7 @@ unsigned int CUDAEnsemble::simulate(const RunPlanVector& plans) {
#endif // FLAMEGPU_ENABLE_MPI
}
// Check that each device is capable, and init cuda context
size_t devices_size_before = devices.size();

Check failure on line 192 in src/flamegpu/simulation/CUDAEnsemble.cu

View workflow job for this annotation

GitHub Actions / build (12.3.0, 3.12, ON, Release, windows-2022)

the following warning is treated as an error [D:\a\FLAMEGPU2\FLAMEGPU2\build\FLAMEGPU\flamegpu.vcxproj]

Check warning on line 192 in src/flamegpu/simulation/CUDAEnsemble.cu

View workflow job for this annotation

GitHub Actions / build (12.3.0, 3.12, ON, Release, windows-2022)

'devices_size_before': local variable is initialized but not referenced [D:\a\FLAMEGPU2\FLAMEGPU2\build\FLAMEGPU\flamegpu.vcxproj]

Check failure on line 192 in src/flamegpu/simulation/CUDAEnsemble.cu

View workflow job for this annotation

GitHub Actions / build (12.3.0, 3.12, OFF, Release, windows-2022)

the following warning is treated as an error [D:\a\FLAMEGPU2\FLAMEGPU2\build\FLAMEGPU\flamegpu.vcxproj]

Check warning on line 192 in src/flamegpu/simulation/CUDAEnsemble.cu

View workflow job for this annotation

GitHub Actions / build (12.3.0, 3.12, OFF, Release, windows-2022)

'devices_size_before': local variable is initialized but not referenced [D:\a\FLAMEGPU2\FLAMEGPU2\build\FLAMEGPU\flamegpu.vcxproj]

Check failure on line 192 in src/flamegpu/simulation/CUDAEnsemble.cu

View workflow job for this annotation

GitHub Actions / build (12.3.0, 3.12, OFF, Beltsoff, windows-2022)

the following warning is treated as an error [D:\a\FLAMEGPU2\FLAMEGPU2\build\FLAMEGPU\flamegpu.vcxproj]

Check warning on line 192 in src/flamegpu/simulation/CUDAEnsemble.cu

View workflow job for this annotation

GitHub Actions / build (12.3.0, 3.12, OFF, Beltsoff, windows-2022)

'devices_size_before': local variable is initialized but not referenced [D:\a\FLAMEGPU2\FLAMEGPU2\build\FLAMEGPU\flamegpu.vcxproj]

Check failure on line 192 in src/flamegpu/simulation/CUDAEnsemble.cu

View workflow job for this annotation

GitHub Actions / build (11.8.0, 3.12, ON, Release, windows-2022)

the following warning is treated as an error [D:\a\FLAMEGPU2\FLAMEGPU2\build\FLAMEGPU\flamegpu.vcxproj]

Check warning on line 192 in src/flamegpu/simulation/CUDAEnsemble.cu

View workflow job for this annotation

GitHub Actions / build (11.8.0, 3.12, ON, Release, windows-2022)

'devices_size_before': local variable is initialized but not referenced [D:\a\FLAMEGPU2\FLAMEGPU2\build\FLAMEGPU\flamegpu.vcxproj]

Check failure on line 192 in src/flamegpu/simulation/CUDAEnsemble.cu

View workflow job for this annotation

GitHub Actions / build (11.8.0, 3.12, OFF, Release, windows-2022)

the following warning is treated as an error [D:\a\FLAMEGPU2\FLAMEGPU2\build\FLAMEGPU\flamegpu.vcxproj]

Check warning on line 192 in src/flamegpu/simulation/CUDAEnsemble.cu

View workflow job for this annotation

GitHub Actions / build (11.8.0, 3.12, OFF, Release, windows-2022)

'devices_size_before': local variable is initialized but not referenced [D:\a\FLAMEGPU2\FLAMEGPU2\build\FLAMEGPU\flamegpu.vcxproj]

Check failure on line 192 in src/flamegpu/simulation/CUDAEnsemble.cu

View workflow job for this annotation

GitHub Actions / build (11.0.3, 3.12, OFF, Release, windows-2019)

the following warning is treated as an error [D:\a\FLAMEGPU2\FLAMEGPU2\build\FLAMEGPU\flamegpu.vcxproj]

Check warning on line 192 in src/flamegpu/simulation/CUDAEnsemble.cu

View workflow job for this annotation

GitHub Actions / build (11.0.3, 3.12, OFF, Release, windows-2019)

'devices_size_before': local variable is initialized but not referenced [D:\a\FLAMEGPU2\FLAMEGPU2\build\FLAMEGPU\flamegpu.vcxproj]
for (auto d = devices.begin(); d != devices.end(); ++d) {
if (!detail::compute_capability::checkComputeCapability(*d)) {
fprintf(stderr, "FLAMEGPU2 has not been built with an appropriate compute capability for device %d, this device will not be used.\n", *d);
Expand All @@ -198,15 +203,38 @@ unsigned int CUDAEnsemble::simulate(const RunPlanVector& plans) {
// Return to device 0 (or check original device first?)
gpuErrchk(cudaSetDevice(0));

// If there are no devices left, we need to error as the work cannot be executed.
// If there are no devices left (and we are not using mpi), we need to error as the work cannot be executed.
#ifdef FLAMEGPU_ENABLE_MPI
if (devices.size() == 0 && mpi == nullptr) {
THROW exception::InvalidCUDAdevice("FLAMEGPU2 has not been built with an appropraite compute capability for any devices, unable to continue\n");
}
#else // FLAMEGPU_ENABLE_MPI
if (devices.size() == 0) {
THROW exception::InvalidCUDAdevice("FLAMEGPU2 has nto been built with an appropraite compute capability for any devices, unable to continue\n");
THROW exception::InvalidCUDAdevice("FLAMEGPU2 has not been built with an appropraite compute capability for any devices, unable to continue\n");
}
#endif // FLAMEGPU_ENABLE_MPI

// Once the number of devices per rank is known, we can create the actual communicator to be used during MPI
#ifdef FLAMEGPU_ENABLE_MPI
if (mpi != nullptr) {

// This rank is participating if it has atleast one device assigned to it.

Check failure on line 220 in src/flamegpu/simulation/CUDAEnsemble.cu

View workflow job for this annotation

GitHub Actions / cpplint (11.8, ubuntu-20.04)

Line ends in whitespace. Consider deleting these extra spaces.
mpi->createParticipatingCommunicator(devices.size() > 0);
// @todo - if world rank 0 is not participating, i.e. no gpu, then this is allll going to break, so instead participating rank 0 will need to be the "main" mpi rank?

// @debug - print the world, local and pariticpating ranks /sizes.
printf("World %d/%d, Local %d,%d, gpus %zu, participating %d %d/%d\n", mpi->world_rank, mpi->world_size, mpi->local_rank, mpi->local_size, devices.size(), mpi->getRankIsParticipating(), mpi->getParticipatingCommRank(), mpi->getParticipatingCommSize());

// @todo - if the participating communicator size is 0, mpi needs to error out, there were no usable gpus.
// @todo - use the new communicator, at teh momement, this will hang if anyhone is not participating.

if (mpi->world_rank == 0 && mpi->getRankIsParticipating() == 0) {
// @todo - error if world rank 0 doesn't have a gpu. kill everything.

// If the world size is not the participating size, issue a warning.
if (mpi->world_size != mpi->getParticipatingCommSize() && config.verbosity >= Verbosity::Default) {
fprintf(stderr, "Warning: MPI Ensemble launched with %d MPI ranks, but only %d ranks have GPUs assigned. Additional %d ranks will not execute any simulations.\n", mpi->world_size, mpi->getParticipatingCommSize(), mpi->getParticipatingCommSize() - mpi->world_size);
}
}
}
#endif

Expand Down Expand Up @@ -269,7 +297,7 @@ unsigned int CUDAEnsemble::simulate(const RunPlanVector& plans) {
int flag;
int mpi_runners_fin = 1; // Start at 1 because we have always already finished
// Wait for all runs to have been assigned, and all MPI runners to have been notified of fin
while (next_run < plans.size() || mpi_runners_fin < mpi->world_size) {
while (next_run < plans.size() || mpi_runners_fin < mpi->getParticipatingCommSize()) {
// Check for errors
const int t_err_count = mpi->receiveErrors(err_detail);
err_count += t_err_count;
Expand Down
14 changes: 7 additions & 7 deletions src/flamegpu/simulation/detail/MPIEnsemble.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@ MPIEnsemble::MPIEnsemble(const CUDAEnsemble::EnsembleConfig &_config, const unsi
: config(_config)
, world_rank(queryMPIWorldRank())
, world_size(queryMPIWorldSize())
, group_rank(queryMPISharedGroupRank())
, group_size(queryMPISharedGroupSize())
, local_rank(queryMPISharedGroupRank())
, local_size(queryMPISharedGroupSize())
, total_runs(_total_runs)
, MPI_ERROR_DETAIL(AbstractSimRunner::createErrorDetailMPIDatatype())
, rank_is_participating(false);
, rank_is_participating(false)
, comm_participating(MPI_COMM_NULL)
, participating_size(0)
, participating_rank(-1) { }
Expand Down Expand Up @@ -177,7 +177,7 @@ std::string MPIEnsemble::assembleGPUsString() {
}
free(buff);
} else {
const std::string d_string = group_rank == 0 ? compute_capability::getDeviceNames(config.devices) : "";
const std::string d_string = local_rank == 0 ? compute_capability::getDeviceNames(config.devices) : "";
// Send GPU count
MPI_Send(
d_string.c_str(), // void* data
Expand Down Expand Up @@ -286,11 +286,11 @@ std::vector<detail::AbstractSimRunner::ErrorDetail> &err_detail_local, const int
}
}

bool MPIEnsemble::createParticipatingCommunicator(const int localGPUCount) {
bool MPIEnsemble::createParticipatingCommunicator(const bool isParticipating) {
// If the communicator has not yet been created, create it and get the rank and size.
if (this->comm_participating != MPI_COMM_NULL) {
if (this->comm_participating == MPI_COMM_NULL) {
// determine if this thread is participating or not, i..e. the colour of the rank
this->rank_is_participating = true; // 'todo
this->rank_is_participating = isParticipating;
// Split the world communicator, if the split fails, abort (this makes the return type not useful tbh.) // @todo tweak the error handling.
if (MPI_Comm_split(MPI_COMM_WORLD, this->rank_is_participating, this->world_rank, &this->comm_participating) != MPI_SUCCESS) {
fprintf(stderr, "Error creating communicator\n");
Expand Down

0 comments on commit c3b23ea

Please sign in to comment.