-
Notifications
You must be signed in to change notification settings - Fork 21
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Does not currently support proper error handling (error's are caught and handled silently) Need to test compilation with MPI and write a test. Need to workout how we will write test to divide up GPUs with local MPI.
- Loading branch information
Showing
18 changed files
with
2,587 additions
and
218 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
612 changes: 612 additions & 0 deletions
612
include/flamegpu/runtime/environment/DeviceEnvironmentDirectedGraph.cuh
Large diffs are not rendered by default.
Oops, something went wrong.
552 changes: 552 additions & 0 deletions
552
include/flamegpu/runtime/environment/HostEnvironmentDirectedGraph.cuh
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,166 @@ | ||
#ifndef INCLUDE_FLAMEGPU_SIMULATION_DETAIL_ABSTRACTSIMRUNNER_H_ | ||
#define INCLUDE_FLAMEGPU_SIMULATION_DETAIL_ABSTRACTSIMRUNNER_H_ | ||
|
||
#include <string> | ||
#include <thread> | ||
#include <mutex> | ||
#include <queue> | ||
#include <condition_variable> | ||
#include <vector> | ||
#include <memory> | ||
|
||
#include "flamegpu/defines.h" | ||
#include "flamegpu/simulation/LogFrame.h" | ||
|
||
namespace flamegpu { | ||
struct ModelData; | ||
class LoggingConfig; | ||
class StepLoggingConfig; | ||
class RunPlanVector; | ||
class CUDAEnsemble; | ||
namespace detail { | ||
/** | ||
* Common interface and implementation shared between SimRunner and MPISimRunner | ||
*/ | ||
class AbstractSimRunner { | ||
friend class flamegpu::CUDAEnsemble; | ||
|
||
public: | ||
struct ErrorDetail { | ||
unsigned int run_id; | ||
unsigned int device_id; | ||
unsigned int runner_id; | ||
std::string exception_string; | ||
}; | ||
|
||
/** | ||
* Constructor, creates and initialises the underlying thread | ||
* @param _model A copy of the ModelDescription hierarchy for the RunPlanVector, this is used to create the CUDASimulation instances. | ||
* @param _err_ct Reference to an atomic integer for tracking how many errors have occurred | ||
* @param _next_run Atomic counter for safely selecting the next run plan to execute across multiple threads | ||
* @param _plans The vector of run plans to be executed by the ensemble | ||
* @param _step_log_config The config of which data should be logged each step | ||
* @param _exit_log_config The config of which data should be logged at run exit | ||
* @param _device_id The GPU that all runs should execute on | ||
* @param _runner_id A unique index assigned to the runner | ||
* @param _verbosity Verbosity level (Verbosity::Quiet, Verbosity::Default, Verbosity::Verbose) | ||
* @param run_logs Reference to the vector to store generate run logs | ||
* @param log_export_queue The queue of logs to exported to disk | ||
* @param log_export_queue_mutex This mutex must be locked to access log_export_queue | ||
* @param log_export_queue_cdn The condition is notified every time a log has been added to the queue | ||
* @param fast_err_detail Structure to store error details on fast failure for main thread rethrow | ||
* @param _total_runners Total number of runners executing | ||
* @param _isSWIG Flag denoting whether it's a Python build of FLAMEGPU | ||
*/ | ||
AbstractSimRunner(const std::shared_ptr<const ModelData> _model, | ||
std::atomic<unsigned int> &_err_ct, | ||
std::atomic<unsigned int> &_next_run, | ||
const RunPlanVector &_plans, | ||
std::shared_ptr<const StepLoggingConfig> _step_log_config, | ||
std::shared_ptr<const LoggingConfig> _exit_log_config, | ||
int _device_id, | ||
unsigned int _runner_id, | ||
flamegpu::Verbosity _verbosity, | ||
std::vector<RunLog> &run_logs, | ||
std::queue<unsigned int> &log_export_queue, | ||
std::mutex &log_export_queue_mutex, | ||
std::condition_variable &log_export_queue_cdn, | ||
ErrorDetail &fast_err_detail, | ||
unsigned int _total_runners, | ||
bool _isSWIG); | ||
/** | ||
* Virtual class requires polymorphic destructor | ||
*/ | ||
virtual ~AbstractSimRunner() {} | ||
/** | ||
* Start executing the SimRunner in it's separate thread | ||
*/ | ||
virtual void start() = 0; | ||
/** | ||
* Blocking call which if thread->joinable() triggers thread->join() | ||
*/ | ||
void join(); | ||
|
||
protected: | ||
/** | ||
* Create and execute the simulation for the RunPlan within plans of given index | ||
* @throws Exceptions during sim execution may be raised, these should be caught and handled by the caller | ||
*/ | ||
void runSimulation(int plan_id); | ||
/** | ||
* The thread which the SimRunner executes on | ||
*/ | ||
std::thread thread; | ||
/** | ||
* Each sim runner takes it's own clone of model description hierarchy, so it can manipulate environment without conflict | ||
*/ | ||
const std::shared_ptr<const ModelData> model; | ||
/** | ||
* CUDA Device index of runner | ||
*/ | ||
const int device_id; | ||
/** | ||
* Per instance unique runner id | ||
*/ | ||
const unsigned int runner_id; | ||
/** | ||
* Total number of runners executing | ||
* This is used to calculate the progress on job completion | ||
*/ | ||
const unsigned int total_runners; | ||
/** | ||
* Flag for whether to print progress | ||
*/ | ||
const flamegpu::Verbosity verbosity; | ||
// External references | ||
/** | ||
* Reference to an atomic integer for tracking how many errors have occurred | ||
*/ | ||
std::atomic<unsigned int> &err_ct; | ||
/** | ||
* Atomic counter for safely selecting the next run plan to execute across multiple threads | ||
* This is used differently by each class of runner | ||
*/ | ||
std::atomic<unsigned int> &next_run; | ||
/** | ||
* Reference to the vector of run configurations to be executed | ||
*/ | ||
const RunPlanVector &plans; | ||
/** | ||
* Config specifying which data to log per step | ||
*/ | ||
const std::shared_ptr<const StepLoggingConfig> step_log_config; | ||
/** | ||
* Config specifying which data to log at run exit | ||
*/ | ||
const std::shared_ptr<const LoggingConfig> exit_log_config; | ||
/** | ||
* Reference to the vector to store generated run logs | ||
*/ | ||
std::vector<RunLog> &run_logs; | ||
/** | ||
* The queue of logs to exported to disk | ||
*/ | ||
std::queue<unsigned int> &log_export_queue; | ||
/** | ||
* This mutex must be locked to access log_export_queue | ||
*/ | ||
std::mutex &log_export_queue_mutex; | ||
/** | ||
* The condition is notified every time a log has been added to the queue | ||
*/ | ||
std::condition_variable &log_export_queue_cdn; | ||
/** | ||
* If fail_fast is true, on error details will be stored here so an exception can be thrown from the main thread | ||
*/ | ||
ErrorDetail& fast_err_detail; | ||
/** | ||
* If true, the model is using SWIG Python interface | ||
**/ | ||
const bool isSWIG; | ||
}; | ||
|
||
} // namespace detail | ||
} // namespace flamegpu | ||
|
||
#endif // INCLUDE_FLAMEGPU_SIMULATION_DETAIL_ABSTRACTSIMRUNNER_H_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
#ifndef INCLUDE_FLAMEGPU_SIMULATION_DETAIL_MPISIMRUNNER_H_ | ||
#define INCLUDE_FLAMEGPU_SIMULATION_DETAIL_MPISIMRUNNER_H_ | ||
|
||
#include <atomic> | ||
#include <memory> | ||
#include <mutex> | ||
#include <queue> | ||
#include <condition_variable> | ||
#include <thread> | ||
#include <vector> | ||
#include <string> | ||
|
||
#include "flamegpu/simulation/detail/AbstractSimRunner.h" | ||
#include "flamegpu/defines.h" | ||
#include "flamegpu/simulation/LogFrame.h" | ||
|
||
namespace flamegpu { | ||
struct ModelData; | ||
class LoggingConfig; | ||
class StepLoggingConfig; | ||
class RunPlanVector; | ||
class CUDAEnsemble; | ||
namespace detail { | ||
|
||
/** | ||
* A thread class which executes RunPlans on a single GPU, communicating with the main-thread which has jobs allocated via MPI | ||
* | ||
* This class is used by CUDAEnsemble, it creates one SimRunner instance per GPU, each executes in a separate thread. | ||
* There may be multiple instances per GPU, if running small models on large GPUs. | ||
*/ | ||
class MPISimRunner : public AbstractSimRunner { | ||
enum Signal : unsigned int { | ||
// MPISimRunner sets this to notify manager that it wants a new job | ||
RequestJob = UINT_MAX, | ||
RunFailed = UINT_MAX-1, | ||
}; | ||
|
||
public: | ||
/** | ||
* Constructor, creates and initialise a new MPISimRunner | ||
* @param _model A copy of the ModelDescription hierarchy for the RunPlanVector, this is used to create the CUDASimulation instances. | ||
* @param _err_ct Reference to an atomic integer for tracking how many errors have occurred | ||
* @param _next_run Atomic counter for safely selecting the next run plan to execute across multiple threads | ||
* @param _plans The vector of run plans to be executed by the ensemble | ||
* @param _step_log_config The config of which data should be logged each step | ||
* @param _exit_log_config The config of which data should be logged at run exit | ||
* @param _device_id The GPU that all runs should execute on | ||
* @param _runner_id A unique index assigned to the runner | ||
* @param _verbosity Verbosity level (Verbosity::Quiet, Verbosity::Default, Verbosity::Verbose) | ||
* @param run_logs Reference to the vector to store generate run logs | ||
* @param log_export_queue The queue of logs to exported to disk | ||
* @param log_export_queue_mutex This mutex must be locked to access log_export_queue | ||
* @param log_export_queue_cdn The condition is notified every time a log has been added to the queue | ||
* @param fast_err_detail Structure to store error details on fast failure for main thread rethrow | ||
* @param _total_runners Total number of runners executing | ||
* @param _isSWIG Flag denoting whether it's a Python build of FLAMEGPU | ||
*/ | ||
MPISimRunner(const std::shared_ptr<const ModelData> _model, | ||
std::atomic<unsigned int> &_err_ct, | ||
std::atomic<unsigned int> &_next_run, | ||
const RunPlanVector &_plans, | ||
std::shared_ptr<const StepLoggingConfig> _step_log_config, | ||
std::shared_ptr<const LoggingConfig> _exit_log_config, | ||
int _device_id, | ||
unsigned int _runner_id, | ||
flamegpu::Verbosity _verbosity, | ||
std::vector<RunLog> &run_logs, | ||
std::queue<unsigned int> &log_export_queue, | ||
std::mutex &log_export_queue_mutex, | ||
std::condition_variable &log_export_queue_cdn, | ||
ErrorDetail &fast_err_detail, | ||
unsigned int _total_runners, | ||
bool _isSWIG); | ||
/** | ||
* Start executing the SimRunner in it's separate thread | ||
*/ | ||
void start() override; | ||
}; | ||
|
||
} // namespace detail | ||
} // namespace flamegpu | ||
|
||
#endif // INCLUDE_FLAMEGPU_SIMULATION_DETAIL_MPISIMRUNNER_H_ |
Oops, something went wrong.