From 886b5a9404b4cf2f88f968c9c835db2ef61d6d15 Mon Sep 17 00:00:00 2001 From: Jamie Mair Date: Mon, 29 Apr 2024 11:46:09 +0100 Subject: [PATCH 01/11] First attempt at adding an MPI extension --- Project.toml | 2 + ext/MPIExt/MPIExt.jl | 114 +++++++++++++++++++++++++++++++++++++++ ext/MPIExt/utils.jl | 123 +++++++++++++++++++++++++++++++++++++++++++ src/Experimenter.jl | 43 +++++++++++++-- src/runner.jl | 108 +++++++++++++++++++++++++------------ 5 files changed, 352 insertions(+), 38 deletions(-) create mode 100644 ext/MPIExt/MPIExt.jl create mode 100644 ext/MPIExt/utils.jl diff --git a/Project.toml b/Project.toml index df0d3e1..98fdac5 100644 --- a/Project.toml +++ b/Project.toml @@ -18,9 +18,11 @@ UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" [weakdeps] ClusterManagers = "34f1f09b-3a8b-5176-ab39-66d58a4d544e" +MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" [extensions] SlurmExt = ["ClusterManagers"] +MPIExt = ["MPI"] [compat] DataFrames = "1" diff --git a/ext/MPIExt/MPIExt.jl b/ext/MPIExt/MPIExt.jl new file mode 100644 index 0000000..7de5c75 --- /dev/null +++ b/ext/MPIExt/MPIExt.jl @@ -0,0 +1,114 @@ +module MPIExt + +############ Module dependencies ############ +if isdefined(Base, :get_extension) + using Experimenter + using MPI +else + using ..Experimenter + using ..MPI +end + +include("utils.jl") + + +############ Module Code ############ +function Experimenter.Cluster.init_mpi() + # Setup SLURM + MPI.Init(; threadlevel=:multiple) + comm = MPI.COMM_WORLD + rank = MPI.Comm_rank(comm) + comm_size = MPI.Comm_size(comm) + if comm_size < 2 + error("Not enough MPI processes were launched to run experiments in MPI mode. MPI mode requires at least two processes.") + exit(1) + end + + if rank == 0 + @info "Initialised MPI with $(comm_size) workers." + end +end + +function Experimenter._mpi_run_job(runner::Experimenter.Runner, trials::AbstractArray{Experimenter.Trial}) + @assert runner.execution_mode isa MPIMode + comm = MPI.COMM_WORLD + rank = MPI.Comm_rank(comm) + if rank == 0 + coordinator_loop(runner.experiment, runner.database, trials) + else + @warn "[WORKER $(rank)] Reached a function that it should not be able to reach." + end + # TODO: Add a way to override this finalise? + MPI.Finalize() +end + +function Experimenter._mpi_worker_loop(batch_size::Int, trial_fn::Function) + comm = MPI.COMM_WORLD + rank = MPI.Comm_rank(comm) + if rank == 0 + error("The first process is reserved for the coordinator and cannot be a worker.") + end + + worker = WorkerNode( + comm, + rank, + false, + 0, + trial_fn + ) + + job_size = batch_size + job_request = JobRequest(worker.mpi_rank, job_size) + + @debug "[WORKER $(rank)] Loaded." + + MPI.Barrier(comm) + + while !worker.has_stopped + @debug "[WORKER $(rank)] Loaded." + send_variable_message(worker.comm, job_request, 0; should_block = true) + response, source = recieve_variable_message(worker.comm) + handle_response!(worker, response) + end + + MPI.Barrier(comm) +end + +function coordinator_loop(experiment::Experiment, db::ExperimentDatabase, trails::AbstractArray{Experimenter.Trial}) + comm = MPI.COMM_WORLD + rank = MPI.Comm_rank(comm) + if rank != 0 + error("The coordinator job should be run on the first process.") + end + comm_size = MPI.Comm_size(comm) + coordinator = Coordinator( + comm, + length(experiments), + 1, + 0, + 0, + comm_size-1, + experiment, + trials, + db + ) + + MPI.Barrier(comm) + + @info "[COORDINATOR] $(comm_size - 1) workers ready. Starting experiment with $(length(trials)) trials." + + while coordinator.num_workers_closed != coordinator.num_workers + # Listen for messages + request, _ = recieve_variable_message(coordinator.comm) + handle_request!(coordinator, request) + end + + @info "[COORDINATOR] Finished." + + MPI.Barrier(comm) + + @info "[COORDINATOR] All workers finished." + +end + +end \ No newline at end of file diff --git a/ext/MPIExt/utils.jl b/ext/MPIExt/utils.jl new file mode 100644 index 0000000..10929c7 --- /dev/null +++ b/ext/MPIExt/utils.jl @@ -0,0 +1,123 @@ +using Serialization +using Logging + + +function send_variable_message(comm, data, dest; tag=MPI.ANY_TAG, should_block=false) + if tag == MPI.ANY_TAG # Override any tag + tag = 0117 + end + send_req = MPI.isend(data, comm; dest, tag=tag) + if should_block + MPI.wait(send_req) + end + return nothing +end +function recieve_variable_message(comm; source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG) + x, status = MPI.recv(comm, MPI.Status; source, tag) + return x, status.source +end + + +abstract type AbstractRequest end +abstract type AbstractResponse end + +struct JobRequest <: AbstractRequest + from::Int + num_jobs::Int +end + +struct JobResponse + num_jobs::Int + trial_details::Vector{Tuple{UUID, Dict{Symbol, Any}}} +end + +struct SaveRequest <: AbstractRequest + from::Int + trial_results::Vector{Tuple{UUID, Dict{Symbol, Any}}} +end + +struct NoMoreJobsResponse <: AbstractResponse end + +mutable struct Coordinator + comm::MPI.Comm + num_jobs::Int + job_id::Int + num_saved::Int + num_workers_closed::Int + num_workers::Int + experimenter::Experimenter.Experiment + trials::Vector{Experimenter.Trial} + database::Experimenter.ExperimentDatabase +end + +mutable struct WorkerNode + comm::MPI.Comm + mpi_rank::Int + has_stopped::Bool + jobs_completed::Int + run_fn::Function +end + + +function handle_request!(::Coordinator, request::AbstractRequest) + @warn "[COORDINATOR] Recieved request of type $(typeof(request)), with no implementation" +end +function handle_response!(worker::WorkerNode, response::AbstractResponse) + @warn "[WORKER $(worker.mpi_rank)] Recieved response of type $(typeof(response)), with no implementation" +end + +function handle_response!(worker::WorkerNode, ::NoMoreJobsResponse) + worker.has_stopped = true + @info "[WORKER $(worker.mpi_rank)] Finished." + nothing +end +function handle_response!(worker::WorkerNode, response::JobResponse) + results = map(response.trial_details) do (trial_id, configuration) + result = worker.run_fn(configuration, trial_id) + worker.jobs_completed += 1 + return (trial_id, result) + end + save_req = SaveRequest(worker.mpi_rank, results) + send_variable_message(comm, save_req, 0) + @debug "[WORKER $(worker.mpi_rank)] Completed $(response.num_jobs) jobs." + nothing +end +function send_quit_response!(coordinator::Coordinator, target::Int) + coordinator.num_workers_closed += 1 + send_variable_message(coordinator.comm, NoMoreJobsResponse(), target) + + @debug "[COORDINATOR] No more jobs to send to Worker $(target)." +end +function handle_request!(coordinator::Coordinator, request::JobRequest) + job_id = coordinator.job_id + + n = request.num_jobs + n = min(length(coordinator.trials)-job_id+1, n) + + target = request.from + if n == 0 # If there are no more jobs left, tell requesting node + send_quit_response!(coordinator, target) + return nothing + end + + trial_data = map(job_id:(job_id+n-1)) do id + t = coordinator.trials[id] + (t.id, t.configuration) + end + response = JobResponse(n, trial_data) + send_variable_message(coordinator.comm, response, target) + @debug "[COORDINATOR] Sent $(n) jobs to Worker $(target)." + + coordinator.job_id = job_id + n + nothing +end +function handle_request!(coordinator::Coordinator, request::SaveRequest) + results = request.trial_results + @debug "[COORDINATOR] Recieved $(length(results)) results from Worker $(request.from)." + + for (trial_id, result) in results + Experimenter.complete_trial!(coordinator.database, trial_id, result) + end + + nothing +end \ No newline at end of file diff --git a/src/Experimenter.jl b/src/Experimenter.jl index e7512f9..7d5b9e7 100644 --- a/src/Experimenter.jl +++ b/src/Experimenter.jl @@ -9,16 +9,42 @@ include("runner.jl") module Cluster - function init_cluster_support() + function init_slurm_support() @eval Main using ClusterManagers if isdefined(Base, :get_extension) @eval Main Base.retry_load_extensions() end end - function install_cluster_support() + function install_slurm_support() @eval Main import Pkg @eval Main Pkg.add(["ClusterManagers"]) end + function init_mpi_support() + @eval Main using MPI + if isdefined(Base, :get_extension) + @eval Main Base.retry_load_extensions() + end + end + function install_mpi_support() + @eval Main import Pkg + @eval Main Pkg.add(["MPI"]) + end + + function _can_import_mpi() + try + import MPI + return true + catch + return false + end + end + + function _try_detect_mpi() + haskey(ENV, "OMPI_COMM_WORLD_RANK") && return true + haskey(ENV, "PMI_RANK") && return true + haskey(ENV, "MV2_COMM_WORLD_RANK") && return true + return false + end """ init(; kwargs...) @@ -32,9 +58,15 @@ module Cluster management system. Check the `ext` folder for extensions to see which keywords are supported. """ - function init(; kwargs...) + function init(; force_mpi=false, kwargs...) + if _can_import_mpi() + @eval Main Experimenter.Cluster.init_mpi_support() + if force_mpi || _try_detect_mpi() + @eval Main Experimenter.Cluster.init_mpi(; $(kwargs)...) + end + end if haskey(ENV, "SLURM_JOB_NAME") - @eval Main Experimenter.Cluster.init_cluster_support() + @eval Main Experimenter.Cluster.init_slurm_support() @eval Main Experimenter.Cluster.init_slurm(; $(kwargs)...) else @info "Cluster not detected, doing nothing." @@ -89,8 +121,9 @@ module Cluster nothing end function init_slurm end + function init_mpi end - export init, install_cluster_support, init_cluster_support + export init, install_slurm_support, init_slurm_support end using PackageExtensionCompat diff --git a/src/runner.jl b/src/runner.jl index 84419a8..4203c64 100644 --- a/src/runner.jl +++ b/src/runner.jl @@ -13,7 +13,9 @@ module ExecutionModes struct HeterogeneousMode <: AbstractExecutionMode threads_per_node::Int end - + struct MPIMode <: AbstractExecutionMode + batch_jobs::Int + end const _SerialModeSingleton = SerialMode() const _MultithreadedModeSingleton = MultithreadedMode() @@ -25,11 +27,24 @@ import .ExecutionModes: _SerialModeSingleton as SerialMode import .ExecutionModes: _MultithreadedModeSingleton as MultithreadedMode import .ExecutionModes: _DistributedModeSingleton as DistributedMode import .ExecutionModes: HeterogeneousMode +import .ExecutionModes: MPIMode +include("../ext/MPIExt/utils.jl") @doc raw"Executes the trials of the experiment one of the other, sequentially." SerialMode @doc raw"Executes the trials of the experiment in parallel using `Threads.@Threads`" MultithreadedMode @doc raw"Executes the trials of the experiment in parallel using `Distributed.jl`s `pmap`." DistributedMode @doc raw"Executes the trials of the experiment in parallel using a custom scheduler that uses all threads of each worker." HeterogeneousMode +@doc raw"Executes the trials of the experiment in parallel using `MPI`, which uses one MPI node for coordination and saving of jobs." MPIMode + + +# Function calls to be overwritten +""" + _mpi_begin_job(runner::Runner, trials::AbstractArray{Trial}) + +Executes the MPI process that becomes a coordinator or a worker, depending on the rank. +""" +function _mpi_run_job end +function _mpi_worker_loop end # Global database const global_database = Ref{Union{Missing, ExperimentDatabase}}(missing) @@ -57,45 +72,70 @@ macro execute(experiment, database, mode=SerialMode, use_progress=false, directo quote $(esc(experiment)) = restore_from_db($(esc(database)), $(esc(experiment))) let runner = Runner(experiment=$(esc(experiment)), database=$(esc(database)), execution_mode=$(esc(mode))) - push!(runner.database, runner.experiment) - existing_trials = get_trials(runner.database, runner.experiment.id) + is_mpi_worker = false + if runner.execution_mode isa MPIMode + comm = MPI.COMM_WORLD + rank = MPI.Comm_rank(comm) + if rank > 0 + is_mpi_worker = true + + # Load the trial code straight away + dir = $(esc(directory)) + cd(dir) + include_file = runner.experiment.include_file + include_file_path = joinpath(dir, include_file) + if !ismissing(include_file) + Base.include(Main, "$include_file_path") + end + + # Run the work loop + fn = eval(Main, Meta.parse(runner.experiment.function_name)) + _mpi_worker_loop(runner.execution_mode.batch_jobs, fn) + end + end - completed_trials = [trial for trial in existing_trials if trial.has_finished] - completed_uuids = Set(trial.id for trial in completed_trials) - # Only take unrun trials - incomplete_trials = [trial for trial in runner.experiment if !(trial.id in completed_uuids)] + if !is_mpi_worker + push!(runner.database, runner.experiment) + existing_trials = get_trials(runner.database, runner.experiment.id) - # Push all incomplete trials to the database - for trial in incomplete_trials - push!(runner.database, trial) - end + completed_trials = [trial for trial in existing_trials if trial.has_finished] + completed_uuids = Set(trial.id for trial in completed_trials) + # Only take unrun trials + incomplete_trials = [trial for trial in runner.experiment if !(trial.id in completed_uuids)] - dir = $(esc(directory)) - if runner.execution_mode == DistributedMode - current_environment = dirname(Pkg.project().path) - @info "Activating environments..." - @everywhere using Pkg - wait.([remotecall(Pkg.activate, i, current_environment) for i in workers()]) - @everywhere using Experimenter - # Make sure each worker is in the right directory - @info "Switching to '$dir'..." - wait.([remotecall(cd, i, dir) for i in workers()]) - end + # Push all incomplete trials to the database + for trial in incomplete_trials + push!(runner.database, trial) + end - cd(dir) - include_file = runner.experiment.include_file - include_file_path = joinpath(dir, include_file) - if !ismissing(include_file) - if requires_distributed(runner.execution_mode) - code = Meta.parse("Base.include(Main, raw\"$include_file_path\")") - includes_calls = [remotecall(Base.eval, i, code) for i in workers()] - wait.(includes_calls) + dir = $(esc(directory)) + if runner.execution_mode == DistributedMode + current_environment = dirname(Pkg.project().path) + @info "Activating environments..." + @everywhere using Pkg + wait.([remotecall(Pkg.activate, i, current_environment) for i in workers()]) + @everywhere using Experimenter + # Make sure each worker is in the right directory + @info "Switching to '$dir'..." + wait.([remotecall(cd, i, dir) for i in workers()]) + end + + cd(dir) + include_file = runner.experiment.include_file + include_file_path = joinpath(dir, include_file) + if !ismissing(include_file) + if requires_distributed(runner.execution_mode) + code = Meta.parse("Base.include(Main, raw\"$include_file_path\")") + includes_calls = [remotecall(Base.eval, i, code) for i in workers()] + wait.(includes_calls) + end + + Base.include(Main, "$include_file_path") end - Base.include(Main, "$include_file_path") - end - run_trials(runner, incomplete_trials; use_progress=$(esc(use_progress))) + run_trials(runner, incomplete_trials; use_progress=$(esc(use_progress))) + end end end end @@ -298,6 +338,8 @@ function run_trials(runner::Runner, trials::AbstractArray{Trial}; use_progress=f (id, results) = execute_trial(runner.experiment.function_name, trial) complete_trial!(runner.database, id, results) end + elseif execution_mode isa MPIMode + _mpi_run_job() else @info "Running $(length(trials)) trials" for trial in iter From 29d99a0371dc84d9eba757c43bd0a1d7d7123daa Mon Sep 17 00:00:00 2001 From: Jamie Mair Date: Mon, 29 Apr 2024 13:59:28 +0100 Subject: [PATCH 02/11] Fixed bugs in initial MPI implementation --- README.md | 2 +- docs/src/index.md | 2 +- examples/mpi/.gitignore | 3 + examples/mpi/LocalPreferences.toml | 9 +++ examples/mpi/Project.toml | 6 ++ examples/mpi/check_results.jl | 9 +++ examples/mpi/mpi_run.sh | 1 + examples/mpi/my_experiment.jl | 19 +++++ examples/mpi/run.jl | 23 ++++++ ext/MPIExt/MPIExt.jl | 19 ++--- ext/MPIExt/utils.jl | 6 +- src/Experimenter.jl | 121 +---------------------------- src/cluster.jl | 117 ++++++++++++++++++++++++++++ src/database.jl | 7 +- src/experiment.jl | 4 +- src/runner.jl | 26 ++++--- 16 files changed, 222 insertions(+), 152 deletions(-) create mode 100644 examples/mpi/.gitignore create mode 100644 examples/mpi/LocalPreferences.toml create mode 100644 examples/mpi/Project.toml create mode 100644 examples/mpi/check_results.jl create mode 100755 examples/mpi/mpi_run.sh create mode 100644 examples/mpi/my_experiment.jl create mode 100644 examples/mpi/run.jl create mode 100644 src/cluster.jl diff --git a/README.md b/README.md index 9d9de41..e8964a9 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ - Create a local SQLite database to store the results of your experiment. - Provides a standard structure for executing code across a range of parameters. - Provides saving of results into the database using standard Julia types. -- Provides an `@execute` macro that will execute an experiment (consisting of many trails with different parameters). Can execute serially, or in parallel with a choice of multithreading or multiprocessing. +- Provides an `@execute` macro that will execute an experiment (consisting of many trials with different parameters). Can execute serially, or in parallel with a choice of multithreading or multiprocessing. - Automatically skips completed trials. Head over to the [Getting Started](https://jamiemair.github.io/Experimenter.jl/stable/getting_started/) section of the documentation to see how to use this package. diff --git a/docs/src/index.md b/docs/src/index.md index cadba80..23779d8 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -10,7 +10,7 @@ CurrentModule = Experimenter - Create a local SQLite database to store the results of your experiment. - Provides a standard structure for executing code across a range of parameters. - Provides saving of results into the database using standard Julia types. -- Provides an `@execute` macro that will execute an experiment (consisting of many trails with different parameters). Can execute serially, or in parallel with a choice of multithreading or multiprocessing. +- Provides an `@execute` macro that will execute an experiment (consisting of many trials with different parameters). Can execute serially, or in parallel with a choice of multithreading or multiprocessing. - Automatically skips completed trials. Head over to [Getting Started](@ref) to get an overview of this package. diff --git a/examples/mpi/.gitignore b/examples/mpi/.gitignore new file mode 100644 index 0000000..ce7429b --- /dev/null +++ b/examples/mpi/.gitignore @@ -0,0 +1,3 @@ +results/ +*.out +LocalPreferences.toml \ No newline at end of file diff --git a/examples/mpi/LocalPreferences.toml b/examples/mpi/LocalPreferences.toml new file mode 100644 index 0000000..d970ab6 --- /dev/null +++ b/examples/mpi/LocalPreferences.toml @@ -0,0 +1,9 @@ +[MPIPreferences] +__clear__ = ["preloads_env_switch"] +_format = "1.0" +abi = "OpenMPI" +binary = "system" +cclibs = [] +libmpi = "libmpi" +mpiexec = "mpiexec" +preloads = [] diff --git a/examples/mpi/Project.toml b/examples/mpi/Project.toml new file mode 100644 index 0000000..cda965e --- /dev/null +++ b/examples/mpi/Project.toml @@ -0,0 +1,6 @@ +[deps] +Experimenter = "6aee034a-9508-47b1-8e11-813cc29af79f" +MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" + +[extras] +MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267" diff --git a/examples/mpi/check_results.jl b/examples/mpi/check_results.jl new file mode 100644 index 0000000..7779b6c --- /dev/null +++ b/examples/mpi/check_results.jl @@ -0,0 +1,9 @@ +using Experimenter +db = open_db("experiments.db", "results", false) +trials = get_trials_by_name(db, "Test Experiment") + +for (i, t) in enumerate(trials) + hostname = t.results[:hostname] + id = t.results[:pid] + println("Trial $i ran on $hostname on worker $id") +end \ No newline at end of file diff --git a/examples/mpi/mpi_run.sh b/examples/mpi/mpi_run.sh new file mode 100755 index 0000000..15b6023 --- /dev/null +++ b/examples/mpi/mpi_run.sh @@ -0,0 +1 @@ +mpirun -n 4 julia --project my_experiment.jl --threads=1 \ No newline at end of file diff --git a/examples/mpi/my_experiment.jl b/examples/mpi/my_experiment.jl new file mode 100644 index 0000000..187f0bc --- /dev/null +++ b/examples/mpi/my_experiment.jl @@ -0,0 +1,19 @@ +using Experimenter + +config = Dict{Symbol,Any}( + :N => IterableVariable([Int(1e6), Int(2e6), Int(3e6)]), + :seed => IterableVariable([1234, 4321, 3467, 134234, 121]), + :sigma => 0.0001) +experiment = Experiment( + name="Test Experiment", + include_file="run.jl", + function_name="run_trial", + configuration=deepcopy(config) +) + +db = open_db("experiments.db", "results", false) + +# Init the cluster +Experimenter.Cluster.init() + +@execute experiment db MPIMode(1) \ No newline at end of file diff --git a/examples/mpi/run.jl b/examples/mpi/run.jl new file mode 100644 index 0000000..ffcbf34 --- /dev/null +++ b/examples/mpi/run.jl @@ -0,0 +1,23 @@ +using Random +using Distributed +using MPI + + +function run_trial(config::Dict{Symbol,Any}, trial_id) + results = Dict{Symbol, Any}() + sigma = config[:sigma] + N = config[:N] + seed = config[:seed] + rng = Random.Xoshiro(seed) + # Perform some calculation + results[:distance] = sum(rand(rng) * sigma for _ in 1:N) + results[:num_threads] = Threads.nthreads() + + comm = MPI.COMM_WORLD + rank = MPI.Comm_rank(comm) + + results[:mpi_worker] = rank + + # Must return a Dict{Symbol, Any}, with the data we want to save + return results +end \ No newline at end of file diff --git a/ext/MPIExt/MPIExt.jl b/ext/MPIExt/MPIExt.jl index 7de5c75..9321cab 100644 --- a/ext/MPIExt/MPIExt.jl +++ b/ext/MPIExt/MPIExt.jl @@ -25,7 +25,7 @@ function Experimenter.Cluster.init_mpi() end if rank == 0 - @info "Initialised MPI with $(comm_size) workers." + @info "Initialised MPI with $(comm_size-1) workers and 1 coordinator." end end @@ -38,8 +38,6 @@ function Experimenter._mpi_run_job(runner::Experimenter.Runner, trials::Abstract else @warn "[WORKER $(rank)] Reached a function that it should not be able to reach." end - # TODO: Add a way to override this finalise? - MPI.Finalize() end function Experimenter._mpi_worker_loop(batch_size::Int, trial_fn::Function) @@ -62,8 +60,6 @@ function Experimenter._mpi_worker_loop(batch_size::Int, trial_fn::Function) @debug "[WORKER $(rank)] Loaded." - MPI.Barrier(comm) - while !worker.has_stopped @debug "[WORKER $(rank)] Loaded." send_variable_message(worker.comm, job_request, 0; should_block = true) @@ -71,10 +67,10 @@ function Experimenter._mpi_worker_loop(batch_size::Int, trial_fn::Function) handle_response!(worker, response) end - MPI.Barrier(comm) + MPI.Finalize() end -function coordinator_loop(experiment::Experiment, db::ExperimentDatabase, trails::AbstractArray{Experimenter.Trial}) +function coordinator_loop(experiment::Experiment, db::ExperimentDatabase, trials::AbstractArray{Experimenter.Trial}) comm = MPI.COMM_WORLD rank = MPI.Comm_rank(comm) if rank != 0 @@ -83,7 +79,7 @@ function coordinator_loop(experiment::Experiment, db::ExperimentDatabase, trails comm_size = MPI.Comm_size(comm) coordinator = Coordinator( comm, - length(experiments), + length(trials), 1, 0, 0, @@ -93,8 +89,6 @@ function coordinator_loop(experiment::Experiment, db::ExperimentDatabase, trails db ) - MPI.Barrier(comm) - @info "[COORDINATOR] $(comm_size - 1) workers ready. Starting experiment with $(length(trials)) trials." while coordinator.num_workers_closed != coordinator.num_workers @@ -105,10 +99,7 @@ function coordinator_loop(experiment::Experiment, db::ExperimentDatabase, trails @info "[COORDINATOR] Finished." - MPI.Barrier(comm) - - @info "[COORDINATOR] All workers finished." - + MPI.Finalize() end end \ No newline at end of file diff --git a/ext/MPIExt/utils.jl b/ext/MPIExt/utils.jl index 10929c7..8dbc609 100644 --- a/ext/MPIExt/utils.jl +++ b/ext/MPIExt/utils.jl @@ -1,6 +1,6 @@ using Serialization using Logging - +import Base: UUID function send_variable_message(comm, data, dest; tag=MPI.ANY_TAG, should_block=false) if tag == MPI.ANY_TAG # Override any tag @@ -68,7 +68,7 @@ end function handle_response!(worker::WorkerNode, ::NoMoreJobsResponse) worker.has_stopped = true - @info "[WORKER $(worker.mpi_rank)] Finished." + @debug "[WORKER $(worker.mpi_rank)] Finished." nothing end function handle_response!(worker::WorkerNode, response::JobResponse) @@ -78,7 +78,7 @@ function handle_response!(worker::WorkerNode, response::JobResponse) return (trial_id, result) end save_req = SaveRequest(worker.mpi_rank, results) - send_variable_message(comm, save_req, 0) + send_variable_message(worker.comm, save_req, 0) @debug "[WORKER $(worker.mpi_rank)] Completed $(response.num_jobs) jobs." nothing end diff --git a/src/Experimenter.jl b/src/Experimenter.jl index 7d5b9e7..09d5c42 100644 --- a/src/Experimenter.jl +++ b/src/Experimenter.jl @@ -7,124 +7,9 @@ include("database.jl") include("heterogeneous_mapper.jl") include("runner.jl") +# Add support for cluster execution +include("cluster.jl") -module Cluster - function init_slurm_support() - @eval Main using ClusterManagers - if isdefined(Base, :get_extension) - @eval Main Base.retry_load_extensions() - end - end - function install_slurm_support() - @eval Main import Pkg - @eval Main Pkg.add(["ClusterManagers"]) - end - function init_mpi_support() - @eval Main using MPI - if isdefined(Base, :get_extension) - @eval Main Base.retry_load_extensions() - end - end - function install_mpi_support() - @eval Main import Pkg - @eval Main Pkg.add(["MPI"]) - end - - function _can_import_mpi() - try - import MPI - return true - catch - return false - end - end - - function _try_detect_mpi() - haskey(ENV, "OMPI_COMM_WORLD_RANK") && return true - haskey(ENV, "PMI_RANK") && return true - haskey(ENV, "MV2_COMM_WORLD_RANK") && return true - return false - end - - """ - init(; kwargs...) - - Checks the environment variables to see if a script is running on a cluster - and then launches the processes as determined by the environment variables. - - # Arguments - - The keyword arguments are forwarded to the init function for each cluster - management system. Check the `ext` folder for extensions to see which - keywords are supported. - """ - function init(; force_mpi=false, kwargs...) - if _can_import_mpi() - @eval Main Experimenter.Cluster.init_mpi_support() - if force_mpi || _try_detect_mpi() - @eval Main Experimenter.Cluster.init_mpi(; $(kwargs)...) - end - end - if haskey(ENV, "SLURM_JOB_NAME") - @eval Main Experimenter.Cluster.init_slurm_support() - @eval Main Experimenter.Cluster.init_slurm(; $(kwargs)...) - else - @info "Cluster not detected, doing nothing." - end - end - - """ - create_slurm_template(file_loc; job_logs_dir="hpc/logs") - - Creates a template bash script at the supplied file location and - creates the log directory used for the outputs. You should modify - this script to adjust the resources required. - """ - function create_slurm_template(file_loc::AbstractString; - job_logs_dir::AbstractString="hpc/logs") - - log_dir = joinpath(dirname(file_loc), job_logs_dir) - if !isdir(log_dir) && isdirpath(log_dir) - @info "Creating directory at $log_dir to store the log files" - mkdir(log_dir) - end - - - file_contents = """#!/bin/bash - - #SBATCH --nodes=1 - #SBATCH --ntasks=1 - #SBATCH --cpus-per-task=2 - #SBATCH --mem-per-cpu=1024 - #SBATCH --time=00:30:00 - #SBATCH -o $log_dir/job_%j.out - #SBATCH --partition=compute - - # Change below to load version of Julia used - module load julia - - # Change directory if needed - # cd "experiments" - - julia --project myscript.jl --threads=1 - - # Optional: Remove the files created by ClusterManagers.jl - # rm -fr julia-*.out - """ - - open(file_loc, "w") do io - print(io, file_contents) - end - - @info "Wrote template file to $(abspath(file_loc))" - - nothing - end - function init_slurm end - function init_mpi end - - export init, install_slurm_support, init_slurm_support -end using PackageExtensionCompat function __init__() @@ -153,7 +38,7 @@ export complete_trial!, complete_trial_in_global_database, mark_trial_as_incompl ### Execution export execute_trial, execute_trial_and_save_to_db_async, get_global_store export @execute -export SerialMode, MultithreadedMode, DistributedMode, HeterogeneousMode +export SerialMode, MultithreadedMode, DistributedMode, HeterogeneousMode, MPIMode ### Snapshots export Snapshot diff --git a/src/cluster.jl b/src/cluster.jl new file mode 100644 index 0000000..7f4db38 --- /dev/null +++ b/src/cluster.jl @@ -0,0 +1,117 @@ +module Cluster + function init_slurm_support() + @eval Main using ClusterManagers + if isdefined(Base, :get_extension) + @eval Main Base.retry_load_extensions() + end + end + function install_slurm_support() + @eval Main import Pkg + @eval Main Pkg.add(["ClusterManagers"]) + end + function init_mpi_support() + @eval Main using MPI + if isdefined(Base, :get_extension) + @eval Main Base.retry_load_extensions() + end + end + function install_mpi_support() + @eval Main import Pkg + @eval Main Pkg.add(["MPI"]) + end + + function _try_detect_mpi() + haskey(ENV, "OMPI_COMM_WORLD_RANK") && return true + haskey(ENV, "PMI_RANK") && return true + haskey(ENV, "MV2_COMM_WORLD_RANK") && return true + return false + end + + + function _is_master_node() + if _try_detect_mpi() + haskey(ENV, "OMPI_COMM_WORLD_RANK") && return parse(Int, ENV["OMPI_COMM_WORLD_RANK"]) == 0 + haskey(ENV, "PMI_RANK") && return parse(Int, ENV["PMI_RANK"]) == 0 + haskey(ENV, "MV2_COMM_WORLD_RANK") && return parse(Int, ENV["MV2_COMM_WORLD_RANK"]) == 0 + end + + return true + end + + """ + init(; kwargs...) + + Checks the environment variables to see if a script is running on a cluster + and then launches the processes as determined by the environment variables. + + # Arguments + + The keyword arguments are forwarded to the init function for each cluster + management system. Check the `ext` folder for extensions to see which + keywords are supported. + """ + function init(; force_mpi=false, force_slurm=false, kwargs...) + (force_mpi && force_slurm) && error("Must set only one of `force_mpi` and `force_slurm` to true at a time.") + if !force_slurm && (force_mpi || _try_detect_mpi()) + @eval Main Experimenter.Cluster.init_mpi_support() + @eval Main Experimenter.Cluster.init_mpi(; $(kwargs)...) + elseif force_slurm || haskey(ENV, "SLURM_JOB_NAME") + @eval Main Experimenter.Cluster.init_slurm_support() + @eval Main Experimenter.Cluster.init_slurm(; $(kwargs)...) + else + @info "Cluster not detected, doing nothing." + end + end + + """ + create_slurm_template(file_loc; job_logs_dir="hpc/logs") + + Creates a template bash script at the supplied file location and + creates the log directory used for the outputs. You should modify + this script to adjust the resources required. + """ + function create_slurm_template(file_loc::AbstractString; + job_logs_dir::AbstractString="hpc/logs") + + log_dir = joinpath(dirname(file_loc), job_logs_dir) + if !isdir(log_dir) && isdirpath(log_dir) + @info "Creating directory at $log_dir to store the log files" + mkdir(log_dir) + end + + + file_contents = """#!/bin/bash + + #SBATCH --nodes=1 + #SBATCH --ntasks=1 + #SBATCH --cpus-per-task=2 + #SBATCH --mem-per-cpu=1024 + #SBATCH --time=00:30:00 + #SBATCH -o $log_dir/job_%j.out + #SBATCH --partition=compute + + # Change below to load version of Julia used + module load julia + + # Change directory if needed + # cd "experiments" + + julia --project myscript.jl --threads=1 + + # Optional: Remove the files created by ClusterManagers.jl + # rm -fr julia-*.out + """ + + open(file_loc, "w") do io + print(io, file_contents) + end + + @info "Wrote template file to $(abspath(file_loc))" + + nothing + end + function init_slurm end + function init_mpi end + + export init, install_slurm_support, init_slurm_support +end \ No newline at end of file diff --git a/src/database.jl b/src/database.jl index 858de4e..0c116c1 100644 --- a/src/database.jl +++ b/src/database.jl @@ -103,7 +103,12 @@ If the database already exists, it will open it and not overwrite the existing d Setting `in_memory` to `true` will skip all of the arguments and create the database "in memory" and hence, will not persist. """ -function open_db(database_name, experiment_folder=joinpath(pwd(), "experiments"), create_folder=true; in_memory=false)::ExperimentDatabase +function open_db(database_name, experiment_folder=joinpath(pwd(), "experiments"), create_folder=true; in_memory=false)::Union{Nothing, ExperimentDatabase} + + if !Cluster._is_master_node() + return nothing + end + if !in_memory && (!Base.Filesystem.isdir(experiment_folder)) if create_folder @info "Creating $experiment_folder for experiments folder." diff --git a/src/experiment.jl b/src/experiment.jl index db582d1..4436024 100644 --- a/src/experiment.jl +++ b/src/experiment.jl @@ -161,7 +161,7 @@ Base.@kwdef struct Trial has_finished::Bool = false end -function count_trails(experiment::Experiment) +function count_trials(experiment::Experiment) return count_trials(experiment.configuration) end @@ -240,5 +240,5 @@ function Base.iterate(experiment::Experiment, state) return trial, next_state end -Base.length(experiment::Experiment) = count_trails(experiment) +Base.length(experiment::Experiment) = count_trials(experiment) Base.eltype(::Experiment) = Trial \ No newline at end of file diff --git a/src/runner.jl b/src/runner.jl index 4203c64..7f34851 100644 --- a/src/runner.jl +++ b/src/runner.jl @@ -28,7 +28,6 @@ import .ExecutionModes: _MultithreadedModeSingleton as MultithreadedMode import .ExecutionModes: _DistributedModeSingleton as DistributedMode import .ExecutionModes: HeterogeneousMode import .ExecutionModes: MPIMode -include("../ext/MPIExt/utils.jl") @doc raw"Executes the trials of the experiment one of the other, sequentially." SerialMode @doc raw"Executes the trials of the experiment in parallel using `Threads.@Threads`" MultithreadedMode @@ -39,7 +38,7 @@ include("../ext/MPIExt/utils.jl") # Function calls to be overwritten """ - _mpi_begin_job(runner::Runner, trials::AbstractArray{Trial}) + _mpi_run_job(runner::Runner, trials::AbstractArray{Trial}) Executes the MPI process that becomes a coordinator or a worker, depending on the rank. """ @@ -55,7 +54,7 @@ const global_store = Ref{Union{Missing, Store}}(missing) Base.@kwdef struct Runner execution_mode::ExecutionModes.AbstractExecutionMode experiment::Experiment - database::ExperimentDatabase + database::Union{ExperimentDatabase, Nothing} end """ @@ -70,31 +69,34 @@ directory: Directory to change the current process (or worker processes) to for """ macro execute(experiment, database, mode=SerialMode, use_progress=false, directory=pwd()) quote - $(esc(experiment)) = restore_from_db($(esc(database)), $(esc(experiment))) + if !isnothing($(esc(database))) + $(esc(experiment)) = restore_from_db($(esc(database)), $(esc(experiment))) + end let runner = Runner(experiment=$(esc(experiment)), database=$(esc(database)), execution_mode=$(esc(mode))) is_mpi_worker = false if runner.execution_mode isa MPIMode - comm = MPI.COMM_WORLD - rank = MPI.Comm_rank(comm) - if rank > 0 + if !Cluster._is_master_node() is_mpi_worker = true # Load the trial code straight away dir = $(esc(directory)) cd(dir) include_file = runner.experiment.include_file - include_file_path = joinpath(dir, include_file) if !ismissing(include_file) + include_file_path = joinpath(dir, include_file) Base.include(Main, "$include_file_path") end - # Run the work loop - fn = eval(Main, Meta.parse(runner.experiment.function_name)) + code = Meta.parse(runner.experiment.function_name) + fn = Base.eval(Main, code) _mpi_worker_loop(runner.execution_mode.batch_jobs, fn) end end if !is_mpi_worker + if isnothing(runner.database) + error("The database supplied has not been initialised!") + end push!(runner.database, runner.experiment) existing_trials = get_trials(runner.database, runner.experiment.id) @@ -122,8 +124,8 @@ macro execute(experiment, database, mode=SerialMode, use_progress=false, directo cd(dir) include_file = runner.experiment.include_file - include_file_path = joinpath(dir, include_file) if !ismissing(include_file) + include_file_path = joinpath(dir, include_file) if requires_distributed(runner.execution_mode) code = Meta.parse("Base.include(Main, raw\"$include_file_path\")") includes_calls = [remotecall(Base.eval, i, code) for i in workers()] @@ -339,7 +341,7 @@ function run_trials(runner::Runner, trials::AbstractArray{Trial}; use_progress=f complete_trial!(runner.database, id, results) end elseif execution_mode isa MPIMode - _mpi_run_job() + _mpi_run_job(runner, trials) else @info "Running $(length(trials)) trials" for trial in iter From e869a664d8608c67c654d8e18cf3a1ae1c991c56 Mon Sep 17 00:00:00 2001 From: Jamie Mair Date: Mon, 29 Apr 2024 15:27:38 +0100 Subject: [PATCH 03/11] Added code for loading an initial store --- ext/MPIExt/MPIExt.jl | 18 +++++++++++++++--- src/runner.jl | 8 ++++---- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/ext/MPIExt/MPIExt.jl b/ext/MPIExt/MPIExt.jl index 9321cab..497aae1 100644 --- a/ext/MPIExt/MPIExt.jl +++ b/ext/MPIExt/MPIExt.jl @@ -40,13 +40,17 @@ function Experimenter._mpi_run_job(runner::Experimenter.Runner, trials::Abstract end end -function Experimenter._mpi_worker_loop(batch_size::Int, trial_fn::Function) +function Experimenter._mpi_worker_loop(runner::Experimenter.Runner) + batch_size = runner.execution_mode.batch_size + trial_fn = Base.eval(Main, Meta.parse(runner.experiment.function_name)) + comm = MPI.COMM_WORLD rank = MPI.Comm_rank(comm) if rank == 0 error("The first process is reserved for the coordinator and cannot be a worker.") end + # Initialise the state of the worker worker = WorkerNode( comm, rank, @@ -55,8 +59,16 @@ function Experimenter._mpi_worker_loop(batch_size::Int, trial_fn::Function) trial_fn ) - job_size = batch_size - job_request = JobRequest(worker.mpi_rank, job_size) + job_request = JobRequest(worker.mpi_rank, batch_size) + + if !ismissing(runner.experiment.init_store_function_name) + @debug "[WORKER $(rank)] Initialising the global store" + init_fn_name = runner.experiment.init_store_function_name + experiment_config = runner.experiment.configuration + + construct_store(init_fn_name, experiment_config) + end + @debug "[WORKER $(rank)] Loaded." diff --git a/src/runner.jl b/src/runner.jl index 7f34851..939503a 100644 --- a/src/runner.jl +++ b/src/runner.jl @@ -14,7 +14,7 @@ module ExecutionModes threads_per_node::Int end struct MPIMode <: AbstractExecutionMode - batch_jobs::Int + batch_size::Int end const _SerialModeSingleton = SerialMode() @@ -89,7 +89,7 @@ macro execute(experiment, database, mode=SerialMode, use_progress=false, directo code = Meta.parse(runner.experiment.function_name) fn = Base.eval(Main, code) - _mpi_worker_loop(runner.execution_mode.batch_jobs, fn) + _mpi_worker_loop(runner.execution_mode.batch_size, fn) end end @@ -150,13 +150,13 @@ globally per process. This can be used to initialise a shared database. The store is intended to be read-only. """ function construct_store(function_name::AbstractString, configuration) - fn = Base.eval(Main, Meta.parse("$function_name")) + fn = Base.eval(Main, Meta.parse(function_name)) store_data = fn(configuration) # ToDo add a potential lock here? This should only be called once per process. global_store[] = Store(store_data) return nothing end -construct_store(::Missing, ::Any) = Store() # Construct an empty store +construct_store(::Missing, ::Any) = nothing # Construct an empty store """ get_global_store() From 9dac86382bdf0341d1b9dc7e07b68a0e10f07db6 Mon Sep 17 00:00:00 2001 From: Jamie Mair Date: Mon, 29 Apr 2024 15:38:29 +0100 Subject: [PATCH 04/11] Updated snapshots to use time_ns instead --- src/database.jl | 4 ++-- src/snapshots.jl | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/database.jl b/src/database.jl index 0c116c1..1835566 100644 --- a/src/database.jl +++ b/src/database.jl @@ -41,7 +41,7 @@ function Base.push!(db::ExperimentDatabase, trial::Trial) nothing end function Base.push!(db::ExperimentDatabase, snapshot::Snapshot) - vs = (string(snapshot.id), string(snapshot.trial_id), snapshot.state, snapshot.label) + vs = (string(snapshot.id), string(snapshot.trial_id), snapshot.state, snapshot.label, snapshot.created_at) SQLite.execute(db._snapshotInsertStmt, vs) nothing end @@ -324,7 +324,7 @@ end Saves the snapshot with given `state` in the database, associating with the trial with matching `trial_id`. Automatically saves the time of the snapshot. """ function save_snapshot!(db::ExperimentDatabase, trial_id::UUID, state::Dict{Symbol,Any}, label=missing) - snapshot = Snapshots.Snapshot(trial_id=trial_id, state=state, label=label) + snapshot = Snapshots.Snapshot(trial_id=trial_id, state=state, label=label, created_at=time_ns()) push!(db, snapshot) nothing end diff --git a/src/snapshots.jl b/src/snapshots.jl index dc2a77e..fb73522 100644 --- a/src/snapshots.jl +++ b/src/snapshots.jl @@ -4,12 +4,12 @@ using Base using DataFrames using SQLite -Base.@kwdef struct Snapshot{L<:Union{Missing,AbstractString}, D<:Union{Missing,AbstractString}} +Base.@kwdef struct Snapshot{L<:Union{Missing,AbstractString}} id::UUID = uuid4() trial_id::UUID state::Dict{Symbol, Any} label::L = missing - created_at::D = missing + created_at::UInt64 end const snapshot_table_query = raw""" @@ -18,7 +18,7 @@ CREATE TABLE IF NOT EXISTS Snapshots ( trial_id TEXT NOT NULL, state BLOB, label TEXT, - created_at DATETIME DEFAULT (strftime('%Y-%m-%d %H:%M:%f', 'now')), + created_at INTEGER, FOREIGN KEY (trial_id) REFERENCES Trials (id) ON DELETE NO ACTION ON UPDATE NO ACTION ); @@ -26,7 +26,7 @@ CREATE TABLE IF NOT EXISTS Snapshots ( function get_snapshot_insert_stmt(db::SQLite.DB) sql = raw""" - INSERT INTO Snapshots (id, trial_id, state, label) VALUES (?, ?, ?, ?) + INSERT INTO Snapshots (id, trial_id, state, label, created_at) VALUES (?, ?, ?, ?, ?) """ return SQLite.Stmt(db, sql) end From 8435bc25bb24d6f20b03cfd3bf7e847c96cf54ce Mon Sep 17 00:00:00 2001 From: Jamie Mair Date: Mon, 29 Apr 2024 15:42:30 +0100 Subject: [PATCH 05/11] Swapped to using the time since the unix epoch (as a float) --- src/database.jl | 2 +- src/snapshots.jl | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/database.jl b/src/database.jl index 1835566..b32ed96 100644 --- a/src/database.jl +++ b/src/database.jl @@ -324,7 +324,7 @@ end Saves the snapshot with given `state` in the database, associating with the trial with matching `trial_id`. Automatically saves the time of the snapshot. """ function save_snapshot!(db::ExperimentDatabase, trial_id::UUID, state::Dict{Symbol,Any}, label=missing) - snapshot = Snapshots.Snapshot(trial_id=trial_id, state=state, label=label, created_at=time_ns()) + snapshot = Snapshots.Snapshot(trial_id=trial_id, state=state, label=label, created_at=time()) push!(db, snapshot) nothing end diff --git a/src/snapshots.jl b/src/snapshots.jl index fb73522..28529dc 100644 --- a/src/snapshots.jl +++ b/src/snapshots.jl @@ -9,7 +9,7 @@ Base.@kwdef struct Snapshot{L<:Union{Missing,AbstractString}} trial_id::UUID state::Dict{Symbol, Any} label::L = missing - created_at::UInt64 + created_at::REAL end const snapshot_table_query = raw""" @@ -18,7 +18,7 @@ CREATE TABLE IF NOT EXISTS Snapshots ( trial_id TEXT NOT NULL, state BLOB, label TEXT, - created_at INTEGER, + created_at REAL NOT NULL, FOREIGN KEY (trial_id) REFERENCES Trials (id) ON DELETE NO ACTION ON UPDATE NO ACTION ); From b83966d4ad4183e27e62fd47467c003353dfedc2 Mon Sep 17 00:00:00 2001 From: Jamie Mair Date: Mon, 29 Apr 2024 16:24:00 +0100 Subject: [PATCH 06/11] Added support for snapshots and results gathering in mpi worker nodes --- ext/MPIExt/MPIExt.jl | 31 +++++++++++++++++++++++++++++ ext/MPIExt/utils.jl | 46 ++++++++++++++++++++++++++++++++++++++++++++ src/cluster.jl | 11 ++++++++++- src/database.jl | 2 +- src/runner.jl | 19 ++++++++++++++++-- src/snapshots.jl | 2 +- 6 files changed, 106 insertions(+), 5 deletions(-) diff --git a/ext/MPIExt/MPIExt.jl b/ext/MPIExt/MPIExt.jl index 497aae1..19dbab7 100644 --- a/ext/MPIExt/MPIExt.jl +++ b/ext/MPIExt/MPIExt.jl @@ -114,4 +114,35 @@ function coordinator_loop(experiment::Experiment, db::ExperimentDatabase, trials MPI.Finalize() end +function Experimenter._mpi_anon_get_latest_snapshot(trial_id::UUID) + comm = MPI.COMM_WORLD + rank = MPI.Comm_rank(comm) + + req = GetLatestSnapshotRequest(rank, trial_id) + send_variable_message(comm, req, 0; should_block=true) + + msg = recieve_variable_message(comm) + @assert typeof(msg) <: SnapshotResponse "Did not recieve the expected snapshot. Recieved $(typeof(msg)) instead." + + return msg.snapshot +end +function Experimenter._mpi_anon_save_snapshot(trial_id::UUID, state::Dict{Symbol, Any}, label::Union{Missing, String} = missing) + comm = MPI.COMM_WORLD + + req = SaveSnapshotRequest(trial_id, state, label) + send_variable_message(comm, req, 0; should_block=false) + return nothing +end +function Experimenter._mpi_anon_get_trial_results(trial_id::UUID) + comm = MPI.COMM_WORLD + rank = MPI.Comm_rank(comm) + req = GetResultsRequest(rank, trial_id) + send_variable_message(comm, req, 0; should_block=true) + + msg = recieve_variable_message(comm) + @assert typeof(msg) <: ResultsResponse "Did not recieve the results from coordinator. Recieved $(typeof(msg)) instead." + + return msg.results +end + end \ No newline at end of file diff --git a/ext/MPIExt/utils.jl b/ext/MPIExt/utils.jl index 8dbc609..e352cb1 100644 --- a/ext/MPIExt/utils.jl +++ b/ext/MPIExt/utils.jl @@ -25,6 +25,26 @@ struct JobRequest <: AbstractRequest from::Int num_jobs::Int end +struct GetLatestSnapshotRequest <: AbstractRequest + from::Int + trial_id::UUID +end +struct SaveSnapshotRequest <: AbstractRequest + trial_id::UUID + state::Dict{Symbol, Any} + label::Union{Missing, String} +end +struct GetResultsRequest <: AbstractRequest + from::Int + trial_id::UUID +end + +struct SnapshotResponse <: AbstractResponse + snapshot::Union{Missing, Experimenter.Snapshot} +end +struct ResultsResponse <: AbstractResponse + results::Union{Missing, Dict{Symbol, Any}} +end struct JobResponse num_jobs::Int @@ -38,6 +58,7 @@ end struct NoMoreJobsResponse <: AbstractResponse end + mutable struct Coordinator comm::MPI.Comm num_jobs::Int @@ -119,5 +140,30 @@ function handle_request!(coordinator::Coordinator, request::SaveRequest) Experimenter.complete_trial!(coordinator.database, trial_id, result) end + nothing +end + +# Snapshots +function handle_request!(coordinator::Coordinator, request::GetLatestSnapshotRequest) + @debug "[COORDINATOR] Recieved latest snapshot request from Worker $(request.from)." + + snapshot = Experimenter.latest_snapshot(coordinator.database, request.trial_id) + + send_variable_message(coordinator.comm, SnapshotResponse(snapshot), request.from) + nothing +end +function handle_request!(coordinator::Coordinator, request::SaveSnapshotRequest) + @debug "[COORDINATOR] Recieved save snapshot request from Worker $(request.from)." + + Experimenter.save_snapshot!(coordinator.database, request.trial_id, request.state, request.label) + nothing +end +function handle_request!(coordinator::Coordinator, request::GetResultsRequest) + @debug "[COORDINATOR] Recieved save snapshot request from Worker $(request.from)." + + trial = get_trial(coordinator.database, request.trial_id) + results = trial.results + + send_variable_message(coordinator.comm, ResultsResponse(results), request.from) nothing end \ No newline at end of file diff --git a/src/cluster.jl b/src/cluster.jl index 7f4db38..a566da4 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -28,7 +28,7 @@ module Cluster end - function _is_master_node() + function _is_master_mpi_node() if _try_detect_mpi() haskey(ENV, "OMPI_COMM_WORLD_RANK") && return parse(Int, ENV["OMPI_COMM_WORLD_RANK"]) == 0 haskey(ENV, "PMI_RANK") && return parse(Int, ENV["PMI_RANK"]) == 0 @@ -38,6 +38,15 @@ module Cluster return true end + function _is_mpi_worker_node() + if _try_detect_mpi() + is_master = is_master() + return !is_master + else + return false + end + end + """ init(; kwargs...) diff --git a/src/database.jl b/src/database.jl index b32ed96..e52b14e 100644 --- a/src/database.jl +++ b/src/database.jl @@ -105,7 +105,7 @@ Setting `in_memory` to `true` will skip all of the arguments and create the data """ function open_db(database_name, experiment_folder=joinpath(pwd(), "experiments"), create_folder=true; in_memory=false)::Union{Nothing, ExperimentDatabase} - if !Cluster._is_master_node() + if !Cluster._is_master_mpi_node() return nothing end diff --git a/src/runner.jl b/src/runner.jl index 939503a..f00200d 100644 --- a/src/runner.jl +++ b/src/runner.jl @@ -44,7 +44,9 @@ Executes the MPI process that becomes a coordinator or a worker, depending on th """ function _mpi_run_job end function _mpi_worker_loop end - +function _mpi_anon_save_snapshot end +function _mpi_anon_get_latest_snapshot end +function _mpi_anon_get_trial_results end # Global database const global_database = Ref{Union{Missing, ExperimentDatabase}}(missing) const global_database_lock = Ref{ReentrantLock}(ReentrantLock()) @@ -75,7 +77,7 @@ macro execute(experiment, database, mode=SerialMode, use_progress=false, directo let runner = Runner(experiment=$(esc(experiment)), database=$(esc(database)), execution_mode=$(esc(mode))) is_mpi_worker = false if runner.execution_mode isa MPIMode - if !Cluster._is_master_node() + if !Cluster._is_master_mpi_node() is_mpi_worker = true # Load the trial code straight away @@ -242,6 +244,10 @@ end Gets the results of a specific trial from the global database. Redirects to the master node if on a worker node. Locks to secure access. """ function get_results_from_trial_global_database(trial_id::UUID) + if _is_mpi_worker_node() # MPI + return _mpi_anon_get_trial_results(trial_id) + end + if myid() != 1 return remotecall_fetch(get_results_from_trial_global_database, 1, trial_id) end @@ -257,6 +263,11 @@ end Save the results of a specific trial from the global database, with the supplied `state` and optional `label`. Redirects to the master node if on a worker node. Locks to secure access. """ function save_snapshot_in_global_database(trial_id::UUID, state::Dict{Symbol,Any}, label=missing) + if _is_mpi_worker_node() # MPI + _mpi_anon_get_latest_snapshot(trial_id, state, label) + return nothing + end + # Redirect requests on worker nodes to the main node if myid() != 1 remotecall_wait(save_snapshot_in_global_database, 1, trial_id, state, label) @@ -274,6 +285,10 @@ end Same as `get_latest_snapshot`, but in the given global database. Redirects to the master worker if on a distributed node. Only works when using `@execute`. """ function get_latest_snapshot_from_global_database(trial_id::UUID) + if _is_mpi_worker_node() # MPI + return _mpi_anon_get_latest_snapshot(trial_id) + end + # Redirect requests on worker nodes to main node if myid() != 1 return remotecall_fetch(get_latest_snapshot_from_global_database, 1, trial_id) diff --git a/src/snapshots.jl b/src/snapshots.jl index 28529dc..5b4a808 100644 --- a/src/snapshots.jl +++ b/src/snapshots.jl @@ -9,7 +9,7 @@ Base.@kwdef struct Snapshot{L<:Union{Missing,AbstractString}} trial_id::UUID state::Dict{Symbol, Any} label::L = missing - created_at::REAL + created_at::Float64 end const snapshot_table_query = raw""" From 75d068a582c4fb3a6820ef76ff7ae20f8dc81bb3 Mon Sep 17 00:00:00 2001 From: Jamie Mair Date: Mon, 29 Apr 2024 16:38:18 +0100 Subject: [PATCH 07/11] Fixed bug in MPI tests --- Project.toml | 6 +++--- src/runner.jl | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Project.toml b/Project.toml index 98fdac5..cfc2e3f 100644 --- a/Project.toml +++ b/Project.toml @@ -12,7 +12,6 @@ Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" ProgressBars = "49802e3a-d2f1-5c88-81d8-b72133a6f568" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SQLite = "0aa819cd-b072-5ff4-a722-6bc24af294d9" -SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f" Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" @@ -33,13 +32,14 @@ Pkg = "1.6" ProgressBars = "1" Random = "1.6" SQLite = "1" -SafeTestsets = "0.0" Serialization = "1.6" UUIDs = "1.6" julia = "1.6" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f" + [targets] -test = ["Test"] +test = ["Test", "SafeTestsets"] diff --git a/src/runner.jl b/src/runner.jl index f00200d..10e763f 100644 --- a/src/runner.jl +++ b/src/runner.jl @@ -244,7 +244,7 @@ end Gets the results of a specific trial from the global database. Redirects to the master node if on a worker node. Locks to secure access. """ function get_results_from_trial_global_database(trial_id::UUID) - if _is_mpi_worker_node() # MPI + if Cluster._is_mpi_worker_node() # MPI return _mpi_anon_get_trial_results(trial_id) end @@ -263,7 +263,7 @@ end Save the results of a specific trial from the global database, with the supplied `state` and optional `label`. Redirects to the master node if on a worker node. Locks to secure access. """ function save_snapshot_in_global_database(trial_id::UUID, state::Dict{Symbol,Any}, label=missing) - if _is_mpi_worker_node() # MPI + if Cluster._is_mpi_worker_node() # MPI _mpi_anon_get_latest_snapshot(trial_id, state, label) return nothing end @@ -285,7 +285,7 @@ end Same as `get_latest_snapshot`, but in the given global database. Redirects to the master worker if on a distributed node. Only works when using `@execute`. """ function get_latest_snapshot_from_global_database(trial_id::UUID) - if _is_mpi_worker_node() # MPI + if Cluster._is_mpi_worker_node() # MPI return _mpi_anon_get_latest_snapshot(trial_id) end From 7b11753faa99dad0172fb34e4b0d75c99c4a7760 Mon Sep 17 00:00:00 2001 From: Jamie Mair Date: Mon, 29 Apr 2024 16:53:48 +0100 Subject: [PATCH 08/11] Fixed cluster runs --- src/cluster.jl | 2 +- src/runner.jl | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/cluster.jl b/src/cluster.jl index a566da4..c237c8b 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -122,5 +122,5 @@ module Cluster function init_slurm end function init_mpi end - export init, install_slurm_support, init_slurm_support + export init, install_slurm_support, install_mpi_support, init_mpi_support, init_slurm_support, create_slurm_template end \ No newline at end of file diff --git a/src/runner.jl b/src/runner.jl index 10e763f..9b9bd78 100644 --- a/src/runner.jl +++ b/src/runner.jl @@ -89,9 +89,7 @@ macro execute(experiment, database, mode=SerialMode, use_progress=false, directo Base.include(Main, "$include_file_path") end - code = Meta.parse(runner.experiment.function_name) - fn = Base.eval(Main, code) - _mpi_worker_loop(runner.execution_mode.batch_size, fn) + _mpi_worker_loop(runner) end end From fcf21720385c8d1ef234f62791a9c4d966219ebc Mon Sep 17 00:00:00 2001 From: Jamie Mair Date: Mon, 29 Apr 2024 16:54:05 +0100 Subject: [PATCH 09/11] Removed docs manifest toml --- docs/Manifest.toml | 395 --------------------------------------------- 1 file changed, 395 deletions(-) delete mode 100644 docs/Manifest.toml diff --git a/docs/Manifest.toml b/docs/Manifest.toml deleted file mode 100644 index 5461707..0000000 --- a/docs/Manifest.toml +++ /dev/null @@ -1,395 +0,0 @@ -# This file is machine-generated - editing it directly is not advised - -julia_version = "1.9.2" -manifest_format = "2.0" -project_hash = "ed6ce97ede4b2f3f9b4003b8bba0e6161365248c" - -[[deps.ANSIColoredPrinters]] -git-tree-sha1 = "574baf8110975760d391c710b6341da1afa48d8c" -uuid = "a4c015fc-c6ff-483c-b24f-f7ea428134e9" -version = "0.0.1" - -[[deps.ArgTools]] -uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" -version = "1.1.1" - -[[deps.Artifacts]] -uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" - -[[deps.Base64]] -uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" - -[[deps.Compat]] -deps = ["UUIDs"] -git-tree-sha1 = "4e88377ae7ebeaf29a047aa1ee40826e0b708a5d" -uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "4.7.0" -weakdeps = ["Dates", "LinearAlgebra"] - - [deps.Compat.extensions] - CompatLinearAlgebraExt = "LinearAlgebra" - -[[deps.CompilerSupportLibraries_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" -version = "1.0.5+0" - -[[deps.Crayons]] -git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15" -uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f" -version = "4.1.1" - -[[deps.DBInterface]] -git-tree-sha1 = "9b0dc525a052b9269ccc5f7f04d5b3639c65bca5" -uuid = "a10d1c49-ce27-4219-8d33-6db1a4562965" -version = "2.5.0" - -[[deps.DataAPI]] -git-tree-sha1 = "8da84edb865b0b5b0100c0666a9bc9a0b71c553c" -uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" -version = "1.15.0" - -[[deps.DataFrames]] -deps = ["Compat", "DataAPI", "Future", "InlineStrings", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrecompileTools", "PrettyTables", "Printf", "REPL", "Random", "Reexport", "SentinelArrays", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"] -git-tree-sha1 = "089d29c0fc00a190661517e4f3cba5dcb3fd0c08" -uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" -version = "1.6.0" - -[[deps.DataStructures]] -deps = ["Compat", "InteractiveUtils", "OrderedCollections"] -git-tree-sha1 = "cf25ccb972fec4e4817764d01c82386ae94f77b4" -uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" -version = "0.18.14" - -[[deps.DataValueInterfaces]] -git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6" -uuid = "e2d170a0-9d28-54be-80f0-106bbe20a464" -version = "1.0.0" - -[[deps.Dates]] -deps = ["Printf"] -uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" - -[[deps.Distributed]] -deps = ["Random", "Serialization", "Sockets"] -uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" - -[[deps.DocStringExtensions]] -deps = ["LibGit2"] -git-tree-sha1 = "2fb1e02f2b635d0845df5d7c167fec4dd739b00d" -uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" -version = "0.9.3" - -[[deps.Documenter]] -deps = ["ANSIColoredPrinters", "Base64", "Dates", "DocStringExtensions", "IOCapture", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"] -git-tree-sha1 = "39fd748a73dce4c05a9655475e437170d8fb1b67" -uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4" -version = "0.27.25" - -[[deps.Downloads]] -deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"] -uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" -version = "1.6.0" - -[[deps.Experimenter]] -deps = ["DataFrames", "Distributed", "Logging", "Pkg", "ProgressBars", "Random", "SQLite", "SafeTestsets", "UUIDs"] -path = ".." -uuid = "6aee034a-9508-47b1-8e11-813cc29af79f" -version = "0.1.1" - -[[deps.FileWatching]] -uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" - -[[deps.Formatting]] -deps = ["Printf"] -git-tree-sha1 = "8339d61043228fdd3eb658d86c926cb282ae72a8" -uuid = "59287772-0a20-5a39-b81b-1366585eb4c0" -version = "0.4.2" - -[[deps.Future]] -deps = ["Random"] -uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" - -[[deps.IOCapture]] -deps = ["Logging", "Random"] -git-tree-sha1 = "d75853a0bdbfb1ac815478bacd89cd27b550ace6" -uuid = "b5f81e59-6552-4d32-b1f0-c071b021bf89" -version = "0.2.3" - -[[deps.InlineStrings]] -deps = ["Parsers"] -git-tree-sha1 = "9cc2baf75c6d09f9da536ddf58eb2f29dedaf461" -uuid = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48" -version = "1.4.0" - -[[deps.InteractiveUtils]] -deps = ["Markdown"] -uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" - -[[deps.InvertedIndices]] -git-tree-sha1 = "0dc7b50b8d436461be01300fd8cd45aa0274b038" -uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f" -version = "1.3.0" - -[[deps.IteratorInterfaceExtensions]] -git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856" -uuid = "82899510-4779-5014-852e-03e436cf321d" -version = "1.0.0" - -[[deps.JLLWrappers]] -deps = ["Preferences"] -git-tree-sha1 = "abc9885a7ca2052a736a600f7fa66209f96506e1" -uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" -version = "1.4.1" - -[[deps.JSON]] -deps = ["Dates", "Mmap", "Parsers", "Unicode"] -git-tree-sha1 = "31e996f0a15c7b280ba9f76636b3ff9e2ae58c9a" -uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" -version = "0.21.4" - -[[deps.LaTeXStrings]] -git-tree-sha1 = "f2355693d6778a178ade15952b7ac47a4ff97996" -uuid = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f" -version = "1.3.0" - -[[deps.LibCURL]] -deps = ["LibCURL_jll", "MozillaCACerts_jll"] -uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" -version = "0.6.3" - -[[deps.LibCURL_jll]] -deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] -uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" -version = "7.84.0+0" - -[[deps.LibGit2]] -deps = ["Base64", "NetworkOptions", "Printf", "SHA"] -uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" - -[[deps.LibSSH2_jll]] -deps = ["Artifacts", "Libdl", "MbedTLS_jll"] -uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" -version = "1.10.2+0" - -[[deps.Libdl]] -uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" - -[[deps.LinearAlgebra]] -deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"] -uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" - -[[deps.Logging]] -uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" - -[[deps.Markdown]] -deps = ["Base64"] -uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" - -[[deps.MbedTLS_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" -version = "2.28.2+0" - -[[deps.Missings]] -deps = ["DataAPI"] -git-tree-sha1 = "f66bdc5de519e8f8ae43bdc598782d35a25b1272" -uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" -version = "1.1.0" - -[[deps.Mmap]] -uuid = "a63ad114-7e13-5084-954f-fe012c677804" - -[[deps.MozillaCACerts_jll]] -uuid = "14a3606d-f60d-562e-9121-12d972cd8159" -version = "2022.10.11" - -[[deps.NetworkOptions]] -uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" -version = "1.2.0" - -[[deps.OpenBLAS_jll]] -deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] -uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" -version = "0.3.21+4" - -[[deps.OrderedCollections]] -git-tree-sha1 = "d321bf2de576bf25ec4d3e4360faca399afca282" -uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.6.0" - -[[deps.Parsers]] -deps = ["Dates", "PrecompileTools", "UUIDs"] -git-tree-sha1 = "4b2e829ee66d4218e0cef22c0a64ee37cf258c29" -uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" -version = "2.7.1" - -[[deps.Pkg]] -deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] -uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" -version = "1.9.2" - -[[deps.PooledArrays]] -deps = ["DataAPI", "Future"] -git-tree-sha1 = "a6062fe4063cdafe78f4a0a81cfffb89721b30e7" -uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" -version = "1.4.2" - -[[deps.PrecompileTools]] -deps = ["Preferences"] -git-tree-sha1 = "9673d39decc5feece56ef3940e5dafba15ba0f81" -uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a" -version = "1.1.2" - -[[deps.Preferences]] -deps = ["TOML"] -git-tree-sha1 = "7eb1686b4f04b82f96ed7a4ea5890a4f0c7a09f1" -uuid = "21216c6a-2e73-6563-6e65-726566657250" -version = "1.4.0" - -[[deps.PrettyTables]] -deps = ["Crayons", "Formatting", "LaTeXStrings", "Markdown", "Reexport", "StringManipulation", "Tables"] -git-tree-sha1 = "331cc8048cba270591eab381e7aa3e2e3fef7f5e" -uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" -version = "2.2.5" - -[[deps.Printf]] -deps = ["Unicode"] -uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" - -[[deps.ProgressBars]] -deps = ["Printf"] -git-tree-sha1 = "9d84c8646109eb8bc7a006d59b157c64d5155c81" -uuid = "49802e3a-d2f1-5c88-81d8-b72133a6f568" -version = "1.5.0" - -[[deps.REPL]] -deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] -uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" - -[[deps.Random]] -deps = ["SHA", "Serialization"] -uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" - -[[deps.Reexport]] -git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b" -uuid = "189a3867-3050-52da-a836-e630ba90ab69" -version = "1.2.2" - -[[deps.SHA]] -uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" -version = "0.7.0" - -[[deps.SQLite]] -deps = ["DBInterface", "Random", "SQLite_jll", "Serialization", "Tables", "WeakRefStrings"] -git-tree-sha1 = "eb9a473c9b191ced349d04efa612ec9f39c087ea" -uuid = "0aa819cd-b072-5ff4-a722-6bc24af294d9" -version = "1.6.0" - -[[deps.SQLite_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Zlib_jll"] -git-tree-sha1 = "4619dd3363610d94fb42a95a6dc35b526a26d0ef" -uuid = "76ed43ae-9a5d-5a62-8c75-30186b810ce8" -version = "3.42.0+0" - -[[deps.SafeTestsets]] -deps = ["Test"] -git-tree-sha1 = "36ebc5622c82eb9324005cc75e7e2cc51181d181" -uuid = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f" -version = "0.0.1" - -[[deps.SentinelArrays]] -deps = ["Dates", "Random"] -git-tree-sha1 = "04bdff0b09c65ff3e06a05e3eb7b120223da3d39" -uuid = "91c51154-3ec4-41a3-a24f-3f23e20d615c" -version = "1.4.0" - -[[deps.Serialization]] -uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" - -[[deps.Sockets]] -uuid = "6462fe0b-24de-5631-8697-dd941f90decc" - -[[deps.SortingAlgorithms]] -deps = ["DataStructures"] -git-tree-sha1 = "c60ec5c62180f27efea3ba2908480f8055e17cee" -uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c" -version = "1.1.1" - -[[deps.SparseArrays]] -deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"] -uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" - -[[deps.Statistics]] -deps = ["LinearAlgebra", "SparseArrays"] -uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" -version = "1.9.0" - -[[deps.StringManipulation]] -git-tree-sha1 = "46da2434b41f41ac3594ee9816ce5541c6096123" -uuid = "892a3eda-7b42-436c-8928-eab12a02cf0e" -version = "0.3.0" - -[[deps.SuiteSparse_jll]] -deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"] -uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c" -version = "5.10.1+6" - -[[deps.TOML]] -deps = ["Dates"] -uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" -version = "1.0.3" - -[[deps.TableTraits]] -deps = ["IteratorInterfaceExtensions"] -git-tree-sha1 = "c06b2f539df1c6efa794486abfb6ed2022561a39" -uuid = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c" -version = "1.0.1" - -[[deps.Tables]] -deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "OrderedCollections", "TableTraits", "Test"] -git-tree-sha1 = "1544b926975372da01227b382066ab70e574a3ec" -uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" -version = "1.10.1" - -[[deps.Tar]] -deps = ["ArgTools", "SHA"] -uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" -version = "1.10.0" - -[[deps.Test]] -deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] -uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" - -[[deps.UUIDs]] -deps = ["Random", "SHA"] -uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" - -[[deps.Unicode]] -uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" - -[[deps.WeakRefStrings]] -deps = ["DataAPI", "InlineStrings", "Parsers"] -git-tree-sha1 = "b1be2855ed9ed8eac54e5caff2afcdb442d52c23" -uuid = "ea10d353-3f73-51f8-a26c-33c1cb351aa5" -version = "1.4.2" - -[[deps.Zlib_jll]] -deps = ["Libdl"] -uuid = "83775a58-1f1d-513f-b197-d71354ab007a" -version = "1.2.13+0" - -[[deps.libblastrampoline_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "8e850b90-86db-534c-a0d3-1478176c7d93" -version = "5.8.0+0" - -[[deps.nghttp2_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" -version = "1.48.0+0" - -[[deps.p7zip_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" -version = "17.4.0+0" From 47b824842804f76d3c454a0ee89a33ffd3b9c43e Mon Sep 17 00:00:00 2001 From: Jamie Mair Date: Mon, 29 Apr 2024 16:54:27 +0100 Subject: [PATCH 10/11] Updated default mpi experiment --- examples/mpi/.gitignore | 3 ++- examples/mpi/check_results.jl | 2 +- examples/mpi/my_experiment.jl | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/mpi/.gitignore b/examples/mpi/.gitignore index ce7429b..74ac3c6 100644 --- a/examples/mpi/.gitignore +++ b/examples/mpi/.gitignore @@ -1,3 +1,4 @@ results/ *.out -LocalPreferences.toml \ No newline at end of file +LocalPreferences.toml +experiments/ \ No newline at end of file diff --git a/examples/mpi/check_results.jl b/examples/mpi/check_results.jl index 7779b6c..e0e22f1 100644 --- a/examples/mpi/check_results.jl +++ b/examples/mpi/check_results.jl @@ -1,5 +1,5 @@ using Experimenter -db = open_db("experiments.db", "results", false) +db = open_db("experiments.db") trials = get_trials_by_name(db, "Test Experiment") for (i, t) in enumerate(trials) diff --git a/examples/mpi/my_experiment.jl b/examples/mpi/my_experiment.jl index 187f0bc..f901949 100644 --- a/examples/mpi/my_experiment.jl +++ b/examples/mpi/my_experiment.jl @@ -11,7 +11,7 @@ experiment = Experiment( configuration=deepcopy(config) ) -db = open_db("experiments.db", "results", false) +db = open_db("experiments.db") # Init the cluster Experimenter.Cluster.init() From bf8dafdede5d16927de8094144cb76c8cd457873 Mon Sep 17 00:00:00 2001 From: Jamie Mair Date: Mon, 29 Apr 2024 16:54:32 +0100 Subject: [PATCH 11/11] Fixed docs --- docs/make.jl | 2 ++ docs/src/clusters.md | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/make.jl b/docs/make.jl index 54d4585..79c6afa 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -7,6 +7,8 @@ makedocs(; modules=[Experimenter], authors="Jamie Mair and contributors", sitename="Experimenter.jl", + checkdocs=:exports, + warnonly=true, format=Documenter.HTML(; prettyurls=get(ENV, "CI", "false") == "true", canonical="https://JamieMair.github.io/Experimenter.jl", diff --git a/docs/src/clusters.md b/docs/src/clusters.md index 9992ed1..63963e7 100644 --- a/docs/src/clusters.md +++ b/docs/src/clusters.md @@ -18,7 +18,7 @@ Normally when running on SLURM, one creates a bash script to tell the scheduler #SBATCH -o hpc/output/test_job_%j.out ``` -The function [`Experimenter.Cluster.create_slurm_template`](@ref) provides an easy way to create one of these bash scripts with everything you need to run. +The function [`Experimenter.Cluster.create_slurm_template`] provides an easy way to create one of these bash scripts with everything you need to run. ### Example