Skip to content

Commit 8522bfb

Browse files
authored
Merge pull request #6478 from wthrowe/checkpoint_test_failure
2 parents ce61833 + 782b107 commit 8522bfb

File tree

9 files changed

+107
-104
lines changed

9 files changed

+107
-104
lines changed

cmake/RunInputFileTest.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ for expected_code in $4 ; do
3535
restart=$(expr $restart + 1)
3636
fi
3737
if [ $exit_code -ne $expected_code ]; then
38+
echo "ERROR: Exited with ${exit_code} instead of ${expected_code}" >&2
3839
exit 1
3940
fi
4041
done

src/Parallel/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ spectre_target_sources(
1313
PRIVATE
1414
ArrayComponentId.cpp
1515
CharmRegistration.cpp
16+
ExitCode.cpp
1617
InitializationFunctions.cpp
1718
NodeLock.cpp
1819
Phase.cpp

src/Parallel/ExitCode.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
// Distributed under the MIT License.
2+
// See LICENSE.txt for details.
3+
4+
#include "Parallel/ExitCode.hpp"
5+
6+
#include <ostream>
7+
#include <type_traits>
8+
9+
#include "Utilities/ErrorHandling/Error.hpp"
10+
11+
namespace Parallel {
12+
std::ostream& operator<<(std::ostream& os, const ExitCode& code) {
13+
os << static_cast<std::underlying_type_t<ExitCode>>(code);
14+
switch (code) {
15+
case Parallel::ExitCode::Complete:
16+
return os << " (Complete)";
17+
case Parallel::ExitCode::Abort:
18+
return os << " (Abort)";
19+
case Parallel::ExitCode::ContinueFromCheckpoint:
20+
return os << " (ContinueFromCheckpoint)";
21+
default:
22+
ERROR("Unknown exit code: "
23+
<< static_cast<std::underlying_type_t<ExitCode>>(code));
24+
}
25+
}
26+
} // namespace Parallel

src/Parallel/ExitCode.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
#pragma once
55

6+
#include <iosfwd>
7+
68
#include "DataStructures/DataBox/Tag.hpp"
79

810
namespace Parallel {
@@ -23,6 +25,8 @@ enum class ExitCode : int {
2325
ContinueFromCheckpoint = 2
2426
};
2527

28+
std::ostream& operator<<(std::ostream& os, const ExitCode& code);
29+
2630
namespace Tags {
2731

2832
/*!

src/Parallel/PhaseControl/CheckpointAndExitAfterWallclock.cpp

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,6 @@ std::optional<Parallel::Phase> RestartPhase::combine_method::operator()(
2121
"arbitration in the Main chare, so no reduction data should be "
2222
"provided.");
2323
}
24-
25-
std::optional<double> WallclockHoursAtCheckpoint::combine_method::operator()(
26-
const std::optional<double> /*first_time*/,
27-
const std::optional<double>& /*second_time*/) {
28-
ERROR(
29-
"The wallclock time at which a checkpoint was requested should "
30-
"only be altered by the phase change arbitration in the Main "
31-
"chare, so no reduction data should be provided.");
32-
}
3324
} // namespace Tags
3425

3526
CheckpointAndExitAfterWallclock::CheckpointAndExitAfterWallclock(

src/Parallel/PhaseControl/CheckpointAndExitAfterWallclock.hpp

Lines changed: 14 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -46,22 +46,6 @@ struct RestartPhase {
4646
using main_combine_method = combine_method;
4747
};
4848

49-
/// Storage in the phase change decision tuple so that the Main chare can record
50-
/// the elapsed wallclock time since the start of the run.
51-
///
52-
/// \note This tag is not intended to participate in any of the reduction
53-
/// procedures, so will error if the combine method is called.
54-
struct WallclockHoursAtCheckpoint {
55-
using type = std::optional<double>;
56-
57-
struct combine_method {
58-
[[noreturn]] std::optional<double> operator()(
59-
const std::optional<double> /*first_time*/,
60-
const std::optional<double>& /*second_time*/);
61-
};
62-
using main_combine_method = combine_method;
63-
};
64-
6549
/// Stores whether the checkpoint and exit has been requested.
6650
///
6751
/// Combinations are performed via `funcl::Or`, as the phase in question should
@@ -146,8 +130,7 @@ struct CheckpointAndExitAfterWallclock : public PhaseChange {
146130
using return_tags = tmpl::list<>;
147131

148132
using phase_change_tags_and_combines =
149-
tmpl::list<Tags::RestartPhase, Tags::WallclockHoursAtCheckpoint,
150-
Tags::CheckpointAndExitRequested>;
133+
tmpl::list<Tags::RestartPhase, Tags::CheckpointAndExitRequested>;
151134

152135
template <typename Metavariables>
153136
using participating_components = typename Metavariables::component_list;
@@ -174,15 +157,22 @@ struct CheckpointAndExitAfterWallclock : public PhaseChange {
174157

175158
private:
176159
std::optional<double> wallclock_hours_for_checkpoint_and_exit_ = std::nullopt;
160+
// This flag is set during arbitration when the class decides to
161+
// halt the run. As it is not checkpointed, this distinguishes the
162+
// state immediately after writing the checkpoint from that
163+
// immediately after reading it during the restart.
164+
//
165+
// Phase arbitration is only run from Main, so there are no
166+
// threading issues here.
167+
// NOLINTNEXTLINE(spectre-mutable)
168+
mutable bool halting_ = false;
177169
};
178170

179171
template <typename... DecisionTags>
180172
void CheckpointAndExitAfterWallclock::initialize_phase_data_impl(
181173
const gsl::not_null<tuples::TaggedTuple<DecisionTags...>*>
182174
phase_change_decision_data) const {
183175
tuples::get<Tags::RestartPhase>(*phase_change_decision_data) = std::nullopt;
184-
tuples::get<Tags::WallclockHoursAtCheckpoint>(*phase_change_decision_data) =
185-
std::nullopt;
186176
tuples::get<Tags::CheckpointAndExitRequested>(*phase_change_decision_data) =
187177
false;
188178
}
@@ -214,21 +204,12 @@ CheckpointAndExitAfterWallclock::arbitrate_phase_change_impl(
214204

215205
auto& restart_phase =
216206
tuples::get<Tags::RestartPhase>(*phase_change_decision_data);
217-
auto& wallclock_hours_at_checkpoint =
218-
tuples::get<Tags::WallclockHoursAtCheckpoint>(
219-
*phase_change_decision_data);
220207
auto& exit_code =
221208
tuples::get<Parallel::Tags::ExitCode>(*phase_change_decision_data);
222209
if (restart_phase.has_value()) {
223-
ASSERT(wallclock_hours_at_checkpoint.has_value(),
224-
"Consistency error: Should have recorded the Wallclock time "
225-
"while recording a phase to restart from.");
226210
// This `if` branch, where restart_phase has a value, is the
227-
// post-checkpoint call to arbitrate_phase_change. Depending on the time
228-
// elapsed so far in this run, next phase is...
229-
// - Exit, if the time is large
230-
// - restart_phase, if the time is small
231-
if (elapsed_hours >= wallclock_hours_at_checkpoint.value()) {
211+
// post-checkpoint call to arbitrate_phase_change.
212+
if (halting_) {
232213
// Preserve restart_phase for use after restarting from the checkpoint
233214
exit_code = Parallel::ExitCode::ContinueFromCheckpoint;
234215
return std::make_pair(Parallel::Phase::Exit,
@@ -245,7 +226,6 @@ CheckpointAndExitAfterWallclock::arbitrate_phase_change_impl(
245226
// Reset restart_phase until it is needed for the next checkpoint
246227
const auto result = restart_phase;
247228
restart_phase.reset();
248-
wallclock_hours_at_checkpoint.reset();
249229
return std::make_pair(result.value(),
250230
ArbitrationStrategy::PermitAdditionalJumps);
251231
}
@@ -260,7 +240,8 @@ CheckpointAndExitAfterWallclock::arbitrate_phase_change_impl(
260240
std::numeric_limits<double>::infinity())) {
261241
// Record phase and actual elapsed time for determining following phase
262242
restart_phase = current_phase;
263-
wallclock_hours_at_checkpoint = elapsed_hours;
243+
ASSERT(not halting_, "Halting for checkpoint recursively");
244+
halting_ = true;
264245
return std::make_pair(Parallel::Phase::WriteCheckpoint,
265246
ArbitrationStrategy::RunPhaseImmediately);
266247
}

tests/Unit/Parallel/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,7 @@ set(LIBRARY "Test_Parallel")
165165
set(LIBRARY_SOURCES
166166
Test_ArrayComponentId.cpp
167167
Test_DomainDiagnosticInfo.cpp
168+
Test_ExitCode.cpp
168169
Test_GlobalCacheDataBox.cpp
169170
Test_InboxInserters.cpp
170171
Test_MemoryMonitor.cpp

tests/Unit/Parallel/PhaseControl/Test_CheckpointAndExitAfterWallclock.cpp

Lines changed: 44 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#include "DataStructures/DataBox/DataBox.hpp"
1010
#include "Framework/TestCreation.hpp"
11+
#include "Framework/TestHelpers.hpp"
1112
#include "Options/Protocols/FactoryCreation.hpp"
1213
#include "Parallel/ExitCode.hpp"
1314
#include "Parallel/GlobalCache.hpp"
@@ -56,14 +57,12 @@ SPECTRE_TEST_CASE("Unit.Parallel.PhaseControl.CheckpointAndExitAfterWallclock",
5657
{
5758
INFO("Test initialize phase change decision data");
5859
PhaseChangeDecisionData phase_change_decision_data{
59-
Parallel::Phase::Execute, true, 1.0, true,
60-
Parallel::ExitCode::Complete};
60+
Parallel::Phase::Execute, true, true, Parallel::ExitCode::Complete};
6161
phase_change0.initialize_phase_data<Metavariables>(
6262
make_not_null(&phase_change_decision_data));
63-
// extra parens in the check prevent Catch from trying to stream the tuple
64-
CHECK((phase_change_decision_data ==
65-
PhaseChangeDecisionData{std::nullopt, std::nullopt, false, true,
66-
Parallel::ExitCode::Complete}));
63+
CHECK(phase_change_decision_data ==
64+
PhaseChangeDecisionData{std::nullopt, false, true,
65+
Parallel::ExitCode::Complete});
6766
}
6867
{
6968
INFO("Wallclock time < big trigger time");
@@ -72,93 +71,76 @@ SPECTRE_TEST_CASE("Unit.Parallel.PhaseControl.CheckpointAndExitAfterWallclock",
7271
// the PhaseChange with a big trigger time.
7372
// (this assumes the test doesn't take 1h to get here)
7473
PhaseChangeDecisionData phase_change_decision_data{
75-
std::nullopt, std::nullopt, true, true, Parallel::ExitCode::Complete};
74+
std::nullopt, true, true, Parallel::ExitCode::Complete};
7675
const auto decision_result = phase_change1.arbitrate_phase_change(
7776
make_not_null(&phase_change_decision_data), Parallel::Phase::Execute,
7877
cache);
79-
CHECK((decision_result == std::nullopt));
80-
CHECK((phase_change_decision_data ==
81-
PhaseChangeDecisionData{std::nullopt, std::nullopt, false, true,
82-
Parallel::ExitCode::Complete}));
78+
CHECK(decision_result == std::nullopt);
79+
CHECK(phase_change_decision_data ==
80+
PhaseChangeDecisionData{std::nullopt, false, true,
81+
Parallel::ExitCode::Complete});
8382
}
8483
{
8584
INFO("Wallclock time > small trigger time");
8685
// Now check case where wallclock time > trigger wallclock time, using
8786
// the PhaseChange with a tiny trigger time.
8887
// (this assumes the test takes at least a few cycles to get here)
8988
PhaseChangeDecisionData phase_change_decision_data{
90-
std::nullopt, std::nullopt, true, true, Parallel::ExitCode::Complete};
89+
std::nullopt, true, true, Parallel::ExitCode::Complete};
9190
const auto decision_result = phase_change0.arbitrate_phase_change(
9291
make_not_null(&phase_change_decision_data), Parallel::Phase::Execute,
9392
cache);
94-
CHECK((decision_result ==
95-
std::make_pair(
96-
Parallel::Phase::WriteCheckpoint,
97-
PhaseControl::ArbitrationStrategy::RunPhaseImmediately)));
98-
// It's impossible to know what the elapsed wallclock time will be, so we
99-
// check the tags one by one...
100-
CHECK((tuples::get<PhaseControl::Tags::RestartPhase>(
101-
phase_change_decision_data) == Parallel::Phase::Execute));
102-
// Check recorded time in range: 0 second < time < 1 second
103-
// (this assumes test run duration falls in this time window)
104-
CHECK(tuples::get<PhaseControl::Tags::WallclockHoursAtCheckpoint>(
105-
phase_change_decision_data) > 0.0);
106-
const double one_second = 1.0 / 3600.0;
107-
CHECK(tuples::get<PhaseControl::Tags::WallclockHoursAtCheckpoint>(
108-
phase_change_decision_data) < one_second);
109-
CHECK(tuples::get<PhaseControl::Tags::CheckpointAndExitRequested>(
110-
phase_change_decision_data) == false);
93+
CHECK(
94+
decision_result ==
95+
std::make_pair(Parallel::Phase::WriteCheckpoint,
96+
PhaseControl::ArbitrationStrategy::RunPhaseImmediately));
97+
CHECK(phase_change_decision_data ==
98+
PhaseChangeDecisionData{Parallel::Phase::Execute, false, true,
99+
Parallel::ExitCode::Complete});
111100
}
112101
{
113102
INFO("Restarting from checkpoint");
114103
// Check behavior following the checkpoint phase
115-
// First check case where wallclock time < recorded time, which corresponds
116-
// to restarting from a checkpoint. Should update options next.
117-
// (this assumes the test doesn't take 1h to get here)
104+
const PhaseControl::CheckpointAndExitAfterWallclock phase_change_restart =
105+
serialize_and_deserialize(phase_change0);
118106
PhaseChangeDecisionData phase_change_decision_data{
119-
Parallel::Phase::Execute, 1.0, false, true,
120-
Parallel::ExitCode::Complete};
121-
auto decision_result = phase_change0.arbitrate_phase_change(
107+
Parallel::Phase::Execute, false, true, Parallel::ExitCode::Complete};
108+
auto decision_result = phase_change_restart.arbitrate_phase_change(
122109
make_not_null(&phase_change_decision_data),
123110
Parallel::Phase::WriteCheckpoint, cache);
124-
CHECK((decision_result ==
125-
std::make_pair(
126-
Parallel::Phase::UpdateOptionsAtRestartFromCheckpoint,
127-
PhaseControl::ArbitrationStrategy::PermitAdditionalJumps)));
128-
CHECK((phase_change_decision_data ==
129-
PhaseChangeDecisionData{Parallel::Phase::Execute, 1.0, false, true,
130-
Parallel::ExitCode::Complete}));
111+
CHECK(decision_result ==
112+
std::make_pair(
113+
Parallel::Phase::UpdateOptionsAtRestartFromCheckpoint,
114+
PhaseControl::ArbitrationStrategy::PermitAdditionalJumps));
115+
CHECK(phase_change_decision_data ==
116+
PhaseChangeDecisionData{Parallel::Phase::Execute, false, true,
117+
Parallel::ExitCode::Complete});
131118

132119
// Now, from update phase, go back to Execute
133-
decision_result = phase_change0.arbitrate_phase_change(
120+
decision_result = phase_change_restart.arbitrate_phase_change(
134121
make_not_null(&phase_change_decision_data),
135122
Parallel::Phase::UpdateOptionsAtRestartFromCheckpoint, cache);
136-
CHECK((decision_result ==
137-
std::make_pair(
138-
Parallel::Phase::Execute,
139-
PhaseControl::ArbitrationStrategy::PermitAdditionalJumps)));
140-
CHECK((phase_change_decision_data ==
141-
PhaseChangeDecisionData{std::nullopt, std::nullopt, false, true,
142-
Parallel::ExitCode::Complete}));
123+
CHECK(decision_result ==
124+
std::make_pair(
125+
Parallel::Phase::Execute,
126+
PhaseControl::ArbitrationStrategy::PermitAdditionalJumps));
127+
CHECK(phase_change_decision_data ==
128+
PhaseChangeDecisionData{std::nullopt, false, true,
129+
Parallel::ExitCode::Complete});
143130
}
144131
{
145132
INFO("Exiting after checkpoint");
146-
// Now check case where wallclock time > recorded time, which corresponds to
147-
// having just written a checkpoint. We want to exit with exit code 2 now.
148-
// (this assumes the test takes at least a few cycles to get here)
149133
PhaseChangeDecisionData phase_change_decision_data{
150-
Parallel::Phase::Execute, 1e-15, false, true,
151-
Parallel::ExitCode::Complete};
134+
Parallel::Phase::Execute, false, true, Parallel::ExitCode::Complete};
152135
const auto decision_result = phase_change0.arbitrate_phase_change(
153136
make_not_null(&phase_change_decision_data),
154137
Parallel::Phase::WriteCheckpoint, cache);
155-
CHECK((decision_result ==
156-
std::make_pair(
157-
Parallel::Phase::Exit,
158-
PhaseControl::ArbitrationStrategy::RunPhaseImmediately)));
159138
CHECK(
160-
(phase_change_decision_data ==
161-
PhaseChangeDecisionData{Parallel::Phase::Execute, 1e-15, false, true,
162-
Parallel::ExitCode::ContinueFromCheckpoint}));
139+
decision_result ==
140+
std::make_pair(Parallel::Phase::Exit,
141+
PhaseControl::ArbitrationStrategy::RunPhaseImmediately));
142+
CHECK(phase_change_decision_data ==
143+
PhaseChangeDecisionData{Parallel::Phase::Execute, false, true,
144+
Parallel::ExitCode::ContinueFromCheckpoint});
163145
}
164146
}

tests/Unit/Parallel/Test_ExitCode.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
// Distributed under the MIT License.
2+
// See LICENSE.txt for details.
3+
4+
#include "Framework/TestingFramework.hpp"
5+
6+
#include "Parallel/ExitCode.hpp"
7+
#include "Utilities/GetOutput.hpp"
8+
9+
SPECTRE_TEST_CASE("Unit.Parallel.ExitCode", "[Parallel][Unit]") {
10+
CHECK(get_output(Parallel::ExitCode::Complete) == "0 (Complete)");
11+
CHECK(get_output(Parallel::ExitCode::Abort) == "1 (Abort)");
12+
CHECK(get_output(Parallel::ExitCode::ContinueFromCheckpoint) ==
13+
"2 (ContinueFromCheckpoint)");
14+
CHECK_THROWS_WITH(get_output(static_cast<Parallel::ExitCode>(3)),
15+
Catch::Matchers::ContainsSubstring("Unknown exit code: 3"));
16+
}

0 commit comments

Comments
 (0)