Skip to content

Commit 2130201

Browse files
committed
tests: add test for function migration thread and fix bug in thread shutdown
1 parent ff175df commit 2130201

File tree

4 files changed

+124
-31
lines changed

4 files changed

+124
-31
lines changed

include/faabric/scheduler/Scheduler.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ class Scheduler
228228
// ----------------------------------
229229
// Function Migration
230230
// ----------------------------------
231-
void checkForMigrationOpportunities(
231+
bool checkForMigrationOpportunities(
232232
faabric::util::MigrationStrategy =
233233
faabric::util::MigrationStrategy::BIN_PACK);
234234

src/scheduler/FunctionMigrationThread.cpp

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@
66
namespace faabric::scheduler {
77
void FunctionMigrationThread::start(int wakeUpPeriodSecondsIn)
88
{
9+
// Initialise wake up period and shutdown variable
910
wakeUpPeriodSeconds = wakeUpPeriodSecondsIn;
11+
isShutdown.store(false, std::memory_order_release);
1012

1113
// Main work loop
1214
workThread = std::make_unique<std::thread>([&] {
@@ -26,9 +28,16 @@ void FunctionMigrationThread::start(int wakeUpPeriodSecondsIn)
2628
// If we hit the timeout it means we have not been notified to
2729
// reset or stop. Thus we check for migration oportunities.
2830
if (returnVal == std::cv_status::timeout) {
29-
SPDLOG_DEBUG("Checking for migration oportunities");
30-
faabric::scheduler::getScheduler()
31-
.checkForMigrationOpportunities();
31+
SPDLOG_DEBUG(
32+
"Migration thread checking for migration oportunities");
33+
// If there are no more apps in-flight to be checked-for, the
34+
// scheduler will return false, so we can shut down
35+
bool shutdown = faabric::scheduler::getScheduler()
36+
.checkForMigrationOpportunities();
37+
if (shutdown) {
38+
SPDLOG_INFO("Shutting down ourselves...");
39+
isShutdown.store(true, std::memory_order_release);
40+
}
3241
}
3342
};
3443

@@ -42,7 +51,7 @@ void FunctionMigrationThread::stop()
4251
return;
4352
}
4453

45-
{
54+
if (!isShutdown.load(std::memory_order_acquire)) {
4655
faabric::util::UniqueLock lock(mx);
4756

4857
// We set the flag _before_ we notify and after we acquire the lock.
@@ -55,5 +64,7 @@ void FunctionMigrationThread::stop()
5564
if (workThread->joinable()) {
5665
workThread->join();
5766
}
67+
68+
workThread = nullptr;
5869
}
5970
}

src/scheduler/Scheduler.cpp

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,7 @@ faabric::util::SchedulingDecision Scheduler::doCallFunctions(
463463
std::make_shared<faabric::util::SchedulingDecision>(decision);
464464
inFlightRequests[decision.appId] = std::make_pair(req, decisionPtr);
465465
if (inFlightRequests.size() == 1) {
466+
SPDLOG_INFO("starting migration thread");
466467
functionMigrationThread.start(firstMsg.migrationcheckperiod());
467468
} else if (firstMsg.migrationcheckperiod() !=
468469
functionMigrationThread.wakeUpPeriodSeconds) {
@@ -916,11 +917,6 @@ void Scheduler::setFunctionResult(faabric::Message& msg)
916917

917918
inFlightRequests.erase(msg.appid());
918919
pendingMigrations.erase(msg.appid());
919-
// If there are no more apps to track, stop the thread checking for
920-
// migration opportunities
921-
if (inFlightRequests.size() == 0) {
922-
functionMigrationThread.stop();
923-
}
924920
}
925921

926922
// Write the successful result to the result queue
@@ -1167,18 +1163,25 @@ ExecGraphNode Scheduler::getFunctionExecGraphNode(unsigned int messageId)
11671163
return node;
11681164
}
11691165

1170-
void Scheduler::checkForMigrationOpportunities(
1166+
bool Scheduler::checkForMigrationOpportunities(
11711167
faabric::util::MigrationStrategy migrationStrategy)
11721168
{
11731169
// Vector to cache all migrations we have to do, and update the shared map
11741170
// at the very end just once. This is because we need a unique lock to write
11751171
// to the shared map, but the rest of this method can do with a shared lock.
11761172
std::vector<std::shared_ptr<faabric::PendingMigrations>>
11771173
tmpPendingMigrations;
1174+
SPDLOG_INFO("Checking for migration opportunities");
11781175

11791176
{
11801177
faabric::util::SharedLock lock(mx);
11811178

1179+
// If no in-flight requests, stop the background thread
1180+
if (inFlightRequests.size() == 0) {
1181+
SPDLOG_INFO("No requests to check");
1182+
return true;
1183+
}
1184+
11821185
// For each in-flight request that has opted in to be migrated,
11831186
// check if there is an opportunity to migrate
11841187
for (const auto& app : inFlightRequests) {
@@ -1188,6 +1191,9 @@ void Scheduler::checkForMigrationOpportunities(
11881191
// If we have already recorded a pending migration for this req,
11891192
// skip
11901193
if (canAppBeMigrated(originalDecision.appId) != nullptr) {
1194+
SPDLOG_INFO("Skipping app {} as migration opportunity has "
1195+
"already been recorded",
1196+
originalDecision.appId);
11911197
continue;
11921198
}
11931199

@@ -1216,9 +1222,6 @@ void Scheduler::checkForMigrationOpportunities(
12161222
};
12171223
auto claimSlot = [&r]() {
12181224
int currentUsedSlots = r.usedslots();
1219-
SPDLOG_INFO("Old slots: {} - New slots: {}",
1220-
currentUsedSlots,
1221-
currentUsedSlots + 1);
12221225
r.set_usedslots(currentUsedSlots + 1);
12231226
};
12241227
while (left < right) {
@@ -1282,10 +1285,11 @@ void Scheduler::checkForMigrationOpportunities(
12821285
if (tmpPendingMigrations.size() > 0) {
12831286
faabric::util::FullLock lock(mx);
12841287
for (auto msgPtr : tmpPendingMigrations) {
1285-
SPDLOG_INFO("Adding app: {}", msgPtr->appid());
12861288
pendingMigrations[msgPtr->appid()] = std::move(msgPtr);
12871289
}
12881290
}
1291+
1292+
return false;
12891293
}
12901294

12911295
std::shared_ptr<faabric::PendingMigrations> Scheduler::canAppBeMigrated(

tests/test/scheduler/test_function_migration.cpp

Lines changed: 94 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ class FunctionMigrationTestFixture : public SchedulerTestFixture
2525

2626
~FunctionMigrationTestFixture()
2727
{
28-
sch.clearRecordedMessages();
2928
faabric::util::setMockMode(false);
3029

3130
// Remove all hosts from global set
@@ -89,7 +88,7 @@ TEST_CASE_METHOD(FunctionMigrationTestFixture,
8988

9089
TEST_CASE_METHOD(
9190
FunctionMigrationTestFixture,
92-
"Test function migration thread only works if set in the message",
91+
"Test migration oportunities are only detected if set in the message",
9392
"[scheduler]")
9493
{
9594
// First set resources before calling the functions: one will be allocated
@@ -105,6 +104,7 @@ TEST_CASE_METHOD(
105104
req->mutable_messages()->at(0).set_inputdata(std::to_string(timeToSleep));
106105
uint32_t appId = req->messages().at(0).appid();
107106

107+
// Build expected pending migrations
108108
std::shared_ptr<faabric::PendingMigrations> expectedMigrations;
109109
SECTION("Migration not enabled") { expectedMigrations = nullptr; }
110110

@@ -157,10 +157,9 @@ TEST_CASE_METHOD(
157157
REQUIRE(sch.canAppBeMigrated(appId) == nullptr);
158158
}
159159

160-
TEST_CASE_METHOD(
161-
FunctionMigrationTestFixture,
162-
"Test function migration thread detects migration opportunities",
163-
"[scheduler]")
160+
TEST_CASE_METHOD(FunctionMigrationTestFixture,
161+
"Test checking for migration opportunities",
162+
"[scheduler]")
164163
{
165164
std::vector<std::string> hosts = { masterHost, "hostA" };
166165
std::vector<int> slots = { 1, 1 };
@@ -170,9 +169,12 @@ TEST_CASE_METHOD(
170169
auto req = faabric::util::batchExecFactory("foo", "sleep", 2);
171170
int timeToSleep = SHORT_TEST_TIMEOUT_MS;
172171
req->mutable_messages()->at(0).set_inputdata(std::to_string(timeToSleep));
173-
req->mutable_messages()->at(0).set_migrationcheckperiod(2);
174172
uint32_t appId = req->messages().at(0).appid();
175173

174+
// By setting the check period to a non-zero value, we are effectively
175+
// opting in to be considered for migration
176+
req->mutable_messages()->at(0).set_migrationcheckperiod(2);
177+
176178
auto decision = sch.callFunctions(req);
177179

178180
std::shared_ptr<faabric::PendingMigrations> expectedMigrations;
@@ -226,11 +228,11 @@ TEST_CASE_METHOD(
226228

227229
TEST_CASE_METHOD(
228230
FunctionMigrationTestFixture,
229-
"Test detecting migration opportunities with several hosts and requests",
231+
"Test detecting migration opportunities for several messages and hosts",
230232
"[scheduler]")
231233
{
232-
// First set resources before calling the functions: one will be allocated
233-
// locally, another one in the remote host
234+
// First set resources before calling the functions: one request will be
235+
// allocated to each host
234236
std::vector<std::string> hosts = { masterHost, "hostA", "hostB", "hostC" };
235237
std::vector<int> slots = { 1, 1, 1, 1 };
236238
std::vector<int> usedSlots = { 0, 0, 0, 0 };
@@ -239,9 +241,11 @@ TEST_CASE_METHOD(
239241
auto req = faabric::util::batchExecFactory("foo", "sleep", 4);
240242
int timeToSleep = SHORT_TEST_TIMEOUT_MS;
241243
req->mutable_messages()->at(0).set_inputdata(std::to_string(timeToSleep));
242-
req->mutable_messages()->at(0).set_migrationcheckperiod(2);
243244
uint32_t appId = req->messages().at(0).appid();
244245

246+
// Opt in to be considered for migration
247+
req->mutable_messages()->at(0).set_migrationcheckperiod(2);
248+
245249
auto decision = sch.callFunctions(req);
246250

247251
// Set up expectations
@@ -256,17 +260,17 @@ TEST_CASE_METHOD(
256260
std::vector<int> newUsedSlots = { 1, 1, 1, 1 };
257261
setHostResources(hosts, newSlots, newUsedSlots);
258262

259-
// Build expected result
263+
// Build expected result: two migrations
260264
faabric::PendingMigrations expected;
261265
expected.set_appid(appId);
262-
// Migrate last message (scheduled to last host) to first host. This
263-
// fills up the first host.
266+
// Migration 1: migrate last message (originally scheduled to last host)
267+
// to first host. This fills up the first host.
264268
auto* migration1 = expected.add_migrations();
265269
migration1->set_messageid(req->messages().at(3).id());
266270
migration1->set_srchost(hosts.at(3));
267271
migration1->set_dsthost(hosts.at(0));
268-
// Migrate penultimate message (scheduled to penultimate host) to second
269-
// host. This fills up the second host.
272+
// Migration 2: migrate penultimate message (originally scheduled to
273+
// penultimate host) to second host. This fills up the second host.
270274
auto* migration2 = expected.add_migrations();
271275
migration2->set_messageid(req->messages().at(2).id());
272276
migration2->set_srchost(hosts.at(2));
@@ -301,4 +305,78 @@ TEST_CASE_METHOD(
301305
sch.checkForMigrationOpportunities();
302306
REQUIRE(sch.canAppBeMigrated(appId) == nullptr);
303307
}
308+
309+
TEST_CASE_METHOD(
310+
FunctionMigrationTestFixture,
311+
"Test function migration thread detects migration opportunities",
312+
"[scheduler]")
313+
{
314+
std::vector<std::string> hosts = { masterHost, "hostA" };
315+
std::vector<int> slots = { 1, 1 };
316+
std::vector<int> usedSlots = { 0, 0 };
317+
setHostResources(hosts, slots, usedSlots);
318+
319+
auto req = faabric::util::batchExecFactory("foo", "sleep", 2);
320+
int checkPeriodSecs = 1;
321+
int timeToSleep = 4 * checkPeriodSecs * 1000;
322+
req->mutable_messages()->at(0).set_inputdata(std::to_string(timeToSleep));
323+
uint32_t appId = req->messages().at(0).appid();
324+
325+
// Opt in to be migrated
326+
req->mutable_messages()->at(0).set_migrationcheckperiod(checkPeriodSecs);
327+
328+
auto decision = sch.callFunctions(req);
329+
330+
std::shared_ptr<faabric::PendingMigrations> expectedMigrations;
331+
332+
SECTION("Can not migrate") { expectedMigrations = nullptr; }
333+
334+
// As we don't update the available resources, no migration opportunities
335+
// will appear, even though we are checking for them
336+
SECTION("Can migrate")
337+
{
338+
// Update host resources so that a migration opportunity appears
339+
updateLocalResources(2, 1);
340+
341+
// Build expected result
342+
faabric::PendingMigrations expected;
343+
expected.set_appid(appId);
344+
auto* migration = expected.add_migrations();
345+
migration->set_messageid(req->messages().at(1).id());
346+
migration->set_srchost(hosts.at(1));
347+
migration->set_dsthost(hosts.at(0));
348+
expectedMigrations =
349+
std::make_shared<faabric::PendingMigrations>(expected);
350+
}
351+
352+
// Instead of directly calling the scheduler function to check for migration
353+
// opportunites, sleep for enough time (twice the check period) so that a
354+
// migration is detected by the background thread.
355+
SLEEP_MS(2 * checkPeriodSecs * 1000);
356+
357+
auto actualMigrations = sch.canAppBeMigrated(appId);
358+
if (expectedMigrations == nullptr) {
359+
REQUIRE(actualMigrations == expectedMigrations);
360+
} else {
361+
REQUIRE(actualMigrations->appid() == expectedMigrations->appid());
362+
REQUIRE(actualMigrations->migrations_size() ==
363+
expectedMigrations->migrations_size());
364+
for (int i = 0; i < actualMigrations->migrations_size(); i++) {
365+
auto actual = actualMigrations->mutable_migrations()->at(i);
366+
auto expected = expectedMigrations->mutable_migrations()->at(i);
367+
REQUIRE(actual.messageid() == expected.messageid());
368+
REQUIRE(actual.srchost() == expected.srchost());
369+
REQUIRE(actual.dsthost() == expected.dsthost());
370+
}
371+
}
372+
373+
faabric::Message res =
374+
sch.getFunctionResult(req->messages().at(0).id(), 2 * timeToSleep);
375+
REQUIRE(res.returnvalue() == 0);
376+
377+
// Check that after the result is set, the app can't be migrated no more
378+
sch.checkForMigrationOpportunities();
379+
REQUIRE(sch.canAppBeMigrated(appId) == nullptr);
380+
}
381+
304382
}

0 commit comments

Comments
 (0)