From fe20def0f78192864f063885e6f7db60fd811924 Mon Sep 17 00:00:00 2001 From: hc-github-team-nomad-core <82989552+hc-github-team-nomad-core@users.noreply.github.com> Date: Tue, 2 Apr 2024 05:11:30 -0500 Subject: [PATCH] Backport of [gh-19729] Fix logic for updating terminal allocs on clients with max client disconnect into release/1.7.x (#20248) * [gh-19729] Fix logic for updating terminal allocs on clients with max client disconnect (#20181) Only ignore allocs on terminal states that are updated --------- Co-authored-by: Tim Gross * [gh-19729] Fix logic for updating terminal allocs on clients with max client disconnect (#20181) * fix: update test --------- Co-authored-by: Juana De La Cuesta Co-authored-by: Tim Gross Co-authored-by: Juanadelacuesta <8647634+Juanadelacuesta@users.noreply.github.com> --- scheduler/reconcile_test.go | 28 +++++++------ scheduler/reconcile_util.go | 4 +- scheduler/reconcile_util_test.go | 70 +++++++++++++++----------------- 3 files changed, 50 insertions(+), 52 deletions(-) diff --git a/scheduler/reconcile_test.go b/scheduler/reconcile_test.go index 3e13276a8db..ab18bd5278d 100644 --- a/scheduler/reconcile_test.go +++ b/scheduler/reconcile_test.go @@ -940,7 +940,7 @@ func TestReconciler_LostNode_PreventRescheduleOnLost(t *testing.T) { maxClientDisconnect: pointer.Of(10 * time.Second), PreventRescheduleOnLost: false, reschedulePolicy: disabledReschedulePolicy, - expectPlace: 2, + expectPlace: 1, expectStop: 1, expectIgnore: 4, expectDisconnect: 1, @@ -951,13 +951,12 @@ func TestReconciler_LostNode_PreventRescheduleOnLost(t *testing.T) { maxClientDisconnect: pointer.Of(10 * time.Second), PreventRescheduleOnLost: true, reschedulePolicy: disabledReschedulePolicy, - expectPlace: 1, // This behaviour needs to be verified + expectPlace: 0, expectStop: 0, expectIgnore: 5, expectDisconnect: 2, allocStatus: structs.AllocClientStatusUnknown, }, - { name: "PreventRescheduleOnLost off, MaxClientDisconnect off, Reschedule on", maxClientDisconnect: nil, @@ -991,8 +990,8 @@ func TestReconciler_LostNode_PreventRescheduleOnLost(t *testing.T) { Attempts: 1, }, expectPlace: 3, - expectStop: 1, - expectIgnore: 3, + expectStop: 2, + expectIgnore: 2, expectDisconnect: 1, allocStatus: structs.AllocClientStatusLost, }, @@ -5447,7 +5446,7 @@ func TestReconciler_Disconnected_Client(t *testing.T) { }, }, { - name: "ignore-reconnect-completed", + name: "update-reconnect-completed", allocCount: 2, replace: false, disconnectedAllocCount: 2, @@ -5456,11 +5455,11 @@ func TestReconciler_Disconnected_Client(t *testing.T) { disconnectedAllocStates: disconnectAllocState, isBatch: true, expected: &resultExpectation{ - place: 2, + place: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ "web": { Ignore: 2, - Place: 2, + Place: 0, }, }, }, @@ -5476,10 +5475,13 @@ func TestReconciler_Disconnected_Client(t *testing.T) { disconnectedAllocStates: disconnectAllocState, expected: &resultExpectation{ reconnectUpdates: 2, - stop: 0, + stop: 2, + place: 2, desiredTGUpdates: map[string]*structs.DesiredUpdates{ "web": { - Ignore: 5, + Ignore: 3, + Place: 2, + Stop: 2, }, }, }, @@ -5617,13 +5619,13 @@ func TestReconciler_Disconnected_Client(t *testing.T) { disconnectedAllocStates: disconnectAllocState, shouldStopOnDisconnectedNode: true, expected: &resultExpectation{ - stop: 2, + stop: 4, place: 2, desiredTGUpdates: map[string]*structs.DesiredUpdates{ "web": { - Stop: 2, + Stop: 4, Place: 2, - Ignore: 5, + Ignore: 3, }, }, }, diff --git a/scheduler/reconcile_util.go b/scheduler/reconcile_util.go index 18d8af5c4e5..4b10801ed18 100644 --- a/scheduler/reconcile_util.go +++ b/scheduler/reconcile_util.go @@ -293,9 +293,9 @@ func (a allocSet) filterByTainted(taintedNodes map[string]*structs.Node, serverS } if alloc.TerminalStatus() && !reconnect { - // Terminal allocs, if supportsDisconnectedClient and not reconnect, + // Server-terminal allocs, if supportsDisconnectedClient and not reconnect, // are probably stopped replacements and should be ignored - if supportsDisconnectedClients { + if supportsDisconnectedClients && alloc.ServerTerminalStatus() { ignore[alloc.ID] = alloc continue } diff --git a/scheduler/reconcile_util_test.go b/scheduler/reconcile_util_test.go index 268c963d52c..0aef2918d8a 100644 --- a/scheduler/reconcile_util_test.go +++ b/scheduler/reconcile_util_test.go @@ -15,8 +15,6 @@ import ( ) func TestAllocSet_filterByTainted(t *testing.T) { - ci.Parallel(t) - nodes := map[string]*structs.Node{ "draining": { ID: "draining", @@ -77,7 +75,7 @@ func TestAllocSet_filterByTainted(t *testing.T) { }, } - type testCase struct { + testCases := []struct { name string all allocSet taintedNodes map[string]*structs.Node @@ -93,10 +91,7 @@ func TestAllocSet_filterByTainted(t *testing.T) { reconnecting allocSet ignore allocSet expiring allocSet - } - - testCases := []testCase{ - // These two cases test that we maintain parity with pre-disconnected-clients behavior. + }{ // These two cases test that we maintain parity with pre-disconnected-clients behavior. { name: "lost-client", supportsDisconnectedClients: false, @@ -394,10 +389,10 @@ func TestAllocSet_filterByTainted(t *testing.T) { taintedNodes: nodes, skipNilNodeTest: false, all: allocSet{ - // Allocs on reconnected nodes that are complete are ignored - "ignored-reconnect-complete": { - ID: "ignored-reconnect-complete", - Name: "ignored-reconnect-complete", + // Allocs on reconnected nodes that are complete need to be updated to stop + "untainted-reconnect-complete": { + ID: "untainted-reconnect-complete", + Name: "untainted-reconnect-complete", ClientStatus: structs.AllocClientStatusComplete, DesiredStatus: structs.AllocDesiredStatusRun, Job: testJob, @@ -428,10 +423,10 @@ func TestAllocSet_filterByTainted(t *testing.T) { TaskGroup: "web", AllocStates: unknownAllocState, }, - // Replacement allocs that are complete are ignored - "ignored-reconnect-complete-replacement": { - ID: "ignored-reconnect-complete-replacement", - Name: "ignored-reconnect-complete", + // Replacement allocs that are complete need to be updated + "untainted-reconnect-complete-replacement": { + ID: "untainted-reconnect-complete-replacement", + Name: "untainted-reconnect-complete", ClientStatus: structs.AllocClientStatusComplete, DesiredStatus: structs.AllocDesiredStatusRun, Job: testJob, @@ -464,7 +459,29 @@ func TestAllocSet_filterByTainted(t *testing.T) { PreviousAllocation: "untainted-reconnect-lost", }, }, - untainted: allocSet{}, + untainted: allocSet{ + "untainted-reconnect-complete": { + ID: "untainted-reconnect-complete", + Name: "untainted-reconnect-complete", + ClientStatus: structs.AllocClientStatusComplete, + DesiredStatus: structs.AllocDesiredStatusRun, + Job: testJob, + NodeID: "normal", + TaskGroup: "web", + AllocStates: unknownAllocState, + }, + "untainted-reconnect-complete-replacement": { + ID: "untainted-reconnect-complete-replacement", + Name: "untainted-reconnect-complete", + ClientStatus: structs.AllocClientStatusComplete, + DesiredStatus: structs.AllocDesiredStatusRun, + Job: testJob, + NodeID: "normal", + TaskGroup: "web", + AllocStates: unknownAllocState, + PreviousAllocation: "untainted-reconnect-complete", + }, + }, migrate: allocSet{}, disconnecting: allocSet{}, reconnecting: allocSet{ @@ -480,16 +497,6 @@ func TestAllocSet_filterByTainted(t *testing.T) { }, }, ignore: allocSet{ - "ignored-reconnect-complete": { - ID: "ignored-reconnect-complete", - Name: "ignored-reconnect-complete", - ClientStatus: structs.AllocClientStatusComplete, - DesiredStatus: structs.AllocDesiredStatusRun, - Job: testJob, - NodeID: "normal", - TaskGroup: "web", - AllocStates: unknownAllocState, - }, "ignored-reconnect-lost": { ID: "ignored-reconnect-lost", Name: "ignored-reconnect-lost", @@ -500,17 +507,6 @@ func TestAllocSet_filterByTainted(t *testing.T) { TaskGroup: "web", AllocStates: unknownAllocState, }, - "ignored-reconnect-complete-replacement": { - ID: "ignored-reconnect-complete-replacement", - Name: "ignored-reconnect-complete", - ClientStatus: structs.AllocClientStatusComplete, - DesiredStatus: structs.AllocDesiredStatusRun, - Job: testJob, - NodeID: "normal", - TaskGroup: "web", - AllocStates: unknownAllocState, - PreviousAllocation: "untainted-reconnect-complete", - }, "ignored-reconnect-failed-replacement": { ID: "ignored-reconnect-failed-replacement", Name: "ignored-reconnect-failed",