From 3790b3210d6149310d709766f6056a7a056f744a Mon Sep 17 00:00:00 2001 From: Juana De La Cuesta Date: Tue, 26 Mar 2024 10:31:58 +0100 Subject: [PATCH] [gh-19729] Fix logic for updating terminal allocs on clients with max client disconnect (#20181) Only ignore allocs on terminal states that are updated --------- Co-authored-by: Tim Gross --- scheduler/reconcile_test.go | 22 +++++------ scheduler/reconcile_util.go | 4 +- scheduler/reconcile_util_test.go | 63 ++++++++++++++++---------------- 3 files changed, 43 insertions(+), 46 deletions(-) diff --git a/scheduler/reconcile_test.go b/scheduler/reconcile_test.go index 62ac5d38035..35e97e2f450 100644 --- a/scheduler/reconcile_test.go +++ b/scheduler/reconcile_test.go @@ -942,7 +942,7 @@ func TestReconciler_LostNode_PreventRescheduleOnLost(t *testing.T) { maxClientDisconnect: pointer.Of(10 * time.Second), PreventRescheduleOnLost: false, reschedulePolicy: disabledReschedulePolicy, - expectPlace: 2, + expectPlace: 1, expectStop: 1, expectIgnore: 4, expectDisconnect: 1, @@ -953,13 +953,12 @@ func TestReconciler_LostNode_PreventRescheduleOnLost(t *testing.T) { maxClientDisconnect: pointer.Of(10 * time.Second), PreventRescheduleOnLost: true, reschedulePolicy: disabledReschedulePolicy, - expectPlace: 1, // This behaviour needs to be verified + expectPlace: 0, expectStop: 0, expectIgnore: 5, expectDisconnect: 2, allocStatus: structs.AllocClientStatusUnknown, }, - { name: "PreventRescheduleOnLost off, MaxClientDisconnect off, Reschedule on", maxClientDisconnect: nil, @@ -993,8 +992,8 @@ func TestReconciler_LostNode_PreventRescheduleOnLost(t *testing.T) { Attempts: 1, }, expectPlace: 3, - expectStop: 1, - expectIgnore: 3, + expectStop: 2, + expectIgnore: 2, expectDisconnect: 1, allocStatus: structs.AllocClientStatusLost, }, @@ -5428,7 +5427,7 @@ func TestReconciler_Disconnected_Client(t *testing.T) { }, }, { - name: "ignore-reconnect-completed", + name: "update-reconnect-completed", allocCount: 2, replace: false, disconnectedAllocCount: 2, @@ -5436,16 +5435,15 @@ func TestReconciler_Disconnected_Client(t *testing.T) { disconnectedAllocStates: disconnectAllocState, isBatch: true, expected: &resultExpectation{ - place: 2, + place: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ "web": { Ignore: 2, - Place: 2, + Place: 0, }, }, }, }, - { name: "stop-original-alloc-failed-replacements-replaced", allocCount: 5, @@ -5496,13 +5494,13 @@ func TestReconciler_Disconnected_Client(t *testing.T) { disconnectedAllocStates: disconnectAllocState, shouldStopOnDisconnectedNode: true, expected: &resultExpectation{ - stop: 2, + stop: 4, place: 2, desiredTGUpdates: map[string]*structs.DesiredUpdates{ "web": { - Stop: 2, + Stop: 4, Place: 2, - Ignore: 5, + Ignore: 3, }, }, }, diff --git a/scheduler/reconcile_util.go b/scheduler/reconcile_util.go index bf8978ca2bc..25c7f543152 100644 --- a/scheduler/reconcile_util.go +++ b/scheduler/reconcile_util.go @@ -293,9 +293,9 @@ func (a allocSet) filterByTainted(taintedNodes map[string]*structs.Node, serverS } if alloc.TerminalStatus() && !reconnect { - // Terminal allocs, if supportsDisconnectedClient and not reconnect, + // Server-terminal allocs, if supportsDisconnectedClient and not reconnect, // are probably stopped replacements and should be ignored - if supportsDisconnectedClients { + if supportsDisconnectedClients && alloc.ServerTerminalStatus() { ignore[alloc.ID] = alloc continue } diff --git a/scheduler/reconcile_util_test.go b/scheduler/reconcile_util_test.go index 55e2011dceb..f5c64e5be3c 100644 --- a/scheduler/reconcile_util_test.go +++ b/scheduler/reconcile_util_test.go @@ -4,7 +4,6 @@ package scheduler import ( - "fmt" "testing" "time" @@ -129,7 +128,6 @@ func TestAllocSet_filterByTainted(t *testing.T) { Time: now, }, } - fmt.Println(unknownAllocState, expiredAllocState, reconnectedAllocState) jobDefinitions := []struct { name string @@ -477,10 +475,10 @@ func TestAllocSet_filterByTainted(t *testing.T) { taintedNodes: nodes, skipNilNodeTest: false, all: allocSet{ - // Allocs on reconnected nodes that are complete are ignored - "ignored-reconnect-complete": { - ID: "ignored-reconnect-complete", - Name: "ignored-reconnect-complete", + // Allocs on reconnected nodes that are complete need to be updated to stop + "untainted-reconnect-complete": { + ID: "untainted-reconnect-complete", + Name: "untainted-reconnect-complete", ClientStatus: structs.AllocClientStatusComplete, DesiredStatus: structs.AllocDesiredStatusRun, Job: testJob, @@ -511,10 +509,10 @@ func TestAllocSet_filterByTainted(t *testing.T) { TaskGroup: "web", AllocStates: unknownAllocState, }, - // Replacement allocs that are complete are ignored - "ignored-reconnect-complete-replacement": { - ID: "ignored-reconnect-complete-replacement", - Name: "ignored-reconnect-complete", + // Replacement allocs that are complete need to be updated + "untainted-reconnect-complete-replacement": { + ID: "untainted-reconnect-complete-replacement", + Name: "untainted-reconnect-complete", ClientStatus: structs.AllocClientStatusComplete, DesiredStatus: structs.AllocDesiredStatusRun, Job: testJob, @@ -547,7 +545,29 @@ func TestAllocSet_filterByTainted(t *testing.T) { PreviousAllocation: "untainted-reconnect-lost", }, }, - untainted: allocSet{}, + untainted: allocSet{ + "untainted-reconnect-complete": { + ID: "untainted-reconnect-complete", + Name: "untainted-reconnect-complete", + ClientStatus: structs.AllocClientStatusComplete, + DesiredStatus: structs.AllocDesiredStatusRun, + Job: testJob, + NodeID: "normal", + TaskGroup: "web", + AllocStates: unknownAllocState, + }, + "untainted-reconnect-complete-replacement": { + ID: "untainted-reconnect-complete-replacement", + Name: "untainted-reconnect-complete", + ClientStatus: structs.AllocClientStatusComplete, + DesiredStatus: structs.AllocDesiredStatusRun, + Job: testJob, + NodeID: "normal", + TaskGroup: "web", + AllocStates: unknownAllocState, + PreviousAllocation: "untainted-reconnect-complete", + }, + }, migrate: allocSet{}, disconnecting: allocSet{}, reconnecting: allocSet{ @@ -563,16 +583,6 @@ func TestAllocSet_filterByTainted(t *testing.T) { }, }, ignore: allocSet{ - "ignored-reconnect-complete": { - ID: "ignored-reconnect-complete", - Name: "ignored-reconnect-complete", - ClientStatus: structs.AllocClientStatusComplete, - DesiredStatus: structs.AllocDesiredStatusRun, - Job: testJob, - NodeID: "normal", - TaskGroup: "web", - AllocStates: unknownAllocState, - }, "ignored-reconnect-lost": { ID: "ignored-reconnect-lost", Name: "ignored-reconnect-lost", @@ -583,17 +593,6 @@ func TestAllocSet_filterByTainted(t *testing.T) { TaskGroup: "web", AllocStates: unknownAllocState, }, - "ignored-reconnect-complete-replacement": { - ID: "ignored-reconnect-complete-replacement", - Name: "ignored-reconnect-complete", - ClientStatus: structs.AllocClientStatusComplete, - DesiredStatus: structs.AllocDesiredStatusRun, - Job: testJob, - NodeID: "normal", - TaskGroup: "web", - AllocStates: unknownAllocState, - PreviousAllocation: "untainted-reconnect-complete", - }, "ignored-reconnect-failed-replacement": { ID: "ignored-reconnect-failed-replacement", Name: "ignored-reconnect-failed",