Skip to content

Commit

Permalink
[gh-19729] Fix logic for updating terminal allocs on clients with max…
Browse files Browse the repository at this point in the history
… client disconnect (#20181)

Only ignore allocs on terminal states that are updated
---------

Co-authored-by: Tim Gross <tgross@hashicorp.com>
  • Loading branch information
Juanadelacuesta and tgross authored Mar 26, 2024
1 parent fee242c commit f2965ca
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 46 deletions.
22 changes: 10 additions & 12 deletions scheduler/reconcile_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -942,7 +942,7 @@ func TestReconciler_LostNode_PreventRescheduleOnLost(t *testing.T) {
maxClientDisconnect: pointer.Of(10 * time.Second),
PreventRescheduleOnLost: false,
reschedulePolicy: disabledReschedulePolicy,
expectPlace: 2,
expectPlace: 1,
expectStop: 1,
expectIgnore: 4,
expectDisconnect: 1,
Expand All @@ -953,13 +953,12 @@ func TestReconciler_LostNode_PreventRescheduleOnLost(t *testing.T) {
maxClientDisconnect: pointer.Of(10 * time.Second),
PreventRescheduleOnLost: true,
reschedulePolicy: disabledReschedulePolicy,
expectPlace: 1, // This behaviour needs to be verified
expectPlace: 0,
expectStop: 0,
expectIgnore: 5,
expectDisconnect: 2,
allocStatus: structs.AllocClientStatusUnknown,
},

{
name: "PreventRescheduleOnLost off, MaxClientDisconnect off, Reschedule on",
maxClientDisconnect: nil,
Expand Down Expand Up @@ -993,8 +992,8 @@ func TestReconciler_LostNode_PreventRescheduleOnLost(t *testing.T) {
Attempts: 1,
},
expectPlace: 3,
expectStop: 1,
expectIgnore: 3,
expectStop: 2,
expectIgnore: 2,
expectDisconnect: 1,
allocStatus: structs.AllocClientStatusLost,
},
Expand Down Expand Up @@ -5428,24 +5427,23 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
},
},
{
name: "ignore-reconnect-completed",
name: "update-reconnect-completed",
allocCount: 2,
replace: false,
disconnectedAllocCount: 2,
disconnectedAllocStatus: structs.AllocClientStatusComplete,
disconnectedAllocStates: disconnectAllocState,
isBatch: true,
expected: &resultExpectation{
place: 2,
place: 0,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
"web": {
Ignore: 2,
Place: 2,
Place: 0,
},
},
},
},

{
name: "stop-original-alloc-failed-replacements-replaced",
allocCount: 5,
Expand Down Expand Up @@ -5496,13 +5494,13 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
disconnectedAllocStates: disconnectAllocState,
shouldStopOnDisconnectedNode: true,
expected: &resultExpectation{
stop: 2,
stop: 4,
place: 2,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
"web": {
Stop: 2,
Stop: 4,
Place: 2,
Ignore: 5,
Ignore: 3,
},
},
},
Expand Down
4 changes: 2 additions & 2 deletions scheduler/reconcile_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,9 +293,9 @@ func (a allocSet) filterByTainted(taintedNodes map[string]*structs.Node, serverS
}

if alloc.TerminalStatus() && !reconnect {
// Terminal allocs, if supportsDisconnectedClient and not reconnect,
// Server-terminal allocs, if supportsDisconnectedClient and not reconnect,
// are probably stopped replacements and should be ignored
if supportsDisconnectedClients {
if supportsDisconnectedClients && alloc.ServerTerminalStatus() {
ignore[alloc.ID] = alloc
continue
}
Expand Down
63 changes: 31 additions & 32 deletions scheduler/reconcile_util_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
package scheduler

import (
"fmt"
"testing"
"time"

Expand Down Expand Up @@ -129,7 +128,6 @@ func TestAllocSet_filterByTainted(t *testing.T) {
Time: now,
},
}
fmt.Println(unknownAllocState, expiredAllocState, reconnectedAllocState)

jobDefinitions := []struct {
name string
Expand Down Expand Up @@ -477,10 +475,10 @@ func TestAllocSet_filterByTainted(t *testing.T) {
taintedNodes: nodes,
skipNilNodeTest: false,
all: allocSet{
// Allocs on reconnected nodes that are complete are ignored
"ignored-reconnect-complete": {
ID: "ignored-reconnect-complete",
Name: "ignored-reconnect-complete",
// Allocs on reconnected nodes that are complete need to be updated to stop
"untainted-reconnect-complete": {
ID: "untainted-reconnect-complete",
Name: "untainted-reconnect-complete",
ClientStatus: structs.AllocClientStatusComplete,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
Expand Down Expand Up @@ -511,10 +509,10 @@ func TestAllocSet_filterByTainted(t *testing.T) {
TaskGroup: "web",
AllocStates: unknownAllocState,
},
// Replacement allocs that are complete are ignored
"ignored-reconnect-complete-replacement": {
ID: "ignored-reconnect-complete-replacement",
Name: "ignored-reconnect-complete",
// Replacement allocs that are complete need to be updated
"untainted-reconnect-complete-replacement": {
ID: "untainted-reconnect-complete-replacement",
Name: "untainted-reconnect-complete",
ClientStatus: structs.AllocClientStatusComplete,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
Expand Down Expand Up @@ -547,7 +545,29 @@ func TestAllocSet_filterByTainted(t *testing.T) {
PreviousAllocation: "untainted-reconnect-lost",
},
},
untainted: allocSet{},
untainted: allocSet{
"untainted-reconnect-complete": {
ID: "untainted-reconnect-complete",
Name: "untainted-reconnect-complete",
ClientStatus: structs.AllocClientStatusComplete,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
},
"untainted-reconnect-complete-replacement": {
ID: "untainted-reconnect-complete-replacement",
Name: "untainted-reconnect-complete",
ClientStatus: structs.AllocClientStatusComplete,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
PreviousAllocation: "untainted-reconnect-complete",
},
},
migrate: allocSet{},
disconnecting: allocSet{},
reconnecting: allocSet{
Expand All @@ -563,16 +583,6 @@ func TestAllocSet_filterByTainted(t *testing.T) {
},
},
ignore: allocSet{
"ignored-reconnect-complete": {
ID: "ignored-reconnect-complete",
Name: "ignored-reconnect-complete",
ClientStatus: structs.AllocClientStatusComplete,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
},
"ignored-reconnect-lost": {
ID: "ignored-reconnect-lost",
Name: "ignored-reconnect-lost",
Expand All @@ -583,17 +593,6 @@ func TestAllocSet_filterByTainted(t *testing.T) {
TaskGroup: "web",
AllocStates: unknownAllocState,
},
"ignored-reconnect-complete-replacement": {
ID: "ignored-reconnect-complete-replacement",
Name: "ignored-reconnect-complete",
ClientStatus: structs.AllocClientStatusComplete,
DesiredStatus: structs.AllocDesiredStatusRun,
Job: testJob,
NodeID: "normal",
TaskGroup: "web",
AllocStates: unknownAllocState,
PreviousAllocation: "untainted-reconnect-complete",
},
"ignored-reconnect-failed-replacement": {
ID: "ignored-reconnect-failed-replacement",
Name: "ignored-reconnect-failed",
Expand Down

0 comments on commit f2965ca

Please sign in to comment.