Skip to content

Commit c3ba40e

Browse files
committed
UPSTREAM: <carry>: improve failed machine detection for hcp
This change allows the failed machine logic to be aware of the API differences between MAPI and CAPI status fields related to failures and errors. This change is being added to ensure that the autoscaler will properly detect failed machines regardless of whether it is running on OCP or HCP.
1 parent 76fd929 commit c3ba40e

File tree

2 files changed

+82
-31
lines changed

2 files changed

+82
-31
lines changed

cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller.go

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ const (
6767
autoDiscovererTypeClusterAPI = "clusterapi"
6868
autoDiscovererClusterNameKey = "clusterName"
6969
autoDiscovererNamespaceKey = "namespace"
70+
CAPIFailedMessageField = "failureMessage"
71+
MAPIFailedMessageField = "errorMessage"
7072
)
7173

7274
// machineController watches for Nodes, Machines, MachinePools, MachineSets, and
@@ -625,17 +627,25 @@ func (c *machineController) findScalableResourceProviderIDs(scalableResource *un
625627
}
626628

627629
for _, machine := range machines {
630+
// Because the error messages are contained in different fields depending on the API (MAPI or CAPI), we
631+
// need to make sure we grab the proper information so that the failure states are evaluated properly.
632+
apiVersion := machine.GetAPIVersion()
633+
failedMessageField := CAPIFailedMessageField
634+
if strings.HasPrefix(apiVersion, openshiftMAPIGroup) {
635+
failedMessageField = MAPIFailedMessageField
636+
}
637+
628638
// In some cases it is possible for a machine to have acquired a provider ID from the infrastructure and
629639
// then become failed later. We want to ensure that a failed machine is not counted towards the total
630640
// number of nodes in the cluster, for this reason we will detect a failed machine first, regardless
631641
// of provider ID, and give it a normalized provider ID with failure message prepended.
632-
errorMessage, found, err := unstructured.NestedString(machine.UnstructuredContent(), "status", "errorMessage")
642+
failureMessage, found, err := unstructured.NestedString(machine.UnstructuredContent(), "status", failedMessageField)
633643
if err != nil {
634644
return nil, err
635645
}
636646

637647
if found {
638-
klog.V(4).Infof("Status.ErrorMessage of machine %q is %q", machine.GetName(), errorMessage)
648+
klog.V(4).Infof("status.%s of machine %q is %q", failedMessageField, machine.GetName(), failureMessage)
639649
// Provide a fake ID to allow the autoscaler to track machines that will never
640650
// become nodes and mark the nodegroup unhealthy after maxNodeProvisionTime.
641651
// Fake ID needs to be recognised later and converted into a machine key.

cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup_test.go

Lines changed: 70 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -416,6 +416,7 @@ func TestNodeGroupDecreaseTargetSize(t *testing.T) {
416416
includeFailedMachine bool
417417
includeFailedMachineWithProviderID bool
418418
includePendingMachine bool
419+
failedMachineIsMAPI bool
419420
}
420421

421422
test := func(t *testing.T, tc *testCase, testConfig *testConfig) {
@@ -451,7 +452,13 @@ func TestNodeGroupDecreaseTargetSize(t *testing.T) {
451452
if !tc.includeFailedMachineWithProviderID {
452453
unstructured.RemoveNestedField(machine.Object, "spec", "providerID")
453454
}
454-
unstructured.SetNestedField(machine.Object, "ErrorMessage", "status", "errorMessage")
455+
456+
if tc.failedMachineIsMAPI {
457+
machine.SetAPIVersion(fmt.Sprintf("%s/v1beta1", openshiftMAPIGroup))
458+
unstructured.SetNestedField(machine.Object, "ErrorMessage", "status", "errorMessage")
459+
} else {
460+
unstructured.SetNestedField(machine.Object, "ErrorMessage", "status", "failureMessage")
461+
}
455462

456463
if err := updateResource(controller.managementClient, controller.machineInformer, controller.machineResource, machine); err != nil {
457464
t.Fatalf("unexpected error updating machine, got %v", err)
@@ -648,6 +655,25 @@ func TestNodeGroupDecreaseTargetSize(t *testing.T) {
648655
includeFailedMachine: true,
649656
includeFailedMachineWithProviderID: true,
650657
},
658+
{
659+
description: "A node group with 4 replicas with one failed MAPI machine without a provider ID should decrease by 1",
660+
initial: 4,
661+
targetSizeIncrement: 0,
662+
expected: 3,
663+
delta: -1,
664+
includeFailedMachine: true,
665+
failedMachineIsMAPI: true,
666+
},
667+
{
668+
description: "A node group with 4 replicas with one failed MAPI machine that has a provider ID should decrease by 1",
669+
initial: 4,
670+
targetSizeIncrement: 0,
671+
expected: 3,
672+
delta: -1,
673+
includeFailedMachine: true,
674+
includeFailedMachineWithProviderID: true,
675+
failedMachineIsMAPI: true,
676+
},
651677
}
652678

653679
annotations := map[string]string{
@@ -1306,13 +1332,20 @@ func TestNodeGroupDeleteNodesSequential(t *testing.T) {
13061332
}
13071333

13081334
func TestNodeGroupWithFailedMachine(t *testing.T) {
1309-
test := func(t *testing.T, testConfig *testConfig) {
1310-
controller, stop := mustCreateTestController(t, testConfig)
1335+
type testCase struct {
1336+
description string
1337+
testConfig *testConfig
1338+
failedMachineIsMAPI bool
1339+
}
1340+
1341+
test := func(t *testing.T, tc testCase) {
1342+
controller, stop := mustCreateTestController(t, tc.testConfig)
13111343
defer stop()
13121344

13131345
// Simulate a failed machine
1314-
machine := testConfig.machines[3].DeepCopy()
1346+
machine := tc.testConfig.machines[3].DeepCopy()
13151347

1348+
machine.SetAPIVersion(fmt.Sprintf("%s/v1beta1", openshiftMAPIGroup))
13161349
unstructured.RemoveNestedField(machine.Object, "spec", "providerID")
13171350
unstructured.SetNestedField(machine.Object, "ErrorMessage", "status", "errorMessage")
13181351

@@ -1335,8 +1368,8 @@ func TestNodeGroupWithFailedMachine(t *testing.T) {
13351368
t.Fatalf("unexpected error: %v", err)
13361369
}
13371370

1338-
if len(nodeNames) != len(testConfig.nodes) {
1339-
t.Fatalf("expected len=%v, got len=%v", len(testConfig.nodes), len(nodeNames))
1371+
if len(nodeNames) != len(tc.testConfig.nodes) {
1372+
t.Fatalf("expected len=%v, got len=%v", len(tc.testConfig.nodes), len(nodeNames))
13401373
}
13411374

13421375
sort.SliceStable(nodeNames, func(i, j int) bool {
@@ -1360,8 +1393,8 @@ func TestNodeGroupWithFailedMachine(t *testing.T) {
13601393
nodeIndex = i
13611394
}
13621395

1363-
if nodeNames[i].Id != testConfig.nodes[nodeIndex].Spec.ProviderID {
1364-
t.Fatalf("expected %q, got %q", testConfig.nodes[nodeIndex].Spec.ProviderID, nodeNames[i].Id)
1396+
if nodeNames[i].Id != tc.testConfig.nodes[nodeIndex].Spec.ProviderID {
1397+
t.Fatalf("expected %q, got %q", tc.testConfig.nodes[nodeIndex].Spec.ProviderID, nodeNames[i].Id)
13651398
}
13661399
}
13671400
}
@@ -1370,40 +1403,48 @@ func TestNodeGroupWithFailedMachine(t *testing.T) {
13701403
// Going beyond 10 will break the sorting that happens in the
13711404
// test() function because sort.Strings() will not do natural
13721405
// sorting and the expected semantics in test() will fail.
1406+
annotations := map[string]string{
1407+
nodeGroupMinSizeAnnotationKey: "1",
1408+
nodeGroupMaxSizeAnnotationKey: "10",
1409+
}
13731410

1374-
t.Run("MachineSet", func(t *testing.T) {
1375-
test(
1376-
t,
1377-
createMachineSetTestConfig(
1411+
testCases := []testCase{
1412+
{
1413+
description: "one failed machine with MAPI api group",
1414+
testConfig: createMachineSetTestConfig(
13781415
RandomString(6),
13791416
RandomString(6),
13801417
RandomString(6),
13811418
10,
1382-
map[string]string{
1383-
nodeGroupMinSizeAnnotationKey: "1",
1384-
nodeGroupMaxSizeAnnotationKey: "10",
1385-
},
1419+
annotations,
13861420
nil,
13871421
),
1388-
)
1389-
})
1390-
1391-
t.Run("MachineDeployment", func(t *testing.T) {
1392-
test(
1393-
t,
1394-
createMachineDeploymentTestConfig(
1422+
failedMachineIsMAPI: true,
1423+
},
1424+
{
1425+
description: "one failed machine with CAPI api group",
1426+
testConfig: createMachineSetTestConfig(
13951427
RandomString(6),
13961428
RandomString(6),
13971429
RandomString(6),
13981430
10,
1399-
map[string]string{
1400-
nodeGroupMinSizeAnnotationKey: "1",
1401-
nodeGroupMaxSizeAnnotationKey: "10",
1402-
},
1431+
annotations,
14031432
nil,
14041433
),
1405-
)
1406-
})
1434+
},
1435+
}
1436+
1437+
for _, tc := range testCases {
1438+
t.Run(fmt.Sprintf("MachineSet/%s", tc.description), func(t *testing.T) {
1439+
test(t, tc)
1440+
})
1441+
}
1442+
1443+
for _, tc := range testCases {
1444+
t.Run(fmt.Sprintf("MachineDeployment/%s", tc.description), func(t *testing.T) {
1445+
test(t, tc)
1446+
})
1447+
}
14071448
}
14081449

14091450
func TestNodeGroupTemplateNodeInfo(t *testing.T) {

0 commit comments

Comments
 (0)