Skip to content

Commit a2f8902

Browse files
authored
Merge pull request #6396 from guopeng0/feature/node_group_healthy_metrics
feat:add node group health and back off metrics
2 parents 779c1ba + 4b9d4b1 commit a2f8902

File tree

4 files changed

+127
-1
lines changed

4 files changed

+127
-1
lines changed

cluster-autoscaler/clusterstate/clusterstate.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -462,6 +462,11 @@ func (csr *ClusterStateRegistry) updateNodeGroupMetrics() {
462462
metrics.UpdateNodeGroupsCount(autoscaled, autoprovisioned)
463463
}
464464

465+
// BackoffStatusForNodeGroup queries the backoff status of the node group
466+
func (csr *ClusterStateRegistry) BackoffStatusForNodeGroup(nodeGroup cloudprovider.NodeGroup, now time.Time) backoff.Status {
467+
return csr.backoff.BackoffStatus(nodeGroup, csr.nodeInfosForGroups[nodeGroup.Id()], now)
468+
}
469+
465470
// NodeGroupScaleUpSafety returns information about node group safety to be scaled up now.
466471
func (csr *ClusterStateRegistry) NodeGroupScaleUpSafety(nodeGroup cloudprovider.NodeGroup, now time.Time) NodeGroupScalingSafety {
467472
isHealthy := csr.IsNodeGroupHealthy(nodeGroup.Id())

cluster-autoscaler/metrics/metrics.go

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,22 @@ var (
215215
}, []string{"node_group"},
216216
)
217217

218+
nodesGroupHealthiness = k8smetrics.NewGaugeVec(
219+
&k8smetrics.GaugeOpts{
220+
Namespace: caNamespace,
221+
Name: "node_group_healthiness",
222+
Help: "Whether or not node group is healthy enough for autoscaling. 1 if it is, 0 otherwise.",
223+
}, []string{"node_group"},
224+
)
225+
226+
nodeGroupBackOffStatus = k8smetrics.NewGaugeVec(
227+
&k8smetrics.GaugeOpts{
228+
Namespace: caNamespace,
229+
Name: "node_group_backoff_status",
230+
Help: "Whether or not node group is backoff for not autoscaling. 1 if it is, 0 otherwise.",
231+
}, []string{"node_group", "reason"},
232+
)
233+
218234
/**** Metrics related to autoscaler execution ****/
219235
lastActivity = k8smetrics.NewGaugeVec(
220236
&k8smetrics.GaugeOpts{
@@ -438,6 +454,8 @@ func RegisterAll(emitPerNodeGroupMetrics bool) {
438454
legacyregistry.MustRegister(nodesGroupMinNodes)
439455
legacyregistry.MustRegister(nodesGroupMaxNodes)
440456
legacyregistry.MustRegister(nodesGroupTargetSize)
457+
legacyregistry.MustRegister(nodesGroupHealthiness)
458+
legacyregistry.MustRegister(nodeGroupBackOffStatus)
441459
}
442460
}
443461

@@ -543,6 +561,30 @@ func UpdateNodeGroupTargetSize(targetSizes map[string]int) {
543561
}
544562
}
545563

564+
// UpdateNodeGroupHealthStatus records if node group is healthy to autoscaling
565+
func UpdateNodeGroupHealthStatus(nodeGroup string, healthy bool) {
566+
if healthy {
567+
nodesGroupHealthiness.WithLabelValues(nodeGroup).Set(1)
568+
} else {
569+
nodesGroupHealthiness.WithLabelValues(nodeGroup).Set(0)
570+
}
571+
}
572+
573+
// UpdateNodeGroupBackOffStatus records if node group is backoff for not autoscaling
574+
func UpdateNodeGroupBackOffStatus(nodeGroup string, backoffReasonStatus map[string]bool) {
575+
if len(backoffReasonStatus) == 0 {
576+
nodeGroupBackOffStatus.WithLabelValues(nodeGroup, "").Set(0)
577+
} else {
578+
for reason, backoff := range backoffReasonStatus {
579+
if backoff {
580+
nodeGroupBackOffStatus.WithLabelValues(nodeGroup, reason).Set(1)
581+
} else {
582+
nodeGroupBackOffStatus.WithLabelValues(nodeGroup, reason).Set(0)
583+
}
584+
}
585+
}
586+
}
587+
546588
// RegisterError records any errors preventing Cluster Autoscaler from working.
547589
// No more than one error should be recorded per loop.
548590
func RegisterError(err errors.AutoscalerError) {

cluster-autoscaler/processors/status/autoscaling_status_processor.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,9 @@ type AutoscalingStatusProcessor interface {
3131

3232
// NewDefaultAutoscalingStatusProcessor creates a default instance of AutoscalingStatusProcessor.
3333
func NewDefaultAutoscalingStatusProcessor() AutoscalingStatusProcessor {
34-
return &NoOpAutoscalingStatusProcessor{}
34+
return &MetricsAutoscalingStatusProcessor{
35+
backoffReasonStatus: make(map[string]BackoffReasonStatus),
36+
}
3537
}
3638

3739
// NoOpAutoscalingStatusProcessor is an AutoscalingStatusProcessor implementation useful for testing.
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
/*
2+
Copyright 2018 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package status
18+
19+
import (
20+
"time"
21+
22+
"k8s.io/autoscaler/cluster-autoscaler/clusterstate"
23+
"k8s.io/autoscaler/cluster-autoscaler/context"
24+
"k8s.io/autoscaler/cluster-autoscaler/metrics"
25+
"k8s.io/autoscaler/cluster-autoscaler/utils/backoff"
26+
)
27+
28+
const (
29+
// unknownErrorCode means that the cloud provider has not provided an error code.
30+
unknownErrorCode = "unknown"
31+
)
32+
33+
// BackoffReasonStatus contains information about backoff status and reason
34+
type BackoffReasonStatus map[string]bool
35+
36+
// MetricsAutoscalingStatusProcessor is used to update metrics after each autoscaling iteration.
37+
type MetricsAutoscalingStatusProcessor struct {
38+
backoffReasonStatus map[string]BackoffReasonStatus
39+
}
40+
41+
// Process queries the health status and backoff situation of all node groups and updates metrics after each autoscaling iteration.
42+
func (p *MetricsAutoscalingStatusProcessor) Process(context *context.AutoscalingContext, csr *clusterstate.ClusterStateRegistry, now time.Time) error {
43+
for _, nodeGroup := range context.CloudProvider.NodeGroups() {
44+
if !nodeGroup.Exist() {
45+
continue
46+
}
47+
metrics.UpdateNodeGroupHealthStatus(nodeGroup.Id(), csr.IsNodeGroupHealthy(nodeGroup.Id()))
48+
backoffStatus := csr.BackoffStatusForNodeGroup(nodeGroup, now)
49+
p.updateNodeGroupBackoffStatusMetrics(nodeGroup.Id(), backoffStatus)
50+
}
51+
return nil
52+
}
53+
54+
// CleanUp cleans up the processor's internal structures.
55+
func (p *MetricsAutoscalingStatusProcessor) CleanUp() {
56+
}
57+
58+
// updateNodeGroupBackoffStatusMetrics updates metrics about backoff situation and reason of the node group
59+
func (p *MetricsAutoscalingStatusProcessor) updateNodeGroupBackoffStatusMetrics(nodeGroup string, backoffStatus backoff.Status) {
60+
if _, ok := p.backoffReasonStatus[nodeGroup]; ok {
61+
for reason := range p.backoffReasonStatus[nodeGroup] {
62+
p.backoffReasonStatus[nodeGroup][reason] = false
63+
}
64+
} else {
65+
p.backoffReasonStatus[nodeGroup] = make(BackoffReasonStatus)
66+
}
67+
68+
if backoffStatus.IsBackedOff {
69+
errorCode := backoffStatus.ErrorInfo.ErrorCode
70+
if errorCode == "" {
71+
// prevent error code from being empty.
72+
errorCode = unknownErrorCode
73+
}
74+
p.backoffReasonStatus[nodeGroup][errorCode] = true
75+
}
76+
metrics.UpdateNodeGroupBackOffStatus(nodeGroup, p.backoffReasonStatus[nodeGroup])
77+
}

0 commit comments

Comments
 (0)