Merge pull request #6396 from guopeng0/feature/node_group_healthy_metrics

k8s-ci-robot · web-flow · commit a2f890247b01 · 2024-01-24T11:46:45.000+01:00
feat:add node group health and back off metrics
diff --git a/cluster-autoscaler/clusterstate/clusterstate.go b/cluster-autoscaler/clusterstate/clusterstate.go
@@ -462,6 +462,11 @@ func (csr *ClusterStateRegistry) updateNodeGroupMetrics() {
 	metrics.UpdateNodeGroupsCount(autoscaled, autoprovisioned)
 }
 
+// BackoffStatusForNodeGroup queries the backoff status of the node group
+func (csr *ClusterStateRegistry) BackoffStatusForNodeGroup(nodeGroup cloudprovider.NodeGroup, now time.Time) backoff.Status {
+	return csr.backoff.BackoffStatus(nodeGroup, csr.nodeInfosForGroups[nodeGroup.Id()], now)
+}
+
 // NodeGroupScaleUpSafety returns information about node group safety to be scaled up now.
 func (csr *ClusterStateRegistry) NodeGroupScaleUpSafety(nodeGroup cloudprovider.NodeGroup, now time.Time) NodeGroupScalingSafety {
 	isHealthy := csr.IsNodeGroupHealthy(nodeGroup.Id())
diff --git a/cluster-autoscaler/metrics/metrics.go b/cluster-autoscaler/metrics/metrics.go
@@ -215,6 +215,22 @@ var (
 		}, []string{"node_group"},
 	)
 
+	nodesGroupHealthiness = k8smetrics.NewGaugeVec(
+		&k8smetrics.GaugeOpts{
+			Namespace: caNamespace,
+			Name:      "node_group_healthiness",
+			Help:      "Whether or not node group is healthy enough for autoscaling. 1 if it is, 0 otherwise.",
+		}, []string{"node_group"},
+	)
+
+	nodeGroupBackOffStatus = k8smetrics.NewGaugeVec(
+		&k8smetrics.GaugeOpts{
+			Namespace: caNamespace,
+			Name:      "node_group_backoff_status",
+			Help:      "Whether or not node group is backoff for not autoscaling. 1 if it is, 0 otherwise.",
+		}, []string{"node_group", "reason"},
+	)
+
 	/**** Metrics related to autoscaler execution ****/
 	lastActivity = k8smetrics.NewGaugeVec(
 		&k8smetrics.GaugeOpts{
@@ -438,6 +454,8 @@ func RegisterAll(emitPerNodeGroupMetrics bool) {
 		legacyregistry.MustRegister(nodesGroupMinNodes)
 		legacyregistry.MustRegister(nodesGroupMaxNodes)
 		legacyregistry.MustRegister(nodesGroupTargetSize)
+		legacyregistry.MustRegister(nodesGroupHealthiness)
+		legacyregistry.MustRegister(nodeGroupBackOffStatus)
 	}
 }
 
@@ -543,6 +561,30 @@ func UpdateNodeGroupTargetSize(targetSizes map[string]int) {
 	}
 }
 
+// UpdateNodeGroupHealthStatus records if node group is healthy to autoscaling
+func UpdateNodeGroupHealthStatus(nodeGroup string, healthy bool) {
+	if healthy {
+		nodesGroupHealthiness.WithLabelValues(nodeGroup).Set(1)
+	} else {
+		nodesGroupHealthiness.WithLabelValues(nodeGroup).Set(0)
+	}
+}
+
+// UpdateNodeGroupBackOffStatus records if node group is backoff for not autoscaling
+func UpdateNodeGroupBackOffStatus(nodeGroup string, backoffReasonStatus map[string]bool) {
+	if len(backoffReasonStatus) == 0 {
+		nodeGroupBackOffStatus.WithLabelValues(nodeGroup, "").Set(0)
+	} else {
+		for reason, backoff := range backoffReasonStatus {
+			if backoff {
+				nodeGroupBackOffStatus.WithLabelValues(nodeGroup, reason).Set(1)
+			} else {
+				nodeGroupBackOffStatus.WithLabelValues(nodeGroup, reason).Set(0)
+			}
+		}
+	}
+}
+
 // RegisterError records any errors preventing Cluster Autoscaler from working.
 // No more than one error should be recorded per loop.
 func RegisterError(err errors.AutoscalerError) {
diff --git a/cluster-autoscaler/processors/status/autoscaling_status_processor.go b/cluster-autoscaler/processors/status/autoscaling_status_processor.go
@@ -31,7 +31,9 @@ type AutoscalingStatusProcessor interface {
 
 // NewDefaultAutoscalingStatusProcessor creates a default instance of AutoscalingStatusProcessor.
 func NewDefaultAutoscalingStatusProcessor() AutoscalingStatusProcessor {
-	return &NoOpAutoscalingStatusProcessor{}
+	return &MetricsAutoscalingStatusProcessor{
+		backoffReasonStatus: make(map[string]BackoffReasonStatus),
+	}
 }
 
 // NoOpAutoscalingStatusProcessor is an AutoscalingStatusProcessor implementation useful for testing.
diff --git a/cluster-autoscaler/processors/status/metrics_autoscaling_status_processor.go b/cluster-autoscaler/processors/status/metrics_autoscaling_status_processor.go
@@ -0,0 +1,77 @@
+/*
+Copyright 2018 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package status
+
+import (
+	"time"
+
+	"k8s.io/autoscaler/cluster-autoscaler/clusterstate"
+	"k8s.io/autoscaler/cluster-autoscaler/context"
+	"k8s.io/autoscaler/cluster-autoscaler/metrics"
+	"k8s.io/autoscaler/cluster-autoscaler/utils/backoff"
+)
+
+const (
+	// unknownErrorCode means that the cloud provider has not provided an error code.
+	unknownErrorCode = "unknown"
+)
+
+// BackoffReasonStatus contains information about backoff status and reason
+type BackoffReasonStatus map[string]bool
+
+// MetricsAutoscalingStatusProcessor is used to update metrics after each autoscaling iteration.
+type MetricsAutoscalingStatusProcessor struct {
+	backoffReasonStatus map[string]BackoffReasonStatus
+}
+
+// Process queries the health status and backoff situation of all node groups and updates metrics after each autoscaling iteration.
+func (p *MetricsAutoscalingStatusProcessor) Process(context *context.AutoscalingContext, csr *clusterstate.ClusterStateRegistry, now time.Time) error {
+	for _, nodeGroup := range context.CloudProvider.NodeGroups() {
+		if !nodeGroup.Exist() {
+			continue
+		}
+		metrics.UpdateNodeGroupHealthStatus(nodeGroup.Id(), csr.IsNodeGroupHealthy(nodeGroup.Id()))
+		backoffStatus := csr.BackoffStatusForNodeGroup(nodeGroup, now)
+		p.updateNodeGroupBackoffStatusMetrics(nodeGroup.Id(), backoffStatus)
+	}
+	return nil
+}
+
+// CleanUp cleans up the processor's internal structures.
+func (p *MetricsAutoscalingStatusProcessor) CleanUp() {
+}
+
+// updateNodeGroupBackoffStatusMetrics updates metrics about backoff situation and reason of the node group
+func (p *MetricsAutoscalingStatusProcessor) updateNodeGroupBackoffStatusMetrics(nodeGroup string, backoffStatus backoff.Status) {
+	if _, ok := p.backoffReasonStatus[nodeGroup]; ok {
+		for reason := range p.backoffReasonStatus[nodeGroup] {
+			p.backoffReasonStatus[nodeGroup][reason] = false
+		}
+	} else {
+		p.backoffReasonStatus[nodeGroup] = make(BackoffReasonStatus)
+	}
+
+	if backoffStatus.IsBackedOff {
+		errorCode := backoffStatus.ErrorInfo.ErrorCode
+		if errorCode == "" {
+			// prevent error code from being empty.
+			errorCode = unknownErrorCode
+		}
+		p.backoffReasonStatus[nodeGroup][errorCode] = true
+	}
+	metrics.UpdateNodeGroupBackOffStatus(nodeGroup, p.backoffReasonStatus[nodeGroup])
+}

Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,9 @@ type AutoscalingStatusProcessor interface {`
`31`	`31`
`32`	`32`	`// NewDefaultAutoscalingStatusProcessor creates a default instance of AutoscalingStatusProcessor.`
`33`	`33`	`func NewDefaultAutoscalingStatusProcessor() AutoscalingStatusProcessor {`
`34`		`- return &NoOpAutoscalingStatusProcessor{}`
	`34`	`+ return &MetricsAutoscalingStatusProcessor{`
	`35`	`+ backoffReasonStatus: make(map[string]BackoffReasonStatus),`
	`36`	`+ }`
`35`	`37`	`}`
`36`	`38`
`37`	`39`	`// NoOpAutoscalingStatusProcessor is an AutoscalingStatusProcessor implementation useful for testing.`