From 14b87d49120507fa138c7085e446ee64d2d01b7f Mon Sep 17 00:00:00 2001 From: Brian Drennan Date: Tue, 3 Dec 2024 12:23:39 -0800 Subject: [PATCH 1/3] add support for full kube metrics list --- pkg/config/prometheus.go | 6 ++++++ pkg/config/prometheus_test.go | 11 +++++++++++ pkg/config/testdata/cloudzero-agent-validator.yml | 8 ++++++++ 3 files changed, 25 insertions(+) diff --git a/pkg/config/prometheus.go b/pkg/config/prometheus.go index 1a39192..483157a 100644 --- a/pkg/config/prometheus.go +++ b/pkg/config/prometheus.go @@ -11,6 +11,7 @@ type Prometheus struct { Executable string `yaml:"executable" default:"/bin/prometheus" env:"PROMETHEUS_EXECUTABLE" env-description:"Prometheus Executable Path"` KubeStateMetricsServiceEndpoint string `yaml:"kube_state_metrics_service_endpoint" env:"KMS_EP_URL" required:"true" env-description:"Kube State Metrics Service Endpoint"` Configurations []string `yaml:"configurations"` + KubeMetrics []string `yaml:"kube_metrics"` } func (s *Prometheus) Validate() error { @@ -43,5 +44,10 @@ func (s *Prometheus) Validate() error { } s.Configurations = cleanedPaths } + + if len(s.KubeMetrics) == 0 { + return errors.New("no KubeMetrics provided") + } + return nil } diff --git a/pkg/config/prometheus_test.go b/pkg/config/prometheus_test.go index efabb40..b809887 100644 --- a/pkg/config/prometheus_test.go +++ b/pkg/config/prometheus_test.go @@ -23,6 +23,7 @@ func TestPrometheus_Validate(t *testing.T) { prom: config.Prometheus{ KubeStateMetricsServiceEndpoint: kmsServiceEndpoint, Configurations: []string{scrapeConfigFile}, + KubeMetrics: []string{"kube_node_info", "kube_pod_info"}, }, expected: nil, }, @@ -30,6 +31,7 @@ func TestPrometheus_Validate(t *testing.T) { name: "MissingKubeStateMetricsServiceEndpoint", prom: config.Prometheus{ Configurations: []string{scrapeConfigFile}, + KubeMetrics: []string{"kube_node_info", "kube_pod_info"}, }, expected: errors.New(config.ErrNoKubeStateMetricsServiceEndpointMsg), }, @@ -37,9 +39,18 @@ func TestPrometheus_Validate(t *testing.T) { name: "MissingScrapeConfigLocation", prom: config.Prometheus{ KubeStateMetricsServiceEndpoint: kmsServiceEndpoint, + KubeMetrics: []string{"kube_node_info", "kube_pod_info"}, }, expected: nil, }, + { + name: "MissingKubeMetrics", + prom: config.Prometheus{ + KubeStateMetricsServiceEndpoint: kmsServiceEndpoint, + Configurations: []string{scrapeConfigFile}, + }, + expected: errors.New("no KubeMetrics provided"), + }, } for _, tt := range tests { diff --git a/pkg/config/testdata/cloudzero-agent-validator.yml b/pkg/config/testdata/cloudzero-agent-validator.yml index 26053b3..181323f 100644 --- a/pkg/config/testdata/cloudzero-agent-validator.yml +++ b/pkg/config/testdata/cloudzero-agent-validator.yml @@ -18,6 +18,14 @@ cloudzero: prometheus: kube_state_metrics_service_endpoint: http://kube-state-metrics:8080 + kube_metrics: + - kube_node_info + - kube_node_status_capacity + - kube_pod_container_resource_limits + - kube_pod_container_resource_requests + - kube_pod_labels + - kube_pod_info + - node_dmi_info configurations: - prometheus.yml From e68ec94763e85ee696fa34e48d63c180ea237fd1 Mon Sep 17 00:00:00 2001 From: Brian Drennan Date: Tue, 3 Dec 2024 12:29:44 -0800 Subject: [PATCH 2/3] use metric list during check --- pkg/diagnostic/kms/check.go | 2 +- pkg/diagnostic/kms/check_test.go | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pkg/diagnostic/kms/check.go b/pkg/diagnostic/kms/check.go index 6a31077..d7e2826 100644 --- a/pkg/diagnostic/kms/check.go +++ b/pkg/diagnostic/kms/check.go @@ -96,7 +96,7 @@ func (c *checker) Check(_ context.Context, client *http.Client, accessor status. } metrics := string(body) - requiredMetrics := []string{"kube_pod_info", "kube_node_info"} // Add the required metrics here + requiredMetrics := c.cfg.Prometheus.KubeMetrics // Use the required metrics from the configuration for _, metric := range requiredMetrics { if !strings.Contains(metrics, metric) { c.logger.Errorf("Required metric %s not found on attempt %d", metric, attempt) diff --git a/pkg/diagnostic/kms/check_test.go b/pkg/diagnostic/kms/check_test.go index ae8be6a..2ca6bc1 100644 --- a/pkg/diagnostic/kms/check_test.go +++ b/pkg/diagnostic/kms/check_test.go @@ -53,6 +53,7 @@ func TestChecker_CheckOK(t *testing.T) { cfg := &config.Settings{ Prometheus: config.Prometheus{ KubeStateMetricsServiceEndpoint: mockURL, + KubeMetrics: []string{"kube_pod_info", "kube_node_info"}, }, } clientset := fake.NewSimpleClientset() @@ -80,6 +81,7 @@ func TestChecker_CheckRetry(t *testing.T) { cfg := &config.Settings{ Prometheus: config.Prometheus{ KubeStateMetricsServiceEndpoint: mockURL, + KubeMetrics: []string{"kube_pod_info", "kube_node_info"}, }, } clientset := fake.NewSimpleClientset() @@ -114,6 +116,7 @@ func TestChecker_CheckRetryFailure(t *testing.T) { cfg := &config.Settings{ Prometheus: config.Prometheus{ KubeStateMetricsServiceEndpoint: mockURL, + KubeMetrics: []string{"kube_pod_info", "kube_node_info"}, }, } clientset := fake.NewSimpleClientset() @@ -147,6 +150,7 @@ func TestChecker_CheckMetricsValidation(t *testing.T) { cfg := &config.Settings{ Prometheus: config.Prometheus{ KubeStateMetricsServiceEndpoint: mockURL, + KubeMetrics: []string{"kube_pod_info", "kube_node_info"}, }, } clientset := fake.NewSimpleClientset() @@ -174,6 +178,7 @@ func TestChecker_CheckHandles500Error(t *testing.T) { cfg := &config.Settings{ Prometheus: config.Prometheus{ KubeStateMetricsServiceEndpoint: mockURL, + KubeMetrics: []string{"kube_pod_info", "kube_node_info"}, }, } clientset := fake.NewSimpleClientset() From 59de324ea9e4488dd12280a3cb5a4575933b5336 Mon Sep 17 00:00:00 2001 From: Brian Drennan Date: Tue, 3 Dec 2024 13:42:37 -0800 Subject: [PATCH 3/3] better handling of missing metrics --- pkg/diagnostic/kms/check.go | 15 ++++++++---- pkg/diagnostic/kms/check_test.go | 40 ++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 5 deletions(-) diff --git a/pkg/diagnostic/kms/check.go b/pkg/diagnostic/kms/check.go index d7e2826..f1580c3 100644 --- a/pkg/diagnostic/kms/check.go +++ b/pkg/diagnostic/kms/check.go @@ -96,17 +96,22 @@ func (c *checker) Check(_ context.Context, client *http.Client, accessor status. } metrics := string(body) - requiredMetrics := c.cfg.Prometheus.KubeMetrics // Use the required metrics from the configuration - for _, metric := range requiredMetrics { + allMetricsFound := true + for _, metric := range c.cfg.Prometheus.KubeMetrics { if !strings.Contains(metrics, metric) { c.logger.Errorf("Required metric %s not found on attempt %d", metric, attempt) accessor.AddCheck(&status.StatusCheck{Name: DiagnosticKMS, Passing: false, Error: fmt.Sprintf("Required metric %s not found", metric)}) - return nil + allMetricsFound = false } } - accessor.AddCheck(&status.StatusCheck{Name: DiagnosticKMS, Passing: true}) - return nil + if allMetricsFound { + accessor.AddCheck(&status.StatusCheck{Name: DiagnosticKMS, Passing: true}) + return nil + } + + retriesRemaining-- + time.Sleep(RetryInterval) } accessor.AddCheck(&status.StatusCheck{Name: DiagnosticKMS, Passing: false, Error: fmt.Sprintf("Failed to fetch metrics after %d retries", MaxRetry)}) diff --git a/pkg/diagnostic/kms/check_test.go b/pkg/diagnostic/kms/check_test.go index 2ca6bc1..719bfc9 100644 --- a/pkg/diagnostic/kms/check_test.go +++ b/pkg/diagnostic/kms/check_test.go @@ -3,6 +3,7 @@ package kms_test import ( "context" "net/http" + "strings" "testing" "time" @@ -201,3 +202,42 @@ func TestChecker_CheckHandles500Error(t *testing.T) { } }) } + +func TestChecker_CheckMissingMetrics(t *testing.T) { + cfg := &config.Settings{ + Prometheus: config.Prometheus{ + KubeStateMetricsServiceEndpoint: mockURL, + KubeMetrics: []string{"kube_pod_info", "kube_node_info", "missing_metric"}, + }, + } + clientset := fake.NewSimpleClientset() + createMockEndpoints(clientset) + provider := kms.NewProvider(context.Background(), cfg, clientset) + + mock := test.NewHTTPMock() + mock.Expect(http.MethodGet, "kube_pod_info\nkube_node_info\n", http.StatusOK, nil) + client := mock.HTTPClient() + + accessor := makeReport() + + err := provider.Check(context.Background(), client, accessor) + assert.NoError(t, err) + + accessor.ReadFromReport(func(s *status.ClusterStatus) { + assert.Len(t, s.Checks, 2) + foundMissingMetricError := false + foundRetryError := false + for _, c := range s.Checks { + t.Logf("Check: %+v", c) + assert.False(t, c.Passing) + if strings.Contains(c.Error, "Required metric missing_metric not found") { + foundMissingMetricError = true + } + if strings.Contains(c.Error, "Failed to fetch metrics after 3 retries") { + foundRetryError = true + } + } + assert.True(t, foundMissingMetricError, "Expected error for missing metric not found") + assert.True(t, foundRetryError, "Expected error for failed retries not found") + }) +}