Skip to content

Commit

Permalink
Merge pull request #65 from Cloudzero/cp-23740-3
Browse files Browse the repository at this point in the history
CP-23740: Use Required KSM Metrics From Chart when Validating KSM
  • Loading branch information
bdrennz authored Dec 4, 2024
2 parents 53b74ac + 59de324 commit 035a29b
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 5 deletions.
6 changes: 6 additions & 0 deletions pkg/config/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ type Prometheus struct {
Executable string `yaml:"executable" default:"/bin/prometheus" env:"PROMETHEUS_EXECUTABLE" env-description:"Prometheus Executable Path"`
KubeStateMetricsServiceEndpoint string `yaml:"kube_state_metrics_service_endpoint" env:"KMS_EP_URL" required:"true" env-description:"Kube State Metrics Service Endpoint"`
Configurations []string `yaml:"configurations"`
KubeMetrics []string `yaml:"kube_metrics"`
}

func (s *Prometheus) Validate() error {
Expand Down Expand Up @@ -43,5 +44,10 @@ func (s *Prometheus) Validate() error {
}
s.Configurations = cleanedPaths
}

if len(s.KubeMetrics) == 0 {
return errors.New("no KubeMetrics provided")
}

return nil
}
11 changes: 11 additions & 0 deletions pkg/config/prometheus_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,23 +23,34 @@ func TestPrometheus_Validate(t *testing.T) {
prom: config.Prometheus{
KubeStateMetricsServiceEndpoint: kmsServiceEndpoint,
Configurations: []string{scrapeConfigFile},
KubeMetrics: []string{"kube_node_info", "kube_pod_info"},
},
expected: nil,
},
{
name: "MissingKubeStateMetricsServiceEndpoint",
prom: config.Prometheus{
Configurations: []string{scrapeConfigFile},
KubeMetrics: []string{"kube_node_info", "kube_pod_info"},
},
expected: errors.New(config.ErrNoKubeStateMetricsServiceEndpointMsg),
},
{
name: "MissingScrapeConfigLocation",
prom: config.Prometheus{
KubeStateMetricsServiceEndpoint: kmsServiceEndpoint,
KubeMetrics: []string{"kube_node_info", "kube_pod_info"},
},
expected: nil,
},
{
name: "MissingKubeMetrics",
prom: config.Prometheus{
KubeStateMetricsServiceEndpoint: kmsServiceEndpoint,
Configurations: []string{scrapeConfigFile},
},
expected: errors.New("no KubeMetrics provided"),
},
}

for _, tt := range tests {
Expand Down
8 changes: 8 additions & 0 deletions pkg/config/testdata/cloudzero-agent-validator.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,14 @@ cloudzero:

prometheus:
kube_state_metrics_service_endpoint: http://kube-state-metrics:8080
kube_metrics:
- kube_node_info
- kube_node_status_capacity
- kube_pod_container_resource_limits
- kube_pod_container_resource_requests
- kube_pod_labels
- kube_pod_info
- node_dmi_info
configurations:
- prometheus.yml

Expand Down
15 changes: 10 additions & 5 deletions pkg/diagnostic/kms/check.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,17 +96,22 @@ func (c *checker) Check(_ context.Context, client *http.Client, accessor status.
}

metrics := string(body)
requiredMetrics := []string{"kube_pod_info", "kube_node_info"} // Add the required metrics here
for _, metric := range requiredMetrics {
allMetricsFound := true
for _, metric := range c.cfg.Prometheus.KubeMetrics {
if !strings.Contains(metrics, metric) {
c.logger.Errorf("Required metric %s not found on attempt %d", metric, attempt)
accessor.AddCheck(&status.StatusCheck{Name: DiagnosticKMS, Passing: false, Error: fmt.Sprintf("Required metric %s not found", metric)})
return nil
allMetricsFound = false
}
}

accessor.AddCheck(&status.StatusCheck{Name: DiagnosticKMS, Passing: true})
return nil
if allMetricsFound {
accessor.AddCheck(&status.StatusCheck{Name: DiagnosticKMS, Passing: true})
return nil
}

retriesRemaining--
time.Sleep(RetryInterval)
}

accessor.AddCheck(&status.StatusCheck{Name: DiagnosticKMS, Passing: false, Error: fmt.Sprintf("Failed to fetch metrics after %d retries", MaxRetry)})
Expand Down
45 changes: 45 additions & 0 deletions pkg/diagnostic/kms/check_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package kms_test
import (
"context"
"net/http"
"strings"
"testing"
"time"

Expand Down Expand Up @@ -53,6 +54,7 @@ func TestChecker_CheckOK(t *testing.T) {
cfg := &config.Settings{
Prometheus: config.Prometheus{
KubeStateMetricsServiceEndpoint: mockURL,
KubeMetrics: []string{"kube_pod_info", "kube_node_info"},
},
}
clientset := fake.NewSimpleClientset()
Expand Down Expand Up @@ -80,6 +82,7 @@ func TestChecker_CheckRetry(t *testing.T) {
cfg := &config.Settings{
Prometheus: config.Prometheus{
KubeStateMetricsServiceEndpoint: mockURL,
KubeMetrics: []string{"kube_pod_info", "kube_node_info"},
},
}
clientset := fake.NewSimpleClientset()
Expand Down Expand Up @@ -114,6 +117,7 @@ func TestChecker_CheckRetryFailure(t *testing.T) {
cfg := &config.Settings{
Prometheus: config.Prometheus{
KubeStateMetricsServiceEndpoint: mockURL,
KubeMetrics: []string{"kube_pod_info", "kube_node_info"},
},
}
clientset := fake.NewSimpleClientset()
Expand Down Expand Up @@ -147,6 +151,7 @@ func TestChecker_CheckMetricsValidation(t *testing.T) {
cfg := &config.Settings{
Prometheus: config.Prometheus{
KubeStateMetricsServiceEndpoint: mockURL,
KubeMetrics: []string{"kube_pod_info", "kube_node_info"},
},
}
clientset := fake.NewSimpleClientset()
Expand Down Expand Up @@ -174,6 +179,7 @@ func TestChecker_CheckHandles500Error(t *testing.T) {
cfg := &config.Settings{
Prometheus: config.Prometheus{
KubeStateMetricsServiceEndpoint: mockURL,
KubeMetrics: []string{"kube_pod_info", "kube_node_info"},
},
}
clientset := fake.NewSimpleClientset()
Expand All @@ -196,3 +202,42 @@ func TestChecker_CheckHandles500Error(t *testing.T) {
}
})
}

func TestChecker_CheckMissingMetrics(t *testing.T) {
cfg := &config.Settings{
Prometheus: config.Prometheus{
KubeStateMetricsServiceEndpoint: mockURL,
KubeMetrics: []string{"kube_pod_info", "kube_node_info", "missing_metric"},
},
}
clientset := fake.NewSimpleClientset()
createMockEndpoints(clientset)
provider := kms.NewProvider(context.Background(), cfg, clientset)

mock := test.NewHTTPMock()
mock.Expect(http.MethodGet, "kube_pod_info\nkube_node_info\n", http.StatusOK, nil)
client := mock.HTTPClient()

accessor := makeReport()

err := provider.Check(context.Background(), client, accessor)
assert.NoError(t, err)

accessor.ReadFromReport(func(s *status.ClusterStatus) {
assert.Len(t, s.Checks, 2)
foundMissingMetricError := false
foundRetryError := false
for _, c := range s.Checks {
t.Logf("Check: %+v", c)
assert.False(t, c.Passing)
if strings.Contains(c.Error, "Required metric missing_metric not found") {
foundMissingMetricError = true
}
if strings.Contains(c.Error, "Failed to fetch metrics after 3 retries") {
foundRetryError = true
}
}
assert.True(t, foundMissingMetricError, "Expected error for missing metric not found")
assert.True(t, foundRetryError, "Expected error for failed retries not found")
})
}

0 comments on commit 035a29b

Please sign in to comment.