-
Notifications
You must be signed in to change notification settings - Fork 9
CP-23050: Validate CloudZero Metrics are available from Kube State Metrics #61
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
f84be9a
12cde2a
6d0cbab
2edd13e
1a511fb
66162a0
f94ab36
446b53a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,15 +3,20 @@ package kms | |
| import ( | ||
| "context" | ||
| "fmt" | ||
| "io/ioutil" | ||
| net "net/http" | ||
| "strings" | ||
| "time" | ||
|
|
||
| "github.com/cloudzero/cloudzero-agent-validator/pkg/config" | ||
| "github.com/cloudzero/cloudzero-agent-validator/pkg/diagnostic" | ||
| "github.com/cloudzero/cloudzero-agent-validator/pkg/http" | ||
| "github.com/cloudzero/cloudzero-agent-validator/pkg/logging" | ||
| "github.com/cloudzero/cloudzero-agent-validator/pkg/status" | ||
| "github.com/sirupsen/logrus" | ||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
| "k8s.io/client-go/kubernetes" | ||
| "k8s.io/client-go/rest" | ||
| "k8s.io/client-go/tools/clientcmd" | ||
| ) | ||
|
|
||
| const DiagnosticKMS = config.DiagnosticKMS | ||
|
|
@@ -23,46 +28,134 @@ var ( | |
| ) | ||
|
|
||
| type checker struct { | ||
| cfg *config.Settings | ||
| logger *logrus.Entry | ||
| cfg *config.Settings | ||
| logger *logrus.Entry | ||
| clientset kubernetes.Interface | ||
| } | ||
|
|
||
| func NewProvider(ctx context.Context, cfg *config.Settings) diagnostic.Provider { | ||
| func NewProvider(ctx context.Context, cfg *config.Settings, clientset ...kubernetes.Interface) diagnostic.Provider { | ||
| var cs kubernetes.Interface | ||
| if len(clientset) > 0 { | ||
| cs = clientset[0] | ||
| } else { | ||
| // Use the in-cluster config if running inside a cluster, otherwise use the default kubeconfig | ||
| config, err := rest.InClusterConfig() | ||
| if err != nil { | ||
| kubeconfig := clientcmd.RecommendedHomeFile | ||
| config, err = clientcmd.BuildConfigFromFlags("", kubeconfig) | ||
| if err != nil { | ||
| panic(err.Error()) | ||
| } | ||
| } | ||
|
|
||
| // Create the clientset | ||
| cs, err = kubernetes.NewForConfig(config) | ||
| if err != nil { | ||
| panic(err.Error()) | ||
| } | ||
| } | ||
|
|
||
| return &checker{ | ||
| cfg: cfg, | ||
| logger: logging.NewLogger(). | ||
| WithContext(ctx).WithField(logging.OpField, "ksm"), | ||
| clientset: cs, | ||
| } | ||
| } | ||
|
|
||
| func (c *checker) Check(ctx context.Context, client *net.Client, accessor status.Accessor) error { | ||
| func (c *checker) Check(ctx context.Context, client *net.Client, accessor status.Accessor, cfg *config.Settings) error { | ||
| var ( | ||
| err error | ||
| retriesRemaining = MaxRetry | ||
| url = fmt.Sprintf("%s/", c.cfg.Prometheus.KubeStateMetricsServiceEndpoint) | ||
| namespace = "prom-agent" | ||
| serviceName = "cz-prom-agent-kube-state-metrics" | ||
|
Comment on lines
+69
to
+70
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note: Need |
||
| endpointURL string | ||
| ) | ||
|
|
||
| // We need to build in a retry here because the kube-state-metrics service can take a few seconds to start up | ||
| // If it is deploying with the cloudzero-agent chart | ||
| for { | ||
| _, err = http.Do(ctx, client, net.MethodGet, nil, nil, url, nil) | ||
| if err == nil { | ||
| break | ||
| // Wait for the pod to become ready and find the first available endpoint | ||
| for retriesRemaining > 0 { | ||
| endpoints, err := c.clientset.CoreV1().Endpoints(namespace).Get(ctx, serviceName, metav1.GetOptions{}) | ||
| if err != nil { | ||
| c.logger.Errorf("Failed to get service endpoints: %v", err) | ||
| accessor.AddCheck(&status.StatusCheck{Name: DiagnosticKMS, Passing: false, Error: fmt.Sprintf("Failed to get service endpoints: %s", err.Error())}) | ||
| return nil | ||
| } | ||
| if retriesRemaining == 0 { | ||
|
|
||
| // Log the endpoints for debugging | ||
| c.logger.Infof("Endpoints: %v", endpoints) | ||
|
|
||
| // Check if there are any ready addresses and find the first available endpoint | ||
| for _, subset := range endpoints.Subsets { | ||
| for _, address := range subset.Addresses { | ||
| c.logger.Infof("Address: %v", address) | ||
| for _, port := range subset.Ports { | ||
| c.logger.Infof("Port: %v", port) | ||
| if port.Port == 8080 { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| endpointURL = fmt.Sprintf("http://%s:%d/metrics", address.IP, port.Port) | ||
| break | ||
| } | ||
| } | ||
| if endpointURL != "" { | ||
| break | ||
| } | ||
| } | ||
| if endpointURL != "" { | ||
| break | ||
| } | ||
| } | ||
|
|
||
| if endpointURL != "" { | ||
| break | ||
| } | ||
|
|
||
| c.logger.Infof("Pod is not ready, waiting...") | ||
| retriesRemaining-- | ||
| time.Sleep(RetryInterval) | ||
| } | ||
|
|
||
| if err != nil { | ||
| accessor.AddCheck(&status.StatusCheck{Name: DiagnosticKMS, Passing: false, Error: err.Error()}) | ||
| if retriesRemaining == 0 { | ||
| c.logger.Errorf("Pod did not become ready in time") | ||
| accessor.AddCheck(&status.StatusCheck{Name: DiagnosticKMS, Passing: false, Error: "Pod did not become ready in time"}) | ||
| return nil | ||
| } | ||
|
|
||
| accessor.AddCheck(&status.StatusCheck{Name: DiagnosticKMS, Passing: true}) | ||
| return nil | ||
| c.logger.Infof("Using endpoint URL: %s", endpointURL) | ||
|
|
||
| // Retry logic to handle transient issues | ||
| retriesRemaining = MaxRetry | ||
| for retriesRemaining > 0 { | ||
| resp, err := client.Get(endpointURL) | ||
| if err == nil && resp.StatusCode == net.StatusOK { | ||
| defer resp.Body.Close() | ||
| body, err := ioutil.ReadAll(resp.Body) | ||
| if err != nil { | ||
| c.logger.Errorf("Failed to read metrics: %v", err) | ||
| accessor.AddCheck(&status.StatusCheck{Name: DiagnosticKMS, Passing: false, Error: fmt.Sprintf("Failed to read metrics: %s", err.Error())}) | ||
| return nil | ||
| } | ||
|
|
||
| metrics := string(body) | ||
| requiredMetrics := []string{"kube_pod_info", "kube_node_info"} // Add the required metrics here | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note: Pass all Chart metrics via validator config file |
||
| allMetricsFound := true | ||
| for _, metric := range requiredMetrics { | ||
| if !strings.Contains(metrics, metric) { | ||
| c.logger.Errorf("Required metric %s not found", metric) | ||
| accessor.AddCheck(&status.StatusCheck{Name: DiagnosticKMS, Passing: false, Error: fmt.Sprintf("Required metric %s not found", metric)}) | ||
| allMetricsFound = false | ||
| } | ||
| } | ||
|
|
||
| if allMetricsFound { | ||
| c.logger.Infof("All required metrics found: %v", requiredMetrics) | ||
| accessor.AddCheck(&status.StatusCheck{Name: DiagnosticKMS, Passing: true}) | ||
| } | ||
| return nil | ||
| } | ||
|
|
||
| c.logger.Errorf("Failed to fetch metrics: %v", err) | ||
| retriesRemaining-- | ||
| time.Sleep(RetryInterval) | ||
| } | ||
|
|
||
| accessor.AddCheck(&status.StatusCheck{Name: DiagnosticKMS, Passing: false, Error: fmt.Sprintf("Failed to fetch metrics after %d retries", MaxRetry)}) | ||
| return nil | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
note: Change back after testing