Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CP-22073: Remove Service Discovery for Node Exporter #36

Merged
merged 6 commits into from
Sep 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions CONFIGURATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ The `prometheus` section configures Prometheus settings.
| Key | Description | Required | Default Values |
|------------------------------------------|-------------------|----------|----------------|
| kube_state_metrics_service_endpoint | The endpoint for kube-state-metrics service | Mandatory | |
| prometheus_node_exporter_service_endpoint| The endpoint for node-exporter service | Mandatory | |
| configurations | List of one or more configuration files locations for prometheus to validate | Mandatory | |

## Diagnostics
Expand All @@ -76,7 +75,6 @@ The following table describes the available checkers:
| `k8s_version` | Checks the Kubernetes compatability |
| `egress_reachable` | Checks pod can communicate with the Cloudzero API |
| `kube_state_metrics_reachable` | Checks the kubernetes state metrics service is reachable |
| `node_exporter_reachable` | Checks the prometheus node exporter service is reachable |
| `scrape_cfg` | Checks the prometheus configurations exist and contain the necessary scrape configuration |

## Example
Expand Down
8 changes: 6 additions & 2 deletions pkg/cmd/config/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ type ScrapeConfigData struct {
ClusterName string
CloudAccountID string
Region string
Host string
}

func NewCommand(ctx context.Context) *cli.Command {
Expand All @@ -46,28 +47,31 @@ func NewCommand(ctx context.Context) *cli.Command {
&cli.StringFlag{Name: "namespace", Usage: "namespace of the cloudzero-agent pod", Required: true},
&cli.StringFlag{Name: "configmap", Usage: "name of the ConfigMap", Required: true},
&cli.StringFlag{Name: "pod", Usage: "name of the cloudzero-agent pod", Required: true},
&cli.StringFlag{Name: "host", Usage: "host for the prometheus remote write endpoint", Required: true},
},
Action: func(c *cli.Context) error {
kubeconfigPath := c.String("kubeconfig")
namespace := c.String("namespace")
configMapName := c.String("configmap")
host := c.String("host")

clientset, err := k8s.BuildKubeClient(kubeconfigPath)
if err != nil {
return err
}

kubeStateMetricsURL, nodeExporterURL, err := k8s.GetServiceURLs(ctx, clientset, namespace)
kubeStateMetricsURL, err := k8s.GetKubeStateMetricsURL(ctx, clientset, namespace)
if err != nil {
return err
}

targets := []string{kubeStateMetricsURL, nodeExporterURL}
targets := []string{kubeStateMetricsURL}
scrapeConfigData := ScrapeConfigData{
Targets: targets,
ClusterName: c.String(config.FlagClusterName),
CloudAccountID: c.String(config.FlagAccountID),
Region: c.String(config.FlagRegion),
Host: host,
}

configContent, err := Generate(scrapeConfigData)
Expand Down
20 changes: 5 additions & 15 deletions pkg/cmd/config/command_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,31 +31,21 @@ func TestGenerate(t *testing.T) {
},
},
},
&corev1.Service{
ObjectMeta: metav1.ObjectMeta{
Name: "node-exporter",
Namespace: namespace,
},
Spec: corev1.ServiceSpec{
Ports: []corev1.ServicePort{
{Port: 9100},
},
},
},
)

ctx, _ := context.WithCancel(context.Background())

// Fetch service URLs
kubeStateMetricsURL, nodeExporterURL, err := k8s.GetServiceURLs(ctx, clientset, namespace)
// Fetch the Kube State Metrics URL
kubeStateMetricsURL, err := k8s.GetKubeStateMetricsURL(ctx, clientset, namespace)
assert.NoError(t, err)

// Define the scrape config data
scrapeConfigData := config.ScrapeConfigData{
Targets: []string{kubeStateMetricsURL, nodeExporterURL},
Targets: []string{kubeStateMetricsURL},
ClusterName: "test-cluster",
CloudAccountID: "123456789",
Region: "us-west-2",
Host: "test-host",
}

// Generate the configuration content
Expand All @@ -65,10 +55,10 @@ func TestGenerate(t *testing.T) {

// Validate the dynamically populated values
assert.Contains(t, configContent, kubeStateMetricsURL)
assert.Contains(t, configContent, nodeExporterURL)
assert.Contains(t, configContent, "cluster_name=test-cluster")
assert.Contains(t, configContent, "cloud_account_id=123456789")
assert.Contains(t, configContent, "region=us-west-2")
assert.Contains(t, configContent, "test-host")

// Define the ConfigMap data
configMapData := map[string]string{
Expand Down
6 changes: 3 additions & 3 deletions pkg/cmd/config/internal/scrape_config.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ scrape_configs:
metric_relabel_configs:
- source_labels: [__name__]
separator: ;
regex: ^(kube_node_info|kube_node_status_capacity|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_labels|kube_pod_info|node_dmi_info)$
regex: ^(kube_node_info|kube_node_status_capacity|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_labels|kube_pod_info)$
replacement: $1
action: keep
- separator: ;
Expand All @@ -117,12 +117,12 @@ scrape_configs:
- {{ . }}
{{- end }}
remote_write:
- url: https://api.cloudzero.com/v1/container-metrics?cluster_name={{ .ClusterName }}&cloud_account_id={{ .CloudAccountID }}&region={{ .Region }}
- url: https://{{ .Host }}/v1/container-metrics?cluster_name={{ .ClusterName }}&cloud_account_id={{ .CloudAccountID }}&region={{ .Region }}
remote_timeout: 30s
write_relabel_configs:
- source_labels: [__name__]
separator: ;
regex: ^(kube_node_info|kube_node_status_capacity|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_labels|kube_pod_info|node_dmi_info|container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total)$
regex: ^(kube_node_info|kube_node_status_capacity|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_labels|kube_pod_info|container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total)$
replacement: $1
action: keep
authorization:
Expand Down
2 changes: 0 additions & 2 deletions pkg/cmd/config/internal/template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ cloudzero:

prometheus:
kube_state_metrics_service_endpoint: {{ .KubeStateMetricsURL }}
prometheus_node_exporter_service_endpoint: {{ .PromNodeExporterURL }}
configurations:
- /etc/config/prometheus/configmaps/prometheus.yml

Expand All @@ -33,7 +32,6 @@ diagnostics:
checks:
- k8s_version
- kube_state_metrics_reachable
- node_exporter_reachable
- scrape_cfg
- name: pre-stop
enforce: false
Expand Down
3 changes: 1 addition & 2 deletions pkg/config/diagnostics.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ const (
DiagnosticK8sVersion string = "k8s_version"
DiagnosticEgressAccess string = "egress_reachable"
DiagnosticKMS string = "kube_state_metrics_reachable"
DiagnosticNodeExporter string = "node_exporter_reachable"
DiagnosticPrometheusVersion string = "prometheus_version"
DiagnosticScrapeConfig string = "scrape_cfg"
)
Expand All @@ -27,7 +26,7 @@ func IsValidDiagnostic(d string) bool {
d = strings.ToLower(strings.TrimSpace(d))
switch d {
case DiagnosticAPIKey, DiagnosticK8sVersion, DiagnosticEgressAccess,
DiagnosticKMS, DiagnosticNodeExporter, DiagnosticScrapeConfig,
DiagnosticKMS, DiagnosticScrapeConfig,
DiagnosticPrometheusVersion:
return true
}
Expand Down
5 changes: 0 additions & 5 deletions pkg/config/diagnostics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,6 @@ func TestDiagnostics_IsValidDiagnostics(t *testing.T) {
diagnostic: config.DiagnosticKMS,
expected: true,
},
{
name: "DiagnosticNodeExporter",
diagnostic: config.DiagnosticNodeExporter,
expected: true,
},
{
name: "DiagnosticScrapeConfig",
diagnostic: config.DiagnosticScrapeConfig,
Expand Down
25 changes: 12 additions & 13 deletions pkg/config/error.go
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
package config

const (
ErrNoLifecycleStageMsg = "missing Lifecycle Stage"
ErrNoEnvFileMsg = "missing Env File"
ErrNoKeyFileMsg = "missing Key File"
ErrNoCloudZeroHostMsg = "missing CloudZero Host"
ErrNoAccountIDMsg = "missing AWS Account ID"
ErrNoClusterNameMsg = "missing Cluster Name"
ErrNoRegionMsg = "missing AWS Region"
ErrNoSecretFilePathMsg = "missing Secret File"
ErrNoAgentVersionMsg = "missing Agent Version"
ErrNoChartVersionMsg = "missing Chart Version"
ErrNoScrapeConfigLocationMsg = "missing Scrape Config Location"
ErrNoKubeStateMetricsServiceEndpointMsg = "missing Kube State Metrics Service Endpoint"
ErrNoPrometheusNodeExporterServiceEndpointMsg = "missing Prometheus Node Exporter Service Endpoint"
ErrNoLifecycleStageMsg = "missing Lifecycle Stage"
ErrNoEnvFileMsg = "missing Env File"
ErrNoKeyFileMsg = "missing Key File"
ErrNoCloudZeroHostMsg = "missing CloudZero Host"
ErrNoAccountIDMsg = "missing AWS Account ID"
ErrNoClusterNameMsg = "missing Cluster Name"
ErrNoRegionMsg = "missing AWS Region"
ErrNoSecretFilePathMsg = "missing Secret File"
ErrNoAgentVersionMsg = "missing Agent Version"
ErrNoChartVersionMsg = "missing Chart Version"
ErrNoScrapeConfigLocationMsg = "missing Scrape Config Location"
ErrNoKubeStateMetricsServiceEndpointMsg = "missing Kube State Metrics Service Endpoint"
)
14 changes: 3 additions & 11 deletions pkg/config/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,9 @@ import (
)

type Prometheus struct {
Executable string `yaml:"executable" default:"/bin/prometheus" env:"PROMETHEUS_EXECUTABLE" env-description:"Prometheus Executable Path"`
KubeStateMetricsServiceEndpoint string `yaml:"kube_state_metrics_service_endpoint" env:"KMS_EP_URL" required:"true" env-description:"Kube State Metrics Service Endpoint"`
PrometheusNodeExporterServiceEndpoint string `yaml:"prometheus_node_exporter_service_endpoint" env:"NODE_EXPORTER_EP_URL" required:"true" env-description:"Prometheus Node Exporter Service Endpoint"`
Configurations []string `yaml:"configurations"`
Executable string `yaml:"executable" default:"/bin/prometheus" env:"PROMETHEUS_EXECUTABLE" env-description:"Prometheus Executable Path"`
KubeStateMetricsServiceEndpoint string `yaml:"kube_state_metrics_service_endpoint" env:"KMS_EP_URL" required:"true" env-description:"Kube State Metrics Service Endpoint"`
Configurations []string `yaml:"configurations"`
}

func (s *Prometheus) Validate() error {
Expand All @@ -22,13 +21,6 @@ func (s *Prometheus) Validate() error {
return fmt.Errorf("invalid %s", s.KubeStateMetricsServiceEndpoint)
}

if s.PrometheusNodeExporterServiceEndpoint == "" {
return errors.New(ErrNoPrometheusNodeExporterServiceEndpointMsg)
}
if !isValidURL(s.PrometheusNodeExporterServiceEndpoint) {
return fmt.Errorf("URL format invalid: %s", s.PrometheusNodeExporterServiceEndpoint)
}

if len(s.Configurations) == 0 {
s.Configurations = []string{
"/etc/prometheus/prometheus.yml",
Expand Down
19 changes: 4 additions & 15 deletions pkg/config/prometheus_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,33 +21,22 @@ func TestPrometheus_Validate(t *testing.T) {
{
name: "ValidPrometheus",
prom: config.Prometheus{
KubeStateMetricsServiceEndpoint: kmsServiceEndpoint,
PrometheusNodeExporterServiceEndpoint: promNodeExporterServiceEndpoint,
Configurations: []string{scrapeConfigFile},
KubeStateMetricsServiceEndpoint: kmsServiceEndpoint,
Configurations: []string{scrapeConfigFile},
},
expected: nil,
},
{
name: "MissingKubeStateMetricsServiceEndpoint",
prom: config.Prometheus{
PrometheusNodeExporterServiceEndpoint: promNodeExporterServiceEndpoint,
Configurations: []string{scrapeConfigFile},
Configurations: []string{scrapeConfigFile},
},
expected: errors.New(config.ErrNoKubeStateMetricsServiceEndpointMsg),
},
{
name: "MissingPrometheusNodeExporterServiceEndpoint",
prom: config.Prometheus{
KubeStateMetricsServiceEndpoint: kmsServiceEndpoint,
Configurations: []string{scrapeConfigFile},
},
expected: errors.New(config.ErrNoPrometheusNodeExporterServiceEndpointMsg),
},
{
name: "MissingScrapeConfigLocation",
prom: config.Prometheus{
KubeStateMetricsServiceEndpoint: kmsServiceEndpoint,
PrometheusNodeExporterServiceEndpoint: promNodeExporterServiceEndpoint,
KubeStateMetricsServiceEndpoint: kmsServiceEndpoint,
},
expected: nil,
},
Expand Down
4 changes: 1 addition & 3 deletions pkg/config/settings_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@ const (
apiHost = "https://api.cloudzero.com"
apiKey = "my-cloudzero-token"

kmsServiceEndpoint = "http://kube-state-metrics:8080"
promNodeExporterServiceEndpoint = "http://node-exporter:8080"
kmsServiceEndpoint = "http://kube-state-metrics:8080"
)

func TestSettings_NewSettings(t *testing.T) {
Expand Down Expand Up @@ -50,7 +49,6 @@ func TestSettings_NewSettings(t *testing.T) {

// verify Prometheus
assert.Equal(t, kmsServiceEndpoint, settings.Prometheus.KubeStateMetricsServiceEndpoint)
assert.Equal(t, promNodeExporterServiceEndpoint, settings.Prometheus.PrometheusNodeExporterServiceEndpoint)
assert.Equal(t, []string{"prometheus.yml"}, settings.Prometheus.Configurations)

// verify Diagnostics
Expand Down
2 changes: 0 additions & 2 deletions pkg/config/testdata/cloudzero-agent-validator.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ cloudzero:

prometheus:
kube_state_metrics_service_endpoint: http://kube-state-metrics:8080
prometheus_node_exporter_service_endpoint: http://node-exporter:8080
configurations:
- prometheus.yml

Expand All @@ -35,7 +34,6 @@ diagnostics:
- k8s_version
- egress_reachable
- kube_state_metrics_reachable
- node_exporter_reachable
- scrape_cfg
- name: pre-stop
enforce: false
Expand Down
1 change: 0 additions & 1 deletion pkg/config/testdata/file.env
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,3 @@ LOG_LEVEL=info
LOG_LOCATION=cloudzero-agent-validator.log

KMS_EP_URL='http://cloudzero-agent-kube-state-metrics:8080/'
NODE_EXPORTER_EP_URL='http://node-exporter.monitoring.svc.cluster.local:9100/'
2 changes: 0 additions & 2 deletions pkg/diagnostic/catalog/catalog.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ import (
"github.com/cloudzero/cloudzero-agent-validator/pkg/diagnostic/egress"
"github.com/cloudzero/cloudzero-agent-validator/pkg/diagnostic/k8s"
"github.com/cloudzero/cloudzero-agent-validator/pkg/diagnostic/kms"
"github.com/cloudzero/cloudzero-agent-validator/pkg/diagnostic/pne"
promcfg "github.com/cloudzero/cloudzero-agent-validator/pkg/diagnostic/prom/config"
promver "github.com/cloudzero/cloudzero-agent-validator/pkg/diagnostic/prom/version"
"github.com/cloudzero/cloudzero-agent-validator/pkg/diagnostic/stage"
Expand Down Expand Up @@ -42,7 +41,6 @@ func NewCatalog(ctx context.Context, c *config.Settings) Registry {
r.add(config.DiagnosticEgressAccess, false, egress.NewProvider(ctx, c))
r.add(config.DiagnosticK8sVersion, false, k8s.NewProvider(ctx, c))
r.add(config.DiagnosticKMS, false, kms.NewProvider(ctx, c))
r.add(config.DiagnosticNodeExporter, false, pne.NewProvider(ctx, c))
r.add(config.DiagnosticScrapeConfig, false, promcfg.NewProvider(ctx, c))
r.add(config.DiagnosticPrometheusVersion, false, promver.NewProvider(ctx, c))

Expand Down
2 changes: 1 addition & 1 deletion pkg/diagnostic/catalog/catalog_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,5 +48,5 @@ func TestRegistry_List(t *testing.T) {

// Test listing providers
providers := r.List()
assert.Len(t, providers, 7)
assert.Len(t, providers, 6) // Update the expected length to 6
}
Loading