Add OpenCost to the ClusterMetrics feature

Signed-off-by: Pete Wall <pete.wall@grafana.com>
grafana · Nov 19, 2024 · c5af60a · c5af60a
1 parent f439741
commit c5af60a
Show file tree

Hide file tree

Showing 52 changed files with 1,772 additions and 42 deletions.
diff --git a/charts/feature-cluster-metrics/.updatecli-opencost.yaml b/charts/feature-cluster-metrics/.updatecli-opencost.yaml
@@ -0,0 +1,31 @@
+---
+name: Update dependency "opencost" for Helm chart "feature-cluster-metrics"
+sources:
+  opencost:
+    name: Get latest "opencost" Helm chart version
+    kind: helmchart
+    spec:
+      name: opencost
+      url: https://opencost.github.io/opencost-helm-chart
+      versionfilter:
+        kind: semver
+        pattern: '*'
+conditions:
+  opencost:
+    name: Ensure Helm chart dependency "opencost" is specified
+    kind: yaml
+    spec:
+      file: charts/feature-cluster-metrics/Chart.yaml
+      key: $.dependencies[4].name
+      value: opencost
+    disablesourceinput: true
+targets:
+  opencost:
+    name: Bump Helm chart dependency "opencost" for Helm chart "feature-cluster-metrics"
+    kind: helmchart
+    spec:
+      file: Chart.yaml
+      key: $.dependencies[4].version
+      name: charts/feature-cluster-metrics
+      versionincrement: none
+    sourceid: opencost
diff --git a/charts/feature-cluster-metrics/Chart.lock b/charts/feature-cluster-metrics/Chart.lock
@@ -11,5 +11,8 @@ dependencies:
 - name: kepler
   repository: https://sustainable-computing-io.github.io/kepler-helm-chart
   version: 0.5.11
-digest: sha256:392cafa6f737f6aa0129baeae4a8b27e1e584356743d36a5b3f1b6f2b7d1dd14
-generated: "2024-11-10T00:23:03.998641999Z"
+- name: opencost
+  repository: https://opencost.github.io/opencost-helm-chart
+  version: 1.42.3
+digest: sha256:0c7faa125a828d7fb88f7d6814e88fe6b014c71d6dde252587b9355b80dd92ec
+generated: "2024-11-18T13:23:00.779741-06:00"
diff --git a/charts/feature-cluster-metrics/Chart.yaml b/charts/feature-cluster-metrics/Chart.yaml
@@ -32,3 +32,8 @@ dependencies:
     version: 0.5.11
     repository: https://sustainable-computing-io.github.io/kepler-helm-chart
     condition: kepler.enabled
+
+  - name: opencost
+    version: 1.42.3
+    repository: https://opencost.github.io/opencost-helm-chart
+    condition: opencost.enabled
diff --git a/charts/feature-cluster-metrics/README.md b/charts/feature-cluster-metrics/README.md
@@ -153,6 +153,7 @@ Actual integration testing in a live environment should be done in the main [k8s
 
 | Repository | Name | Version |
 |------------|------|---------|
+| https://opencost.github.io/opencost-helm-chart | opencost | 1.42.3 |
 | https://prometheus-community.github.io/helm-charts | kube-state-metrics | 5.27.0 |
 | https://prometheus-community.github.io/helm-charts | node-exporter(prometheus-node-exporter) | 4.42.0 |
 | https://prometheus-community.github.io/helm-charts | windows-exporter(prometheus-windows-exporter) | 0.7.1 |
@@ -189,7 +190,7 @@ Actual integration testing in a live environment should be done in the main [k8s
 | cadvisor.metricsTuning.keepPhysicalFilesystemDevices | list | `["mmcblk.p.+","nvme.+","rbd.+","sd.+","vd.+","xvd.+","dasd.+"]` | Only keep filesystem metrics that use the following physical devices |
 | cadvisor.metricsTuning.keepPhysicalNetworkDevices | list | `["en[ospx][0-9].*","wlan[0-9].*","eth[0-9].*"]` | Only keep network metrics that use the following physical devices |
 | cadvisor.metricsTuning.normalizeUnnecessaryLabels | list | `[{"labels":["boot_id","system_uuid"],"metric":"machine_memory_bytes"}]` | Normalize labels to the same value for the given metric and label pairs |
-| cadvisor.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from cAdvisor to the minimal set required for Kubernetes Monitoring. See [Metrics Tuning and Allow Lists](#metrics-tuning-and-allow-lists) |
+| cadvisor.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from cAdvisor to the minimal set required for Kubernetes Monitoring. |
 
 ### cadvisor
 
@@ -231,7 +232,7 @@ Actual integration testing in a live environment should be done in the main [k8s
 | kepler.maxCacheSize | string | `100000` | Sets the max_cache_size for the prometheus.relabel component for Kepler. This should be at least 2x-5x your largest scrape target or samples appended rate. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus/prometheus.relabel/#arguments)) Overrides global.maxCacheSize |
 | kepler.metricsTuning.excludeMetrics | list | `[]` | Metrics to drop. Can use regular expressions. |
 | kepler.metricsTuning.includeMetrics | list | `[]` | Metrics to keep. Can use regular expressions. |
-| kepler.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from Kepler to the minimal set required for Kubernetes Monitoring. See [Metrics Tuning and Allow Lists](#metrics-tuning-and-allow-lists) |
+| kepler.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from Kepler to the minimal set required for Kubernetes Monitoring. |
 | kepler.scrapeInterval | string | `60s` | How frequently to scrape metrics from Kepler. Overrides global.scrapeInterval. |
 
 ### kube-state-metrics
@@ -247,7 +248,7 @@ Actual integration testing in a live environment should be done in the main [k8s
 | kube-state-metrics.metricLabelsAllowlist | list | `["nodes=[agentpool,alpha.eksctl.io/cluster-name,alpha.eksctl.io/nodegroup-name,beta.kubernetes.io/instance-type,cloud.google.com/gke-nodepool,cluster_name,ec2_amazonaws_com_Name,ec2_amazonaws_com_aws_autoscaling_groupName,ec2_amazonaws_com_aws_autoscaling_group_name,ec2_amazonaws_com_name,eks_amazonaws_com_nodegroup,k8s_io_cloud_provider_aws,karpenter.sh/nodepool,kubernetes.azure.com/cluster,kubernetes.io/arch,kubernetes.io/hostname,kubernetes.io/os,node.kubernetes.io/instance-type,topology.kubernetes.io/region,topology.kubernetes.io/zone]"]` | `kube_<resource>_labels` metrics to generate. The default is to include a useful set for Node labels. |
 | kube-state-metrics.metricsTuning.excludeMetrics | list | `[]` | Metrics to drop. Can use regular expressions. |
 | kube-state-metrics.metricsTuning.includeMetrics | list | `[]` | Metrics to keep. Can use regular expressions. |
-| kube-state-metrics.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from Kube State Metrics to a useful, minimal set. See [Metrics Tuning and Allow Lists](#metrics-tuning-and-allow-lists) |
+| kube-state-metrics.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from Kube State Metrics to a useful, minimal set. |
 | kube-state-metrics.scrapeInterval | string | `60s` | How frequently to scrape kube-state-metrics metrics. |
 
 ### Kube Controller Manager
@@ -311,7 +312,7 @@ Actual integration testing in a live environment should be done in the main [k8s
 | kubelet.maxCacheSize | string | `100000` | Sets the max_cache_size for cadvisor prometheus.relabel component. This should be at least 2x-5x your largest scrape target or samples appended rate. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus/prometheus.relabel/#arguments)) Overrides global.maxCacheSize |
 | kubelet.metricsTuning.excludeMetrics | list | `[]` | Metrics to drop. Can use regular expressions. |
 | kubelet.metricsTuning.includeMetrics | list | `[]` | Metrics to keep. Can use regular expressions. |
-| kubelet.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from the Kubelet to the minimal set required for Kubernetes Monitoring. See [Metrics Tuning and Allow Lists](#metrics-tuning-and-allow-lists) |
+| kubelet.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from the Kubelet to the minimal set required for Kubernetes Monitoring. |
 | kubelet.scrapeInterval | string | `60s` | How frequently to scrape Kubelet metrics. |
 
 ### Kubelet Resources
@@ -324,7 +325,7 @@ Actual integration testing in a live environment should be done in the main [k8s
 | kubeletResource.maxCacheSize | string | `100000` | Sets the max_cache_size for cadvisor prometheus.relabel component. This should be at least 2x-5x your largest scrape target or samples appended rate. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus/prometheus.relabel/#arguments)) Overrides global.maxCacheSize |
 | kubeletResource.metricsTuning.excludeMetrics | list | `[]` | Metrics to drop. Can use regular expressions. |
 | kubeletResource.metricsTuning.includeMetrics | list | `[]` | Metrics to keep. Can use regular expressions. |
-| kubeletResource.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of resources metrics from the Kubelet to the minimal set required for Kubernetes Monitoring. See [Metrics Tuning and Allow Lists](#metrics-tuning-and-allow-lists) |
+| kubeletResource.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of resources metrics from the Kubelet to the minimal set required for Kubernetes Monitoring. |
 | kubeletResource.scrapeInterval | string | `60s` | How frequently to scrape Kubelet Resource metrics. |
 
 ### Node Exporter - Deployment settings
@@ -345,10 +346,29 @@ Actual integration testing in a live environment should be done in the main [k8s
 | node-exporter.metricsTuning.dropMetricsForFilesystem | list | `["tempfs"]` | Drop metrics for the given filesystem types |
 | node-exporter.metricsTuning.excludeMetrics | list | `[]` | Metrics to drop. Can use regular expressions. |
 | node-exporter.metricsTuning.includeMetrics | list | `[]` | Metrics to keep. Can use regular expressions. |
-| node-exporter.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from Node Exporter to the minimal set required for Kubernetes Monitoring. See [Metrics Tuning and Allow Lists](#metrics-tuning-and-allow-lists) |
+| node-exporter.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from Node Exporter to the minimal set required for Kubernetes Monitoring. |
 | node-exporter.metricsTuning.useIntegrationAllowList | bool | `false` | Filter the list of metrics from Node Exporter to the minimal set required for Kubernetes Monitoring as well as the Node Exporter integration. |
 | node-exporter.scrapeInterval | string | `60s` | How frequently to scrape Node Exporter metrics. |
 
+### OpenCost
+
+| Key | Type | Default | Description |
+|-----|------|---------|-------------|
+| opencost.enabled | bool | `false` | Deploy and scrape OpenCost. |
+| opencost.extraDiscoveryRules | string | `""` | Rule blocks to be added to the discovery.relabel component for OpenCost. These relabeling rules are applied pre-scrape against the targets from service discovery. Before the scrape, any remaining target labels that start with __ (i.e. __meta_kubernetes*) are dropped. ([docs](https://grafana.com/docs/alloy/latest/reference/components/discovery/discovery.relabel/#rule-block)) |
+| opencost.extraMetricProcessingRules | string | `""` | Rule blocks to be added to the prometheus.relabel component for OpenCost. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus/prometheus.relabel/#rule-block)) These relabeling rules are applied post-scrape against the metrics returned from the scraped target, no __meta* labels are present. |
+| opencost.labelMatchers | object | `{"app.kubernetes.io/name":"opencost"}` | Label matchers used to select the OpenCost service |
+| opencost.maxCacheSize | string | `100000` | Sets the max_cache_size for the prometheus.relabel component for OpenCost. This should be at least 2x-5x your largest scrape target or samples appended rate. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus/prometheus.relabel/#arguments)) Overrides global.maxCacheSize |
+| opencost.metricsSource | string | `""` | The name of the metric destination where OpenCost will query for required metrics. Setting this will enable guided setup for required OpenCost parameters. To skip guided setup, set this to "custom". |
+| opencost.metricsTuning.excludeMetrics | list | `[]` | Metrics to drop. Can use regular expressions. |
+| opencost.metricsTuning.includeMetrics | list | `[]` | Metrics to keep. Can use regular expressions. |
+| opencost.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from OpenCost to the minimal set required for Kubernetes Monitoring. |
+| opencost.opencost.prometheus.existingSecretName | string | `""` | The name of the secret containing the username and password for the metrics service. This must be in the same namespace as the OpenCost deployment. |
+| opencost.opencost.prometheus.external.url | string | `""` | The URL for Prometheus queries. It should match externalServices.prometheus.host + "/api/prom" |
+| opencost.opencost.prometheus.password_key | string | `"password"` | The key for the password property in the secret. |
+| opencost.opencost.prometheus.username_key | string | `"username"` | The key for the username property in the secret. |
+| opencost.scrapeInterval | string | `60s` | How frequently to scrape metrics from Kepler. Overrides global.scrapeInterval. |
+
 ### Windows Exporter - Deployment settings
 
 | Key | Type | Default | Description |
@@ -366,6 +386,6 @@ Actual integration testing in a live environment should be done in the main [k8s
 | windows-exporter.maxCacheSize | string | `100000` | Sets the max_cache_size for cadvisor prometheus.relabel component. This should be at least 2x-5x your largest scrape target or samples appended rate. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus/prometheus.relabel/#arguments)) Overrides global.maxCacheSize |
 | windows-exporter.metricsTuning.excludeMetrics | list | `[]` | Metrics to drop. Can use regular expressions. |
 | windows-exporter.metricsTuning.includeMetrics | list | `[]` | Metrics to keep. Can use regular expressions. |
-| windows-exporter.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from Windows Exporter to the minimal set required for Kubernetes Monitoring. See [Metrics Tuning and Allow Lists](#metrics-tuning-and-allow-lists) |
+| windows-exporter.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from Windows Exporter to the minimal set required for Kubernetes Monitoring. |
 | windows-exporter.scrapeInterval | string | `60s` | How frequently to scrape metrics from Windows Exporter. |
 <!-- markdownlint-enable no-space-in-emphasis -->
diff --git a/charts/feature-cluster-metrics/charts/opencost-1.42.3.tgz b/charts/feature-cluster-metrics/charts/opencost-1.42.3.tgz
diff --git a/charts/feature-cluster-metrics/templates/_module.alloy.tpl b/charts/feature-cluster-metrics/templates/_module.alloy.tpl
@@ -26,6 +26,7 @@ declare "cluster_metrics" {
   {{- include "feature.clusterMetrics.node_exporter.alloy" . | indent 2 }}
   {{- include "feature.clusterMetrics.windows_exporter.alloy" . | indent 2 }}
   {{- include "feature.clusterMetrics.kepler.alloy" . | indent 2 }}
+  {{- include "feature.clusterMetrics.opencost.alloy" . | indent 2 }}
 }
 {{- end -}}
 

diff --git a/charts/feature-cluster-metrics/templates/_opencost.alloy.tpl b/charts/feature-cluster-metrics/templates/_opencost.alloy.tpl
@@ -0,0 +1,77 @@
+{{ define "feature.clusterMetrics.opencost.allowList" }}
+{{ if .Values.opencost.metricsTuning.useDefaultAllowList }}
+{{ "default-allow-lists/opencost.yaml" | .Files.Get }}
+{{ end }}
+{{ if .Values.opencost.metricsTuning.includeMetrics }}
+{{ .Values.opencost.metricsTuning.includeMetrics | toYaml }}
+{{ end }}
+{{ end }}
+
+{{- define "feature.clusterMetrics.opencost.alloy" }}
+{{- if .Values.opencost.enabled }}
+{{- $metricAllowList := include "feature.clusterMetrics.opencost.allowList" . }}
+{{- $metricDenyList := .Values.opencost.metricsTuning.excludeMetrics }}
+{{- $labelSelectors := list }}
+{{- range $k, $v := .Values.opencost.labelMatchers }}
+{{- $labelSelectors = append $labelSelectors (printf "%s=%s" $k $v) }}
+{{- end }}
+
+discovery.kubernetes "opencost" {
+  role = "pod"
+  namespaces {
+    own_namespace = true
+  }
+  selectors {
+    role = "pod"
+    label = {{ $labelSelectors | join "," | quote }}
+  }
+}
+
+discovery.relabel "opencost" {
+  targets = discovery.kubernetes.opencost.targets
+  rule {
+    source_labels = ["__meta_kubernetes_pod_node_name"]
+    action = "replace"
+    target_label = "instance"
+  }
+{{- if .Values.opencost.extraDiscoveryRules }}
+{{ .Values.opencost.extraDiscoveryRules | indent 2 }}
+{{- end }}
+}
+
+prometheus.scrape "opencost" {
+  targets      = discovery.relabel.opencost.output
+  job_name     = "integrations/opencost"
+  honor_labels = true
+  scrape_interval = {{ .Values.opencost.scrapeInterval | default .Values.global.scrapeInterval | quote }}
+  clustering {
+    enabled = true
+  }
+{{- if or $metricAllowList $metricDenyList .Values.opencost.extraMetricProcessingRules }}
+  forward_to = [prometheus.relabel.opencost.receiver]
+}
+
+prometheus.relabel "opencost" {
+  max_cache_size = {{ .Values.opencost.maxCacheSize | default .Values.global.maxCacheSize | int }}
+{{- if $metricAllowList }}
+  rule {
+    source_labels = ["__name__"]
+    regex = "up|{{ $metricAllowList | fromYamlArray | join "|" }}"
+    action = "keep"
+  }
+{{- end }}
+{{- if $metricDenyList }}
+  rule {
+    source_labels = ["__name__"]
+    regex = {{ $metricDenyList | join "|" | quote }}
+    action = "drop"
+  }
+{{- end }}
+{{- if .Values.opencost.extraMetricProcessingRules }}
+{{ .Values.opencost.extraMetricProcessingRules | indent 2 }}
+{{- end }}
+{{- end }}
+  forward_to = argument.metrics_destinations.value
+}
+{{- end }}
+{{- end }}