Skip to content

Commit 2042f40

Browse files
committed
Add OpenCost to the ClusterMetrics feature
Signed-off-by: Pete Wall <pete.wall@grafana.com>
1 parent f439741 commit 2042f40

File tree

53 files changed

+1773
-42
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

53 files changed

+1773
-42
lines changed
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
---
2+
name: Update dependency "opencost" for Helm chart "feature-cluster-metrics"
3+
sources:
4+
opencost:
5+
name: Get latest "opencost" Helm chart version
6+
kind: helmchart
7+
spec:
8+
name: opencost
9+
url: https://opencost.github.io/opencost-helm-chart
10+
versionfilter:
11+
kind: semver
12+
pattern: '*'
13+
conditions:
14+
opencost:
15+
name: Ensure Helm chart dependency "opencost" is specified
16+
kind: yaml
17+
spec:
18+
file: charts/feature-cluster-metrics/Chart.yaml
19+
key: $.dependencies[4].name
20+
value: opencost
21+
disablesourceinput: true
22+
targets:
23+
opencost:
24+
name: Bump Helm chart dependency "opencost" for Helm chart "feature-cluster-metrics"
25+
kind: helmchart
26+
spec:
27+
file: Chart.yaml
28+
key: $.dependencies[4].version
29+
name: charts/feature-cluster-metrics
30+
versionincrement: none
31+
sourceid: opencost

charts/feature-cluster-metrics/Chart.lock

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,8 @@ dependencies:
1111
- name: kepler
1212
repository: https://sustainable-computing-io.github.io/kepler-helm-chart
1313
version: 0.5.11
14-
digest: sha256:392cafa6f737f6aa0129baeae4a8b27e1e584356743d36a5b3f1b6f2b7d1dd14
15-
generated: "2024-11-10T00:23:03.998641999Z"
14+
- name: opencost
15+
repository: https://opencost.github.io/opencost-helm-chart
16+
version: 1.42.3
17+
digest: sha256:0c7faa125a828d7fb88f7d6814e88fe6b014c71d6dde252587b9355b80dd92ec
18+
generated: "2024-11-18T13:23:00.779741-06:00"

charts/feature-cluster-metrics/Chart.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,8 @@ dependencies:
3232
version: 0.5.11
3333
repository: https://sustainable-computing-io.github.io/kepler-helm-chart
3434
condition: kepler.enabled
35+
36+
- name: opencost
37+
version: 1.42.3
38+
repository: https://opencost.github.io/opencost-helm-chart
39+
condition: opencost.enabled

charts/feature-cluster-metrics/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ build: README.md Chart.lock values.schema.json $(UPDATECLI_FILES)
3131
test: build
3232
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
3333
helm repo add kepler https://sustainable-computing-io.github.io/kepler-helm-chart
34+
helm repo add opencost https://opencost.github.io/opencost-helm-chart
3435

3536
helm lint .
3637
ct lint --lint-conf ../../.configs/lintconf.yaml --helm-dependency-extra-args=--skip-refresh --charts .

charts/feature-cluster-metrics/README.md

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,7 @@ Actual integration testing in a live environment should be done in the main [k8s
153153

154154
| Repository | Name | Version |
155155
|------------|------|---------|
156+
| https://opencost.github.io/opencost-helm-chart | opencost | 1.42.3 |
156157
| https://prometheus-community.github.io/helm-charts | kube-state-metrics | 5.27.0 |
157158
| https://prometheus-community.github.io/helm-charts | node-exporter(prometheus-node-exporter) | 4.42.0 |
158159
| https://prometheus-community.github.io/helm-charts | windows-exporter(prometheus-windows-exporter) | 0.7.1 |
@@ -189,7 +190,7 @@ Actual integration testing in a live environment should be done in the main [k8s
189190
| cadvisor.metricsTuning.keepPhysicalFilesystemDevices | list | `["mmcblk.p.+","nvme.+","rbd.+","sd.+","vd.+","xvd.+","dasd.+"]` | Only keep filesystem metrics that use the following physical devices |
190191
| cadvisor.metricsTuning.keepPhysicalNetworkDevices | list | `["en[ospx][0-9].*","wlan[0-9].*","eth[0-9].*"]` | Only keep network metrics that use the following physical devices |
191192
| cadvisor.metricsTuning.normalizeUnnecessaryLabels | list | `[{"labels":["boot_id","system_uuid"],"metric":"machine_memory_bytes"}]` | Normalize labels to the same value for the given metric and label pairs |
192-
| cadvisor.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from cAdvisor to the minimal set required for Kubernetes Monitoring. See [Metrics Tuning and Allow Lists](#metrics-tuning-and-allow-lists) |
193+
| cadvisor.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from cAdvisor to the minimal set required for Kubernetes Monitoring. |
193194

194195
### cadvisor
195196

@@ -231,7 +232,7 @@ Actual integration testing in a live environment should be done in the main [k8s
231232
| kepler.maxCacheSize | string | `100000` | Sets the max_cache_size for the prometheus.relabel component for Kepler. This should be at least 2x-5x your largest scrape target or samples appended rate. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus/prometheus.relabel/#arguments)) Overrides global.maxCacheSize |
232233
| kepler.metricsTuning.excludeMetrics | list | `[]` | Metrics to drop. Can use regular expressions. |
233234
| kepler.metricsTuning.includeMetrics | list | `[]` | Metrics to keep. Can use regular expressions. |
234-
| kepler.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from Kepler to the minimal set required for Kubernetes Monitoring. See [Metrics Tuning and Allow Lists](#metrics-tuning-and-allow-lists) |
235+
| kepler.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from Kepler to the minimal set required for Kubernetes Monitoring. |
235236
| kepler.scrapeInterval | string | `60s` | How frequently to scrape metrics from Kepler. Overrides global.scrapeInterval. |
236237

237238
### kube-state-metrics
@@ -247,7 +248,7 @@ Actual integration testing in a live environment should be done in the main [k8s
247248
| kube-state-metrics.metricLabelsAllowlist | list | `["nodes=[agentpool,alpha.eksctl.io/cluster-name,alpha.eksctl.io/nodegroup-name,beta.kubernetes.io/instance-type,cloud.google.com/gke-nodepool,cluster_name,ec2_amazonaws_com_Name,ec2_amazonaws_com_aws_autoscaling_groupName,ec2_amazonaws_com_aws_autoscaling_group_name,ec2_amazonaws_com_name,eks_amazonaws_com_nodegroup,k8s_io_cloud_provider_aws,karpenter.sh/nodepool,kubernetes.azure.com/cluster,kubernetes.io/arch,kubernetes.io/hostname,kubernetes.io/os,node.kubernetes.io/instance-type,topology.kubernetes.io/region,topology.kubernetes.io/zone]"]` | `kube_<resource>_labels` metrics to generate. The default is to include a useful set for Node labels. |
248249
| kube-state-metrics.metricsTuning.excludeMetrics | list | `[]` | Metrics to drop. Can use regular expressions. |
249250
| kube-state-metrics.metricsTuning.includeMetrics | list | `[]` | Metrics to keep. Can use regular expressions. |
250-
| kube-state-metrics.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from Kube State Metrics to a useful, minimal set. See [Metrics Tuning and Allow Lists](#metrics-tuning-and-allow-lists) |
251+
| kube-state-metrics.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from Kube State Metrics to a useful, minimal set. |
251252
| kube-state-metrics.scrapeInterval | string | `60s` | How frequently to scrape kube-state-metrics metrics. |
252253

253254
### Kube Controller Manager
@@ -311,7 +312,7 @@ Actual integration testing in a live environment should be done in the main [k8s
311312
| kubelet.maxCacheSize | string | `100000` | Sets the max_cache_size for cadvisor prometheus.relabel component. This should be at least 2x-5x your largest scrape target or samples appended rate. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus/prometheus.relabel/#arguments)) Overrides global.maxCacheSize |
312313
| kubelet.metricsTuning.excludeMetrics | list | `[]` | Metrics to drop. Can use regular expressions. |
313314
| kubelet.metricsTuning.includeMetrics | list | `[]` | Metrics to keep. Can use regular expressions. |
314-
| kubelet.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from the Kubelet to the minimal set required for Kubernetes Monitoring. See [Metrics Tuning and Allow Lists](#metrics-tuning-and-allow-lists) |
315+
| kubelet.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from the Kubelet to the minimal set required for Kubernetes Monitoring. |
315316
| kubelet.scrapeInterval | string | `60s` | How frequently to scrape Kubelet metrics. |
316317

317318
### Kubelet Resources
@@ -324,7 +325,7 @@ Actual integration testing in a live environment should be done in the main [k8s
324325
| kubeletResource.maxCacheSize | string | `100000` | Sets the max_cache_size for cadvisor prometheus.relabel component. This should be at least 2x-5x your largest scrape target or samples appended rate. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus/prometheus.relabel/#arguments)) Overrides global.maxCacheSize |
325326
| kubeletResource.metricsTuning.excludeMetrics | list | `[]` | Metrics to drop. Can use regular expressions. |
326327
| kubeletResource.metricsTuning.includeMetrics | list | `[]` | Metrics to keep. Can use regular expressions. |
327-
| kubeletResource.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of resources metrics from the Kubelet to the minimal set required for Kubernetes Monitoring. See [Metrics Tuning and Allow Lists](#metrics-tuning-and-allow-lists) |
328+
| kubeletResource.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of resources metrics from the Kubelet to the minimal set required for Kubernetes Monitoring. |
328329
| kubeletResource.scrapeInterval | string | `60s` | How frequently to scrape Kubelet Resource metrics. |
329330

330331
### Node Exporter - Deployment settings
@@ -345,10 +346,29 @@ Actual integration testing in a live environment should be done in the main [k8s
345346
| node-exporter.metricsTuning.dropMetricsForFilesystem | list | `["tempfs"]` | Drop metrics for the given filesystem types |
346347
| node-exporter.metricsTuning.excludeMetrics | list | `[]` | Metrics to drop. Can use regular expressions. |
347348
| node-exporter.metricsTuning.includeMetrics | list | `[]` | Metrics to keep. Can use regular expressions. |
348-
| node-exporter.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from Node Exporter to the minimal set required for Kubernetes Monitoring. See [Metrics Tuning and Allow Lists](#metrics-tuning-and-allow-lists) |
349+
| node-exporter.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from Node Exporter to the minimal set required for Kubernetes Monitoring. |
349350
| node-exporter.metricsTuning.useIntegrationAllowList | bool | `false` | Filter the list of metrics from Node Exporter to the minimal set required for Kubernetes Monitoring as well as the Node Exporter integration. |
350351
| node-exporter.scrapeInterval | string | `60s` | How frequently to scrape Node Exporter metrics. |
351352

353+
### OpenCost
354+
355+
| Key | Type | Default | Description |
356+
|-----|------|---------|-------------|
357+
| opencost.enabled | bool | `false` | Deploy and scrape OpenCost. |
358+
| opencost.extraDiscoveryRules | string | `""` | Rule blocks to be added to the discovery.relabel component for OpenCost. These relabeling rules are applied pre-scrape against the targets from service discovery. Before the scrape, any remaining target labels that start with __ (i.e. __meta_kubernetes*) are dropped. ([docs](https://grafana.com/docs/alloy/latest/reference/components/discovery/discovery.relabel/#rule-block)) |
359+
| opencost.extraMetricProcessingRules | string | `""` | Rule blocks to be added to the prometheus.relabel component for OpenCost. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus/prometheus.relabel/#rule-block)) These relabeling rules are applied post-scrape against the metrics returned from the scraped target, no __meta* labels are present. |
360+
| opencost.labelMatchers | object | `{"app.kubernetes.io/name":"opencost"}` | Label matchers used to select the OpenCost service |
361+
| opencost.maxCacheSize | string | `100000` | Sets the max_cache_size for the prometheus.relabel component for OpenCost. This should be at least 2x-5x your largest scrape target or samples appended rate. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus/prometheus.relabel/#arguments)) Overrides global.maxCacheSize |
362+
| opencost.metricsSource | string | `""` | The name of the metric destination where OpenCost will query for required metrics. Setting this will enable guided setup for required OpenCost parameters. To skip guided setup, set this to "custom". |
363+
| opencost.metricsTuning.excludeMetrics | list | `[]` | Metrics to drop. Can use regular expressions. |
364+
| opencost.metricsTuning.includeMetrics | list | `[]` | Metrics to keep. Can use regular expressions. |
365+
| opencost.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from OpenCost to the minimal set required for Kubernetes Monitoring. |
366+
| opencost.opencost.prometheus.existingSecretName | string | `""` | The name of the secret containing the username and password for the metrics service. This must be in the same namespace as the OpenCost deployment. |
367+
| opencost.opencost.prometheus.external.url | string | `""` | The URL for Prometheus queries. It should match externalServices.prometheus.host + "/api/prom" |
368+
| opencost.opencost.prometheus.password_key | string | `"password"` | The key for the password property in the secret. |
369+
| opencost.opencost.prometheus.username_key | string | `"username"` | The key for the username property in the secret. |
370+
| opencost.scrapeInterval | string | `60s` | How frequently to scrape metrics from Kepler. Overrides global.scrapeInterval. |
371+
352372
### Windows Exporter - Deployment settings
353373

354374
| Key | Type | Default | Description |
@@ -366,6 +386,6 @@ Actual integration testing in a live environment should be done in the main [k8s
366386
| windows-exporter.maxCacheSize | string | `100000` | Sets the max_cache_size for cadvisor prometheus.relabel component. This should be at least 2x-5x your largest scrape target or samples appended rate. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus/prometheus.relabel/#arguments)) Overrides global.maxCacheSize |
367387
| windows-exporter.metricsTuning.excludeMetrics | list | `[]` | Metrics to drop. Can use regular expressions. |
368388
| windows-exporter.metricsTuning.includeMetrics | list | `[]` | Metrics to keep. Can use regular expressions. |
369-
| windows-exporter.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from Windows Exporter to the minimal set required for Kubernetes Monitoring. See [Metrics Tuning and Allow Lists](#metrics-tuning-and-allow-lists) |
389+
| windows-exporter.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from Windows Exporter to the minimal set required for Kubernetes Monitoring. |
370390
| windows-exporter.scrapeInterval | string | `60s` | How frequently to scrape metrics from Windows Exporter. |
371391
<!-- markdownlint-enable no-space-in-emphasis -->
Binary file not shown.

charts/feature-cluster-metrics/templates/_module.alloy.tpl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ declare "cluster_metrics" {
2626
{{- include "feature.clusterMetrics.node_exporter.alloy" . | indent 2 }}
2727
{{- include "feature.clusterMetrics.windows_exporter.alloy" . | indent 2 }}
2828
{{- include "feature.clusterMetrics.kepler.alloy" . | indent 2 }}
29+
{{- include "feature.clusterMetrics.opencost.alloy" . | indent 2 }}
2930
}
3031
{{- end -}}
3132

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
{{ define "feature.clusterMetrics.opencost.allowList" }}
2+
{{ if .Values.opencost.metricsTuning.useDefaultAllowList }}
3+
{{ "default-allow-lists/opencost.yaml" | .Files.Get }}
4+
{{ end }}
5+
{{ if .Values.opencost.metricsTuning.includeMetrics }}
6+
{{ .Values.opencost.metricsTuning.includeMetrics | toYaml }}
7+
{{ end }}
8+
{{ end }}
9+
10+
{{- define "feature.clusterMetrics.opencost.alloy" }}
11+
{{- if .Values.opencost.enabled }}
12+
{{- $metricAllowList := include "feature.clusterMetrics.opencost.allowList" . }}
13+
{{- $metricDenyList := .Values.opencost.metricsTuning.excludeMetrics }}
14+
{{- $labelSelectors := list }}
15+
{{- range $k, $v := .Values.opencost.labelMatchers }}
16+
{{- $labelSelectors = append $labelSelectors (printf "%s=%s" $k $v) }}
17+
{{- end }}
18+
19+
discovery.kubernetes "opencost" {
20+
role = "pod"
21+
namespaces {
22+
own_namespace = true
23+
}
24+
selectors {
25+
role = "pod"
26+
label = {{ $labelSelectors | join "," | quote }}
27+
}
28+
}
29+
30+
discovery.relabel "opencost" {
31+
targets = discovery.kubernetes.opencost.targets
32+
rule {
33+
source_labels = ["__meta_kubernetes_pod_node_name"]
34+
action = "replace"
35+
target_label = "instance"
36+
}
37+
{{- if .Values.opencost.extraDiscoveryRules }}
38+
{{ .Values.opencost.extraDiscoveryRules | indent 2 }}
39+
{{- end }}
40+
}
41+
42+
prometheus.scrape "opencost" {
43+
targets = discovery.relabel.opencost.output
44+
job_name = "integrations/opencost"
45+
honor_labels = true
46+
scrape_interval = {{ .Values.opencost.scrapeInterval | default .Values.global.scrapeInterval | quote }}
47+
clustering {
48+
enabled = true
49+
}
50+
{{- if or $metricAllowList $metricDenyList .Values.opencost.extraMetricProcessingRules }}
51+
forward_to = [prometheus.relabel.opencost.receiver]
52+
}
53+
54+
prometheus.relabel "opencost" {
55+
max_cache_size = {{ .Values.opencost.maxCacheSize | default .Values.global.maxCacheSize | int }}
56+
{{- if $metricAllowList }}
57+
rule {
58+
source_labels = ["__name__"]
59+
regex = "up|{{ $metricAllowList | fromYamlArray | join "|" }}"
60+
action = "keep"
61+
}
62+
{{- end }}
63+
{{- if $metricDenyList }}
64+
rule {
65+
source_labels = ["__name__"]
66+
regex = {{ $metricDenyList | join "|" | quote }}
67+
action = "drop"
68+
}
69+
{{- end }}
70+
{{- if .Values.opencost.extraMetricProcessingRules }}
71+
{{ .Values.opencost.extraMetricProcessingRules | indent 2 }}
72+
{{- end }}
73+
{{- end }}
74+
forward_to = argument.metrics_destinations.value
75+
}
76+
{{- end }}
77+
{{- end }}

0 commit comments

Comments
 (0)