Skip to content

Commit

Permalink
feature/add-client-etcd-monitoring (#613)
Browse files Browse the repository at this point in the history
<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
- Enhanced etcd monitoring with new metrics exposure, pod scraping
configuration, and comprehensive alert rules for proactive
observability.
- Introduced a new `VMPodScrape` resource for improved pod metrics
collection.
- Added a new PrometheusRule configuration for monitoring etcd clusters
with various alert conditions.
- **Chores**
  - Upgraded the etcd release from version 2.4.0 to 2.5.0.
- Consolidated and renamed monitoring dashboard references for better
consistency.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
  • Loading branch information
klinch0 authored Feb 5, 2025
1 parent 835ee11 commit 861e6c4
Show file tree
Hide file tree
Showing 9 changed files with 3,766 additions and 1,608 deletions.
3,611 changes: 3,611 additions & 0 deletions dashboards/control-plane/kube-etcd.json

Large diffs are not rendered by default.

1,602 changes: 0 additions & 1,602 deletions dashboards/control-plane/kube-etcd3.json

This file was deleted.

5 changes: 2 additions & 3 deletions hack/download-dashboards.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ fix_d8() {
}

swap_pvc_overview() {
jq '(.panels[] | select(.title=="PVC Detailed") | .panels[] | select(.title=="Overview")) as $a | del(.panels[] | select(.title=="PVC Detailed").panels[] | select(.title=="Overview")) | ( (.panels[] | select(.title=="PVC Detailed"))) as $b | del( .panels[] | select(.title=="PVC Detailed")) | (.panels[.panels|length]=($a|.gridPos.y=$b.gridPos.y)) | (.panels[.panels|length]=($b|.gridPos.y=$a.gridPos.y))'
jq '(.panels[] | select(.title=="PVC Detailed") | .panels[] | select(.title=="Overview")) as $a | del(.panels[] | select(.title=="PVC Detailed").panels[] | select(.title=="Overview")) | ( (.panels[] | select(.title=="PVC Detailed"))) as $b | del( .panels[] | select(.title=="PVC Detailed")) | (.panels[.panels|length]=($a|.gridPos.y=$b.gridPos.y)) | (.panels[.panels|length]=($b|.gridPos.y=$a.gridPos.y))'
}

deprectaed_remove_faq() {
Expand Down Expand Up @@ -68,7 +68,7 @@ modules/402-ingress-nginx/monitoring/grafana-dashboards/ingress-nginx/namespace/
modules/402-ingress-nginx/monitoring/grafana-dashboards/ingress-nginx/vhost/vhost_detail.json
modules/402-ingress-nginx/monitoring/grafana-dashboards/ingress-nginx/vhost/vhosts.json
modules/340-monitoring-kubernetes-control-plane/monitoring/grafana-dashboards/kubernetes-cluster/control-plane-status.json
modules/340-monitoring-kubernetes-control-plane/monitoring/grafana-dashboards/kubernetes-cluster/kube-etcd3.json #TODO
modules/340-monitoring-kubernetes-control-plane/monitoring/grafana-dashboards/kubernetes-cluster/kube-etcd.json #TODO
modules/340-monitoring-kubernetes-control-plane/monitoring/grafana-dashboards/kubernetes-cluster/deprecated-resources.json
modules/340-monitoring-kubernetes/monitoring/grafana-dashboards//kubernetes-cluster/nodes/ntp.json #TODO
modules/340-monitoring-kubernetes/monitoring/grafana-dashboards//kubernetes-cluster/nodes/nodes.json
Expand Down Expand Up @@ -109,4 +109,3 @@ done <<\EOT
https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-namespaces.json
https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-pods.json
EOT

2 changes: 1 addition & 1 deletion packages/extra/etcd/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ name: etcd
description: Storage for Kubernetes clusters
icon: /logos/etcd.svg
type: application
version: 2.4.0
version: 2.5.0
6 changes: 6 additions & 0 deletions packages/extra/etcd/templates/etcd-cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,12 @@ spec:
labels:
cozystack.io/service: etcd
spec:
containers:
- name: etcd
ports:
- name: metrics
containerPort: 2381
protocol: TCP
topologySpreadConstraints:
- maxSkew: 1
topologyKey: "kubernetes.io/hostname"
Expand Down
11 changes: 11 additions & 0 deletions packages/extra/etcd/templates/podscrape.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMPodScrape
metadata:
name: etcd-pod-scrape
spec:
podMetricsEndpoints:
- port: metrics
scheme: http
selector:
matchLabels:
app.kubernetes.io/name: etcd
132 changes: 132 additions & 0 deletions packages/extra/etcd/templates/prometheus-rules.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: etcd-rules
spec:
groups:
- name: etcd
rules:
- alert: etcdInsufficientMembers
annotations:
summary: "etcd cluster '{{`{{ $labels.job }}`}}': insufficient members '{{`{{ $value }}`}}'."
expr: |
sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
for: 3m
labels:
severity: critical

- alert: etcdNoLeader
annotations:
summary: "etcd cluster '{{`{{ $labels.job }}`}}': member '{{`{{ $labels.instance }}`}}' has no leader."
expr: |
etcd_server_has_leader{job=~".*etcd.*"} == 0
for: 1m
labels:
severity: critical

- alert: etcdHighNumberOfLeaderChanges
annotations:
summary: "etcd cluster '{{`{{ $labels.job }}`}}': instance '{{`{{ $labels.instance }}`}}' has seen '{{`{{ $value }}`}}' leader changes within the last hour."
expr: |
rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3
for: 15m
labels:
severity: warning

- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
summary: "etcd cluster '{{`{{ $labels.job }}`}}': '{{`{{ $value }}`}}' of requests for '{{`{{ $labels.grpc_method }}`}}' failed on etcd instance '{{`{{ $labels.instance }}`}}'."
expr: |
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
/
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
> 1
for: 10m
labels:
severity: warning

- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
summary: "etcd cluster '{{`{{ $labels.job }}`}}': '{{`{{ $value }}`}}' of requests for '{{`{{ $labels.grpc_method }}`}}' failed on etcd instance '{{`{{ $labels.instance }}`}}'."
expr: |
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
/
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
> 5
for: 5m
labels:
severity: critical

- alert: etcdGRPCRequestsSlow
annotations:
summary: "etcd cluster '{{`{{ $labels.job }}`}}': gRPC requests to '{{`{{ $labels.grpc_method }}`}}' are taking '{{`{{ $value }}`}}' on etcd instance '{{`{{ $labels.instance }}`}}'."
expr: |
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
> 0.15
for: 10m
labels:
severity: critical

- alert: etcdMemberCommunicationSlow
annotations:
summary: "etcd cluster '{{`{{ $labels.job }}`}}': member communication with '{{`{{ $labels.To }}`}}' is taking '{{`{{ $value }}`}}' on etcd instance '{{`{{ $labels.instance }}`}}'."
expr: |
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.15
for: 10m
labels:
severity: warning

- alert: etcdHighNumberOfFailedProposals
annotations:
summary: "etcd cluster '{{`{{ $labels.job }}`}}': '{{`{{ $value }}`}}' proposal failures within the last hour on etcd instance '{{`{{ $labels.instance }}`}}'."
expr: |
rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
for: 15m
labels:
severity: warning

- alert: etcdHighNumberOfFailedHTTPRequests
annotations:
summary: "'{{`{{ $value }}`}}' of requests for '{{`{{ $labels.method }}`}}' failed on etcd instance '{{`{{ $labels.instance }}`}}'."
expr: |
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) BY (method) > 0.01
for: 10m
labels:
severity: warning

- alert: etcdHighNumberOfFailedHTTPRequests
annotations:
summary: "'{{`{{ $value }}`}}' of requests for '{{`{{ $labels.method }}`}}' failed on etcd instance '{{`{{ $labels.instance }}`}}'."
expr: |
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) BY (method) > 0.05
for: 10m
labels:
severity: critical

- alert: etcdHTTPRequestsSlow
annotations:
summary: "etcd instance '{{`{{ $labels.instance }}`}}' HTTP requests to '{{`{{ $labels.method }}`}}' are slow."
expr: |
histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
> 0.15
for: 10m
labels:
severity: warning

- alert: etcdMembersDown
annotations:
summary: "etcd cluster '{{`{{ $labels.job }}`}}' members are down."
description: 'etcd cluster "{{`{{ $labels.job }}`}}": members are down {{`{{ $value }}`}}.'
expr: |
max without (endpoint) (
sum without (instance, pod) (up{job=~".*etcd.*"} == bool 0)
or
count without (To) (
sum without (instance, pod) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
)
)
> 0
for: 10m
labels:
severity: critical
2 changes: 1 addition & 1 deletion packages/extra/monitoring/dashboards.list
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,5 @@ main/nodes
control-plane/control-plane-status
control-plane/deprecated-resources
control-plane/dns-coredns
control-plane/kube-etcd3
control-plane/kube-etcd
kubevirt/kubevirt-control-plane
3 changes: 2 additions & 1 deletion packages/extra/versions_map
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ etcd 2.0.1 6fc1cc7d
etcd 2.1.0 2b00fcf8
etcd 2.2.0 5ca8823
etcd 2.3.0 b908400d
etcd 2.4.0 HEAD
etcd 2.4.0 cb7b8158
etcd 2.5.0 HEAD
ingress 1.0.0 f642698
ingress 1.1.0 838bee5d
ingress 1.2.0 ced8e5b
Expand Down

0 comments on commit 861e6c4

Please sign in to comment.