Skip to content

Commit f0bcae9

Browse files
fix: move rules to crds
1 parent 9db0cb3 commit f0bcae9

File tree

5 files changed

+116
-101
lines changed

5 files changed

+116
-101
lines changed

argo/cluster/monitoring/kube-prometheus-stack/chart/kustomization.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ resources:
66
- secrets
77
- scrapeconfigs
88
- dashboards
9+
- rules
910

1011
patches:
1112
- path: patches/prometheuses.yaml
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
apiVersion: kustomize.config.k8s.io/v1beta1
2+
kind: Kustomization
3+
resources:
4+
- misc.yaml
5+
- srep.yaml
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
apiVersion: monitoring.coreos.com/v1
2+
kind: PrometheusRule
3+
metadata:
4+
name: misc
5+
spec:
6+
groups:
7+
- name: vms
8+
rules:
9+
- alert: VMDown
10+
expr: 100 * (count by (instance) (up{job="node_exporter"} == 0) / count by (instance) (up{job="node_exporter"})) > 10
11+
for: 5m
12+
labels:
13+
severity: critical
14+
annotations:
15+
summary: Target {{ $labels.instance }} is DOWN
16+
- name: k8s
17+
rules:
18+
- alert: OOMKilled
19+
expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1
20+
labels:
21+
severity: critical
22+
annotations:
23+
summary: Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.
24+
- name: haproxy
25+
rules:
26+
- alert: HaproxyServerNotUp
27+
expr: haproxy_server_status{state!="UP"} > 0
28+
for: 3m
29+
labels:
30+
severity: critical
31+
annotations:
32+
summary: Haproxy Server {{ $labels.server }} is not UP for backend {{ $labels.proxy }}
33+
- name: argo
34+
rules:
35+
- alert: ArgocdServiceUnhealthy
36+
expr: argocd_app_info{health_status!="Healthy"} != 0
37+
for: 15m
38+
labels:
39+
severity: warning
40+
annotations:
41+
summary: ArgoCD service unhealthy (instance {{ $labels.instance }})
42+
- alert: ArgocdServiceNotSynced
43+
expr: argocd_app_info{sync_status!="Synced"} != 0
44+
for: 15m
45+
labels:
46+
severity: warning
47+
annotations:
48+
summary: ArgoCD service not synced (instance {{ $labels.instance }})
49+
- name: db
50+
rules:
51+
- alert: MysqlDown
52+
expr: mysql_up{service!~"srep.*"} == 0
53+
for: 0m
54+
labels:
55+
severity: critical
56+
annotations:
57+
summary: MySQL down (instance {{ $labels.instance }})
58+
- alert: MysqlHighThreadsRunning
59+
expr: max_over_time(mysql_global_status_threads_running{service!~"srep.*"}[1m]) / mysql_global_variables_max_connections{service!~"srep.*"} * 100 > 75
60+
for: 2m
61+
labels:
62+
severity: warning
63+
annotations:
64+
summary: MySQL high threads running (instance {{ $labels.instance }})
65+
- alert: RedisDown
66+
expr: redis_up{service!~"srep*"} == 0
67+
for: 0m
68+
labels:
69+
severity: critical
70+
annotations:
71+
summary: Redis down (instance {{ $labels.instance }})
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
apiVersion: monitoring.coreos.com/v1
2+
kind: PrometheusRule
3+
metadata:
4+
name: misc
5+
labels:
6+
prometheus: k8s
7+
role: alert-rules
8+
spec:
9+
groups:
10+
- name: api
11+
rules:
12+
- alert: APIErrorRate
13+
expr: ((rate(srep_requests_total{code=~"2..|4..|3.."}[1m]) / rate(srep_requests_total[1m])) * 100) < 99.99
14+
for: 1m
15+
labels:
16+
severity: critical
17+
annotations:
18+
summary: API Success Rate is < 99.99%
19+
- alert: APIMysqlDown
20+
expr: mysql_up{service="srep-mysql"} == 0
21+
for: 0m
22+
labels:
23+
severity: critical
24+
annotations:
25+
summary: API MySQL is down
26+
- alert: APIMysqlHighThreadsRunning
27+
expr: max_over_time(mysql_global_status_threads_running{service="srep-mysql"}[1m]) / mysql_global_variables_max_connections{service="srep-mysql"} * 100 > 75
28+
for: 2m
29+
labels:
30+
severity: warning
31+
annotations:
32+
summary: MySQL high threads running (instance {{ $labels.instance }})
33+
- alert: APIRedisDown
34+
expr: redis_up{service="srep-redis"} == 0
35+
for: 0m
36+
labels:
37+
severity: critical
38+
annotations:
39+
summary: Redis down (instance {{ $labels.instance }})

argo/cluster/monitoring/kube-prometheus-stack/chart/values.yaml

Lines changed: 0 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -92,107 +92,6 @@ defaultRules:
9292
kubeSchedulerAlerting: false
9393
kubeSchedulerRecording: false
9494

95-
additionalPrometheusRulesMap:
96-
misc:
97-
groups:
98-
- name: vms
99-
rules:
100-
- alert: VMDown
101-
expr: 100 * (count by (instance) (up{job="node_exporter"} == 0) / count by (instance) (up{job="node_exporter"})) > 10
102-
for: 5m
103-
labels:
104-
severity: critical
105-
annotations:
106-
summary: Target {{ $labels.instance }} is DOWN
107-
- name: k8s
108-
rules:
109-
- alert: OOMKilled
110-
expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1
111-
labels:
112-
severity: critical
113-
annotations:
114-
summary: Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.
115-
- name: haproxy
116-
rules:
117-
- alert: HaproxyServerNotUp
118-
expr: haproxy_server_status{state!="UP"} > 0
119-
for: 3m
120-
labels:
121-
severity: critical
122-
annotations:
123-
summary: Haproxy Server {{ $labels.server }} is not UP for backend {{ $labels.proxy }}
124-
- name: argo
125-
rules:
126-
- alert: ArgocdServiceUnhealthy
127-
expr: argocd_app_info{health_status!="Healthy"} != 0
128-
for: 15m
129-
labels:
130-
severity: warning
131-
annotations:
132-
summary: ArgoCD service unhealthy (instance {{ $labels.instance }})
133-
- alert: ArgocdServiceNotSynced
134-
expr: argocd_app_info{sync_status!="Synced"} != 0
135-
for: 15m
136-
labels:
137-
severity: warning
138-
annotations:
139-
summary: ArgoCD service not synced (instance {{ $labels.instance }})
140-
- name: db
141-
rules:
142-
- alert: MysqlDown
143-
expr: mysql_up{service!~"srep.*"} == 0
144-
for: 0m
145-
labels:
146-
severity: critical
147-
annotations:
148-
summary: MySQL down (instance {{ $labels.instance }})
149-
- alert: MysqlHighThreadsRunning
150-
expr: max_over_time(mysql_global_status_threads_running{service!~"srep.*"}[1m]) / mysql_global_variables_max_connections{service!~"srep.*"} * 100 > 75
151-
for: 2m
152-
labels:
153-
severity: warning
154-
annotations:
155-
summary: MySQL high threads running (instance {{ $labels.instance }})
156-
- alert: RedisDown
157-
expr: redis_up{service!~"srep*"} == 0
158-
for: 0m
159-
labels:
160-
severity: critical
161-
annotations:
162-
summary: Redis down (instance {{ $labels.instance }})
163-
srep:
164-
groups:
165-
- name: api
166-
rules:
167-
- alert: APIErrorRate
168-
expr: ((rate(srep_requests_total{code=~"2..|4..|3.."}[1m]) / rate(srep_requests_total[1m])) * 100) < 99.99
169-
for: 1m
170-
labels:
171-
severity: critical
172-
annotations:
173-
summary: API Success Rate is < 99.99%
174-
- alert: APIMysqlDown
175-
expr: mysql_up{service="srep-mysql"} == 0
176-
for: 0m
177-
labels:
178-
severity: critical
179-
annotations:
180-
summary: API MySQL is down
181-
- alert: APIMysqlHighThreadsRunning
182-
expr: max_over_time(mysql_global_status_threads_running{service="srep-mysql"}[1m]) / mysql_global_variables_max_connections{service="srep-mysql"} * 100 > 75
183-
for: 2m
184-
labels:
185-
severity: warning
186-
annotations:
187-
summary: MySQL high threads running (instance {{ $labels.instance }})
188-
- alert: APIRedisDown
189-
expr: redis_up{service="srep-redis"} == 0
190-
for: 0m
191-
labels:
192-
severity: critical
193-
annotations:
194-
summary: Redis down (instance {{ $labels.instance }})
195-
19695
grafana:
19796
enabled: true
19897

0 commit comments

Comments
 (0)