Skip to content

Commit 0acecc1

Browse files
committed
Add alerting
1 parent 0a77340 commit 0acecc1

File tree

4 files changed

+166
-1
lines changed

4 files changed

+166
-1
lines changed

class/defaults.yml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@ parameters:
22
rotating_bucket_backup:
33
=_metadata: {}
44
namespace: syn-rotating-bucket-backup
5+
namespace_metadata:
6+
annotations: {}
7+
labels: {}
58

69
images:
710
mc:
@@ -14,3 +17,28 @@ parameters:
1417
hour: 1
1518

1619
jobs: {}
20+
21+
monitoring:
22+
enabled: true
23+
alerts:
24+
RotatingClusterBackup_JobFailed:
25+
enabled: true
26+
rule:
27+
annotations:
28+
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.
29+
expr: |
30+
kube_job_failed{namespace="${rotating_bucket_backup:namespace}"} > 0
31+
for: 15m
32+
labels:
33+
severity: warning
34+
35+
RotatingClusterBackup_JobCompletion_12h:
36+
enabled: true
37+
rule:
38+
annotations:
39+
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete. Job did not complete in time
40+
expr: |
41+
kube_job_spec_completions{namespace="${rotating_bucket_backup:namespace}"} - kube_job_status_succeeded{namespace="${rotating_bucket_backup:namespace}"} > 0
42+
for: 12h
43+
labels:
44+
severity: warning

component/main.jsonnet

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,45 @@
11
// main template for rotating-bucket-backup
2+
local com = import 'lib/commodore.libjsonnet';
23
local kap = import 'lib/kapitan.libjsonnet';
34
local kube = import 'lib/kube.libjsonnet';
45
local inv = kap.inventory();
56
// The hiera parameters for the component
67
local params = inv.parameters.rotating_bucket_backup;
78

9+
local alertlabels = {
10+
syn: 'true',
11+
syn_component: 'rotating-bucket-backup',
12+
};
13+
14+
local alerts = params.monitoring.alerts;
15+
local prometheusRule =
16+
kube._Object('monitoring.coreos.com/v1', 'PrometheusRule', 'rotating-bucket-backup-alerts') {
17+
metadata+: {
18+
namespace: params.namespace,
19+
},
20+
spec+: {
21+
groups+: [
22+
{
23+
name: 'rotating-bucket-backup-alerts',
24+
rules:
25+
std.filterMap(
26+
function(field) alerts[field].enabled == true,
27+
function(field) alerts[field].rule {
28+
alert: field,
29+
labels+: alertlabels,
30+
},
31+
std.sort(std.objectFields(alerts)),
32+
),
33+
},
34+
],
35+
},
36+
};
37+
838
// Define outputs below
939
{
10-
'00_namespace': kube.Namespace(params.namespace),
40+
'00_namespace': kube.Namespace(params.namespace) {
41+
metadata+: com.makeMergeable(params.namespace_metadata),
42+
},
1143
'10_jobs': (import 'jobs.libsonnet'),
44+
[if params.monitoring.enabled then '20_prometheus_rule']: prometheusRule,
1245
}

docs/modules/ROOT/pages/references/parameters.adoc

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,21 @@ default:: `syn-rotating-bucket-backup`
1111
The namespace in which to deploy this component.
1212

1313

14+
== `namespace_metadata`
15+
16+
[horizontal]
17+
type:: dict
18+
default::
19+
+
20+
[source,yaml]
21+
----
22+
annotations: {}
23+
labels: {}
24+
----
25+
26+
Allows adding additional annotations and labels to the namespace.
27+
28+
1429
== `images`
1530

1631
[horizontal]
@@ -92,6 +107,60 @@ type:: dict
92107
The parameters for the target bucket. The keys and values depend on the type of the target bucket.
93108

94109

110+
== `monitoring.enabled`
111+
112+
[horizontal]
113+
type:: boolean
114+
default:: `true`
115+
116+
Controls if `PrometheusRule` objects are created.
117+
The alerts expect `kube-state-metrics` to be running in the cluster.
118+
119+
120+
== `monitoring.alerts`
121+
122+
[horizontal]
123+
type:: dict
124+
default:: See https://github.com/vshn/component-rotating-bucket-backup/blob/master/class/defaults.yml[defaults.yml]
125+
example::
126+
+
127+
[source,yaml]
128+
----
129+
alerts:
130+
MyAlert:
131+
enabled: true
132+
rule:
133+
expr: |
134+
up{namespace="${rotating_bucket_backup:namespace}"} == 0
135+
for: 5m
136+
labels:
137+
severity: warning
138+
annotations:
139+
summary: "Instance {{ $labels.instance }} down"
140+
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
141+
----
142+
143+
The alerts to configure.
144+
The key is the name of the alert.
145+
The value is a dictionary with the following keys:
146+
147+
=== `enabled`
148+
149+
[horizontal]
150+
type:: boolean
151+
152+
Controls if the alert is enabled.
153+
154+
155+
=== `rule`
156+
157+
[horizontal]
158+
type:: dict
159+
160+
The Prometheus rule for the alert.
161+
The keys and values depend on the alert.
162+
163+
95164
== Example
96165

97166
[source,yaml]
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
apiVersion: monitoring.coreos.com/v1
2+
kind: PrometheusRule
3+
metadata:
4+
annotations: {}
5+
labels:
6+
name: rotating-bucket-backup-alerts
7+
name: rotating-bucket-backup-alerts
8+
namespace: syn-rotating-bucket-backup
9+
spec:
10+
groups:
11+
- name: rotating-bucket-backup-alerts
12+
rules:
13+
- alert: RotatingClusterBackup_JobCompletion_12h
14+
annotations:
15+
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking
16+
more than 12 hours to complete. Job did not complete in time
17+
expr: |
18+
kube_job_spec_completions{namespace="syn-rotating-bucket-backup"} - kube_job_status_succeeded{namespace="syn-rotating-bucket-backup"} > 0
19+
for: 12h
20+
labels:
21+
severity: warning
22+
syn: 'true'
23+
syn_component: rotating-bucket-backup
24+
- alert: RotatingClusterBackup_JobFailed
25+
annotations:
26+
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed
27+
to complete. Removing failed job after investigation should clear this
28+
alert.
29+
expr: |
30+
kube_job_failed{namespace="syn-rotating-bucket-backup"} > 0
31+
for: 15m
32+
labels:
33+
severity: warning
34+
syn: 'true'
35+
syn_component: rotating-bucket-backup

0 commit comments

Comments
 (0)