File tree Expand file tree Collapse file tree 4 files changed +166
-1
lines changed
docs/modules/ROOT/pages/references
tests/golden/defaults/rotating-bucket-backup/rotating-bucket-backup Expand file tree Collapse file tree 4 files changed +166
-1
lines changed Original file line number Diff line number Diff line change @@ -2,6 +2,9 @@ parameters:
2
2
rotating_bucket_backup :
3
3
=_metadata : {}
4
4
namespace : syn-rotating-bucket-backup
5
+ namespace_metadata :
6
+ annotations : {}
7
+ labels : {}
5
8
6
9
images :
7
10
mc :
@@ -14,3 +17,28 @@ parameters:
14
17
hour : 1
15
18
16
19
jobs : {}
20
+
21
+ monitoring :
22
+ enabled : true
23
+ alerts :
24
+ RotatingClusterBackup_JobFailed :
25
+ enabled : true
26
+ rule :
27
+ annotations :
28
+ description : Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.
29
+ expr : |
30
+ kube_job_failed{namespace="${rotating_bucket_backup:namespace}"} > 0
31
+ for : 15m
32
+ labels :
33
+ severity : warning
34
+
35
+ RotatingClusterBackup_JobCompletion_12h :
36
+ enabled : true
37
+ rule :
38
+ annotations :
39
+ description : Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete. Job did not complete in time
40
+ expr : |
41
+ kube_job_spec_completions{namespace="${rotating_bucket_backup:namespace}"} - kube_job_status_succeeded{namespace="${rotating_bucket_backup:namespace}"} > 0
42
+ for : 12h
43
+ labels :
44
+ severity : warning
Original file line number Diff line number Diff line change 1
1
// main template for rotating-bucket-backup
2
+ local com = import 'lib/commodore.libjsonnet' ;
2
3
local kap = import 'lib/kapitan.libjsonnet' ;
3
4
local kube = import 'lib/kube.libjsonnet' ;
4
5
local inv = kap.inventory();
5
6
// The hiera parameters for the component
6
7
local params = inv.parameters.rotating_bucket_backup;
7
8
9
+ local alertlabels = {
10
+ syn: 'true' ,
11
+ syn_component: 'rotating-bucket-backup' ,
12
+ };
13
+
14
+ local alerts = params.monitoring.alerts;
15
+ local prometheusRule =
16
+ kube._Object('monitoring.coreos.com/v1' , 'PrometheusRule' , 'rotating-bucket-backup-alerts' ) {
17
+ metadata+: {
18
+ namespace: params.namespace,
19
+ },
20
+ spec+: {
21
+ groups+: [
22
+ {
23
+ name: 'rotating-bucket-backup-alerts' ,
24
+ rules:
25
+ std.filterMap (
26
+ function (field) alerts[field].enabled == true ,
27
+ function (field) alerts[field].rule {
28
+ alert: field,
29
+ labels+: alertlabels,
30
+ },
31
+ std.sort (std.objectFields (alerts)),
32
+ ),
33
+ },
34
+ ],
35
+ },
36
+ };
37
+
8
38
// Define outputs below
9
39
{
10
- '00_namespace' : kube.Namespace(params.namespace),
40
+ '00_namespace' : kube.Namespace(params.namespace) {
41
+ metadata+: com.makeMergeable(params.namespace_metadata),
42
+ },
11
43
'10_jobs' : (import 'jobs.libsonnet' ),
44
+ [if params.monitoring.enabled then '20_prometheus_rule' ]: prometheusRule,
12
45
}
Original file line number Diff line number Diff line change @@ -11,6 +11,21 @@ default:: `syn-rotating-bucket-backup`
11
11
The namespace in which to deploy this component.
12
12
13
13
14
+ == `namespace_metadata`
15
+
16
+ [horizontal]
17
+ type:: dict
18
+ default::
19
+ +
20
+ [source,yaml]
21
+ ----
22
+ annotations: {}
23
+ labels: {}
24
+ ----
25
+
26
+ Allows adding additional annotations and labels to the namespace.
27
+
28
+
14
29
== `images`
15
30
16
31
[horizontal]
@@ -92,6 +107,60 @@ type:: dict
92
107
The parameters for the target bucket. The keys and values depend on the type of the target bucket.
93
108
94
109
110
+ == `monitoring.enabled`
111
+
112
+ [horizontal]
113
+ type:: boolean
114
+ default:: `true`
115
+
116
+ Controls if `PrometheusRule` objects are created.
117
+ The alerts expect `kube-state-metrics` to be running in the cluster.
118
+
119
+
120
+ == `monitoring.alerts`
121
+
122
+ [horizontal]
123
+ type:: dict
124
+ default:: See https://github.com/vshn/component-rotating-bucket-backup/blob/master/class/defaults.yml[defaults.yml]
125
+ example::
126
+ +
127
+ [source,yaml]
128
+ ----
129
+ alerts:
130
+ MyAlert:
131
+ enabled: true
132
+ rule:
133
+ expr: |
134
+ up{namespace="${rotating_bucket_backup:namespace}"} == 0
135
+ for: 5m
136
+ labels:
137
+ severity: warning
138
+ annotations:
139
+ summary: "Instance {{ $labels.instance }} down"
140
+ description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
141
+ ----
142
+
143
+ The alerts to configure.
144
+ The key is the name of the alert.
145
+ The value is a dictionary with the following keys:
146
+
147
+ === `enabled`
148
+
149
+ [horizontal]
150
+ type:: boolean
151
+
152
+ Controls if the alert is enabled.
153
+
154
+
155
+ === `rule`
156
+
157
+ [horizontal]
158
+ type:: dict
159
+
160
+ The Prometheus rule for the alert.
161
+ The keys and values depend on the alert.
162
+
163
+
95
164
== Example
96
165
97
166
[source,yaml]
Original file line number Diff line number Diff line change
1
+ apiVersion : monitoring.coreos.com/v1
2
+ kind : PrometheusRule
3
+ metadata :
4
+ annotations : {}
5
+ labels :
6
+ name : rotating-bucket-backup-alerts
7
+ name : rotating-bucket-backup-alerts
8
+ namespace : syn-rotating-bucket-backup
9
+ spec :
10
+ groups :
11
+ - name : rotating-bucket-backup-alerts
12
+ rules :
13
+ - alert : RotatingClusterBackup_JobCompletion_12h
14
+ annotations :
15
+ description : Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking
16
+ more than 12 hours to complete. Job did not complete in time
17
+ expr : |
18
+ kube_job_spec_completions{namespace="syn-rotating-bucket-backup"} - kube_job_status_succeeded{namespace="syn-rotating-bucket-backup"} > 0
19
+ for : 12h
20
+ labels :
21
+ severity : warning
22
+ syn : ' true'
23
+ syn_component : rotating-bucket-backup
24
+ - alert : RotatingClusterBackup_JobFailed
25
+ annotations :
26
+ description : Job {{ $labels.namespace }}/{{ $labels.job_name }} failed
27
+ to complete. Removing failed job after investigation should clear this
28
+ alert.
29
+ expr : |
30
+ kube_job_failed{namespace="syn-rotating-bucket-backup"} > 0
31
+ for : 15m
32
+ labels :
33
+ severity : warning
34
+ syn : ' true'
35
+ syn_component : rotating-bucket-backup
You can’t perform that action at this time.
0 commit comments