Skip to content

Commit 5ff41f5

Browse files
fourstepperHy3n4jmsantorum
authored
feat(alerting): magic alerting severities (#127)
* feat(alerting): magic alerting hocus pocus Signed-off-by: Hy3n4 <hy3nk4@gmail.com> * feat(alert-severities): config like this? Signed-off-by: Robin Opletal <me@robinopletal.com> * fix(alert-severities): remove NoSlo severity Signed-off-by: Robin Opletal <me@robinopletal.com> * stylistic fixes Signed-off-by: Robin Opletal <me@robinopletal.com> * fix(config): nicer Cfg initialization Signed-off-by: Robin Opletal <me@robinopletal.com> * stop hardcoding 1-target in alertmanagerrule Signed-off-by: Robin Opletal <me@robinopletal.com> * feat(make): make wait for services better when deploying dev infratructure Signed-off-by: Hy3n4 <hy3nk4@gmail.com> * feat(alerting): alertmanager config loader edits (#130) * feat(alerting): alertmanager config loader Signed-off-by: Robin Opletal <me@robinopletal.com> * generated stuff... Signed-off-by: Robin Opletal <me@robinopletal.com> --------- Signed-off-by: Robin Opletal <me@robinopletal.com> * chore: add release-drafter gh action (#133) Signed-off-by: Jose Santorum <jmsantorum@gmail.com> * feat(magic): some features some fixes mainly fixed duration issues also, added some feature to devel, osko dashboards, kustomize for asier deployment and other cool stuff modified function responsible for creating the alerting rule when magiAlerting is enabled, also added some basic mapping for opsgenie, pagerduty and custom alerting tool. Currently not working as expected tho Signed-off-by: Hy3n4 <hy3nk4@gmail.com> * fix(rules): duplicated rules for 5m window Signed-off-by: Hy3n4 <hy3nk4@gmail.com> * prometheus helper: simplify unique windows, resolve finalizer API warnings Signed-off-by: Robin Opletal <me@robinopletal.com> * at least move uniqueStrings to function Signed-off-by: Robin Opletal <me@robinopletal.com> --------- Signed-off-by: Hy3n4 <hy3nk4@gmail.com> Signed-off-by: Robin Opletal <me@robinopletal.com> Signed-off-by: Jose Santorum <jmsantorum@gmail.com> Co-authored-by: Hy3n4 <hy3nk4@gmail.com> Co-authored-by: Jose Santorum <jmsantorum@gmail.com>
1 parent 83c5b03 commit 5ff41f5

19 files changed

+981
-75
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ $(ENVTEST): $(LOCALBIN)
168168

169169
.PHONY: deploydev
170170
deploydev:
171-
@$(KUBECTL) apply -R -f devel/
171+
@$(KUBECTL) apply -k devel/
172172
@echo "Waiting for services to come online for the port-forwards..."
173173
@until [ "$$($(KUBECTL) get pods -l app=grafana -o jsonpath='{.items}')}" != "[]" ] && \
174174
[ "$$($(KUBECTL) get pods -l app=grafana -o jsonpath='{.items[0].status.containerStatuses[0].ready}')" == "true" ]; do \

api/osko/v1alpha1/mimirrule_types.go

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package v1alpha1
22

33
import (
4+
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
45
"github.com/prometheus/common/model"
56
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
67
)
@@ -33,13 +34,13 @@ type RuleGroup struct {
3334
}
3435

3536
type Rule struct {
36-
Record string `json:"record,omitempty"`
37-
Alert string `json:"alert,omitempty"`
38-
Expr string `json:"expr"`
39-
For model.Duration `json:"for,omitempty"`
40-
KeepFiringFor model.Duration `json:"keep_firing_for,omitempty"`
41-
Labels map[string]string `json:"labels,omitempty"`
42-
Annotations map[string]string `json:"annotations,omitempty"`
37+
Record string `json:"record,omitempty"`
38+
Alert string `json:"alert,omitempty"`
39+
Expr string `json:"expr"`
40+
For *monitoringv1.Duration `json:"for,omitempty"`
41+
KeepFiringFor model.Duration `json:"keep_firing_for,omitempty"`
42+
Labels map[string]string `json:"labels,omitempty"`
43+
Annotations map[string]string `json:"annotations,omitempty"`
4344
}
4445

4546
//+kubebuilder:object:root=true

api/osko/v1alpha1/zz_generated.deepcopy.go

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

cmd/main.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ func main() {
5757
opts.BindFlags(flag.CommandLine)
5858

5959
flag.Parse()
60-
cfg := config.NewConfig()
60+
config.NewConfig()
6161

6262
ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))
6363

@@ -141,7 +141,7 @@ func main() {
141141
Client: mgr.GetClient(),
142142
Scheme: mgr.GetScheme(),
143143
Recorder: mgr.GetEventRecorderFor("mimirrule-controller"),
144-
RequeueAfterPeriod: cfg.MimirRuleRequeuePeriod,
144+
RequeueAfterPeriod: config.Cfg.MimirRuleRequeuePeriod,
145145
}).SetupWithManager(mgr); err != nil {
146146
setupLog.Error(err, "unable to create controller", "controller", "MimirRule")
147147
os.Exit(1)

config/crd/bases/osko.dev_mimirrules.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -85,11 +85,11 @@ spec:
8585
type: string
8686
for:
8787
description: |-
88-
Duration wraps time.Duration. It is used to parse the custom duration format
89-
from YAML.
90-
This type should not propagate beyond the scope of input/output processing.
91-
format: int64
92-
type: integer
88+
Duration is a valid time duration that can be parsed by Prometheus model.ParseDuration() function.
89+
Supported units: y, w, d, h, m, s, ms
90+
Examples: `30s`, `1m`, `1h20m15s`, `15d`
91+
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
92+
type: string
9393
keep_firing_for:
9494
description: |-
9595
Duration wraps time.Duration. It is used to parse the custom duration format

config/samples/kustomization.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
## Append samples of your project ##
22
resources:
3-
- osko_v1alpha1_alertmanagerconfig.yaml
3+
- openslo_v1_datasource.yaml
4+
- openslo_v1_slo.yaml
5+
- config_secret.yaml
6+
- osko_v1alpha1_alertmanagerconfig.yaml
47
# +kubebuilder:scaffold:manifestskustomizesamples

config/samples/openslo_v1_datasource.yaml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,8 @@ spec:
88
description: Mimir Datasource for logging tenant
99
type: mimir
1010
connectionDetails:
11-
address: https://mimir.monitoring.dev.heu.group/
1211
address: http://localhost:9009/
1312
sourceTenants:
1413
- gatekeeper-system
15-
targetTenant: gatekeeper-system
1614
- monitoring
1715
targetTenant: monitoring

config/samples/openslo_v1_slo.yaml

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,39 @@
11
apiVersion: openslo.com/v1
22
kind: SLO
33
metadata:
4+
name: mimir-ingestion-latency
45
labels:
5-
label.osko.dev/team: "infrastructure"
6-
label.osko.dev/system: "gatekeeper"
7-
label.osko.dev/domain: "security"
6+
label.osko.dev/team: "infra"
7+
label.osko.dev/system: "monitoring"
8+
label.osko.dev/domain: "observability"
9+
label.osko.dev/service: "mimir"
810
annotations:
911
osko.dev/datasourceRef: "mimir-infra-ds"
1012
osko.dev/magicAlerting: "true"
11-
name: gatekeeper-webhook-response-time
1213
spec:
1314
budgetingMethod: Occurrences
14-
description: 99% of Gatekeeper webhook requests return in less than 0.5s
15+
description: 95% of all queries should have a latency of less than 300 milliseconds
1516
indicator:
1617
metadata:
17-
name: gatekeeper-webhook-less-than-05s
18+
name: distributor-query-success-latency
1819
spec:
19-
description: 99% of Gatekeeper webhook requests return in less than 0.5s
20+
description: 95% of all queries should have a latency of less than 500 milliseconds
2021
ratioMetric:
2122
good:
2223
metricSource:
2324
metricSourceRef: mimir-infra-ds
2425
type: Mimir
2526
spec:
26-
query: controller_runtime_webhook_latency_seconds_bucket{le="0.5", job="gatekeeper-metrics"}
27+
query: cortex_distributor_query_duration_seconds_bucket{le="0.5", method="Distributor.QueryStream", status_code="200"}
2728
total:
2829
metricSource:
2930
metricSourceRef: mimir-infra-ds
3031
type: Mimir
3132
spec:
32-
query: controller_runtime_webhook_latency_seconds_count{job="gatekeeper-metrics"}
33+
query: cortex_distributor_query_duration_seconds_count{method="Distributor.QueryStream"}
3334
objectives:
34-
- displayName: gatekeeper-webhook-less-than-05s
35-
target: '0.99'
36-
service: testing
35+
- target: "0.99"
36+
service: mimir
3737
timeWindow:
38-
- duration: 28d
39-
isRolling: true
38+
- duration: 28d
39+
isRolling: true

devel/grafana-agent/configmap.yaml

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,19 @@ data:
1212
}
1313
}
1414
}
15-
prometheus.scrape "static" {
15+
prometheus.relabel "cluster" {
16+
rule {
17+
target_label = "cluster"
18+
replacement = "local"
19+
}
1620
forward_to = [
1721
prometheus.remote_write.local.receiver,
1822
]
23+
}
24+
prometheus.scrape "static" {
25+
forward_to = [
26+
prometheus.relabel.cluster.receiver,
27+
]
1928
targets = [
2029
{
2130
"__address__" = "mimir-service:9009",

devel/grafana-agent/deployment.yaml

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,11 @@ spec:
1616
- name: grafana-agent
1717
image: grafana/agent:latest
1818
args:
19-
- 'run'
20-
- '/etc/agent/agent.river'
21-
- '--storage.path=/tmp/agent'
22-
- '--server.http.listen-addr=127.0.0.1:80'
23-
- '--server.http.ui-path-prefix=/'
19+
- "run"
20+
- "/etc/agent/agent.river"
21+
- "--storage.path=/tmp/agent"
22+
- "--server.http.listen-addr=127.0.0.1:12345"
23+
- "--server.http.ui-path-prefix=/"
2424
volumeMounts:
2525
- name: config-volume
2626
mountPath: /etc/agent
@@ -31,6 +31,9 @@ spec:
3131
valueFrom:
3232
fieldRef:
3333
fieldPath: spec.nodeName
34+
ports:
35+
- containerPort: 12345
36+
name: http-agent
3437

3538
volumes:
3639
- name: config-volume

0 commit comments

Comments
 (0)