diff --git a/CHANGELOG.md b/CHANGELOG.md index 7c5386f069b..66c679a888c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -61,8 +61,10 @@ * [CHANGE] Remove backwards compatibility for `thanos_memcached_` prefixed metrics in dashboards and alerts removed in 2.12. #9674 * [ENHANCEMENT] Unify ingester autoscaling panels on 'Mimir / Writes' dashboard to work for both ingest-storage and non-ingest-storage autoscaling. #9617 +* [ENHANCEMENT] Alerts: Enable configuring job prefix for alerts to prevent clashes with metrics from Loki/Tempo. #9659 * [ENHANCEMENT] Dashboards: visualize the age of source blocks in the "Mimir / Compactor" dashboard. #9697 * [ENHANCEMENT] Dashboards: Include block compaction level on queried blocks in 'Mimir / Queries' dashboard. #9706 + * [BUGFIX] Dashboards: Fix autoscaling metrics joins when series churn. #9412 #9450 #9432 * [BUGFIX] Alerts: Fix autoscaling metrics joins in `MimirAutoscalerNotActive` when series churn. #9412 * [BUGFIX] Alerts: Exclude failed cache "add" operations from alerting since failures are expected in normal operation. #9658 diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml index b9364ff156b..f866fb0f3dc 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml @@ -533,7 +533,7 @@ spec: expr: | max by (cluster, namespace) (memberlist_client_cluster_members_count) > - (sum by (cluster, namespace) (up{job=~".+/(admin-api|alertmanager|compactor.*|distributor.*|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) + 10) + (sum by (cluster, namespace) (up{job=~".*/(admin-api|alertmanager|compactor.*|distributor.*|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) + 10) for: 20m labels: severity: warning diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml index 1efb58e2127..41ff8e17768 100644 --- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml +++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml @@ -511,7 +511,7 @@ groups: expr: | max by (cluster, namespace) (memberlist_client_cluster_members_count) > - (sum by (cluster, namespace) (up{job=~".+/(admin-api|alertmanager|compactor.*|distributor.*|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) + 10) + (sum by (cluster, namespace) (up{job=~".*/(admin-api|alertmanager|compactor.*|distributor.*|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) + 10) for: 20m labels: severity: warning diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml index 8bf5007ec69..87696d3be85 100644 --- a/operations/mimir-mixin-compiled/alerts.yaml +++ b/operations/mimir-mixin-compiled/alerts.yaml @@ -521,7 +521,7 @@ groups: expr: | max by (cluster, namespace) (memberlist_client_cluster_members_count) > - (sum by (cluster, namespace) (up{job=~".+/(admin-api|alertmanager|compactor.*|distributor.*|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) + 10) + (sum by (cluster, namespace) (up{job=~".*/(admin-api|alertmanager|compactor.*|distributor.*|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) + 10) for: 20m labels: severity: warning diff --git a/operations/mimir-mixin/alerts/alerts-utils.libsonnet b/operations/mimir-mixin/alerts/alerts-utils.libsonnet index 3f3307a100a..3d344020f64 100644 --- a/operations/mimir-mixin/alerts/alerts-utils.libsonnet +++ b/operations/mimir-mixin/alerts/alerts-utils.libsonnet @@ -10,10 +10,10 @@ $._config.product + name, jobMatcher(job):: - 'job=~".*/%s"' % formatJobForQuery(job), + '%s=~"%s%s"' % [$._config.per_job_label, $._config.alert_job_prefix, formatJobForQuery(job)], jobNotMatcher(job):: - 'job!~".*/%s"' % formatJobForQuery(job), + '%s!~"%s%s"' % [$._config.per_job_label, $._config.alert_job_prefix, formatJobForQuery(job)], local formatJobForQuery(job) = if std.isArray(job) then '(%s)' % std.join('|', job) diff --git a/operations/mimir-mixin/alerts/alerts.libsonnet b/operations/mimir-mixin/alerts/alerts.libsonnet index c47e6ef32d0..63118434fa8 100644 --- a/operations/mimir-mixin/alerts/alerts.libsonnet +++ b/operations/mimir-mixin/alerts/alerts.libsonnet @@ -775,8 +775,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; ||| max by (%s) (memberlist_client_cluster_members_count) > - (sum by (%s) (up{%s=~".+/%s"}) + 10) - ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels, $._config.per_job_label, simpleRegexpOpt($._config.job_names.ring_members)], + (sum by (%s) (up{%s}) + 10) + ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels, $.jobMatcher($._config.job_names.ring_members)], 'for': '20m', labels: { severity: 'warning', diff --git a/operations/mimir-mixin/config.libsonnet b/operations/mimir-mixin/config.libsonnet index b9c5dc3bf48..7b11f041338 100644 --- a/operations/mimir-mixin/config.libsonnet +++ b/operations/mimir-mixin/config.libsonnet @@ -207,6 +207,9 @@ // Used to add extra annotations to all alerts, Careful: takes precedence over default annotations. alert_extra_annotations: {}, + // Used as the job prefix in alerts that select on job label (e.g. GossipMembersTooHigh, RingMembersMismatch). This can be set to a known namespace to prevent those alerts from firing incorrectly due to selecting similar metrics from Loki/Tempo. + alert_job_prefix: '.*/', + // Whether alerts for experimental ingest storage are enabled. ingest_storage_enabled: true,