Skip to content

Commit

Permalink
Adds a gmx_machine_state{}==1 condition to a number of alerts. (#307)
Browse files Browse the repository at this point in the history
* Adds a gmx_machine_state{}==1 condition to any alerts that already have a lame_duck_node condition.

* Adds gmx_[site, machine]_state conditions to a couple more alerts.

* A few enhancements and fixes per suggestions/comments from PR #307.
  • Loading branch information
nkinkade authored Sep 18, 2018
1 parent 8f3583b commit db47ce2
Showing 1 changed file with 10 additions and 10 deletions.
20 changes: 10 additions & 10 deletions config/federation/prometheus/alerts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,7 @@ ALERT CoreServices_SidestreamIsNotRunning
IF sum_over_time(up{service="sidestream"}[10m]) == 0
AND ON(machine)
sum_over_time(probe_success{service="ssh806"}[20m]) / 20 >= 0.90
UNLESS ON(machine)
lame_duck_node == 1
UNLESS ON(machine) (lame_duck_node == 1 OR gmx_machine_maintenance == 1)
FOR 10m
LABELS {
severity = "page",
Expand Down Expand Up @@ -85,8 +84,7 @@ ALERT ScraperMostRecentArchivedFileTimeIsTooOld
IF (time() - (scraper_maxrawfiletimearchived{container="scraper-sync"} != 0)) > (56 * 60 * 60)
AND ON(machine)
(time() - process_start_time_seconds{service="sidestream"}) > (30 * 60 * 60)
UNLESS ON(machine)
lame_duck_node == 1
UNLESS ON(machine) (lame_duck_node == 1 OR gmx_machine_maintenance == 1)
FOR 2h
LABELS {
severity = "page",
Expand Down Expand Up @@ -143,7 +141,8 @@ ALERT ScraperCollectorMissingFromScraperSync
# "up"/"aliveness" check.
ALERT SwitchDownAtSite
IF up{job="snmp-targets", site!~".*t$"} == 0
AND ON(site) probe_success{instance=~"s1.*", module="icmp"} == 0
AND ON(site) probe_success{instance=~"s1.*", module="icmp"} == 0
UNLESS ON(site) gmx_site_maintenance == 1
FOR 24h
LABELS {
severity = "ticket",
Expand Down Expand Up @@ -404,8 +403,8 @@ ALERT BlackboxExporterIpv6DownOrMissing
ALERT TooManyNdtServersDown
IF count_scalar(
probe_success{service="ndt_raw"} AND ON(machine)
up{service="nodeexporter"} == 1 UNLESS ON(machine)
lame_duck_node{} == 1
up{service="nodeexporter"} == 1
UNLESS ON(machine) (lame_duck_node == 1 OR gmx_machine_maintenance == 1)
UNLESS ON(machine) (
probe_success{service="ndt_raw"} == 1 AND ON(machine)
probe_success{service="ndt_ssl"} == 1 AND ON(machine)
Expand All @@ -417,8 +416,8 @@ ALERT TooManyNdtServersDown
/
count(
probe_success{service="ndt_raw"} AND ON(machine)
up{service="nodeexporter"} == 1 UNLESS ON(machine)
lame_duck_node{} == 1
up{service="nodeexporter"} == 1
UNLESS ON(machine) (lame_duck_node == 1 OR gmx_machine_maintenance == 1)
) > 0.25
FOR 30m
LABELS {
Expand Down Expand Up @@ -492,7 +491,7 @@ ALERT MobiperfMetricsMissing
# Some number of nodes don't have a lame-duck status.
ALERT LameDuckMetricMissingForNode
IF up{service="nodeexporter"} == 1
UNLESS ON(machine) lame_duck_node{}
UNLESS ON(machine) (lame_duck_node == 1 OR gmx_machine_maintenance == 1)
FOR 30m
LABELS {
severity = "ticket",
Expand Down Expand Up @@ -523,6 +522,7 @@ ALERT VdlimitMetricsMissingForNode
# A collectd-mlab service has a problem and is down.
ALERT CoreServices_CollectdMlabDown
IF collectd_mlab_success{} == 0
UNLESS ON(machine) gmx_machine_maintenance == 1
FOR 10m
LABELS {
severity = "ticket",
Expand Down

0 comments on commit db47ce2

Please sign in to comment.