From 30505e01e4e933ef6fba82bc29b51a852f7d4a27 Mon Sep 17 00:00:00 2001 From: nkinkade Date: Thu, 3 May 2018 16:34:45 -0600 Subject: [PATCH] Alerting for BBE when the BBE service itself is down, not just probes. (#223) * Fixes a previous PR. Alerting for BBE now alerts when the BBE service is down or missing, not when BBE target checks are down or missing. * Disables service auto-discovery for blackbox_exporter, and adds a static scrape target for it. This is so that IPv4 and IPv6 instances follow the same patterns. * Changes to BlackboxExporterIpv4DownOrMissing alert to check for up{job} instead of up{deployment}. It is now like the Ipv6 check. Fixes a type in one other alert name. --- apply-global-prometheus.sh | 13 +++++++++++++ config/federation/prometheus/alerts.yml | 17 +++++++++-------- .../prometheus/prometheus.yml.template | 17 +++++++++++++++++ .../deployments/blackbox.yml | 2 +- 4 files changed, 40 insertions(+), 9 deletions(-) diff --git a/apply-global-prometheus.sh b/apply-global-prometheus.sh index 06c1a509..abd736cd 100755 --- a/apply-global-prometheus.sh +++ b/apply-global-prometheus.sh @@ -25,6 +25,18 @@ CLUSTER=${CLUSTER:?Please provide cluster name: $USAGE} export GRAFANA_DOMAIN=grafana.${PROJECT}.measurementlab.net +# GCP doesn't support IPv6, so we have a Linode VM running three instances of +# the blackbox_exporter, on three separate ports... one port/instance for each +# project. These variables map projects to ports. +BBE_IPV6_PORT_mlab_oti="9115" +BBE_IPV6_PORT_mlab_staging="8115" +BBE_IPV6_PORT_mlab_sandbox="7115" + +# Construct the per-project blackbox_exporter port using the passed $PROJECT +# argument. +bbe_port=BBE_IPV6_PORT_${PROJECT/-/_} + + # Config maps and Secrets ## Blackbox exporter. @@ -36,6 +48,7 @@ kubectl create configmap blackbox-config \ # Evaluate the configuration template. sed -e 's|{{PROJECT}}|'${PROJECT}'|g' \ + -e 's|{{BBE_IPV6_PORT}}|'${!bbe_port}'|g' \ config/federation/prometheus/prometheus.yml.template > \ config/federation/prometheus/prometheus.yml diff --git a/config/federation/prometheus/alerts.yml b/config/federation/prometheus/alerts.yml index 487bc5a6..f8aafe4b 100644 --- a/config/federation/prometheus/alerts.yml +++ b/config/federation/prometheus/alerts.yml @@ -313,9 +313,10 @@ ALERT ScriptExporterMissingMetrics } # Prometheus is unable to get data from the blackbox_exporter service for IPv4 -# probes. +# probes. The service is down, or the metric is missing. ALERT BlackboxExporterIpv4DownOrMissing - IF up{job="blackbox-targets"} == 0 OR absent(up{job="blackbox-targets"}) + IF up{job="blackbox-exporter-ipv4"} == 0 + OR absent(up{job="blackbox-exporter-ipv4"}) FOR 10m LABELS { severity = "ticket" @@ -326,17 +327,17 @@ ALERT BlackboxExporterIpv4DownOrMissing } # Prometheus is unable to get data from the blackbox_exporter service for IPv6 -# probes. +# probes. The service is down, or the metric is missing. ALERT BlackboxExporterIpv6DownOrMissing - IF up{job="blackbox-targets-ipv6"} == 0 - OR absent(up{job="blackbox-targets-ipv6"}) + IF up{job="blackbox-exporter-ipv6"} == 0 + OR absent(up{job="blackbox-exporter-ipv6"}) FOR 10m LABELS { severity = "ticket" } ANNOTATIONS { - summary = "The blackbox_exporter service is down for IPv6 probes.", - hints = "The blackbox_exporter for IPv6 checks runs in a Linode VM. Make sure the VM is up and running. If it is, check the status of the BBE container running on it. Domains for VM are like blackbox-exporter-ipv6..measurementlab.net." + summary = "The blackbox_exporter service is down or missing for IPv6 probes.", + hints = "The blackbox_exporter for IPv6 checks runs in a Linode VM. Make sure the VM is up and running. If it is, check the status of the BBE container running in the VM. Domains for VMs are like blackbox-exporter-ipv6..measurementlab.net." } # More than a certain percentage of NDT servers meet the criteria for being @@ -533,7 +534,7 @@ ALERT NagiosExporterMissing } # The node_exporter running on eb.measurementlab.net is down. -ALERT NodeExporterOnEbDownorMissing +ALERT NodeExporterOnEbDownOrMissing IF up{job="eb-node-exporter"} == 0 OR absent(up{job="eb-node-exporter"}) FOR 10m LABELS { diff --git a/config/federation/prometheus/prometheus.yml.template b/config/federation/prometheus/prometheus.yml.template index b4252917..f11c442f 100644 --- a/config/federation/prometheus/prometheus.yml.template +++ b/config/federation/prometheus/prometheus.yml.template @@ -337,8 +337,17 @@ scrape_configs: # Attempt to re-read files every five minutes. refresh_interval: 5m + # Blackbox configurations. # + # Scrape the blackbox_exporter instance running in the cluster for service + # metrics. + - job_name: 'blackbox-exporter-ipv4' + static_configs: + - targets: + - blackbox-service.default.svc.cluster.local:9115 + + # Each blackbox configuration uses a different probe (tcp, icmp, http, etc). - job_name: 'blackbox-targets' metrics_path: /probe @@ -390,6 +399,14 @@ scrape_configs: # Blackbox configurations for IPv6 probes. # + # There are three blackbox_exporter instances running on a Linode VM, one for + # each M-Lab GCP project. They each run on different ports. + - job_name: 'blackbox-exporter-ipv6' + static_configs: + - targets: + - blackbox-exporter-ipv6.{{PROJECT}}.measurementlab.net:{{BBE_IPV6_PORT}} + + # Each blackbox configuration uses a different probe (tcp, icmp, http, etc). - job_name: 'blackbox-targets-ipv6' metrics_path: /probe diff --git a/k8s/prometheus-federation/deployments/blackbox.yml b/k8s/prometheus-federation/deployments/blackbox.yml index 6757c2c1..3fc1570f 100644 --- a/k8s/prometheus-federation/deployments/blackbox.yml +++ b/k8s/prometheus-federation/deployments/blackbox.yml @@ -23,7 +23,7 @@ spec: run: blackbox-server annotations: # Tell prometheus service discovery to scrape the blackbox container. - prometheus.io/scrape: 'true' + prometheus.io/scrape: 'false' spec: # Place the pod into the Guaranteed QoS by setting equal resource # requests and limits for *all* containers in the pod.