diff --git a/config/federation/prometheus/alerts.yml b/config/federation/prometheus/alerts.yml index 5e28ee62..ade4cb51 100644 --- a/config/federation/prometheus/alerts.yml +++ b/config/federation/prometheus/alerts.yml @@ -356,26 +356,6 @@ groups: monitoring. dashboard: https://grafana.mlab-oti.measurementlab.net/d/tZHLFQRZk/k8s-workload-overview -# HTTP per-status response count is exported by stackdriver. By querying -# stackdriver metrics, we can alert if the server-side errors rate for mlab-ns -# is too high or if the service is unavailable. - - alert: MlabNS_TooManyServerSideErrors - expr: | - sum(stackdriver_gae_app_appengine_googleapis_com_http_server_response_count{ - deployment="mlabns-stackdriver", module_id="default", loading="false", - response_code=~"5.."}) / - sum(stackdriver_gae_app_appengine_googleapis_com_http_server_response_count{ - deployment="mlabns-stackdriver", module_id="default", loading="false"}) > 0.01 - for: 2m - labels: - repo: dev-tracker - severity: ticket - annotations: - summary: Server-side errors rate for mlab-ns is over 1%. - description: Stackdriver reports more than 1% of the HTTP requests to mlab-ns are - returning a 5xx status code. Please check the mlab-ns logs to verify what is - causing them. - # Check 5xx errors for the rate-limiter deployment, too. - alert: RateLimiterTooManyServerSideErrors expr: | @@ -392,21 +372,28 @@ groups: rate-limiter are returning a 5xx status code. Please check the rate-limiter logs to determine the cause. -# If any of the deployments in the mlab-ns GAE project is unavailable, an alert +# HTTP per-status response count is exported by stackdriver. If any of the +# services in the mlab-ns GAE project return > 1% 5xx errors, an alert # should fire immediately. - - alert: MlabNS_ServiceUnavailable - expr: sum(stackdriver_gae_app_appengine_googleapis_com_http_server_response_count{ - deployment="mlabns-stackdriver", response_code=~"503"}) > 0 + - alert: MlabNS_TooManyServiceErrors + expr: | + sum(stackdriver_gae_app_appengine_googleapis_com_http_server_response_count{ + deployment="mlabns-stackdriver", loading="false", response_code=~"5.."}) / + sum(stackdriver_gae_app_appengine_googleapis_com_http_server_response_count{ + deployment="mlabns-stackdriver", loading="false"}) > 0.01 + for: 2m labels: repo: dev-tracker severity: page page_project: mlab-oti annotations: - summary: Service {{ $labels.module }} in the mlab-ns project is unavailable. - description: Stackdriver reports some requests to the {{ $labels.module }} - service are returning error 503. This is likely generated by GAE and - means there is a persistent failure in the underlying service. Please - check the GAE dashboard and logs to determine the cause. + summary: Stackdriver reports >1% errors from {{ $labels.module_id }} + in mlab-ns project. + description: Stackdriver reports more than 1% of the HTTP requests + to {{ $labels.module_id }} are returning a 5xx status code. This is + likely generated by GAE and means there is a persistent failure in + the underlying service. Please check the GAE dashboard and logs to + determine the cause. # One or more generic (non-experiment specific) mlab-ns metrics is missing. # These are metrics that mlab-ns relies on to determine whether an experiment