From f60b49ec7803ec931cc1dfff9ce36b13a378009c Mon Sep 17 00:00:00 2001 From: ethax-ross Date: Mon, 2 Nov 2020 11:45:14 +0000 Subject: [PATCH] Additional API metric alerts Add additional metric alerting for the API: FailedJobs - Alert when any job fails. HighGoogleApiCalls - Alert when a high number of Google API calls are made (>5 in a 10m period). GoogleApiErrors - Alert when the Google API returns a non-success response. ClientApproachingRateLimit - Alert when a client is approaching the rate limit threshold (15rpm out of an available 30rpm). HighCpu - Alert when max CPU utilization is over 70%. HighMemory - Alert when max memory utilization is over 256MB. HighDatabaseConnections - Alert when max database connections exceeds 75 (out of an available 100). --- monitoring/prometheus/alert.rules | 44 +++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/monitoring/prometheus/alert.rules b/monitoring/prometheus/alert.rules index d23766aa2..399640621 100644 --- a/monitoring/prometheus/alert.rules +++ b/monitoring/prometheus/alert.rules @@ -37,3 +37,47 @@ groups: severity: high annotations: summary: Alert when any client hits a rate limit. + - alert: FailedJobs + expr: 'sum(increase(api_hangfire_jobs{state="failed"}[1m])) > 0' + labels: + severity: high + annotations: + summary: Alert when any job fails. + - alert: HighGoogleApiCalls + expr: 'sum(increase(api_google_api_calls[10m])) > 5' + labels: + severity: high + annotations: + summary: Alert when a high number of Google API calls are made (>5 in a 10m period). + - alert: GoogleApiErrors + expr: 'sum(rate(api_google_api_calls{result != "success"}[1m])) > 0' + labels: + severity: medium + annotations: + summary: Alert when the Google API returns a non-success response. + - alert: ClientApproachingRateLimit + expr: 'sum(increase(http_request_duration_seconds_sum{controller=~"Candidates|MailingList|TeachingEvents",action=~"CreateAccessToken|AddMember|AddAttendee|SignUp",code=~".+"}[1m])) by (controller, action) > 15' + labels: + severity: medium + annotations: + summary: Alert when a client is approaching the rate limit threshold (15rpm out of an available 30rpm). + - alert: HighCpu + expr: 'max(cpu_percent) > 70' + labels: + severity: medium + annotations: + summary: Alert when max CPU utilization is over 70%. + - alert: HighMemory + expr: 'dotnet_total_memory_bytes > 256000000' + labels: + severity: medium + annotations: + summary: Alert when max memory utilization is over 256MB. + - alert: HighDatabaseConnections + expr: 'max(connections) > 75' + labels: + severity: medium + annotations: + summary: Alert when max database connections exceeds 75 (out of an available 100). + +