From 441ce0f2c53f67c580f36a4aea8b5fde427cc2c7 Mon Sep 17 00:00:00 2001
From: Krystle Salazar <krystle.salazar@automattic.com>
Date: Mon, 2 Oct 2023 14:01:03 -0400
Subject: [PATCH] Add runbooks for API Thumbnails 2XX/5XX responses and Request
 Count alarms (#3076)

---
 ...api_thumbnails_http_2xx_under_threshold.md | 36 +++++++++++++++++++
 ...api_thumbnails_http_5xx_above_threshold.md | 31 ++++++++++++++++
 .../api_thumbnails_request_count_anomaly.md   | 34 ++++++++++++++++++
 .../meta/monitoring/runbooks/index.md         |  3 ++
 4 files changed, 104 insertions(+)
 create mode 100644 documentation/meta/monitoring/runbooks/api_thumbnails_http_2xx_under_threshold.md
 create mode 100644 documentation/meta/monitoring/runbooks/api_thumbnails_http_5xx_above_threshold.md
 create mode 100644 documentation/meta/monitoring/runbooks/api_thumbnails_request_count_anomaly.md

diff --git a/documentation/meta/monitoring/runbooks/api_thumbnails_http_2xx_under_threshold.md b/documentation/meta/monitoring/runbooks/api_thumbnails_http_2xx_under_threshold.md
new file mode 100644
index 00000000000..719c5e58b3b
--- /dev/null
+++ b/documentation/meta/monitoring/runbooks/api_thumbnails_http_2xx_under_threshold.md
@@ -0,0 +1,36 @@
+# Run Book: API Thumbnails Production HTTP 2XX responses count under threshold
+
+```{admonition} Metadata
+Status: **Unstable**
+
+Maintainer: @krysaldb
+
+Alarm link:
+- <https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#alarmsV2:alarm/API+Thumbnails+Production+HTTP+2XX+responses+count+under+threshold?>
+```
+
+## Severity Guide
+
+After confirming there is not a total outage, check if the overall request count
+has decreased as well (go to the [CloudWatch dashboard][cloudwatch] or
+alternatively check in CloudFlare). If the overall requests are lower then the
+severity is low, and you should continue searching for the cause of the general
+decrease.
+
+If the lower number is only in 2XX responses the severity is likely high, so
+also check the dashboard to look for other anomalies. Verify if any of the
+thumbnail providers are experiencing an outage or are rate-limiting Openverse.
+Go to the [API logs][api_logs] to check for errors or data that yield clues.
+
+[cloudwatch]:
+  https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#dashboards/dashboard/ECS-Production-Dashboard
+[api_logs]:
+  https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#logsV2:log-groups/log-group/$252Fecs$252Fproduction$252Fapi
+
+## Historical false positives
+
+Nothing registered to date.
+
+## Related incident reports
+
+Nothing registered to date.
diff --git a/documentation/meta/monitoring/runbooks/api_thumbnails_http_5xx_above_threshold.md b/documentation/meta/monitoring/runbooks/api_thumbnails_http_5xx_above_threshold.md
new file mode 100644
index 00000000000..83acf7a0bf7
--- /dev/null
+++ b/documentation/meta/monitoring/runbooks/api_thumbnails_http_5xx_above_threshold.md
@@ -0,0 +1,31 @@
+# Run Book: API Thumbnails Production HTTP 5XX responses count above threshold
+
+```{admonition} Metadata
+Status: **Unstable**
+
+Maintainer: @krysaldb
+
+Alarm link:
+- <https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#alarmsV2:alarm/API+Thumbnails+Production+HTTP+5XX+responses+count+under+threshold?>
+```
+
+## Severity Guide
+
+After confirming there is not a total outage, check if the increase of 5XX HTTP
+errors is related to a regular time where resources are expected to be
+constrained like a recent deployment, a data refresh, DB maintenance, etc. If
+the spike is related to one of these events and the alarms stabilizes in the
+short time then the severity is low.
+
+If the issue is not related to known recurrent events and persists, the severity
+is critical. Check if dependent services –DB, Redis, Elasticsearch– are
+available to the API or if the problem is intrinsic to itself.
+
+## Historical false positives
+
+Nothing registered to date.
+
+## Related incident reports
+
+- 2023-09-27 from 16:15 to 18:00 UTC: 5XX responses spike between 983 and 1434
+  due to unstable tasks.
diff --git a/documentation/meta/monitoring/runbooks/api_thumbnails_request_count_anomaly.md b/documentation/meta/monitoring/runbooks/api_thumbnails_request_count_anomaly.md
new file mode 100644
index 00000000000..a80595f87c6
--- /dev/null
+++ b/documentation/meta/monitoring/runbooks/api_thumbnails_request_count_anomaly.md
@@ -0,0 +1,34 @@
+# Run Book: API Thumbnails Production Request Count anomalously high
+
+```{admonition} Metadata
+Status: **Unstable**
+Maintainer: @krysaldb
+Alarm link:
+- <https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#alarmsV2:alarm/API+Thumbnails+Production+Request+Count+anomalously+high>
+```
+
+## Severity Guide
+
+When a sudden increase in request count is noticed, verify that the services are
+supporting the load by looking at metrics like response time or ES CPU usage for
+example. If the API is doing fine, then severity is low and may only require
+future resource scaling depending on the kind of traffic.
+
+If the services are strained then the severity is critical, search for the root
+cause to prevent more serious outages. If there are no recent obvious
+integrations (like the Gutenberg plugin) then follow the run book to [identify
+traffic anomalies in Cloudflare][runbook_traffic], to determine whether the
+recent traffic is organic or if it comes from a botnet. Find the origin of
+requests and evaluate whether it needs to be blocked or if Openverse services
+need to adapt to the new demand.
+
+[traffic_runbook]:
+  /meta/monitoring/traffic/runbooks/identifying-and-blocking-traffic-anomalies.md
+
+## Historical false positives
+
+Nothing registered to date.
+
+## Related incident reports
+
+Nothing registered to date.
diff --git a/documentation/meta/monitoring/runbooks/index.md b/documentation/meta/monitoring/runbooks/index.md
index 78f335d9ee4..2145a9dd494 100644
--- a/documentation/meta/monitoring/runbooks/index.md
+++ b/documentation/meta/monitoring/runbooks/index.md
@@ -19,6 +19,9 @@ api_avg_response_time_above_threshold
 api_avg_response_time_anomaly
 api_p99_response_time_above_threshold
 api_p99_response_time_anomaly
+api_thumbnails_http_2xx_under_threshold
+api_thumbnails_http_5xx_above_threshold
+api_thumbnails_request_count_anomaly
 api_thumbnails_avg_response_time_above_threshold
 api_thumbnails_avg_response_time_anomaly
 api_thumbnails_p99_response_time_above_threshold