From 5a5ed65b98b7e6a44524976ad8867e8495cf93a4 Mon Sep 17 00:00:00 2001 From: Krystle Salazar Date: Wed, 29 Nov 2023 18:54:46 -0400 Subject: [PATCH 1/7] Rename files and title --- ..._2xx_under_threshold.md => nuxt_http_2xx_under_threshold.md} | 0 ..._5xx_above_threshold.md => nuxt_http_5xx_above_threshold.md} | 0 .../runbooks/nuxt_p99_response_time_above_threshold.md | 2 +- 3 files changed, 1 insertion(+), 1 deletion(-) rename documentation/meta/monitoring/runbooks/{nuxt_2xx_under_threshold.md => nuxt_http_2xx_under_threshold.md} (100%) rename documentation/meta/monitoring/runbooks/{nuxt_5xx_above_threshold.md => nuxt_http_5xx_above_threshold.md} (100%) diff --git a/documentation/meta/monitoring/runbooks/nuxt_2xx_under_threshold.md b/documentation/meta/monitoring/runbooks/nuxt_http_2xx_under_threshold.md similarity index 100% rename from documentation/meta/monitoring/runbooks/nuxt_2xx_under_threshold.md rename to documentation/meta/monitoring/runbooks/nuxt_http_2xx_under_threshold.md diff --git a/documentation/meta/monitoring/runbooks/nuxt_5xx_above_threshold.md b/documentation/meta/monitoring/runbooks/nuxt_http_5xx_above_threshold.md similarity index 100% rename from documentation/meta/monitoring/runbooks/nuxt_5xx_above_threshold.md rename to documentation/meta/monitoring/runbooks/nuxt_http_5xx_above_threshold.md diff --git a/documentation/meta/monitoring/runbooks/nuxt_p99_response_time_above_threshold.md b/documentation/meta/monitoring/runbooks/nuxt_p99_response_time_above_threshold.md index dea1ea3d4ab..f0e6715e0ef 100644 --- a/documentation/meta/monitoring/runbooks/nuxt_p99_response_time_above_threshold.md +++ b/documentation/meta/monitoring/runbooks/nuxt_p99_response_time_above_threshold.md @@ -1,4 +1,4 @@ -# Run Book: Nuxt Production Average Response Time above threshold +# Run Book: Nuxt Production P99 Response Time above threshold ```{admonition} Metadata Status: **Unstable** From 3e2a16fca20405b0a8d0b3a6e5123fd3ff06eaa7 Mon Sep 17 00:00:00 2001 From: Krystle Salazar Date: Wed, 29 Nov 2023 19:20:56 -0400 Subject: [PATCH 2/7] Add runbooks for anomalous Nuxt response times --- .../nuxt_avg_response_time_anomaly.md | 31 +++++++++++++++++++ .../nuxt_p99_response_time_anomaly.md | 31 +++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 documentation/meta/monitoring/runbooks/nuxt_avg_response_time_anomaly.md create mode 100644 documentation/meta/monitoring/runbooks/nuxt_p99_response_time_anomaly.md diff --git a/documentation/meta/monitoring/runbooks/nuxt_avg_response_time_anomaly.md b/documentation/meta/monitoring/runbooks/nuxt_avg_response_time_anomaly.md new file mode 100644 index 00000000000..338a51f5577 --- /dev/null +++ b/documentation/meta/monitoring/runbooks/nuxt_avg_response_time_anomaly.md @@ -0,0 +1,31 @@ +# Run Book: Nuxt Production Average Response Time anomalously high + +```{admonition} Metadata +Status: **Unstable** + +Maintainer: @obulat + +Alarm link: +- +``` + +## Severity Guide + +Confirm that there is not a total outage of the service. If not, the severity is +likely low. Check for the request count and general network activity. If +abnormally high, refer to the [traffic analysis run book][traffic_runbook] to +identify and block any malicious traffic. If not, then check for a recent +deployment that may have introduced a problem, and [rollback][rollback_docs] to +the previous version if necessary. + +[traffic_runbook]: + /meta/monitoring/traffic/runbooks/identifying-and-blocking-traffic-anomalies.md +[rollback_docs]: /general/deployment.md#rollbacks + +## Historical false positives + +Nothing registered to date. + +## Related incident reports + +Nothing registered to date. diff --git a/documentation/meta/monitoring/runbooks/nuxt_p99_response_time_anomaly.md b/documentation/meta/monitoring/runbooks/nuxt_p99_response_time_anomaly.md new file mode 100644 index 00000000000..f4f93bc3ee3 --- /dev/null +++ b/documentation/meta/monitoring/runbooks/nuxt_p99_response_time_anomaly.md @@ -0,0 +1,31 @@ +# Run Book: Nuxt Production P99 Response Time anomalously high + +```{admonition} Metadata +Status: **Unstable** + +Maintainer: @obulat + +Alarm link: +- +``` + +## Severity Guide + +Confirm that there is not a total outage of the service. If not, the severity is +likely low. Check for the request count and general network activity. If +abnormally high, refer to the [traffic analysis run book][traffic_runbook] to +identify and block any malicious traffic. If not, then check for a recent +deployment that may have introduced a problem, and [rollback][rollback_docs] to +the previous version if necessary. + +[traffic_runbook]: + /meta/monitoring/traffic/runbooks/identifying-and-blocking-traffic-anomalies.md +[rollback_docs]: /general/deployment.md#rollbacks + +## Historical false positives + +Nothing registered to date. + +## Related incident reports + +Nothing registered to date. From 7ba17045716f11ceb6ab3c42a3899fc60ba97fb2 Mon Sep 17 00:00:00 2001 From: Krystle Salazar Date: Wed, 29 Nov 2023 19:28:01 -0400 Subject: [PATCH 3/7] Update runbooks for general Request Count alarms --- ...ove_threshold.md => api_request_count_anomaly.md} | 12 ++++++------ ...equest_count.md => nuxt_request_count_anomaly.md} | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) rename documentation/meta/monitoring/runbooks/{api_request_count_above_threshold.md => api_request_count_anomaly.md} (70%) rename documentation/meta/monitoring/runbooks/{nuxt_request_count.md => nuxt_request_count_anomaly.md} (89%) diff --git a/documentation/meta/monitoring/runbooks/api_request_count_above_threshold.md b/documentation/meta/monitoring/runbooks/api_request_count_anomaly.md similarity index 70% rename from documentation/meta/monitoring/runbooks/api_request_count_above_threshold.md rename to documentation/meta/monitoring/runbooks/api_request_count_anomaly.md index 332c7afafb6..4a2106103e6 100644 --- a/documentation/meta/monitoring/runbooks/api_request_count_above_threshold.md +++ b/documentation/meta/monitoring/runbooks/api_request_count_anomaly.md @@ -1,4 +1,4 @@ -# Run Book: API Production Request Count above threshold +# Run Book: API Production Request Count anomalously high ```{admonition} Metadata Status: **Unstable** @@ -6,7 +6,7 @@ Status: **Unstable** Maintainer: @krysaldb Alarm link: -- +- ``` ## Severity Guide @@ -19,10 +19,10 @@ future resource scaling depending on the kind of traffic. If the services are strained then the severity is critical, search for the root cause to prevent more serious outages. If there are no recent obvious integrations (like the Gutenberg plugin) then follow the run book to [identify -traffic anomalies in Cloudflare][runbook_traffic], to determine whether the -recent traffic is organic or if it comes from a botnet. Find the origin of -requests and evaluate whether it needs to be blocked or if Openverse services -need to adapt to the new demand. +traffic anomalies][runbook_traffic], to determine whether the recent traffic is +organic or if it comes from a botnet. Find the origin of requests and evaluate +whether it needs to be blocked or if Openverse services need to adapt to the new +demand. [runbook_traffic]: https://docs.openverse.org/meta/monitoring/traffic/runbooks/identifying-and-blocking-traffic-anomalies.html diff --git a/documentation/meta/monitoring/runbooks/nuxt_request_count.md b/documentation/meta/monitoring/runbooks/nuxt_request_count_anomaly.md similarity index 89% rename from documentation/meta/monitoring/runbooks/nuxt_request_count.md rename to documentation/meta/monitoring/runbooks/nuxt_request_count_anomaly.md index 0573055fa12..359f429dcb5 100644 --- a/documentation/meta/monitoring/runbooks/nuxt_request_count.md +++ b/documentation/meta/monitoring/runbooks/nuxt_request_count_anomaly.md @@ -1,4 +1,4 @@ -# Run Book: Nuxt request count above threshold +# Run Book: Nuxt Request Count anomalously high ```{admonition} Metadata Status: **Unstable** @@ -6,7 +6,7 @@ Status: **Unstable** Maintainer: @dhruvkb Alarm link: -- [production-nuxt](https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#alarmsV2:alarm/Nuxt+Production+request+count+above+threshold) +- [production-nuxt](https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#alarmsV2:alarm/Nuxt+Production+Request+Count+anomalously+high) ``` ## Severity guide From 5172d019b7105715bd08f2eb991deadd7d3ab2be Mon Sep 17 00:00:00 2001 From: Krystle Salazar Date: Wed, 29 Nov 2023 19:28:46 -0400 Subject: [PATCH 4/7] Add new entries to index --- documentation/meta/monitoring/runbooks/index.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/documentation/meta/monitoring/runbooks/index.md b/documentation/meta/monitoring/runbooks/index.md index 2145a9dd494..13b661174a3 100644 --- a/documentation/meta/monitoring/runbooks/index.md +++ b/documentation/meta/monitoring/runbooks/index.md @@ -12,13 +12,13 @@ that can be a good resource when writing a new one. ```{toctree} :titlesonly: -api_request_count_above_threshold api_http_2xx_under_threshold api_http_5xx_above_threshold api_avg_response_time_above_threshold api_avg_response_time_anomaly api_p99_response_time_above_threshold api_p99_response_time_anomaly +api_request_count_anomaly api_thumbnails_http_2xx_under_threshold api_thumbnails_http_5xx_above_threshold api_thumbnails_request_count_anomaly @@ -26,10 +26,12 @@ api_thumbnails_avg_response_time_above_threshold api_thumbnails_avg_response_time_anomaly api_thumbnails_p99_response_time_above_threshold api_thumbnails_p99_response_time_anomaly -nuxt_request_count -nuxt_2xx_under_threshold -nuxt_5xx_above_threshold +nuxt_http_2xx_under_threshold +nuxt_http_5xx_above_threshold nuxt_avg_response_time_above_threshold +nuxt_avg_response_time_anomaly nuxt_p99_response_time_above_threshold +nuxt_p99_response_time_anomaly +nuxt_request_count_anomaly unhealthy_ecs_hosts ``` From c69b36e56f2ee43e63ff48cc63bbb367e35cfed4 Mon Sep 17 00:00:00 2001 From: Krystle Salazar Date: Wed, 29 Nov 2023 19:43:51 -0400 Subject: [PATCH 5/7] Fix titles of Nuxt HTTP alarms --- .../meta/monitoring/runbooks/nuxt_http_2xx_under_threshold.md | 2 +- .../meta/monitoring/runbooks/nuxt_http_5xx_above_threshold.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/documentation/meta/monitoring/runbooks/nuxt_http_2xx_under_threshold.md b/documentation/meta/monitoring/runbooks/nuxt_http_2xx_under_threshold.md index 1f7d1d97ea3..6fbb61e49e6 100644 --- a/documentation/meta/monitoring/runbooks/nuxt_http_2xx_under_threshold.md +++ b/documentation/meta/monitoring/runbooks/nuxt_http_2xx_under_threshold.md @@ -1,4 +1,4 @@ -# Run Book: Nuxt 2XX request count under threshold +# Run Book: Nuxt 2XX responses count under threshold ```{admonition} Metadata Status: **Unstable** diff --git a/documentation/meta/monitoring/runbooks/nuxt_http_5xx_above_threshold.md b/documentation/meta/monitoring/runbooks/nuxt_http_5xx_above_threshold.md index 7d3705ac96f..542862d1e9f 100644 --- a/documentation/meta/monitoring/runbooks/nuxt_http_5xx_above_threshold.md +++ b/documentation/meta/monitoring/runbooks/nuxt_http_5xx_above_threshold.md @@ -1,4 +1,4 @@ -# Run Book: Nuxt 5XX request count above threshold +# Run Book: Nuxt 5XX responses count above threshold ```{admonition} Metadata Status: **Unstable** From 1895891078164e1e20d434a603a24f61ef9b614a Mon Sep 17 00:00:00 2001 From: Krystle Salazar Date: Wed, 6 Dec 2023 16:03:35 -0400 Subject: [PATCH 6/7] Add spaces in admonition sections --- .../api_thumbnails_avg_response_time_above_threshold.md | 2 ++ .../runbooks/api_thumbnails_avg_response_time_anomaly.md | 2 ++ .../api_thumbnails_p99_response_time_above_threshold.md | 2 ++ .../runbooks/api_thumbnails_p99_response_time_anomaly.md | 2 ++ .../monitoring/runbooks/api_thumbnails_request_count_anomaly.md | 2 ++ .../runbooks/nuxt_avg_response_time_above_threshold.md | 2 ++ .../runbooks/nuxt_p99_response_time_above_threshold.md | 2 ++ 7 files changed, 14 insertions(+) diff --git a/documentation/meta/monitoring/runbooks/api_thumbnails_avg_response_time_above_threshold.md b/documentation/meta/monitoring/runbooks/api_thumbnails_avg_response_time_above_threshold.md index eada9a9bf13..4cd49639792 100644 --- a/documentation/meta/monitoring/runbooks/api_thumbnails_avg_response_time_above_threshold.md +++ b/documentation/meta/monitoring/runbooks/api_thumbnails_avg_response_time_above_threshold.md @@ -2,7 +2,9 @@ ```{admonition} Metadata Status: **Unstable** + Maintainer: @stacimc + Alarm link: - ``` diff --git a/documentation/meta/monitoring/runbooks/api_thumbnails_avg_response_time_anomaly.md b/documentation/meta/monitoring/runbooks/api_thumbnails_avg_response_time_anomaly.md index 4ab28f149a0..8e6602b0b80 100644 --- a/documentation/meta/monitoring/runbooks/api_thumbnails_avg_response_time_anomaly.md +++ b/documentation/meta/monitoring/runbooks/api_thumbnails_avg_response_time_anomaly.md @@ -2,7 +2,9 @@ ```{admonition} Metadata Status: **Unstable** + Maintainer: @stacimc + Alarm link: - ``` diff --git a/documentation/meta/monitoring/runbooks/api_thumbnails_p99_response_time_above_threshold.md b/documentation/meta/monitoring/runbooks/api_thumbnails_p99_response_time_above_threshold.md index 31a7eab1acf..0cf613d562a 100644 --- a/documentation/meta/monitoring/runbooks/api_thumbnails_p99_response_time_above_threshold.md +++ b/documentation/meta/monitoring/runbooks/api_thumbnails_p99_response_time_above_threshold.md @@ -2,7 +2,9 @@ ```{admonition} Metadata Status: **Unstable** + Maintainer: @stacimc + Alarm link: - ``` diff --git a/documentation/meta/monitoring/runbooks/api_thumbnails_p99_response_time_anomaly.md b/documentation/meta/monitoring/runbooks/api_thumbnails_p99_response_time_anomaly.md index 65ca50bdc4e..440af3e7a52 100644 --- a/documentation/meta/monitoring/runbooks/api_thumbnails_p99_response_time_anomaly.md +++ b/documentation/meta/monitoring/runbooks/api_thumbnails_p99_response_time_anomaly.md @@ -2,7 +2,9 @@ ```{admonition} Metadata Status: **Unstable** + Maintainer: @stacimc + Alarm link: - ``` diff --git a/documentation/meta/monitoring/runbooks/api_thumbnails_request_count_anomaly.md b/documentation/meta/monitoring/runbooks/api_thumbnails_request_count_anomaly.md index a80595f87c6..255c43db232 100644 --- a/documentation/meta/monitoring/runbooks/api_thumbnails_request_count_anomaly.md +++ b/documentation/meta/monitoring/runbooks/api_thumbnails_request_count_anomaly.md @@ -2,7 +2,9 @@ ```{admonition} Metadata Status: **Unstable** + Maintainer: @krysaldb + Alarm link: - ``` diff --git a/documentation/meta/monitoring/runbooks/nuxt_avg_response_time_above_threshold.md b/documentation/meta/monitoring/runbooks/nuxt_avg_response_time_above_threshold.md index c3874021ef0..538abb6872a 100644 --- a/documentation/meta/monitoring/runbooks/nuxt_avg_response_time_above_threshold.md +++ b/documentation/meta/monitoring/runbooks/nuxt_avg_response_time_above_threshold.md @@ -2,7 +2,9 @@ ```{admonition} Metadata Status: **Unstable** + Maintainer: @obulat + Alarm link: - ``` diff --git a/documentation/meta/monitoring/runbooks/nuxt_p99_response_time_above_threshold.md b/documentation/meta/monitoring/runbooks/nuxt_p99_response_time_above_threshold.md index f0e6715e0ef..0daadd86fca 100644 --- a/documentation/meta/monitoring/runbooks/nuxt_p99_response_time_above_threshold.md +++ b/documentation/meta/monitoring/runbooks/nuxt_p99_response_time_above_threshold.md @@ -2,7 +2,9 @@ ```{admonition} Metadata Status: **Unstable** + Maintainer: @obulat + Alarm link: - ``` From fcc24efdb747c996761f1787ba4f512a82fee9cc Mon Sep 17 00:00:00 2001 From: Krystle Salazar Date: Wed, 6 Dec 2023 16:24:18 -0400 Subject: [PATCH 7/7] Update status for Nuxt P99 alarms --- .../runbooks/nuxt_p99_response_time_above_threshold.md | 2 +- .../meta/monitoring/runbooks/nuxt_p99_response_time_anomaly.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/documentation/meta/monitoring/runbooks/nuxt_p99_response_time_above_threshold.md b/documentation/meta/monitoring/runbooks/nuxt_p99_response_time_above_threshold.md index 0daadd86fca..f9a0bf511d0 100644 --- a/documentation/meta/monitoring/runbooks/nuxt_p99_response_time_above_threshold.md +++ b/documentation/meta/monitoring/runbooks/nuxt_p99_response_time_above_threshold.md @@ -1,7 +1,7 @@ # Run Book: Nuxt Production P99 Response Time above threshold ```{admonition} Metadata -Status: **Unstable** +Status: **Disabled** until Nuxt request logging is added. Maintainer: @obulat diff --git a/documentation/meta/monitoring/runbooks/nuxt_p99_response_time_anomaly.md b/documentation/meta/monitoring/runbooks/nuxt_p99_response_time_anomaly.md index f4f93bc3ee3..9c76aa430cb 100644 --- a/documentation/meta/monitoring/runbooks/nuxt_p99_response_time_anomaly.md +++ b/documentation/meta/monitoring/runbooks/nuxt_p99_response_time_anomaly.md @@ -1,7 +1,7 @@ # Run Book: Nuxt Production P99 Response Time anomalously high ```{admonition} Metadata -Status: **Unstable** +Status: **Disabled** until Nuxt request logging is added. Maintainer: @obulat