From 284cced6c1d13e4a56a03c3c5dddecd1b84147ca Mon Sep 17 00:00:00 2001 From: Tang Jiawei Date: Tue, 26 Sep 2023 03:03:19 +0800 Subject: [PATCH] add alerts for stuck at catchup and bootstrap --- .../templates/testnet-alert-rules.yml.tpl | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/automation/terraform/modules/testnet-alerts/templates/testnet-alert-rules.yml.tpl b/automation/terraform/modules/testnet-alerts/templates/testnet-alert-rules.yml.tpl index 7a9a175383d0..9812696c78af 100644 --- a/automation/terraform/modules/testnet-alerts/templates/testnet-alert-rules.yml.tpl +++ b/automation/terraform/modules/testnet-alerts/templates/testnet-alert-rules.yml.tpl @@ -255,6 +255,24 @@ groups: description: "{{ $value }} blocks have been validated on network {{ $labels.testnet }} in the last hour (according to some node)." runbook: "https://www.notion.so/minaprotocol/FewBlocksPerHour-47a6356f093242d988b0d9527ce23478" + - alert: StuckInBootstrap + expr: count by (testnet) (increase(Coda_Runtime_process_uptime_ms_total{syncStatus = "BOOTSTRAP"}[2h]) >= 7200000) > 0 + for: ${alert_evaluation_duration} + labels: + testnet: "{{ $labels.testnet }}" + severity: critical + annotations: + summary: "One or more {{ $labels.testnet }} nodes are stuck at bootstrap for more than 2 hours" + + - alert: StuckInCatchup + expr: count by (testnet) (increase(Coda_Runtime_process_uptime_ms_total{syncStatus = "CATCHUP"}[2h]) >= 7200000) > 0 + for: ${alert_evaluation_duration} + labels: + testnet: "{{ $labels.testnet }}" + severity: critical + annotations: + summary: "One or more {{ $labels.testnet }} nodes are stuck at catchup for more than 2 hours" + - name: Warnings rules: