From 2573ab096bbb4a9a030e47f030687dd1f6149b2a Mon Sep 17 00:00:00 2001
From: janhorstmann <horstmann@osism.tech>
Date: Tue, 10 Dec 2024 16:59:02 +0100
Subject: [PATCH] Allow check of prometheus alert status (#2178)

Part of https://github.com/osism/issues/issues/1009

Signed-off-by: Jan Horstmann <horstmann@osism.tech>
---
 .zuul.yaml                                    |   2 +
 environments/generic/configuration.yml        |   6 +
 .../files/overlays/prometheus/cadvisor.rules  |  64 ----
 .../files/overlays/prometheus/ceph.rules      | 340 ------------------
 .../overlays/prometheus/elasticsearch.rules   |  99 -----
 .../files/overlays/prometheus/haproxy.rules   |  76 ----
 .../files/overlays/prometheus/mysql.rules     |  89 -----
 .../files/overlays/prometheus/openstack.rules |  15 -
 .../overlays/prometheus/prometheus.rules      | 225 ------------
 .../files/overlays/prometheus/rabbitmq.rules  |  69 ----
 .../files/overlays/prometheus/system.rules    |  79 ----
 playbooks/deploy.yml                          |   9 +
 scripts/check/303-prometheus-alert-status.sh  |   9 +
 13 files changed, 26 insertions(+), 1056 deletions(-)
 delete mode 100644 environments/kolla/files/overlays/prometheus/cadvisor.rules
 delete mode 100644 environments/kolla/files/overlays/prometheus/ceph.rules
 delete mode 100644 environments/kolla/files/overlays/prometheus/elasticsearch.rules
 delete mode 100644 environments/kolla/files/overlays/prometheus/haproxy.rules
 delete mode 100644 environments/kolla/files/overlays/prometheus/mysql.rules
 delete mode 100644 environments/kolla/files/overlays/prometheus/openstack.rules
 delete mode 100644 environments/kolla/files/overlays/prometheus/prometheus.rules
 delete mode 100644 environments/kolla/files/overlays/prometheus/rabbitmq.rules
 delete mode 100644 environments/kolla/files/overlays/prometheus/system.rules
 create mode 100755 scripts/check/303-prometheus-alert-status.sh

diff --git a/.zuul.yaml b/.zuul.yaml
index e60574a1a..0e2240749 100644
--- a/.zuul.yaml
+++ b/.zuul.yaml
@@ -74,6 +74,7 @@
     parent: abstract-testbed-deploy
     vars:
       tempest: true
+      prometheus_alert_status: false
 
 - job:
     name: abstract-testbed-deploy-in-a-nutshell
@@ -164,6 +165,7 @@
     vars:
       manager_version: 8.1.0
       tempst: true
+      prometheus_alert_status: false
 
 - job:
     name: testbed-deploy-stable-ubuntu-22.04
diff --git a/environments/generic/configuration.yml b/environments/generic/configuration.yml
index b420780c9..b0aea9ca3 100644
--- a/environments/generic/configuration.yml
+++ b/environments/generic/configuration.yml
@@ -7,3 +7,9 @@ dotfiles_repo_version: main
 dotfiles_repo_local_destination: "~/dotfiles"
 dotfiles_files:
   - .tmux.conf
+
+prometheus_alert_status_filter:
+  alertname:
+    - PrometheusAlertmanagerE2eDeadManSwitch
+  severity:
+    - info
diff --git a/environments/kolla/files/overlays/prometheus/cadvisor.rules b/environments/kolla/files/overlays/prometheus/cadvisor.rules
deleted file mode 100644
index 63a9d8803..000000000
--- a/environments/kolla/files/overlays/prometheus/cadvisor.rules
+++ /dev/null
@@ -1,64 +0,0 @@
-# Taken from https://awesome-prometheus-alerts.grep.to/rules
-
-
-
-groups:
-- name: Cadvisor
-  rules:
-
-  - alert: ContainerKilled
-    expr: time() - container_last_seen > 61
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Container killed (instance {{ $labels.instance }})"
-      description: "A container has disappeared\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: ContainerCpuUsage
-    expr: (sum(rate(container_cpu_usage_seconds_total{name=~".+"}[3m])) BY (instance, name) * 100) > 3000
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Container CPU usage (instance {{ $labels.instance }})"
-      description: "Container CPU usage is above 300%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: ContainerMemoryUsage
-    expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) /
-      sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 90
-    for: 10m
-    labels:
-      severity: warning
-    annotations:
-      summary: Container Memory usage {{ $labels.name }} on Host {{ $labels.host_name}}
-      description: "Container Memory usage VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: ContainerVolumeUsage
-    expr: (1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance)) * 100) > 80
-    for: 1m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Container Volume usage (instance {{ $labels.instance }})"
-      description: "Container Volume usage is above 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: ContainerVolumeIoUsage
-    expr: (sum(container_fs_io_current) BY (instance, name) * 100) > 80
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Container Volume IO usage (instance {{ $labels.instance }})"
-      description: "Container Volume IO usage is above 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: ContainerHighThrottleRate
-    expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1
-    for: 1m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Container high throttle rate (instance {{ $labels.instance }})"
-      description: "Container is being throttled\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-
diff --git a/environments/kolla/files/overlays/prometheus/ceph.rules b/environments/kolla/files/overlays/prometheus/ceph.rules
deleted file mode 100644
index 70b133588..000000000
--- a/environments/kolla/files/overlays/prometheus/ceph.rules
+++ /dev/null
@@ -1,340 +0,0 @@
-# Official set of upstream alerts https://github.com/ceph/ceph/blob/octopus/monitoring/prometheus/alerts/ceph_default_alerts.yml
-
-
-
-groups:
-- name: cluster health
-  rules:
-    - alert: health error
-      expr: ceph_health_status == 2
-      for: 5m
-      labels:
-        severity: critical
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.2.1
-      annotations:
-        description: >
-          Ceph in HEALTH_ERROR state for more than 5 minutes.
-          Please check "ceph health detail" for more information.
-
-    - alert: health warn
-      expr: ceph_health_status == 1
-      for: 15m
-      labels:
-        severity: warning
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.2.2
-      annotations:
-        description: >
-          Ceph has been in HEALTH_WARN for more than 15 minutes.
-          Please check "ceph health detail" for more information.
-
-- name: mon
-  rules:
-    - alert: low monitor quorum count
-      expr: sum(ceph_mon_quorum_status) < 3
-      labels:
-        severity: critical
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.3.1
-      annotations:
-        description: |
-          Monitor count in quorum is below three.
-
-          Only {{ $value }} of {{ with query "count(ceph_mon_quorum_status)" }}{{ . | first | value }}{{ end }} monitors are active.
-
-          The following monitors are down:
-          {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }}
-            - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
-          {{- end }}
-
-- name: osd
-  rules:
-    - alert: 10% OSDs down
-      expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10
-      labels:
-        severity: critical
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.4.1
-      annotations:
-        description: |
-          {{ $value | humanize }}% or {{ with query "count(ceph_osd_up == 0)" }}{{ . | first | value }}{{ end }} of {{ with query "count(ceph_osd_up)" }}{{ . | first | value }}{{ end }} OSDs are down (≥ 10%).
-
-          The following OSDs are down:
-          {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }}
-            - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
-          {{- end }}
-
-    - alert: OSD down
-      expr: count(ceph_osd_up == 0) > 0
-      for: 15m
-      labels:
-        severity: warning
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.4.2
-      annotations:
-        description: |
-          {{ $s := "" }}{{ if gt $value 1.0 }}{{ $s = "s" }}{{ end }}
-          {{ $value }} OSD{{ $s }} down for more than 15 minutes.
-
-          {{ $value }} of {{ query "count(ceph_osd_up)" | first | value }} OSDs are down.
-
-          The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down:
-            {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}}
-            - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
-            {{- end }}
-
-    - alert: OSDs near full
-      expr: |
-        (
-        ((ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) and on(ceph_daemon) ceph_osd_up == 1)
-        * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
-        ) * 100 > 90
-      for: 5m
-      labels:
-        severity: critical
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.4.3
-      annotations:
-        description: >
-          OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} is
-          dangerously full: {{ $value | humanize }}%
-
-    - alert: flapping OSD
-      expr: |
-        (
-        rate(ceph_osd_up[5m])
-        * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
-        ) * 60 > 1
-      labels:
-        severity: warning
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.4.4
-      annotations:
-        description: >
-          OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was
-          marked down and back up at {{ $value | humanize }} times once a
-          minute for 5 minutes.
-
-    - alert: Ceph OSD Reweighted needed
-      expr: ceph_osd_weight < 1
-      for: 1h
-      labels:
-        severity: warning
-      annotations:
-        description: >
-          Ceph OSD reweighted (OSD: {{ $labels.ceph_daemon }})
-
-
-#    # alert on high deviation from average PG count
-#    - alert: high pg count deviation
-#      expr: |
-#        abs(
-#        (
-#          (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job)
-#        ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job)
-#        ) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
-#      for: 5m
-#      labels:
-#        severity: warning
-#        type: ceph_default
-#        oid: 1.3.6.1.4.1.50495.15.1.2.4.5
-#      annotations:
-#        description: >
-#          OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates
-#          by more than 30% from average PG count.
-
-- name: pgs
-  rules:
-    - alert: pgs inactive
-      expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0
-      for: 5m
-      labels:
-        severity: critical
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.7.1
-      annotations:
-        description: >
-          {{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}.
-          Inactive placement groups aren't able to serve read/write
-          requests.
-
-    - alert: pgs unclean
-      expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0
-      for: 15m
-      labels:
-        severity: warning
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.7.2
-      annotations:
-        description: >
-          {{ $value }} PGs haven't been clean for more than 15 minutes in pool {{ $labels.name }}.
-          Unclean PGs haven't been able to completely recover from a
-          previous failure.
-
-    - alert: pgs down
-      expr: ceph_pg_down > 0
-      for: 1m
-      labels:
-        severity: warning
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.7.3
-      annotations:
-        description: >
-          Ceph PG down (Pool ID {{ $labels.pool_id }})
-
-    - alert: pgs incomplete
-      expr: ceph_pg_incomplete > 0
-      for: 1m
-      labels:
-          severity: warning
-          type: ceph_default
-          oid: 1.3.6.1.4.1.50495.15.1.2.7.4
-      annotations:
-        description: >
-          PG incomplete (Pool ID {{ $labels.pool_id }})
-
-    - alert: pg inconsistant
-      expr: ceph_pg_inconsistent > 0
-      for: 1m
-      labels:
-        severity: warning
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.7.5
-      annotations:
-        description: >
-          pg inconsistant (Pool ID {{ $labels.pool_id }})
-
-    - alert: pg ActivationLong
-      expr: ceph_pg_activating > 0
-      for: 2m
-      labels:
-        severity: warning
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.7.6
-      annotations:
-        description: >
-          pg activation too long (Pool ID {{ $labels.pool_id }})
-
-    - alert: pg backfill full
-      expr: ceph_pg_backfill_toofull > 0
-      for: 2m
-      labels:
-        severity: warning
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.7.7
-      annotations:
-        description: >
-          pg backfill full (Pool ID {{ $labels.pool_id }})
-
-    - alert: CephPgUnavailable
-      expr: ceph_pg_total - ceph_pg_active > 0
-      for: 1m
-      labels:
-        severity: warning
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.7.8
-      annotations:
-        description: pg unavailable (Pool ID {{ $labels.pool_id }})
-
-- name: nodes
-  rules:
-    - alert: root volume full
-      expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 5
-      for: 5m
-      labels:
-        severity: critical
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.8.1
-      annotations:
-        description: "Root volume (OSD and MON store) is dangerously full: {{ $value | humanize }}% free."
-
-    # alert on nic packet errors and drops rates > 100 packet/s
-    #- alert: network packets dropped
-    #  expr: irate(node_network_receive_drop_total{device!~"lo|breth2-ovs"}[5m]) + irate(node_network_transmit_drop_total{device!~"lo|breth2-ovs"}[5m]) > 100
-    #  labels:
-    #    severity: warning
-    #    type: ceph_default
-    #    oid: 1.3.6.1.4.1.50495.15.1.2.8.2
-    #  annotations:
-    #    description: >
-    #      Node {{ $labels.instance }} experiences packet drop > 100
-    #      packet/s on interface {{ $labels.device }}.
-
-    - alert: network packet errors
-      expr: |
-        irate(node_network_receive_errs_total{device!="lo"}[5m]) +
-        irate(node_network_transmit_errs_total{device!="lo"}[5m]) > 1
-      labels:
-        severity: warning
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.8.3
-      annotations:
-        description: >
-          Node {{ $labels.instance }} experiences packet errors > 1
-          packet/s on interface {{ $labels.device }}.
-
-    - alert: storage filling up
-      expr: |
-        predict_linear(node_filesystem_free_bytes[2d], 3600 * 24 * 5) *
-        on(instance) group_left(nodename) node_uname_info < 0
-      labels:
-        severity: warning
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.8.4
-      annotations:
-        description: >
-          Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }}
-          will be full in less than 5 days assuming the average fill-up
-          rate of the past 48 hours.
-
-- name: pools
-  rules:
-    - alert: pool full
-      expr: |
-        ceph_pool_stored / (ceph_pool_stored + ceph_pool_max_avail)
-        * on(pool_id) group_right ceph_pool_metadata * 100 > 90
-      labels:
-        severity: critical
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.9.1
-      annotations:
-        description: "Pool {{ $labels.name }} at {{ $value | humanize }}% capacity."
-
-    - alert: pool filling up
-      expr: |
-        (
-        predict_linear(ceph_pool_stored[2d], 3600 * 24 * 5)
-        >= ceph_pool_stored + ceph_pool_max_avail
-        ) * on(pool_id) group_left(name) ceph_pool_metadata
-      labels:
-        severity: warning
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.9.2
-      annotations:
-        description: >
-          Pool {{ $labels.name }} will be full in less than 5 days
-          assuming the average fill-up rate of the past 48 hours.
-
-- name: healthchecks
-  rules:
-    - alert: Slow OSD Ops
-      expr: ceph_healthcheck_slow_ops > 0
-      for: 30s
-      labels:
-        severity: warning
-        type: ceph_default
-      annotations:
-        description: "{{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)"
-
-- name: ceph exporter
-  rules:
-    - alert: CephMgrExporterDown
-      expr: up{job="ceph_mgr_exporter"} == 0
-      for: 1m
-      labels:
-        severity: critical
-      annotations:
-        description: >
-          the Ceph-Manager-Exporter is down.
-          message: CEPH target down for more than 1m, please check
diff --git a/environments/kolla/files/overlays/prometheus/elasticsearch.rules b/environments/kolla/files/overlays/prometheus/elasticsearch.rules
deleted file mode 100644
index 4c0dbcacc..000000000
--- a/environments/kolla/files/overlays/prometheus/elasticsearch.rules
+++ /dev/null
@@ -1,99 +0,0 @@
-# Taken from https://awesome-prometheus-alerts.grep.to/rules
-
-
-
-groups:
-- name: Elasticsearch
-  rules:
-
-  - alert: ElasticsearchHeapUsageTooHigh
-    expr: (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90
-    for: 2m
-    labels:
-      severity: critical
-    annotations:
-      summary: "Elasticsearch Heap Usage Too High (instance {{ $labels.instance }})"
-      description: "The heap usage is over 90%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-  - alert: ElasticsearchHeapUsageWarning
-    expr: (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 85
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Elasticsearch Heap Usage warning (instance {{ $labels.instance }})"
-      description: "The heap usage is over 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-  - alert: ElasticsearchDiskOutOfSpace
-    expr: elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10
-    for: 0m
-    labels:
-      severity: critical
-    annotations:
-      summary: "Elasticsearch disk out of space (instance {{ $labels.instance }})"
-      description: "The disk usage is over 90%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-  - alert: ElasticsearchDiskSpaceLow
-    expr: elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Elasticsearch disk space low (instance {{ $labels.instance }})"
-      description: "The disk usage is over 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-  - alert: ElasticsearchClusterRed
-    expr: elasticsearch_cluster_health_status{color="red"} == 1
-    for: 0m
-    labels:
-      severity: critical
-    annotations:
-      summary: "Elasticsearch Cluster Red (instance {{ $labels.instance }})"
-      description: "Elastic Cluster Red status\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-  - alert: ElasticsearchClusterYellow
-    expr: elasticsearch_cluster_health_status{color="yellow"} == 1
-    for: 0m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Elasticsearch Cluster Yellow (instance {{ $labels.instance }})"
-      description: "Elastic Cluster Yellow status\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-  - alert: ElasticsearchRelocatingShardsTooLong
-    expr: elasticsearch_cluster_health_relocating_shards > 0
-    for: 15m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Elasticsearch relocating shards too long (instance {{ $labels.instance }})"
-      description: "Elasticsearch has been relocating shards for 15min\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-  - alert: ElasticsearchInitializingShardsTooLong
-    expr: elasticsearch_cluster_health_initializing_shards > 0
-    for: 15m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Elasticsearch initializing shards too long (instance {{ $labels.instance }})"
-      description: "Elasticsearch has been initializing shards for 15 min\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-  - alert: ElasticsearchUnassignedShards
-    expr: elasticsearch_cluster_health_unassigned_shards > 0
-    for: 0m
-    labels:
-      severity: critical
-    annotations:
-      summary: "Elasticsearch unassigned shards (instance {{ $labels.instance }})"
-      description: "Elasticsearch has unassigned shards\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-  - alert: ElasticsearchPendingTasks
-    expr: elasticsearch_cluster_health_number_of_pending_tasks > 0
-    for: 15m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Elasticsearch pending tasks (instance {{ $labels.instance }})"
-      description: "Elasticsearch has pending tasks. Cluster works slowly.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-
diff --git a/environments/kolla/files/overlays/prometheus/haproxy.rules b/environments/kolla/files/overlays/prometheus/haproxy.rules
deleted file mode 100644
index c95db6fde..000000000
--- a/environments/kolla/files/overlays/prometheus/haproxy.rules
+++ /dev/null
@@ -1,76 +0,0 @@
-# Taken from https://awesome-prometheus-alerts.grep.to/rules
-
-
-
-groups:
-- name: HAProxy
-  rules:
-
-    - alert: HaproxyDown
-      expr: haproxy_up == 0
-      for: 1m
-      labels:
-        severity: critical
-      annotations:
-        summary: "HAProxy down (instance {{ $labels.instance }})"
-        description: "HAProxy down\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-    - alert: HaproxyBackendDown
-      expr: haproxy_backend_up == 0
-      for: 1m
-      labels:
-        severity: critical
-      annotations:
-        summary: "HAProxy backend down (instance {{ $labels.instance }})"
-        description: "HAProxy backend is down\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-    - alert: HaproxyServerDown
-      expr: haproxy_server_up == 0
-      for: 1m
-      labels:
-        severity: critical
-      annotations:
-        summary: "HAProxy server down (instance {{ $labels.instance }})"
-        description: "HAProxy server is down\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-    - alert: HaproxyServerResponseErrors
-      expr: sum(rate(haproxy_server_response_errors_total[1m])) BY (server) / sum(rate(haproxy_server_http_responses_total[1m]))
-        BY (server) * 100 > 5
-      for: 1m
-      labels:
-        severity: critical
-      annotations:
-        description: HAProxy server response errors ({{ $labels.backend }} on instance {{ $labels.server }})
-
-    - alert: HaproxyServerConnectionErrors
-      expr: sum(rate(haproxy_server_connection_errors_total[1m])) BY (server) > 100
-      for: 2m
-      labels:
-        severity: critical
-      annotations:
-        description: HAProxy server connection errors ({{ $labels.backend }} on instance {{ $labels.server }})
-
-    - alert: HaproxyBackendMaxActiveSession
-      expr: ((sum(avg_over_time(haproxy_backend_max_sessions[2m])) BY (haproxy_backend_sessions_total) / sum(avg_over_time(haproxy_backend_limit_sessions[2m])) BY (haproxy_backend_sessions_total))
-        * 100) > 80
-      for: 2m
-      labels:
-        severity: warning
-      annotations:
-        description: HAProxy backend max active session ({{ $labels.backend }} on instance {{ $labels.server }})
-
-    - alert: HaproxyHttpSlowingDown
-      expr: avg(haproxy_backend_http_total_time_average_seconds) BY (haproxy_backend_sessions_total) > 7
-      for: 5m
-      labels:
-        severity: warning
-      annotations:
-        description: HAProxy HTTP slowing down ({{ $labels.backend }} on instance {{ $labels.server }})
-
-    - alert: HaproxyServerHealthcheckFailure
-      expr: increase(haproxy_server_check_failures_total[5m]) > 3
-      for: 5m
-      labels:
-        severity: warning
-      annotations:
-        description: HAProxy server healthcheck failure ({{ $labels.server }} for backend {{ $labels.backend }})
diff --git a/environments/kolla/files/overlays/prometheus/mysql.rules b/environments/kolla/files/overlays/prometheus/mysql.rules
deleted file mode 100644
index 63d39ebee..000000000
--- a/environments/kolla/files/overlays/prometheus/mysql.rules
+++ /dev/null
@@ -1,89 +0,0 @@
-# Taken from https://awesome-prometheus-alerts.grep.to/rules
-
-
-
-groups:
-- name: Mysql
-  rules:
-
-  - alert: MysqlDown
-    expr: mysql_up == 0
-    for: 1m
-    labels:
-      severity: critical
-    annotations:
-      summary: "MySQL down (instance {{ $labels.instance }})"
-      description: "MySQL instance is down on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: MysqlTooManyConnections
-    expr: avg by (instance) (max_over_time(mysql_global_status_threads_connected[5m])) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 80
-    for: 1m
-    labels:
-      severity: warning
-    annotations:
-      summary: "MySQL too many connections (instance {{ $labels.instance }})"
-      description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: MysqlHighThreadsRunning
-    expr: avg by (instance) (max_over_time(mysql_global_status_threads_running[5m])) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 60
-    for: 1m
-    labels:
-      severity: warning
-    annotations:
-      summary: "MySQL high threads running (instance {{ $labels.instance }})"
-      description: "More than 60% of MySQL connections are in running state on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: MysqlSlaveIoThreadNotRunning
-    expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_io_running == 0
-    for: 1m
-    labels:
-      severity: critical
-    annotations:
-      summary: "MySQL Slave IO thread not running (instance {{ $labels.instance }})"
-      description: "MySQL Slave IO thread not running on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: MysqlSlaveSqlThreadNotRunning
-    expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_sql_running == 0
-    for: 1m
-    labels:
-      severity: critical
-    annotations:
-      summary: "MySQL Slave SQL thread not running (instance {{ $labels.instance }})"
-      description: "MySQL Slave SQL thread not running on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: MysqlSlaveReplicationLag
-    expr: mysql_slave_status_master_server_id > 0 and ON (instance) (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) > 300
-    for: 1m
-    labels:
-      severity: warning
-    annotations:
-      summary: "MySQL Slave replication lag (instance {{ $labels.instance }})"
-      description: "MysqL replication lag on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: MysqlSlowQueries
-    expr: rate(mysql_global_status_slow_queries[5m]) > 0
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: "MySQL slow queries (instance {{ $labels.instance }})"
-      description: "MySQL server is having some slow queries.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: MysqlRestarted
-    expr: mysql_global_status_uptime < 60
-    for: 1m
-    labels:
-      severity: warning
-    annotations:
-      summary: "MySQL restarted (instance {{ $labels.instance }})"
-      description: "MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: mariadb_innodb_replication_fallen_behind
-    expr: (mysql_global_variables_innodb_replication_delay > 30) and ON(instance) (predict_linear(mysql_global_variables_innodb_replication_delay[5m], 60 * 2) > 0)
-    for: 10m
-    labels:
-      severity: warning
-    annotations:
-      description: MySQL innodb replication is lagging on host {{ $labels.host_name }}
-
-
diff --git a/environments/kolla/files/overlays/prometheus/openstack.rules b/environments/kolla/files/overlays/prometheus/openstack.rules
deleted file mode 100644
index 942f710df..000000000
--- a/environments/kolla/files/overlays/prometheus/openstack.rules
+++ /dev/null
@@ -1,15 +0,0 @@
-
-
-groups:
-- name: OpenStack
-  rules:
-  - alert: OpenStackServiceDown
-    expr: (sum({__name__=~"openstack.+_state", job="openstack_exporter"} == 0) by (hostname, service))
-    for: 1m
-    labels:
-      severity: alert
-    annotations:
-      summary: "{{ $labels.service }} at {{ $labels.instance }} is down"
-      description: "OpenStack service {{ $labels.service }} at {{ $labels.instance }} is down"
-
-
diff --git a/environments/kolla/files/overlays/prometheus/prometheus.rules b/environments/kolla/files/overlays/prometheus/prometheus.rules
deleted file mode 100644
index d446894b2..000000000
--- a/environments/kolla/files/overlays/prometheus/prometheus.rules
+++ /dev/null
@@ -1,225 +0,0 @@
-# Taken from https://awesome-prometheus-alerts.grep.to/rules
-
-
-
-groups:
-- name: Prometheus
-  rules:
-
-  - alert: PrometheusTargetMissing
-    expr: up == 0
-    for: 5m
-    labels:
-      severity: critical
-    annotations:
-      summary: "Prometheus target missing (instance {{ $labels.instance }})"
-      description: "A Prometheus target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: PrometheusAllTargetsMissing
-    expr: count by (job) (up) == 0
-    for: 1m
-    labels:
-      severity: critical
-    annotations:
-      summary: "Prometheus all targets missing (instance {{ $labels.instance }})"
-      description: "A Prometheus job does not have living target anymore.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: PrometheusConfigurationReloadFailure
-    expr: prometheus_config_last_reload_successful != 1
-    for: 1m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Prometheus configuration reload failure (instance {{ $labels.instance }})"
-      description: "Prometheus configuration reload error\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: PrometheusTooManyRestarts
-    expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
-    for: 1m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Prometheus too many restarts (instance {{ $labels.instance }})"
-      description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: PrometheusAlertmanagerConfigurationReloadFailure
-    expr: alertmanager_config_last_reload_successful != 1
-    for: 1m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})"
-      description: "AlertManager configuration reload error\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: PrometheusAlertmanagerConfigNotSynced
-    expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
-    for: 1m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Prometheus AlertManager config not synced (instance {{ $labels.instance }})"
-      description: "Configurations of AlertManager cluster instances are out of sync\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
- # - alert: PrometheusAlertmanagerE2eDeadManSwitch
- #   expr: vector(1)
- #   for: 1m
- #   labels:
- #     severity: critical
- #   annotations:
- #     summary: "Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})"
- #     description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: PrometheusNotConnectedToAlertmanager
-    expr: prometheus_notifications_alertmanagers_discovered < 1
-    for: 1m
-    labels:
-      severity: critical
-    annotations:
-      summary: "Prometheus not connected to alertmanager (instance {{ $labels.instance }})"
-      description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: PrometheusRuleEvaluationFailures
-    expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
-    for: 1m
-    labels:
-      severity: critical
-    annotations:
-      summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})"
-      description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: PrometheusTemplateTextExpansionFailures
-    expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
-    for: 1m
-    labels:
-      severity: critical
-    annotations:
-      summary: "Prometheus template text expansion failures (instance {{ $labels.instance }})"
-      description: "Prometheus encountered {{ $value }} template text expansion failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: PrometheusRuleEvaluationSlow
-    expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
-    for: 1m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Prometheus rule evaluation slow (instance {{ $labels.instance }})"
-      description: "Prometheus rule evaluation took more time than the scheduled interval. I indicates a slower storage backend access or too complex query.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: PrometheusNotificationsBacklog
-    expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
-    for: 1m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Prometheus notifications backlog (instance {{ $labels.instance }})"
-      description: "The Prometheus notification queue has not been empty for 10 minutes\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: PrometheusAlertmanagerNotificationFailing
-    expr: rate(alertmanager_notifications_failed_total[1m]) > 0
-    for: 1m
-    labels:
-      severity: critical
-    annotations:
-      summary: "Prometheus AlertManager notification failing (instance {{ $labels.instance }})"
-      description: "Alertmanager is failing sending notifications\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: PrometheusTargetEmpty
-    expr: prometheus_sd_discovered_targets == 0
-    for: 1m
-    labels:
-      severity: critical
-    annotations:
-      summary: "Prometheus target empty (instance {{ $labels.instance }})"
-      description: "Prometheus has no target in service discovery\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: PrometheusTargetScrapingSlow
-    expr: prometheus_target_interval_length_seconds{quantile="0.9",interval="1m0s"} > 61
-    for: 1m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Prometheus target scraping slow (instance {{ $labels.instance }})"
-      description: "Prometheus is scraping exporters slowly\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: PrometheusLargeScrape
-    expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
-    for: 1m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Prometheus large scrape (instance {{ $labels.instance }})"
-      description: "Prometheus has many scrapes that exceed the sample limit\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: PrometheusTargetScrapeDuplicate
-    expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
-    for: 1m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Prometheus target scrape duplicate (instance {{ $labels.instance }})"
-      description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: PrometheusTsdbCheckpointCreationFailures
-    expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[3m]) > 0
-    for: 1m
-    labels:
-      severity: critical
-    annotations:
-      summary: "Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})"
-      description: "Prometheus encountered {{ $value }} checkpoint creation failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: PrometheusTsdbCheckpointDeletionFailures
-    expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[3m]) > 0
-    for: 1m
-    labels:
-      severity: critical
-    annotations:
-      summary: "Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})"
-      description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: PrometheusTsdbCompactionsFailed
-    expr: increase(prometheus_tsdb_compactions_failed_total[3m]) > 0
-    for: 1m
-    labels:
-      severity: critical
-    annotations:
-      summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})"
-      description: "Prometheus encountered {{ $value }} TSDB compactions failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: PrometheusTsdbHeadTruncationsFailed
-    expr: increase(prometheus_tsdb_head_truncations_failed_total[3m]) > 0
-    for: 1m
-    labels:
-      severity: critical
-    annotations:
-      summary: "Prometheus TSDB head truncations failed (instance {{ $labels.instance }})"
-      description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: PrometheusTsdbReloadFailures
-    expr: increase(prometheus_tsdb_reloads_failures_total[3m]) > 0
-    for: 1m
-    labels:
-      severity: critical
-    annotations:
-      summary: "Prometheus TSDB reload failures (instance {{ $labels.instance }})"
-      description: "Prometheus encountered {{ $value }} TSDB reload failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: PrometheusTsdbWalCorruptions
-    expr: increase(prometheus_tsdb_wal_corruptions_total[3m]) > 0
-    for: 1m
-    labels:
-      severity: critical
-    annotations:
-      summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})"
-      description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: PrometheusTsdbWalTruncationsFailed
-    expr: increase(prometheus_tsdb_wal_truncations_failed_total[3m]) > 0
-    for: 1m
-    labels:
-      severity: critical
-    annotations:
-      summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})"
-      description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-
diff --git a/environments/kolla/files/overlays/prometheus/rabbitmq.rules b/environments/kolla/files/overlays/prometheus/rabbitmq.rules
deleted file mode 100644
index 978145d11..000000000
--- a/environments/kolla/files/overlays/prometheus/rabbitmq.rules
+++ /dev/null
@@ -1,69 +0,0 @@
-
-
-groups:
-- name: rabbitmq.rules
-  rules:
-  - alert: RabbitmqNodeDown
-    expr: sum(rabbitmq_build_info{host_name!=""}) < 3
-    for: 30m
-    labels:
-      severity: critical
-    annotations:
-      description: Rabbitmq node down on {{ $labels.host_name }}
-  - alert: RabbitmqConsumersLowUtilization
-    expr: rabbitmq_queue_consumer_utilisation < 0.4
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      description: RabbitMQ consumers message consumption speed is low on {{ $labels.host_name }}
-  - alert: RabbitmqNodeNotDistributed
-    expr: erlang_vm_dist_node_state < 3
-    for: 5m
-    labels:
-      severity: critical
-    annotations:
-      description: Rabbitmq node not distributed on node {{ $labels.host_name }}
-  - alert: RabbitmqMemoryHigh
-    expr: rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes
-      * 100 > 90
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      description: Rabbitmq memory too high on {{ $labels.host_name }}
-  - alert: RabbitmqFileDescriptorsUsage
-    expr: rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      description: Rabbitmq file descriptors usage on {{ $labels.host_name }}
-  - alert: RabbitmqTooMuchUnack
-    expr: sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000
-    for: 1m
-    labels:
-      severity: warning
-    annotations:
-      description: Rabbitmq too much unack on {{ $labels.host_name }}
-  - alert: RabbitmqTooMuchConnections
-    expr: rabbitmq_connections > 1000
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      description: Rabbitmq too much connections on {{ $labels.host_name }}
-  - alert: RabbitmqNoQueueConsumer
-    expr: rabbitmq_queue_consumers < 1
-    for: 1m
-    labels:
-      severity: warning
-    annotations:
-      description: Rabbitmq no queue consumer on {{ $labels.host_name }}
-  - alert: RabbitmqUnroutableMessages
-    expr: increase(rabbitmq_channel_messages_unroutable_returned_total[1m]) > 0 or  increase(rabbitmq_channel_messages_unroutable_dropped_total[1m]) > 0
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      description: Rabbitmq unroutable messages on {{ $labels.host_name }}
diff --git a/environments/kolla/files/overlays/prometheus/system.rules b/environments/kolla/files/overlays/prometheus/system.rules
deleted file mode 100644
index 65514d0c8..000000000
--- a/environments/kolla/files/overlays/prometheus/system.rules
+++ /dev/null
@@ -1,79 +0,0 @@
-
-
-groups:
-- name: Node
-  rules:
-
-  - alert: LowDiskSpace
-    expr: ( ( node_filesystem_free_bytes - node_filesystem_avail_bytes ) / node_filesystem_free_bytes ) * 100 >= 80
-    for: 1m
-    labels:
-      severity: alert
-    annotations:
-      summary: "Prometheus exporter at {{ $labels.instance }} reports low disk space"
-      description: "{{ $labels.device }} is {{ $value }}% full."
-
-  - alert: LowMemory
-    expr: ( ( node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes - (node_memory_Hugepagesize_bytes * node_memory_HugePages_Total)) / (node_memory_MemTotal_bytes - (node_memory_Hugepagesize_bytes * node_memory_HugePages_Total))) * 100 >= 80
-    for: 1m
-    labels:
-      severity: alert
-    annotations:
-      summary: "Prometheus exporter at {{ $labels.instance }} reports low memory"
-      description: "Memory is {{ $value }}% full."
-
-  - alert: HostOomKillDetected
-    expr: increase(node_vmstat_oom_kill[5m]) > 0
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Host OOM kill detected (instance {{ $labels.instance }})"
-      description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: Overheating
-    expr: node_hwmon_temp_celsius >= 95
-    for: 1m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Prometheus exporter at {{ $labels.instance }} reports overheating"
-      description: "Sensor {{ $labels.chip }} reports {{ $value }} degrees celcius."
-
-  - alert: HostNodeOvertemperatureAlarm
-    expr: node_hwmon_temp_crit_alarm_celsius == 1
-    for: 5m
-    labels:
-      severity: critical
-    annotations:
-      summary: "Host node overtemperature alarm (instance {{ $labels.instance }})"
-      description: "Physical node temperature alarm triggered\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: InstanceDown
-    expr: up{job="node"} == 0
-    for: 1m
-    labels:
-      severity: alert
-    annotations:
-      summary: "Instance {{$labels.instance}} down"
-      description: "{{$labels.instance}} has been down for more than 5 minutes."
-
-  - alert: HostEdacCorrectableErrorsDetected
-    expr: increase(node_edac_correctable_errors_total[5m]) > 0
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Host EDAC Correctable Errors detected (instance {{ $labels.instance }})"
-      description: "{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-  - alert: HostEdacUncorrectableErrorsDetected
-    expr: node_edac_uncorrectable_errors_total > 0
-    for: 5m
-    labels:
-      severity: critical
-    annotations:
-      summary: "Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})"
-      description: "{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
-
-
diff --git a/playbooks/deploy.yml b/playbooks/deploy.yml
index 6aba15859..9c5233a74 100644
--- a/playbooks/deploy.yml
+++ b/playbooks/deploy.yml
@@ -29,6 +29,7 @@
 
     _nutshell: "{{ nutshell | default(false) | bool }}"
     _tempest: "{{ tempest | default(false) | bool }}"
+    _prometheus_alert_status: "{{ prometheus_alert_status | default(false) | bool }}"
 
     _ceph_stack: "{{ ceph_stack | default('ceph-ansible') }}"
 
@@ -235,3 +236,11 @@
         - not manual_deploy | bool
         - _tempest | bool
       changed_when: true
+
+    - name: Check prometheus alert status
+      ansible.builtin.command:
+        cmd: "ssh -i {{ terraform_path }}/.id_rsa.{{ cloud_env }} dragon@{{ manager_host }} /opt/configuration/scripts/check/303-prometheus-alert-status.sh"
+      when:
+        - not manual_deploy | bool
+        - run_prometheus_alert_status | bool
+      changed_when: true
diff --git a/scripts/check/303-prometheus-alert-status.sh b/scripts/check/303-prometheus-alert-status.sh
new file mode 100755
index 000000000..3f5664920
--- /dev/null
+++ b/scripts/check/303-prometheus-alert-status.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+set -x
+set -e
+
+echo
+echo "# Checking for active prometheus alerts"
+echo
+
+osism apply prometheus-alert-status