diff --git a/.zuul.yaml b/.zuul.yaml index e60574a1a..0e2240749 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -74,6 +74,7 @@ parent: abstract-testbed-deploy vars: tempest: true + prometheus_alert_status: false - job: name: abstract-testbed-deploy-in-a-nutshell @@ -164,6 +165,7 @@ vars: manager_version: 8.1.0 tempst: true + prometheus_alert_status: false - job: name: testbed-deploy-stable-ubuntu-22.04 diff --git a/environments/generic/configuration.yml b/environments/generic/configuration.yml index b420780c9..b0aea9ca3 100644 --- a/environments/generic/configuration.yml +++ b/environments/generic/configuration.yml @@ -7,3 +7,9 @@ dotfiles_repo_version: main dotfiles_repo_local_destination: "~/dotfiles" dotfiles_files: - .tmux.conf + +prometheus_alert_status_filter: + alertname: + - PrometheusAlertmanagerE2eDeadManSwitch + severity: + - info diff --git a/environments/kolla/files/overlays/prometheus/cadvisor.rules b/environments/kolla/files/overlays/prometheus/cadvisor.rules deleted file mode 100644 index 63a9d8803..000000000 --- a/environments/kolla/files/overlays/prometheus/cadvisor.rules +++ /dev/null @@ -1,64 +0,0 @@ -# Taken from https://awesome-prometheus-alerts.grep.to/rules - - - -groups: -- name: Cadvisor - rules: - - - alert: ContainerKilled - expr: time() - container_last_seen > 61 - for: 5m - labels: - severity: warning - annotations: - summary: "Container killed (instance {{ $labels.instance }})" - description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: ContainerCpuUsage - expr: (sum(rate(container_cpu_usage_seconds_total{name=~".+"}[3m])) BY (instance, name) * 100) > 3000 - for: 2m - labels: - severity: warning - annotations: - summary: "Container CPU usage (instance {{ $labels.instance }})" - description: "Container CPU usage is above 300%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: ContainerMemoryUsage - expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / - sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 90 - for: 10m - labels: - severity: warning - annotations: - summary: Container Memory usage {{ $labels.name }} on Host {{ $labels.host_name}} - description: "Container Memory usage VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: ContainerVolumeUsage - expr: (1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance)) * 100) > 80 - for: 1m - labels: - severity: warning - annotations: - summary: "Container Volume usage (instance {{ $labels.instance }})" - description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: ContainerVolumeIoUsage - expr: (sum(container_fs_io_current) BY (instance, name) * 100) > 80 - for: 5m - labels: - severity: warning - annotations: - summary: "Container Volume IO usage (instance {{ $labels.instance }})" - description: "Container Volume IO usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: ContainerHighThrottleRate - expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1 - for: 1m - labels: - severity: warning - annotations: - summary: "Container high throttle rate (instance {{ $labels.instance }})" - description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - diff --git a/environments/kolla/files/overlays/prometheus/ceph.rules b/environments/kolla/files/overlays/prometheus/ceph.rules deleted file mode 100644 index 70b133588..000000000 --- a/environments/kolla/files/overlays/prometheus/ceph.rules +++ /dev/null @@ -1,340 +0,0 @@ -# Official set of upstream alerts https://github.com/ceph/ceph/blob/octopus/monitoring/prometheus/alerts/ceph_default_alerts.yml - - - -groups: -- name: cluster health - rules: - - alert: health error - expr: ceph_health_status == 2 - for: 5m - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.15.1.2.2.1 - annotations: - description: > - Ceph in HEALTH_ERROR state for more than 5 minutes. - Please check "ceph health detail" for more information. - - - alert: health warn - expr: ceph_health_status == 1 - for: 15m - labels: - severity: warning - type: ceph_default - oid: 1.3.6.1.4.1.50495.15.1.2.2.2 - annotations: - description: > - Ceph has been in HEALTH_WARN for more than 15 minutes. - Please check "ceph health detail" for more information. - -- name: mon - rules: - - alert: low monitor quorum count - expr: sum(ceph_mon_quorum_status) < 3 - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.15.1.2.3.1 - annotations: - description: | - Monitor count in quorum is below three. - - Only {{ $value }} of {{ with query "count(ceph_mon_quorum_status)" }}{{ . | first | value }}{{ end }} monitors are active. - - The following monitors are down: - {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }} - - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} - {{- end }} - -- name: osd - rules: - - alert: 10% OSDs down - expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10 - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.15.1.2.4.1 - annotations: - description: | - {{ $value | humanize }}% or {{ with query "count(ceph_osd_up == 0)" }}{{ . | first | value }}{{ end }} of {{ with query "count(ceph_osd_up)" }}{{ . | first | value }}{{ end }} OSDs are down (≥ 10%). - - The following OSDs are down: - {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }} - - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} - {{- end }} - - - alert: OSD down - expr: count(ceph_osd_up == 0) > 0 - for: 15m - labels: - severity: warning - type: ceph_default - oid: 1.3.6.1.4.1.50495.15.1.2.4.2 - annotations: - description: | - {{ $s := "" }}{{ if gt $value 1.0 }}{{ $s = "s" }}{{ end }} - {{ $value }} OSD{{ $s }} down for more than 15 minutes. - - {{ $value }} of {{ query "count(ceph_osd_up)" | first | value }} OSDs are down. - - The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down: - {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}} - - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} - {{- end }} - - - alert: OSDs near full - expr: | - ( - ((ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) and on(ceph_daemon) ceph_osd_up == 1) - * on(ceph_daemon) group_left(hostname) ceph_osd_metadata - ) * 100 > 90 - for: 5m - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.15.1.2.4.3 - annotations: - description: > - OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} is - dangerously full: {{ $value | humanize }}% - - - alert: flapping OSD - expr: | - ( - rate(ceph_osd_up[5m]) - * on(ceph_daemon) group_left(hostname) ceph_osd_metadata - ) * 60 > 1 - labels: - severity: warning - type: ceph_default - oid: 1.3.6.1.4.1.50495.15.1.2.4.4 - annotations: - description: > - OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was - marked down and back up at {{ $value | humanize }} times once a - minute for 5 minutes. - - - alert: Ceph OSD Reweighted needed - expr: ceph_osd_weight < 1 - for: 1h - labels: - severity: warning - annotations: - description: > - Ceph OSD reweighted (OSD: {{ $labels.ceph_daemon }}) - - -# # alert on high deviation from average PG count -# - alert: high pg count deviation -# expr: | -# abs( -# ( -# (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job) -# ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job) -# ) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30 -# for: 5m -# labels: -# severity: warning -# type: ceph_default -# oid: 1.3.6.1.4.1.50495.15.1.2.4.5 -# annotations: -# description: > -# OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates -# by more than 30% from average PG count. - -- name: pgs - rules: - - alert: pgs inactive - expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0 - for: 5m - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.15.1.2.7.1 - annotations: - description: > - {{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}. - Inactive placement groups aren't able to serve read/write - requests. - - - alert: pgs unclean - expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0 - for: 15m - labels: - severity: warning - type: ceph_default - oid: 1.3.6.1.4.1.50495.15.1.2.7.2 - annotations: - description: > - {{ $value }} PGs haven't been clean for more than 15 minutes in pool {{ $labels.name }}. - Unclean PGs haven't been able to completely recover from a - previous failure. - - - alert: pgs down - expr: ceph_pg_down > 0 - for: 1m - labels: - severity: warning - type: ceph_default - oid: 1.3.6.1.4.1.50495.15.1.2.7.3 - annotations: - description: > - Ceph PG down (Pool ID {{ $labels.pool_id }}) - - - alert: pgs incomplete - expr: ceph_pg_incomplete > 0 - for: 1m - labels: - severity: warning - type: ceph_default - oid: 1.3.6.1.4.1.50495.15.1.2.7.4 - annotations: - description: > - PG incomplete (Pool ID {{ $labels.pool_id }}) - - - alert: pg inconsistant - expr: ceph_pg_inconsistent > 0 - for: 1m - labels: - severity: warning - type: ceph_default - oid: 1.3.6.1.4.1.50495.15.1.2.7.5 - annotations: - description: > - pg inconsistant (Pool ID {{ $labels.pool_id }}) - - - alert: pg ActivationLong - expr: ceph_pg_activating > 0 - for: 2m - labels: - severity: warning - type: ceph_default - oid: 1.3.6.1.4.1.50495.15.1.2.7.6 - annotations: - description: > - pg activation too long (Pool ID {{ $labels.pool_id }}) - - - alert: pg backfill full - expr: ceph_pg_backfill_toofull > 0 - for: 2m - labels: - severity: warning - type: ceph_default - oid: 1.3.6.1.4.1.50495.15.1.2.7.7 - annotations: - description: > - pg backfill full (Pool ID {{ $labels.pool_id }}) - - - alert: CephPgUnavailable - expr: ceph_pg_total - ceph_pg_active > 0 - for: 1m - labels: - severity: warning - type: ceph_default - oid: 1.3.6.1.4.1.50495.15.1.2.7.8 - annotations: - description: pg unavailable (Pool ID {{ $labels.pool_id }}) - -- name: nodes - rules: - - alert: root volume full - expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 5 - for: 5m - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.15.1.2.8.1 - annotations: - description: "Root volume (OSD and MON store) is dangerously full: {{ $value | humanize }}% free." - - # alert on nic packet errors and drops rates > 100 packet/s - #- alert: network packets dropped - # expr: irate(node_network_receive_drop_total{device!~"lo|breth2-ovs"}[5m]) + irate(node_network_transmit_drop_total{device!~"lo|breth2-ovs"}[5m]) > 100 - # labels: - # severity: warning - # type: ceph_default - # oid: 1.3.6.1.4.1.50495.15.1.2.8.2 - # annotations: - # description: > - # Node {{ $labels.instance }} experiences packet drop > 100 - # packet/s on interface {{ $labels.device }}. - - - alert: network packet errors - expr: | - irate(node_network_receive_errs_total{device!="lo"}[5m]) + - irate(node_network_transmit_errs_total{device!="lo"}[5m]) > 1 - labels: - severity: warning - type: ceph_default - oid: 1.3.6.1.4.1.50495.15.1.2.8.3 - annotations: - description: > - Node {{ $labels.instance }} experiences packet errors > 1 - packet/s on interface {{ $labels.device }}. - - - alert: storage filling up - expr: | - predict_linear(node_filesystem_free_bytes[2d], 3600 * 24 * 5) * - on(instance) group_left(nodename) node_uname_info < 0 - labels: - severity: warning - type: ceph_default - oid: 1.3.6.1.4.1.50495.15.1.2.8.4 - annotations: - description: > - Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }} - will be full in less than 5 days assuming the average fill-up - rate of the past 48 hours. - -- name: pools - rules: - - alert: pool full - expr: | - ceph_pool_stored / (ceph_pool_stored + ceph_pool_max_avail) - * on(pool_id) group_right ceph_pool_metadata * 100 > 90 - labels: - severity: critical - type: ceph_default - oid: 1.3.6.1.4.1.50495.15.1.2.9.1 - annotations: - description: "Pool {{ $labels.name }} at {{ $value | humanize }}% capacity." - - - alert: pool filling up - expr: | - ( - predict_linear(ceph_pool_stored[2d], 3600 * 24 * 5) - >= ceph_pool_stored + ceph_pool_max_avail - ) * on(pool_id) group_left(name) ceph_pool_metadata - labels: - severity: warning - type: ceph_default - oid: 1.3.6.1.4.1.50495.15.1.2.9.2 - annotations: - description: > - Pool {{ $labels.name }} will be full in less than 5 days - assuming the average fill-up rate of the past 48 hours. - -- name: healthchecks - rules: - - alert: Slow OSD Ops - expr: ceph_healthcheck_slow_ops > 0 - for: 30s - labels: - severity: warning - type: ceph_default - annotations: - description: "{{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)" - -- name: ceph exporter - rules: - - alert: CephMgrExporterDown - expr: up{job="ceph_mgr_exporter"} == 0 - for: 1m - labels: - severity: critical - annotations: - description: > - the Ceph-Manager-Exporter is down. - message: CEPH target down for more than 1m, please check diff --git a/environments/kolla/files/overlays/prometheus/elasticsearch.rules b/environments/kolla/files/overlays/prometheus/elasticsearch.rules deleted file mode 100644 index 4c0dbcacc..000000000 --- a/environments/kolla/files/overlays/prometheus/elasticsearch.rules +++ /dev/null @@ -1,99 +0,0 @@ -# Taken from https://awesome-prometheus-alerts.grep.to/rules - - - -groups: -- name: Elasticsearch - rules: - - - alert: ElasticsearchHeapUsageTooHigh - expr: (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90 - for: 2m - labels: - severity: critical - annotations: - summary: "Elasticsearch Heap Usage Too High (instance {{ $labels.instance }})" - description: "The heap usage is over 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: ElasticsearchHeapUsageWarning - expr: (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 85 - for: 2m - labels: - severity: warning - annotations: - summary: "Elasticsearch Heap Usage warning (instance {{ $labels.instance }})" - description: "The heap usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: ElasticsearchDiskOutOfSpace - expr: elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10 - for: 0m - labels: - severity: critical - annotations: - summary: "Elasticsearch disk out of space (instance {{ $labels.instance }})" - description: "The disk usage is over 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: ElasticsearchDiskSpaceLow - expr: elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20 - for: 2m - labels: - severity: warning - annotations: - summary: "Elasticsearch disk space low (instance {{ $labels.instance }})" - description: "The disk usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: ElasticsearchClusterRed - expr: elasticsearch_cluster_health_status{color="red"} == 1 - for: 0m - labels: - severity: critical - annotations: - summary: "Elasticsearch Cluster Red (instance {{ $labels.instance }})" - description: "Elastic Cluster Red status\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: ElasticsearchClusterYellow - expr: elasticsearch_cluster_health_status{color="yellow"} == 1 - for: 0m - labels: - severity: warning - annotations: - summary: "Elasticsearch Cluster Yellow (instance {{ $labels.instance }})" - description: "Elastic Cluster Yellow status\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: ElasticsearchRelocatingShardsTooLong - expr: elasticsearch_cluster_health_relocating_shards > 0 - for: 15m - labels: - severity: warning - annotations: - summary: "Elasticsearch relocating shards too long (instance {{ $labels.instance }})" - description: "Elasticsearch has been relocating shards for 15min\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: ElasticsearchInitializingShardsTooLong - expr: elasticsearch_cluster_health_initializing_shards > 0 - for: 15m - labels: - severity: warning - annotations: - summary: "Elasticsearch initializing shards too long (instance {{ $labels.instance }})" - description: "Elasticsearch has been initializing shards for 15 min\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: ElasticsearchUnassignedShards - expr: elasticsearch_cluster_health_unassigned_shards > 0 - for: 0m - labels: - severity: critical - annotations: - summary: "Elasticsearch unassigned shards (instance {{ $labels.instance }})" - description: "Elasticsearch has unassigned shards\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: ElasticsearchPendingTasks - expr: elasticsearch_cluster_health_number_of_pending_tasks > 0 - for: 15m - labels: - severity: warning - annotations: - summary: "Elasticsearch pending tasks (instance {{ $labels.instance }})" - description: "Elasticsearch has pending tasks. Cluster works slowly.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - diff --git a/environments/kolla/files/overlays/prometheus/haproxy.rules b/environments/kolla/files/overlays/prometheus/haproxy.rules deleted file mode 100644 index c95db6fde..000000000 --- a/environments/kolla/files/overlays/prometheus/haproxy.rules +++ /dev/null @@ -1,76 +0,0 @@ -# Taken from https://awesome-prometheus-alerts.grep.to/rules - - - -groups: -- name: HAProxy - rules: - - - alert: HaproxyDown - expr: haproxy_up == 0 - for: 1m - labels: - severity: critical - annotations: - summary: "HAProxy down (instance {{ $labels.instance }})" - description: "HAProxy down\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: HaproxyBackendDown - expr: haproxy_backend_up == 0 - for: 1m - labels: - severity: critical - annotations: - summary: "HAProxy backend down (instance {{ $labels.instance }})" - description: "HAProxy backend is down\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: HaproxyServerDown - expr: haproxy_server_up == 0 - for: 1m - labels: - severity: critical - annotations: - summary: "HAProxy server down (instance {{ $labels.instance }})" - description: "HAProxy server is down\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: HaproxyServerResponseErrors - expr: sum(rate(haproxy_server_response_errors_total[1m])) BY (server) / sum(rate(haproxy_server_http_responses_total[1m])) - BY (server) * 100 > 5 - for: 1m - labels: - severity: critical - annotations: - description: HAProxy server response errors ({{ $labels.backend }} on instance {{ $labels.server }}) - - - alert: HaproxyServerConnectionErrors - expr: sum(rate(haproxy_server_connection_errors_total[1m])) BY (server) > 100 - for: 2m - labels: - severity: critical - annotations: - description: HAProxy server connection errors ({{ $labels.backend }} on instance {{ $labels.server }}) - - - alert: HaproxyBackendMaxActiveSession - expr: ((sum(avg_over_time(haproxy_backend_max_sessions[2m])) BY (haproxy_backend_sessions_total) / sum(avg_over_time(haproxy_backend_limit_sessions[2m])) BY (haproxy_backend_sessions_total)) - * 100) > 80 - for: 2m - labels: - severity: warning - annotations: - description: HAProxy backend max active session ({{ $labels.backend }} on instance {{ $labels.server }}) - - - alert: HaproxyHttpSlowingDown - expr: avg(haproxy_backend_http_total_time_average_seconds) BY (haproxy_backend_sessions_total) > 7 - for: 5m - labels: - severity: warning - annotations: - description: HAProxy HTTP slowing down ({{ $labels.backend }} on instance {{ $labels.server }}) - - - alert: HaproxyServerHealthcheckFailure - expr: increase(haproxy_server_check_failures_total[5m]) > 3 - for: 5m - labels: - severity: warning - annotations: - description: HAProxy server healthcheck failure ({{ $labels.server }} for backend {{ $labels.backend }}) diff --git a/environments/kolla/files/overlays/prometheus/mysql.rules b/environments/kolla/files/overlays/prometheus/mysql.rules deleted file mode 100644 index 63d39ebee..000000000 --- a/environments/kolla/files/overlays/prometheus/mysql.rules +++ /dev/null @@ -1,89 +0,0 @@ -# Taken from https://awesome-prometheus-alerts.grep.to/rules - - - -groups: -- name: Mysql - rules: - - - alert: MysqlDown - expr: mysql_up == 0 - for: 1m - labels: - severity: critical - annotations: - summary: "MySQL down (instance {{ $labels.instance }})" - description: "MySQL instance is down on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: MysqlTooManyConnections - expr: avg by (instance) (max_over_time(mysql_global_status_threads_connected[5m])) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 80 - for: 1m - labels: - severity: warning - annotations: - summary: "MySQL too many connections (instance {{ $labels.instance }})" - description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: MysqlHighThreadsRunning - expr: avg by (instance) (max_over_time(mysql_global_status_threads_running[5m])) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 60 - for: 1m - labels: - severity: warning - annotations: - summary: "MySQL high threads running (instance {{ $labels.instance }})" - description: "More than 60% of MySQL connections are in running state on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: MysqlSlaveIoThreadNotRunning - expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_io_running == 0 - for: 1m - labels: - severity: critical - annotations: - summary: "MySQL Slave IO thread not running (instance {{ $labels.instance }})" - description: "MySQL Slave IO thread not running on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: MysqlSlaveSqlThreadNotRunning - expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_sql_running == 0 - for: 1m - labels: - severity: critical - annotations: - summary: "MySQL Slave SQL thread not running (instance {{ $labels.instance }})" - description: "MySQL Slave SQL thread not running on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: MysqlSlaveReplicationLag - expr: mysql_slave_status_master_server_id > 0 and ON (instance) (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) > 300 - for: 1m - labels: - severity: warning - annotations: - summary: "MySQL Slave replication lag (instance {{ $labels.instance }})" - description: "MysqL replication lag on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: MysqlSlowQueries - expr: rate(mysql_global_status_slow_queries[5m]) > 0 - for: 2m - labels: - severity: warning - annotations: - summary: "MySQL slow queries (instance {{ $labels.instance }})" - description: "MySQL server is having some slow queries.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: MysqlRestarted - expr: mysql_global_status_uptime < 60 - for: 1m - labels: - severity: warning - annotations: - summary: "MySQL restarted (instance {{ $labels.instance }})" - description: "MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: mariadb_innodb_replication_fallen_behind - expr: (mysql_global_variables_innodb_replication_delay > 30) and ON(instance) (predict_linear(mysql_global_variables_innodb_replication_delay[5m], 60 * 2) > 0) - for: 10m - labels: - severity: warning - annotations: - description: MySQL innodb replication is lagging on host {{ $labels.host_name }} - - diff --git a/environments/kolla/files/overlays/prometheus/openstack.rules b/environments/kolla/files/overlays/prometheus/openstack.rules deleted file mode 100644 index 942f710df..000000000 --- a/environments/kolla/files/overlays/prometheus/openstack.rules +++ /dev/null @@ -1,15 +0,0 @@ - - -groups: -- name: OpenStack - rules: - - alert: OpenStackServiceDown - expr: (sum({__name__=~"openstack.+_state", job="openstack_exporter"} == 0) by (hostname, service)) - for: 1m - labels: - severity: alert - annotations: - summary: "{{ $labels.service }} at {{ $labels.instance }} is down" - description: "OpenStack service {{ $labels.service }} at {{ $labels.instance }} is down" - - diff --git a/environments/kolla/files/overlays/prometheus/prometheus.rules b/environments/kolla/files/overlays/prometheus/prometheus.rules deleted file mode 100644 index d446894b2..000000000 --- a/environments/kolla/files/overlays/prometheus/prometheus.rules +++ /dev/null @@ -1,225 +0,0 @@ -# Taken from https://awesome-prometheus-alerts.grep.to/rules - - - -groups: -- name: Prometheus - rules: - - - alert: PrometheusTargetMissing - expr: up == 0 - for: 5m - labels: - severity: critical - annotations: - summary: "Prometheus target missing (instance {{ $labels.instance }})" - description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: PrometheusAllTargetsMissing - expr: count by (job) (up) == 0 - for: 1m - labels: - severity: critical - annotations: - summary: "Prometheus all targets missing (instance {{ $labels.instance }})" - description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: PrometheusConfigurationReloadFailure - expr: prometheus_config_last_reload_successful != 1 - for: 1m - labels: - severity: warning - annotations: - summary: "Prometheus configuration reload failure (instance {{ $labels.instance }})" - description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: PrometheusTooManyRestarts - expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2 - for: 1m - labels: - severity: warning - annotations: - summary: "Prometheus too many restarts (instance {{ $labels.instance }})" - description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: PrometheusAlertmanagerConfigurationReloadFailure - expr: alertmanager_config_last_reload_successful != 1 - for: 1m - labels: - severity: warning - annotations: - summary: "Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})" - description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: PrometheusAlertmanagerConfigNotSynced - expr: count(count_values("config_hash", alertmanager_config_hash)) > 1 - for: 1m - labels: - severity: warning - annotations: - summary: "Prometheus AlertManager config not synced (instance {{ $labels.instance }})" - description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - # - alert: PrometheusAlertmanagerE2eDeadManSwitch - # expr: vector(1) - # for: 1m - # labels: - # severity: critical - # annotations: - # summary: "Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})" - # description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: PrometheusNotConnectedToAlertmanager - expr: prometheus_notifications_alertmanagers_discovered < 1 - for: 1m - labels: - severity: critical - annotations: - summary: "Prometheus not connected to alertmanager (instance {{ $labels.instance }})" - description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: PrometheusRuleEvaluationFailures - expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0 - for: 1m - labels: - severity: critical - annotations: - summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})" - description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: PrometheusTemplateTextExpansionFailures - expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0 - for: 1m - labels: - severity: critical - annotations: - summary: "Prometheus template text expansion failures (instance {{ $labels.instance }})" - description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: PrometheusRuleEvaluationSlow - expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds - for: 1m - labels: - severity: warning - annotations: - summary: "Prometheus rule evaluation slow (instance {{ $labels.instance }})" - description: "Prometheus rule evaluation took more time than the scheduled interval. I indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: PrometheusNotificationsBacklog - expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0 - for: 1m - labels: - severity: warning - annotations: - summary: "Prometheus notifications backlog (instance {{ $labels.instance }})" - description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: PrometheusAlertmanagerNotificationFailing - expr: rate(alertmanager_notifications_failed_total[1m]) > 0 - for: 1m - labels: - severity: critical - annotations: - summary: "Prometheus AlertManager notification failing (instance {{ $labels.instance }})" - description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: PrometheusTargetEmpty - expr: prometheus_sd_discovered_targets == 0 - for: 1m - labels: - severity: critical - annotations: - summary: "Prometheus target empty (instance {{ $labels.instance }})" - description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: PrometheusTargetScrapingSlow - expr: prometheus_target_interval_length_seconds{quantile="0.9",interval="1m0s"} > 61 - for: 1m - labels: - severity: warning - annotations: - summary: "Prometheus target scraping slow (instance {{ $labels.instance }})" - description: "Prometheus is scraping exporters slowly\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: PrometheusLargeScrape - expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10 - for: 1m - labels: - severity: warning - annotations: - summary: "Prometheus large scrape (instance {{ $labels.instance }})" - description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: PrometheusTargetScrapeDuplicate - expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0 - for: 1m - labels: - severity: warning - annotations: - summary: "Prometheus target scrape duplicate (instance {{ $labels.instance }})" - description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: PrometheusTsdbCheckpointCreationFailures - expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[3m]) > 0 - for: 1m - labels: - severity: critical - annotations: - summary: "Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})" - description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: PrometheusTsdbCheckpointDeletionFailures - expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[3m]) > 0 - for: 1m - labels: - severity: critical - annotations: - summary: "Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})" - description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: PrometheusTsdbCompactionsFailed - expr: increase(prometheus_tsdb_compactions_failed_total[3m]) > 0 - for: 1m - labels: - severity: critical - annotations: - summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})" - description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: PrometheusTsdbHeadTruncationsFailed - expr: increase(prometheus_tsdb_head_truncations_failed_total[3m]) > 0 - for: 1m - labels: - severity: critical - annotations: - summary: "Prometheus TSDB head truncations failed (instance {{ $labels.instance }})" - description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: PrometheusTsdbReloadFailures - expr: increase(prometheus_tsdb_reloads_failures_total[3m]) > 0 - for: 1m - labels: - severity: critical - annotations: - summary: "Prometheus TSDB reload failures (instance {{ $labels.instance }})" - description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: PrometheusTsdbWalCorruptions - expr: increase(prometheus_tsdb_wal_corruptions_total[3m]) > 0 - for: 1m - labels: - severity: critical - annotations: - summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})" - description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: PrometheusTsdbWalTruncationsFailed - expr: increase(prometheus_tsdb_wal_truncations_failed_total[3m]) > 0 - for: 1m - labels: - severity: critical - annotations: - summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})" - description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - diff --git a/environments/kolla/files/overlays/prometheus/rabbitmq.rules b/environments/kolla/files/overlays/prometheus/rabbitmq.rules deleted file mode 100644 index 978145d11..000000000 --- a/environments/kolla/files/overlays/prometheus/rabbitmq.rules +++ /dev/null @@ -1,69 +0,0 @@ - - -groups: -- name: rabbitmq.rules - rules: - - alert: RabbitmqNodeDown - expr: sum(rabbitmq_build_info{host_name!=""}) < 3 - for: 30m - labels: - severity: critical - annotations: - description: Rabbitmq node down on {{ $labels.host_name }} - - alert: RabbitmqConsumersLowUtilization - expr: rabbitmq_queue_consumer_utilisation < 0.4 - for: 5m - labels: - severity: warning - annotations: - description: RabbitMQ consumers message consumption speed is low on {{ $labels.host_name }} - - alert: RabbitmqNodeNotDistributed - expr: erlang_vm_dist_node_state < 3 - for: 5m - labels: - severity: critical - annotations: - description: Rabbitmq node not distributed on node {{ $labels.host_name }} - - alert: RabbitmqMemoryHigh - expr: rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes - * 100 > 90 - for: 2m - labels: - severity: warning - annotations: - description: Rabbitmq memory too high on {{ $labels.host_name }} - - alert: RabbitmqFileDescriptorsUsage - expr: rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90 - for: 2m - labels: - severity: warning - annotations: - description: Rabbitmq file descriptors usage on {{ $labels.host_name }} - - alert: RabbitmqTooMuchUnack - expr: sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000 - for: 1m - labels: - severity: warning - annotations: - description: Rabbitmq too much unack on {{ $labels.host_name }} - - alert: RabbitmqTooMuchConnections - expr: rabbitmq_connections > 1000 - for: 2m - labels: - severity: warning - annotations: - description: Rabbitmq too much connections on {{ $labels.host_name }} - - alert: RabbitmqNoQueueConsumer - expr: rabbitmq_queue_consumers < 1 - for: 1m - labels: - severity: warning - annotations: - description: Rabbitmq no queue consumer on {{ $labels.host_name }} - - alert: RabbitmqUnroutableMessages - expr: increase(rabbitmq_channel_messages_unroutable_returned_total[1m]) > 0 or increase(rabbitmq_channel_messages_unroutable_dropped_total[1m]) > 0 - for: 2m - labels: - severity: warning - annotations: - description: Rabbitmq unroutable messages on {{ $labels.host_name }} diff --git a/environments/kolla/files/overlays/prometheus/system.rules b/environments/kolla/files/overlays/prometheus/system.rules deleted file mode 100644 index 65514d0c8..000000000 --- a/environments/kolla/files/overlays/prometheus/system.rules +++ /dev/null @@ -1,79 +0,0 @@ - - -groups: -- name: Node - rules: - - - alert: LowDiskSpace - expr: ( ( node_filesystem_free_bytes - node_filesystem_avail_bytes ) / node_filesystem_free_bytes ) * 100 >= 80 - for: 1m - labels: - severity: alert - annotations: - summary: "Prometheus exporter at {{ $labels.instance }} reports low disk space" - description: "{{ $labels.device }} is {{ $value }}% full." - - - alert: LowMemory - expr: ( ( node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes - (node_memory_Hugepagesize_bytes * node_memory_HugePages_Total)) / (node_memory_MemTotal_bytes - (node_memory_Hugepagesize_bytes * node_memory_HugePages_Total))) * 100 >= 80 - for: 1m - labels: - severity: alert - annotations: - summary: "Prometheus exporter at {{ $labels.instance }} reports low memory" - description: "Memory is {{ $value }}% full." - - - alert: HostOomKillDetected - expr: increase(node_vmstat_oom_kill[5m]) > 0 - for: 5m - labels: - severity: warning - annotations: - summary: "Host OOM kill detected (instance {{ $labels.instance }})" - description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: Overheating - expr: node_hwmon_temp_celsius >= 95 - for: 1m - labels: - severity: warning - annotations: - summary: "Prometheus exporter at {{ $labels.instance }} reports overheating" - description: "Sensor {{ $labels.chip }} reports {{ $value }} degrees celcius." - - - alert: HostNodeOvertemperatureAlarm - expr: node_hwmon_temp_crit_alarm_celsius == 1 - for: 5m - labels: - severity: critical - annotations: - summary: "Host node overtemperature alarm (instance {{ $labels.instance }})" - description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: InstanceDown - expr: up{job="node"} == 0 - for: 1m - labels: - severity: alert - annotations: - summary: "Instance {{$labels.instance}} down" - description: "{{$labels.instance}} has been down for more than 5 minutes." - - - alert: HostEdacCorrectableErrorsDetected - expr: increase(node_edac_correctable_errors_total[5m]) > 0 - for: 5m - labels: - severity: warning - annotations: - summary: "Host EDAC Correctable Errors detected (instance {{ $labels.instance }})" - description: "{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - - alert: HostEdacUncorrectableErrorsDetected - expr: node_edac_uncorrectable_errors_total > 0 - for: 5m - labels: - severity: critical - annotations: - summary: "Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})" - description: "{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" - - diff --git a/playbooks/deploy.yml b/playbooks/deploy.yml index 6aba15859..9c5233a74 100644 --- a/playbooks/deploy.yml +++ b/playbooks/deploy.yml @@ -29,6 +29,7 @@ _nutshell: "{{ nutshell | default(false) | bool }}" _tempest: "{{ tempest | default(false) | bool }}" + _prometheus_alert_status: "{{ prometheus_alert_status | default(false) | bool }}" _ceph_stack: "{{ ceph_stack | default('ceph-ansible') }}" @@ -235,3 +236,11 @@ - not manual_deploy | bool - _tempest | bool changed_when: true + + - name: Check prometheus alert status + ansible.builtin.command: + cmd: "ssh -i {{ terraform_path }}/.id_rsa.{{ cloud_env }} dragon@{{ manager_host }} /opt/configuration/scripts/check/303-prometheus-alert-status.sh" + when: + - not manual_deploy | bool + - run_prometheus_alert_status | bool + changed_when: true diff --git a/scripts/check/303-prometheus-alert-status.sh b/scripts/check/303-prometheus-alert-status.sh new file mode 100755 index 000000000..3f5664920 --- /dev/null +++ b/scripts/check/303-prometheus-alert-status.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +set -x +set -e + +echo +echo "# Checking for active prometheus alerts" +echo + +osism apply prometheus-alert-status