From 0be7d5b1c808d6e08924caa9090b0a21f2b76137 Mon Sep 17 00:00:00 2001 From: Steven <64777084+sfawcett123@users.noreply.github.com> Date: Tue, 10 Nov 2020 14:17:33 +0000 Subject: [PATCH] Added templating to slack alerts (#363) * Added templating to slack alerts * Corrected Messages --- monitoring/alertmanager/alertmanager.yml.tmpl | 17 -------------- monitoring/alertmanager/manifest.yml | 5 ---- monitoring/prometheus/alert.rules | 11 ++++----- terraform/monitoring/modules.tf | 13 ++++++----- terraform/monitoring/testAlert.sh | 23 +++++++++++++++++++ 5 files changed, 34 insertions(+), 35 deletions(-) delete mode 100644 monitoring/alertmanager/alertmanager.yml.tmpl delete mode 100644 monitoring/alertmanager/manifest.yml create mode 100755 terraform/monitoring/testAlert.sh diff --git a/monitoring/alertmanager/alertmanager.yml.tmpl b/monitoring/alertmanager/alertmanager.yml.tmpl deleted file mode 100644 index c4b3bc307..000000000 --- a/monitoring/alertmanager/alertmanager.yml.tmpl +++ /dev/null @@ -1,17 +0,0 @@ -global: - resolve_timeout: 1m - slack_api_url: '${slack_url}' - -route: - receiver: 'slack-notifications' - -receivers: -- name: 'slack-notifications' - slack_configs: - - channel: '${slack_channel}' - send_resolved: true - title: "{{ range .Alerts }}{{ .Annotations.summary }}\n{{ end }}" - title_link: 'https://alertmanager-dev-get-into-teaching.london.cloudapps.digital/#/alerts' - pretext: 'An alert has been raised by AlertManager' - text: 'https://dfedigital.atlassian.net/wiki/spaces/GGIT/pages/2120941584/Alerts/{{ .GroupLabels.app }}/{{ .GroupLabels.alertname }}' - icon_url: 'https://beta-getintoteaching.education.gov.uk/packs/media/images/getintoteachinglogo-be08cb9f14ade16ed1967b5a655c7e13.svg' diff --git a/monitoring/alertmanager/manifest.yml b/monitoring/alertmanager/manifest.yml deleted file mode 100644 index 8680df7e8..000000000 --- a/monitoring/alertmanager/manifest.yml +++ /dev/null @@ -1,5 +0,0 @@ -applications: - - name: teaching-vacancies-alertmanager-staging - buildpacks: - - binary_buildpack - command: ./alertmanager --web.listen-address=:${PORT} --config.file=alertmanager.yml diff --git a/monitoring/prometheus/alert.rules b/monitoring/prometheus/alert.rules index 6807c9a1d..9ca2376eb 100644 --- a/monitoring/prometheus/alert.rules +++ b/monitoring/prometheus/alert.rules @@ -25,12 +25,6 @@ groups: severity: medium annotations: summary: Alert when memory utilization is over 70%. - - alert: StevesHighCpuTest - expr: 'max(cpu{app="get-into-teaching-app-dev"}) > 0' - labels: - severity: medium - annotations: - summary: Alert when max CPU utilization is over 70%. - name: TTA rules: - alert: TooManyRequests @@ -64,9 +58,12 @@ groups: sum(increase(http_requests_received_total{controller=~".+",action=~".+",code=~"429"}[1m])) > 0 labels: - severity: high + severity: medium annotations: summary: Alert when any client hits a rate limit. + runbook: https://dfedigital.atlassian.net/wiki/spaces/GGIT/pages/2152497153/Rate+Limit + dashboard: https://grafana-prod-get-into-teaching.london.cloudapps.digital/d/28EURzZGz/get-into-teaching-api?viewPanel=60&orgId=1&var-App=get-into-teaching-api-prod + description: The API has recieved too many requests, please read the runbook for advice on what action should be taken. - alert: FailedJobs expr: 'sum(increase(api_hangfire_jobs{state="failed"}[1m])) > 0' labels: diff --git a/terraform/monitoring/modules.tf b/terraform/monitoring/modules.tf index 0b7219ac9..df3a0bc98 100644 --- a/terraform/monitoring/modules.tf +++ b/terraform/monitoring/modules.tf @@ -21,7 +21,7 @@ locals { } module "prometheus" { - source = "git::https://github.com/DFE-Digital/bat-platform-building-blocks.git//terraform/modules/prometheus?ref=monitoring-terraform-0_13-tv" + source = "git::https://github.com/DFE-Digital/bat-platform-building-blocks.git//terraform/modules/prometheus" paas_prometheus_exporter_endpoint = module.paas_prometheus_exporter.endpoint monitoring_space_id = data.cloudfoundry_space.space.id monitoring_instance_name = local.monitoring_org_name @@ -34,13 +34,13 @@ module "prometheus" { } module "influx" { - source = "git::https://github.com/DFE-Digital/bat-platform-building-blocks.git//terraform/modules/influxdb?ref=monitoring-terraform-0_13-tv" + source = "git::https://github.com/DFE-Digital/bat-platform-building-blocks.git//terraform/modules/influxdb" monitoring_space_id = data.cloudfoundry_space.space.id monitoring_instance_name = local.monitoring_org_name } module "paas_prometheus_exporter" { - source = "git::https://github.com/DFE-Digital/bat-platform-building-blocks.git//terraform/modules/paas_prometheus_exporter?ref=monitoring-terraform-0_13-tv" + source = "git::https://github.com/DFE-Digital/bat-platform-building-blocks.git//terraform/modules/paas_prometheus_exporter" monitoring_space_id = data.cloudfoundry_space.space.id monitoring_instance_name = local.monitoring_org_name paas_username = var.paas_exporter_username @@ -49,7 +49,7 @@ module "paas_prometheus_exporter" { module "grafana" { - source = "git::https://github.com/DFE-Digital/bat-platform-building-blocks.git//terraform/modules/grafana?ref=monitoring-terraform-0_13-tv" + source = "git::https://github.com/DFE-Digital/bat-platform-building-blocks.git//terraform/modules/grafana" monitoring_space_id = data.cloudfoundry_space.space.id monitoring_instance_name = "${var.environment}-${var.grafana["name"]}" prometheus_endpoint = module.prometheus.endpoint @@ -64,8 +64,9 @@ module "grafana" { } module "alertmanager" { - source = "git::https://github.com/DFE-Digital/bat-platform-building-blocks.git//terraform/modules/alertmanager?ref=monitoring-terraform-0_13-tv" + source = "git::https://github.com/DFE-Digital/bat-platform-building-blocks.git//terraform/modules/alertmanager?ref=monitoring/alertmanager/templates" monitoring_space_id = data.cloudfoundry_space.space.id monitoring_instance_name = "${var.environment}-${var.alertmanager["name"]}" - config = templatefile(var.alertmanager["config"], local.template_variable_map) + slack_url = var.alertmanager_slack_url + slack_channel = var.alertmanager_slack_channel } diff --git a/terraform/monitoring/testAlert.sh b/terraform/monitoring/testAlert.sh new file mode 100755 index 000000000..282fbdcdf --- /dev/null +++ b/terraform/monitoring/testAlert.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +name=$RANDOM +name=STEVE +url='https://alertmanager-dev-get-into-teaching.london.cloudapps.digital:/api/v1/alerts' + +echo "firing up alert $name" + +# change url o +curl -XPOST $url -d "[{ + \"status\": \"firing\", + \"labels\": { + \"severity\":\"medium\" + }, + \"annotations\": { + \"summary\": \"This is a test of the alerting system on development, this test can be ignored.\", + \"runbook\": \"https://dfedigital.atlassian.net/wiki/spaces/GGIT/pages/2152595459/Test+Page\", + \"dashboard\": \"https://grafana-dev-get-into-teaching.london.cloudapps.digital/d/qZjcqcpGz/csp-violations?orgId=1\", + \"description\": \"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\" + }, + \"generatorURL\": \"http://prometheus.int.example.net/${name}\" +}]" +