From c42bcbefb49e70ce75081ee195b870abe89df6ed Mon Sep 17 00:00:00 2001 From: abhishek-dbz Date: Fri, 7 Nov 2025 21:45:12 +0530 Subject: [PATCH 1/2] docs: add comprehensive troubleshooting section to README - Add troubleshooting section with common issues and solutions - Include cluster connectivity problems and DNS resolution timeouts - Add guidance for alerts/notifications not working - Include memory usage and configuration reload issues - Provide practical examples and commands for debugging This helps users quickly resolve common operational issues without needing to search through multiple documentation sources. Signed-off-by: abhishek-dbz --- README.md | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/README.md b/README.md index 2fdfdd9768..8299b30365 100644 --- a/README.md +++ b/README.md @@ -404,6 +404,91 @@ alerting: If running Alertmanager in high availability mode is not desired, setting `--cluster.listen-address=` prevents Alertmanager from listening to incoming peer requests. +## Troubleshooting + +### Common Issues and Solutions + +#### Cluster peers not connecting + +**Symptoms:** Alertmanager instances cannot discover each other in cluster mode. + +**Solutions:** +- Verify that both UDP and TCP ports are open on `--cluster.listen-address` (default: 9094) +- Check firewall rules and ensure the clustering port is whitelisted for both protocols +- Verify `--cluster.advertise-address` is set correctly and reachable from other peers +- Use `--cluster.peer` flag to explicitly specify initial peers +- Check logs for DNS resolution errors, especially if using hostnames +- Increase `--cluster.peers-resolve-timeout` if DNS lookups are slow (default: 15s) + +Example of correct cluster setup: +```bash +# Node 1 +./alertmanager --cluster.listen-address=0.0.0.0:9094 \ + --cluster.advertise-address=192.168.1.10:9094 \ + --cluster.peer=192.168.1.11:9094 + +# Node 2 +./alertmanager --cluster.listen-address=0.0.0.0:9094 \ + --cluster.advertise-address=192.168.1.11:9094 \ + --cluster.peer=192.168.1.10:9094 +``` + +#### Alerts not being received + +**Symptoms:** Prometheus is sending alerts but Alertmanager shows no alerts. + +**Solutions:** +- Verify Alertmanager is reachable from Prometheus: `curl http://:9093/-/healthy` +- Check Prometheus alerting configuration points to correct Alertmanager endpoints +- Review Prometheus logs for connection errors to Alertmanager +- Ensure alerts are actually firing in Prometheus: check `/alerts` page +- Verify no firewall blocking between Prometheus and Alertmanager + +#### Notifications not being sent + +**Symptoms:** Alerts appear in Alertmanager UI but notifications are not delivered. + +**Solutions:** +- Check Alertmanager logs for errors related to notification delivery +- Verify receiver configuration in `alertmanager.yml` is correct +- Test receiver credentials and endpoints manually +- Check if alerts are being inhibited or silenced +- Verify routing configuration matches alert labels +- Use `amtool config routes test` to verify routing logic + +#### High memory usage + +**Symptoms:** Alertmanager consuming excessive memory. + +**Solutions:** +- Check for alert storms - large number of unique alert groups +- Review `group_by` labels in routing configuration +- Consider using more specific grouping to reduce alert group count +- Monitor notification log size and configure retention as needed +- Check for large number of active silences + +#### DNS resolution timeouts + +**Symptoms:** Alertmanager becomes unresponsive, readiness checks fail. + +**Solutions:** +- Increase `--cluster.peers-resolve-timeout` (default: 15s) +- Use IP addresses instead of hostnames in `--cluster.peer` flags +- Check DNS server responsiveness and network connectivity +- Review DNS resolution logs in Alertmanager output +- Consider using a local DNS cache + +#### Configuration reload fails + +**Symptoms:** Configuration changes don't take effect or Alertmanager fails to reload. + +**Solutions:** +- Validate configuration before reload: `amtool check-config alertmanager.yml` +- Check Alertmanager logs for specific configuration errors +- Verify file permissions on configuration file +- Ensure template files referenced in config exist and are readable +- Send SIGHUP signal manually: `kill -HUP ` + ## Contributing Check the [Prometheus contributing page](https://github.com/prometheus/prometheus/blob/main/CONTRIBUTING.md). From 4fbc391f0b5f5ab9378a69d3c0820e893aba7f8c Mon Sep 17 00:00:00 2001 From: Guido Trotter Date: Fri, 7 Nov 2025 15:46:49 +0000 Subject: [PATCH 2/2] Fix silence_bench_test FAIL (#4709) In 92ecf8b93b6f8fa7cce0fb329de86a1133362765 silence_bench_test.go was left behind since it's not run automatically, and started failing. Fix by passing a new registry when creating Silences. Signed-off-by: Guido Trotter Co-authored-by: Guido Trotter Signed-off-by: abhishek-dbz --- silence/silence_bench_test.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/silence/silence_bench_test.go b/silence/silence_bench_test.go index fbfd22295c..9f3a32c1c4 100644 --- a/silence/silence_bench_test.go +++ b/silence/silence_bench_test.go @@ -50,7 +50,7 @@ func BenchmarkMutes(b *testing.B) { } func benchmarkMutes(b *testing.B, n int) { - silences, err := New(Options{}) + silences, err := New(Options{Metrics: prometheus.NewRegistry()}) require.NoError(b, err) clock := quartz.NewMock(b) @@ -105,7 +105,7 @@ func BenchmarkQuery(b *testing.B) { } func benchmarkQuery(b *testing.B, numSilences int) { - s, err := New(Options{}) + s, err := New(Options{Metrics: prometheus.NewRegistry()}) require.NoError(b, err) clock := quartz.NewMock(b) @@ -199,7 +199,7 @@ func BenchmarkQueryParallel(b *testing.B) { } func benchmarkQueryParallel(b *testing.B, numSilences, numGoroutines int) { - s, err := New(Options{}) + s, err := New(Options{Metrics: prometheus.NewRegistry()}) require.NoError(b, err) clock := quartz.NewMock(b) @@ -278,7 +278,7 @@ func BenchmarkQueryWithConcurrentAdds(b *testing.B) { } func benchmarkQueryWithConcurrentAdds(b *testing.B, initialSilences int, addRatio float64) { - s, err := New(Options{}) + s, err := New(Options{Metrics: prometheus.NewRegistry()}) require.NoError(b, err) clock := quartz.NewMock(b) @@ -375,7 +375,7 @@ func BenchmarkMutesParallel(b *testing.B) { } func benchmarkMutesParallel(b *testing.B, numSilences, numGoroutines int) { - silences, err := New(Options{}) + silences, err := New(Options{Metrics: prometheus.NewRegistry()}) require.NoError(b, err) clock := quartz.NewMock(b)