Skip to content

Commit 21f60cf

Browse files
memberlist WriteTo: Track dropped packets properly (#611)
- Do not increment the `packets_sent_errors_total`, instead have a new `packets_dropped_total` so we can isolate these - Debug messages instead of warn. These logs are all mostly the same, they don't need to be "warn" level
1 parent d3f80b0 commit 21f60cf

File tree

1 file changed

+11
-2
lines changed

1 file changed

+11
-2
lines changed

kv/memberlist/tcp_transport.go

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ type TCPTransport struct {
123123
sentPackets prometheus.Counter
124124
sentPacketsBytes prometheus.Counter
125125
sentPacketsErrors prometheus.Counter
126+
droppedPackets prometheus.Counter
126127
unknownConnections prometheus.Counter
127128
}
128129

@@ -463,8 +464,9 @@ func (t *TCPTransport) WriteTo(b []byte, addr string) (time.Time, error) {
463464
// If this blocks for too long (as configured), abort and log an error.
464465
select {
465466
case <-time.After(t.cfg.AcquireWriterTimeout):
466-
level.Warn(t.logger).Log("msg", "WriteTo failed to acquire a writer. Dropping message", "timeout", t.cfg.AcquireWriterTimeout, "addr", addr)
467-
t.sentPacketsErrors.Inc()
467+
// Dropped packets are not an issue, the memberlist protocol will retry later.
468+
level.Debug(t.logger).Log("msg", "WriteTo failed to acquire a writer. Dropping message", "timeout", t.cfg.AcquireWriterTimeout, "addr", addr)
469+
t.droppedPackets.Inc()
468470
// WriteTo is used to send "UDP" packets. Since we use TCP, we can detect more errors,
469471
// but memberlist library doesn't seem to cope with that very well. That is why we return nil instead.
470472
return time.Now(), nil
@@ -681,6 +683,13 @@ func (t *TCPTransport) registerMetrics(registerer prometheus.Registerer) {
681683
Help: "Number of errors when receiving memberlist packets",
682684
})
683685

686+
t.droppedPackets = promauto.With(registerer).NewCounter(prometheus.CounterOpts{
687+
Namespace: t.cfg.MetricsNamespace,
688+
Subsystem: subsystem,
689+
Name: "packets_dropped_total",
690+
Help: "Number of dropped memberlist packets. These packets were not sent due to timeout waiting for a writer.",
691+
})
692+
684693
t.sentPackets = promauto.With(registerer).NewCounter(prometheus.CounterOpts{
685694
Namespace: t.cfg.MetricsNamespace,
686695
Subsystem: subsystem,

0 commit comments

Comments
 (0)