Skip to content

Commit 86315ff

Browse files
authored
Log and report kafka metrics after producing them (#155)
Currently we report on kafka metrics on .Produce(). However, there are scenarios when kafka delivery fails and we don't record those failures. Handling reporting of metrics based on producer events instead
1 parent e902968 commit 86315ff

File tree

1 file changed

+29
-15
lines changed

1 file changed

+29
-15
lines changed

datastore/kafka/kafka.go

Lines changed: 29 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,11 @@ type Producer struct {
2727

2828
// Metrics stores metrics reported from this package
2929
type Metrics struct {
30-
produceCount adapter.Counter
31-
byteTotal adapter.Counter
32-
errorCount adapter.Counter
30+
produceCount adapter.Counter
31+
bytesTotal adapter.Counter
32+
produceAckCount adapter.Counter
33+
bytesAckTotal adapter.Counter
34+
errorCount adapter.Counter
3335
}
3436

3537
var (
@@ -57,9 +59,7 @@ func NewProducer(config *kafka.ConfigMap, namespace string, reliableAckWorkers i
5759
reliableAck: reliableAckWorkers > 0,
5860
}
5961

60-
for i := 0; i < reliableAckWorkers; i++ {
61-
go producer.handleProducerEvents(ackChan)
62-
}
62+
go producer.handleProducerEvents(ackChan)
6363
producer.logger.ActivityLog("kafka_registered", logrus.LogInfo{"namespace": namespace})
6464
return producer, nil
6565
}
@@ -74,21 +74,18 @@ func (p *Producer) Produce(entry *telemetry.Record) {
7474
Key: []byte(entry.Vin),
7575
Headers: headersFromRecord(entry),
7676
Timestamp: time.Now(),
77+
Opaque: entry,
7778
}
7879

7980
// Note: confluent kafka supports the concept of one channel per connection, so we could add those here and get rid of reliableAckWorkers
8081
// ex.: https://github.com/confluentinc/confluent-kafka-go/blob/master/examples/producer_custom_channel_example/producer_custom_channel_example.go#L79
81-
if p.reliableAck {
82-
msg.Opaque = entry
83-
}
8482
entry.ProduceTime = time.Now()
8583
if err := p.kafkaProducer.Produce(msg, nil); err != nil {
8684
p.logError(err)
8785
return
8886
}
89-
9087
metricsRegistry.produceCount.Inc(map[string]string{"record_type": entry.TxType})
91-
metricsRegistry.byteTotal.Add(int64(entry.Length()), map[string]string{"record_type": entry.TxType})
88+
metricsRegistry.bytesTotal.Add(int64(entry.Length()), map[string]string{"record_type": entry.TxType})
9289
}
9390

9491
// ReportError to airbrake and logger
@@ -113,9 +110,14 @@ func (p *Producer) handleProducerEvents(ackChan chan (*telemetry.Record)) {
113110
case kafka.Error:
114111
p.logError(fmt.Errorf("producer_error %v", ev))
115112
case *kafka.Message:
116-
record, ok := ev.Opaque.(*telemetry.Record)
117-
if ok {
118-
ackChan <- record
113+
entry, ok := ev.Opaque.(*telemetry.Record)
114+
if !ok {
115+
continue
116+
}
117+
metricsRegistry.produceAckCount.Inc(map[string]string{"record_type": entry.TxType})
118+
metricsRegistry.bytesAckTotal.Add(int64(entry.Length()), map[string]string{"record_type": entry.TxType})
119+
if p.reliableAck {
120+
ackChan <- entry
119121
}
120122
default:
121123
p.logger.ActivityLog("kafka_event_ignored", logrus.LogInfo{"event": ev.String()})
@@ -139,12 +141,24 @@ func registerMetrics(metricsCollector metrics.MetricCollector) {
139141
Labels: []string{"record_type"},
140142
})
141143

142-
metricsRegistry.byteTotal = metricsCollector.RegisterCounter(adapter.CollectorOptions{
144+
metricsRegistry.bytesTotal = metricsCollector.RegisterCounter(adapter.CollectorOptions{
143145
Name: "kafka_produce_total_bytes",
144146
Help: "The number of bytes produced to Kafka.",
145147
Labels: []string{"record_type"},
146148
})
147149

150+
metricsRegistry.produceAckCount = metricsCollector.RegisterCounter(adapter.CollectorOptions{
151+
Name: "kafka_produce_ack_total",
152+
Help: "The number of records produced to Kafka for which we got an ACK.",
153+
Labels: []string{"record_type"},
154+
})
155+
156+
metricsRegistry.bytesAckTotal = metricsCollector.RegisterCounter(adapter.CollectorOptions{
157+
Name: "kafka_produce_ack_total_bytes",
158+
Help: "The number of bytes produced to Kafka for which we got an ACK.",
159+
Labels: []string{"record_type"},
160+
})
161+
148162
metricsRegistry.errorCount = metricsCollector.RegisterCounter(adapter.CollectorOptions{
149163
Name: "kafka_err",
150164
Help: "The number of errors while producing to Kafka.",

0 commit comments

Comments
 (0)