diff --git a/core/consensus/component.go b/core/consensus/component.go index 91c6ff13e..0fcf35d9f 100644 --- a/core/consensus/component.go +++ b/core/consensus/component.go @@ -445,15 +445,31 @@ func (c *Component) runInstance(ctx context.Context, duty core.Duty) (err error) } // Instrument consensus instance. - var decided bool + var ( + decided bool + nodes = len(c.peers) + ) + decideCallback := func(qcommit []qbft.Msg[core.Duty, [32]byte]) { + round := qcommit[0].Round() decided = true - decidedRoundsGauge.WithLabelValues(duty.Type.String(), string(roundTimer.Type())).Set(float64(qcommit[0].Round())) + decidedRoundsGauge.WithLabelValues(duty.Type.String(), string(roundTimer.Type())).Set(float64(round)) inst.decidedAtCh <- time.Now() + + leaderIndex := leader(duty, round, nodes) + leaderName := c.peers[leaderIndex].Name + log.Debug(ctx, "QBFT consensus decided", + z.Str("duty", duty.Type.String()), + z.U64("slot", duty.Slot), + z.I64("round", round), + z.I64("leader_index", leaderIndex), + z.Str("leader_name", leaderName)) + + decidedLeaderGauge.WithLabelValues(duty.Type.String()).Set(float64(leaderIndex)) } // Create a new qbft definition for this instance. - def := newDefinition(len(c.peers), c.subscribers, roundTimer, decideCallback) + def := newDefinition(nodes, c.subscribers, roundTimer, decideCallback) // Create a new transport that handles sending and receiving for this instance. t := transport{ @@ -479,7 +495,7 @@ func (c *Component) runInstance(ctx context.Context, duty core.Duty) (err error) } // Run the algo, blocking until the context is cancelled. - err = qbft.Run[core.Duty, [32]byte](ctx, def, qt, duty, peerIdx, inst.hashCh) + err = qbft.Run(ctx, def, qt, duty, peerIdx, inst.hashCh) if err != nil && !isContextErr(err) { consensusError.Inc() return err // Only return non-context errors. diff --git a/core/consensus/metrics.go b/core/consensus/metrics.go index 0c9b98648..8a6eee996 100644 --- a/core/consensus/metrics.go +++ b/core/consensus/metrics.go @@ -9,12 +9,21 @@ import ( ) var ( + // Using gauge since the value changes slowly, once per slot. decidedRoundsGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "core", Subsystem: "consensus", Name: "decided_rounds", Help: "Number of rounds it took to decide consensus instances by duty and timer type.", - }, []string{"duty", "timer"}) // Using gauge since the value changes slowly, once per slot. + }, []string{"duty", "timer"}) + + // Using gauge since the value changes slowly, once per slot. + decidedLeaderGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "core", + Subsystem: "consensus", + Name: "decided_leader_index", + Help: "Leader node index of the decision round by duty.", + }, []string{"duty"}) consensusDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{ Namespace: "core", diff --git a/docs/metrics.md b/docs/metrics.md index 4bcfcbe96..aa8e0e771 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -43,6 +43,7 @@ when storing metrics from multiple nodes or clusters in one Prometheus instance. | `core_bcast_recast_errors_total` | Counter | The total count of failed recasted registrations by source; `pregen` vs `downstream` | `source` | | `core_bcast_recast_registration_total` | Counter | The total number of unique validator registration stored in recaster per pubkey | `pubkey` | | `core_bcast_recast_total` | Counter | The total count of recasted registrations by source; `pregen` vs `downstream` | `source` | +| `core_consensus_decided_leader_index` | Gauge | Leader node index of the decision round by duty. | `duty` | | `core_consensus_decided_rounds` | Gauge | Number of rounds it took to decide consensus instances by duty and timer type. | `duty, timer` | | `core_consensus_duration_seconds` | Histogram | Duration of a consensus instance in seconds by duty and timer type. | `duty, timer` | | `core_consensus_error_total` | Counter | Total count of consensus errors | |