From 01a48daadb82a92fdea0cecc103816758a33a636 Mon Sep 17 00:00:00 2001 From: Andrei Smirnov Date: Tue, 15 Oct 2024 10:01:52 +0300 Subject: [PATCH] core/consensus: logging leader index --- core/consensus/component.go | 31 ++++++++++++++++++++----------- core/consensus/metrics.go | 18 ++++++++++-------- docs/metrics.md | 2 +- 3 files changed, 31 insertions(+), 20 deletions(-) diff --git a/core/consensus/component.go b/core/consensus/component.go index 19b27bf68..63715f921 100644 --- a/core/consensus/component.go +++ b/core/consensus/component.go @@ -445,23 +445,32 @@ func (c *Component) runInstance(ctx context.Context, duty core.Duty) (err error) } // Instrument consensus instance. - var decided bool + var ( + decided bool + nodes = len(c.peers) + ) + decideCallback := func(qcommit []qbft.Msg[core.Duty, [32]byte]) { + round := qcommit[0].Round() decided = true - decidedRoundsGauge.WithLabelValues(duty.Type.String(), string(roundTimer.Type())).Set(float64(qcommit[0].Round())) + decidedRoundsGauge.WithLabelValues(duty.Type.String(), string(roundTimer.Type())).Set(float64(round)) inst.decidedAtCh <- time.Now() - } - // Create a new qbft definition for this instance. - def := newDefinition(len(c.peers), c.subscribers, roundTimer, decideCallback) + leaderIndex := leader(duty, round, nodes) + leaderName := c.peers[leaderIndex].Name + log.Debug(ctx, "QBFT consensus decided", + z.Str("duty", duty.Type.String()), + z.U64("slot", duty.Slot), + z.I64("round", round), + z.I64("leader_index", leaderIndex), + z.Str("leader_name", leaderName)) - if duty.Type == core.DutyProposer { - leaderIndex := leader(duty, 0, len(c.peers)) - proposeLeaderGauge.Set(float64(leaderIndex)) - - log.Debug(ctx, "QBFT consensus leader index", z.I64("index", leaderIndex)) + decidedLeaderGauge.Set(float64(leaderIndex)) } + // Create a new qbft definition for this instance. + def := newDefinition(nodes, c.subscribers, roundTimer, decideCallback) + // Create a new transport that handles sending and receiving for this instance. t := transport{ component: c, @@ -486,7 +495,7 @@ func (c *Component) runInstance(ctx context.Context, duty core.Duty) (err error) } // Run the algo, blocking until the context is cancelled. - err = qbft.Run[core.Duty, [32]byte](ctx, def, qt, duty, peerIdx, inst.hashCh) + err = qbft.Run(ctx, def, qt, duty, peerIdx, inst.hashCh) if err != nil && !isContextErr(err) { consensusError.Inc() return err // Only return non-context errors. diff --git a/core/consensus/metrics.go b/core/consensus/metrics.go index 21afe5313..3228f9a9e 100644 --- a/core/consensus/metrics.go +++ b/core/consensus/metrics.go @@ -9,12 +9,21 @@ import ( ) var ( + // Using gauge since the value changes slowly, once per slot. decidedRoundsGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "core", Subsystem: "consensus", Name: "decided_rounds", Help: "Number of rounds it took to decide consensus instances by duty and timer type.", - }, []string{"duty", "timer"}) // Using gauge since the value changes slowly, once per slot. + }, []string{"duty", "timer"}) + + // Using gauge since the value changes slowly, once per slot. + decidedLeaderGauge = promauto.NewGauge(prometheus.GaugeOpts{ + Namespace: "core", + Subsystem: "consensus", + Name: "decided_leader_index", + Help: "Leader node index of the decision round.", + }) consensusDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{ Namespace: "core", @@ -37,11 +46,4 @@ var ( Name: "error_total", Help: "Total count of consensus errors", }) - - proposeLeaderGauge = promauto.NewGauge(prometheus.GaugeOpts{ - Namespace: "core", - Subsystem: "consensus", - Name: "propose_leader_index", - Help: "Index of leader node proposing a block for the first round", - }) ) diff --git a/docs/metrics.md b/docs/metrics.md index 983f680fd..211a6f571 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -43,10 +43,10 @@ when storing metrics from multiple nodes or clusters in one Prometheus instance. | `core_bcast_recast_errors_total` | Counter | The total count of failed recasted registrations by source; `pregen` vs `downstream` | `source` | | `core_bcast_recast_registration_total` | Counter | The total number of unique validator registration stored in recaster per pubkey | `pubkey` | | `core_bcast_recast_total` | Counter | The total count of recasted registrations by source; `pregen` vs `downstream` | `source` | +| `core_consensus_decided_leader_index` | Gauge | Leader node index of the decision round. | | | `core_consensus_decided_rounds` | Gauge | Number of rounds it took to decide consensus instances by duty and timer type. | `duty, timer` | | `core_consensus_duration_seconds` | Histogram | Duration of a consensus instance in seconds by duty and timer type. | `duty, timer` | | `core_consensus_error_total` | Counter | Total count of consensus errors | | -| `core_consensus_propose_leader_index` | Gauge | Index of leader node proposing a block for the first round | | | `core_consensus_timeout_total` | Counter | Total count of consensus timeouts by duty and timer type. | `duty, timer` | | `core_parsigdb_exit_total` | Counter | Total number of partially signed voluntary exits per public key | `pubkey` | | `core_scheduler_current_epoch` | Gauge | The current epoch | |