Skip to content

Commit

Permalink
core/consensus: logging leader index (#3334)
Browse files Browse the repository at this point in the history
Added `core_consensus_decided_leader_index` gauge to reflect the leader index of QBFT decision round. Also, logging essential decision round data as a debug message.

category: feature
ticket: none
  • Loading branch information
pinebit authored Oct 16, 2024
1 parent 9acf569 commit 774902f
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 5 deletions.
24 changes: 20 additions & 4 deletions core/consensus/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -445,15 +445,31 @@ func (c *Component) runInstance(ctx context.Context, duty core.Duty) (err error)
}

// Instrument consensus instance.
var decided bool
var (
decided bool
nodes = len(c.peers)
)

decideCallback := func(qcommit []qbft.Msg[core.Duty, [32]byte]) {
round := qcommit[0].Round()
decided = true
decidedRoundsGauge.WithLabelValues(duty.Type.String(), string(roundTimer.Type())).Set(float64(qcommit[0].Round()))
decidedRoundsGauge.WithLabelValues(duty.Type.String(), string(roundTimer.Type())).Set(float64(round))
inst.decidedAtCh <- time.Now()

leaderIndex := leader(duty, round, nodes)
leaderName := c.peers[leaderIndex].Name
log.Debug(ctx, "QBFT consensus decided",
z.Str("duty", duty.Type.String()),
z.U64("slot", duty.Slot),
z.I64("round", round),
z.I64("leader_index", leaderIndex),
z.Str("leader_name", leaderName))

decidedLeaderGauge.WithLabelValues(duty.Type.String()).Set(float64(leaderIndex))
}

// Create a new qbft definition for this instance.
def := newDefinition(len(c.peers), c.subscribers, roundTimer, decideCallback)
def := newDefinition(nodes, c.subscribers, roundTimer, decideCallback)

// Create a new transport that handles sending and receiving for this instance.
t := transport{
Expand All @@ -479,7 +495,7 @@ func (c *Component) runInstance(ctx context.Context, duty core.Duty) (err error)
}

// Run the algo, blocking until the context is cancelled.
err = qbft.Run[core.Duty, [32]byte](ctx, def, qt, duty, peerIdx, inst.hashCh)
err = qbft.Run(ctx, def, qt, duty, peerIdx, inst.hashCh)
if err != nil && !isContextErr(err) {
consensusError.Inc()
return err // Only return non-context errors.
Expand Down
11 changes: 10 additions & 1 deletion core/consensus/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,21 @@ import (
)

var (
// Using gauge since the value changes slowly, once per slot.
decidedRoundsGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "core",
Subsystem: "consensus",
Name: "decided_rounds",
Help: "Number of rounds it took to decide consensus instances by duty and timer type.",
}, []string{"duty", "timer"}) // Using gauge since the value changes slowly, once per slot.
}, []string{"duty", "timer"})

// Using gauge since the value changes slowly, once per slot.
decidedLeaderGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "core",
Subsystem: "consensus",
Name: "decided_leader_index",
Help: "Leader node index of the decision round by duty.",
}, []string{"duty"})

consensusDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{
Namespace: "core",
Expand Down
1 change: 1 addition & 0 deletions docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ when storing metrics from multiple nodes or clusters in one Prometheus instance.
| `core_bcast_recast_errors_total` | Counter | The total count of failed recasted registrations by source; `pregen` vs `downstream` | `source` |
| `core_bcast_recast_registration_total` | Counter | The total number of unique validator registration stored in recaster per pubkey | `pubkey` |
| `core_bcast_recast_total` | Counter | The total count of recasted registrations by source; `pregen` vs `downstream` | `source` |
| `core_consensus_decided_leader_index` | Gauge | Leader node index of the decision round by duty. | `duty` |
| `core_consensus_decided_rounds` | Gauge | Number of rounds it took to decide consensus instances by duty and timer type. | `duty, timer` |
| `core_consensus_duration_seconds` | Histogram | Duration of a consensus instance in seconds by duty and timer type. | `duty, timer` |
| `core_consensus_error_total` | Counter | Total count of consensus errors | |
Expand Down

0 comments on commit 774902f

Please sign in to comment.