Skip to content

Commit

Permalink
Better logs/metrics when divergent vertex execution is detected (#830)
Browse files Browse the repository at this point in the history
  • Loading branch information
LukasGasior1 authored Feb 22, 2024
2 parents 978f9ec + 5ff3efe commit 5ccf5e1
Show file tree
Hide file tree
Showing 9 changed files with 368 additions and 175 deletions.
4 changes: 3 additions & 1 deletion common/src/main/java/com/radixdlt/monitoring/Metrics.java
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ public record Bft(
Summary leaderTransactionBytesIncludedInProposal,
Summary leaderTransactionBytesIncludedInProposalAndPreviousVertices,
Summary numSignaturesInCertificate,
Counter divergentVertexExecutions) {
LabelledCounter<DivergentVertexExecution> divergentVertexExecutions) {

public record SuccessfullyProcessedVote(boolean isTimeout, VoteProcessingResult result) {}

Expand Down Expand Up @@ -234,6 +234,8 @@ public record Sync(

public record VertexStore(
Gauge size, Counter forks, Counter rebuilds, Counter indirectParents) {}

public record DivergentVertexExecution(int numDistinctExecutionResults) {}
}

public record BerkeleyDb(AddressBook addressBook, SafetyState safetyState) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
/* Copyright 2021 Radix Publishing Ltd incorporated in Jersey (Channel Islands).
*
* Licensed under the Radix License, Version 1.0 (the "License"); you may not use this
* file except in compliance with the License. You may obtain a copy of the License at:
*
* radixfoundation.org/licenses/LICENSE-v1
*
* The Licensor hereby grants permission for the Canonical version of the Work to be
* published, distributed and used under or by reference to the Licensor’s trademark
* Radix ® and use of any unregistered trade names, logos or get-up.
*
* The Licensor provides the Work (and each Contributor provides its Contributions) on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
* including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT,
* MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE.
*
* Whilst the Work is capable of being deployed, used and adopted (instantiated) to create
* a distributed ledger it is your responsibility to test and validate the code, together
* with all logic and performance of that code under all foreseeable scenarios.
*
* The Licensor does not make or purport to make and hereby excludes liability for all
* and any representation, warranty or undertaking in any form whatsoever, whether express
* or implied, to any entity or person, including any representation, warranty or
* undertaking, as to the functionality security use, value or other characteristics of
* any distributed ledger nor in respect the functioning or value of any tokens which may
* be created stored or transferred using the Work. The Licensor does not warrant that the
* Work or any use of the Work complies with any law or regulation in any territory where
* it may be implemented or used or that it will be appropriate for any specific purpose.
*
* Neither the licensor nor any current or former employees, officers, directors, partners,
* trustees, representatives, agents, advisors, contractors, or volunteers of the Licensor
* shall be liable for any direct or indirect, special, incidental, consequential or other
* losses of any kind, in tort, contract or otherwise (including but not limited to loss
* of revenue, income or profits, or loss of use or data, or loss of reputation, or loss
* of any economic or other opportunity of whatsoever nature or howsoever arising), arising
* out of or in connection with (without limitation of any use, misuse, of any ledger system
* or use made or its functionality or any performance or operation of any code or protocol
* caused by bugs or programming or logic errors or otherwise);
*
* A. any offer, purchase, holding, use, sale, exchange or transmission of any
* cryptographic keys, tokens or assets created, exchanged, stored or arising from any
* interaction with the Work;
*
* B. any failure in a transmission or loss of any token or assets keys or other digital
* artefacts due to errors in transmission;
*
* C. bugs, hacks, logic errors or faults in the Work or any communication;
*
* D. system software or apparatus including but not limited to losses caused by errors
* in holding or transmitting tokens by any third-party;
*
* E. breaches or failure of security including hacker attacks, loss or disclosure of
* password, loss of private key, unauthorised use or misuse of such passwords or keys;
*
* F. any losses including loss of anticipated savings or other benefits resulting from
* use of the Work or any changes to the Work (however implemented).
*
* You are solely responsible for; testing, validating and evaluation of all operation
* logic, functionality, security and appropriateness of using the Work for any commercial
* or non-commercial purpose and for any reproduction or redistribution by You of the
* Work. You assume all risks associated with Your use of the Work and the exercise of
* permissions under this License.
*/

package com.radixdlt.consensus;

import com.google.common.hash.HashCode;
import com.google.common.util.concurrent.RateLimiter;
import com.radixdlt.consensus.bft.BFTValidatorId;
import com.radixdlt.consensus.bft.BFTValidatorSet;
import com.radixdlt.consensus.bft.Round;
import com.radixdlt.monitoring.Metrics;
import java.math.BigDecimal;
import java.math.RoundingMode;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

/**
* A utility for detecting divergent vertex execution, i.e. a situation where two (or more)
* validators from the validator set have voted for the same vertexId, but their resultant ledger
* headers were different.
*/
@SuppressWarnings("UnstableApiUsage")
public final class DivergentVertexExecutionDetector {
private static final Logger log = LogManager.getLogger();

private final RateLimiter logRatelimiter = RateLimiter.create(0.05); // At most one log every 20s

private final Metrics metrics;
private final BFTValidatorSet validatorSet;

// vertexId -> ledgerHeader -> validators who voted for (vertexId, ledgerHeader)
private final Map<HashCode, Map<LedgerHeader, Set<BFTValidatorId>>> votesByVertexId =
new HashMap<>();

public DivergentVertexExecutionDetector(Metrics metrics, BFTValidatorSet validatorSet) {
this.metrics = metrics;
this.validatorSet = validatorSet;
}

/**
* Processes a received vote. No additional filtering is applied, the caller should ensure that
* only relevant votes are being processed (e.g. only votes from a single round).
*/
public void processVote(Vote vote) {
final var ledgerHeadersByVertexId =
votesByVertexId.computeIfAbsent(
vote.getVoteData().getProposed().getVertexId(), unused -> new HashMap<>());
final var authorsByLedgerHeader =
ledgerHeadersByVertexId.computeIfAbsent(
vote.getVoteData().getProposed().getLedgerHeader(), unused -> new HashSet<>());
authorsByLedgerHeader.add(vote.getAuthor());
}

public void summarizeAfterRoundAndReset(Round round) {
// Divergent executions are the ones that have more than one resultant header
// for the same vertexId.
final StringBuilder logBuilder = new StringBuilder();
votesByVertexId.entrySet().stream()
.filter(e -> e.getValue().size() > 1)
.forEach(
e -> {
final var vertexId = e.getKey();
final var distinctResults = e.getValue();

metrics
.bft()
.divergentVertexExecutions()
.label(new Metrics.Bft.DivergentVertexExecution(distinctResults.size()))
.inc();

logBuilder.append(
String.format(
"In round %s validators have voted for vertex %s but they've computed %s"
+ " distinct results:\n",
round, vertexId, distinctResults.size()));
final var totalStakeDec = new BigDecimal(validatorSet.getTotalPower().toBigInt());
for (var result : distinctResults.entrySet()) {
final var stakeVoted =
result.getValue().stream()
.map(v -> new BigDecimal(validatorSet.getPower(v).toBigInt()))
.reduce(BigDecimal.ZERO, BigDecimal::add);
final var stakeVotedProportion =
stakeVoted.divide(totalStakeDec, 4, RoundingMode.HALF_UP);
// Let's list the actual validators if they represent less than 10% stake
final var validatorsDetails =
stakeVotedProportion.compareTo(BigDecimal.valueOf(0.1)) < 0
? " ("
+ result.getValue().stream()
.map(BFTValidatorId::toString)
.collect(Collectors.joining(","))
+ ")"
: "";
logBuilder.append(
String.format(
" * %s validator(s)%s representing %s stake computed %s\n",
result.getValue().size(),
validatorsDetails,
stakeVotedProportion,
result.getKey()));
}
});

if (logBuilder.length() > 0 && logRatelimiter.tryAcquire()) {
logBuilder.append(
"This is informational only, this node is unaffected unless other error messages"
+ " follow.\n");
log.info(logBuilder.toString());
}

// Reset this detector to its initial (empty) state
this.votesByVertexId.clear();
}
}
13 changes: 12 additions & 1 deletion core/src/main/java/com/radixdlt/consensus/NextEpoch.java
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,17 @@ public int hashCode() {

@Override
public String toString() {
return "NextEpoch{" + "validators=" + validators + ", epoch=" + epoch + '}';
final StringBuilder builder = new StringBuilder("NextEpoch{epoch=");
builder.append(epoch);
builder.append(", stake_by_validator_id=[");
validators.forEach(
v -> {
builder.append(v.getValidatorId());
builder.append("->");
builder.append(v.getPower());
builder.append(",");
});
builder.append("]}");
return builder.toString();
}
}
110 changes: 1 addition & 109 deletions core/src/main/java/com/radixdlt/consensus/PendingVotes.java
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Maps;
import com.google.common.hash.HashCode;
import com.google.common.util.concurrent.RateLimiter;
import com.radixdlt.SecurityCritical;
import com.radixdlt.SecurityCritical.SecurityKind;
import com.radixdlt.consensus.bft.BFTValidatorId;
Expand All @@ -79,13 +78,10 @@
import com.radixdlt.crypto.ECDSASecp256k1Signature;
import com.radixdlt.crypto.Hasher;
import com.radixdlt.environment.EventDispatcher;
import com.radixdlt.monitoring.Metrics;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import javax.annotation.concurrent.NotThreadSafe;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

/**
* Manages pending votes for various vertices.
Expand All @@ -97,120 +93,20 @@
@NotThreadSafe
@SecurityCritical({SecurityKind.SIG_VERIFY, SecurityKind.GENERAL})
public final class PendingVotes {
private static final Logger log = LogManager.getLogger();

private final BFTValidatorId self;
private final Map<HashCode, ValidationState> voteState = Maps.newHashMap();
private final Map<HashCode, ValidationState> timeoutVoteState = Maps.newHashMap();
private final Map<BFTValidatorId, PreviousVote> previousVotes = Maps.newHashMap();
private final Hasher hasher;
private final EventDispatcher<ConsensusByzantineEvent> doubleVoteEventDispatcher;
private final BFTValidatorSet validatorSet;
private final Metrics metrics;
private final RateLimiter divergentVertexExecutionLogRateLimiter = RateLimiter.create(0.1);

public PendingVotes(
BFTValidatorId self,
Hasher hasher,
EventDispatcher<ConsensusByzantineEvent> doubleVoteEventDispatcher,
BFTValidatorSet validatorSet,
Metrics metrics) {
this.self = self;
BFTValidatorSet validatorSet) {
this.hasher = Objects.requireNonNull(hasher);
this.doubleVoteEventDispatcher = Objects.requireNonNull(doubleVoteEventDispatcher);
this.validatorSet = Objects.requireNonNull(validatorSet);
this.metrics = metrics;
}

private void checkForDivergentVertexExecution(Vote incomingVote) {
final var voteVertexId = incomingVote.getVoteData().getProposed().getVertexId();
final var voteLedgerHeader = incomingVote.getVoteData().getProposed().getLedgerHeader();
for (var otherVote : this.previousVotes.entrySet()) {
final var otherVertexId = otherVote.getValue().proposedHeader().getVertexId();
final var otherLedgerHeader = otherVote.getValue().proposedHeader().getLedgerHeader();
if (voteVertexId.equals(otherVertexId) && !voteLedgerHeader.equals(otherLedgerHeader)) {
if (!voteLedgerHeader
.nextProtocolVersion()
.equals(otherLedgerHeader.nextProtocolVersion())) {
logDivergentProtocolUpdateVote(incomingVote, otherVote.getKey(), otherVote.getValue());
} else {
if (divergentVertexExecutionLogRateLimiter.tryAcquire()) {
log.warn(
"Divergent vertex execution detected! An incoming vote (from {}) for vertex {}"
+ " claims the following resultant ledger header: {}, while validator {} thinks"
+ " that the resultant ledger header is {}. [this_vote={}, other_vote={}]",
incomingVote.getAuthor(),
voteVertexId,
voteLedgerHeader,
otherVote.getKey(),
otherLedgerHeader,
incomingVote,
otherVote);
}
}

// Regardless of the reason and whether the divergence applies to this node,
// we're bumping the metrics.
this.metrics.bft().divergentVertexExecutions().inc();
}
}
}

private void logDivergentProtocolUpdateVote(
Vote incomingVote, BFTValidatorId otherVoteAuthor, PreviousVote otherVote) {
// Protocol update-related divergence is most likely caused by some nodes running an
// outdated version. This is somewhat expected, so we're not going to log these occurrences
// unless they apply to this node.
final var isIncomingVoteFromSelf = self.equals(incomingVote.getAuthor());
final var isExistingVoteFromSelf = self.equals(otherVoteAuthor);
if (isIncomingVoteFromSelf || isExistingVoteFromSelf) {
final var selfProposedHeader =
isIncomingVoteFromSelf
? incomingVote.getVoteData().getProposed()
: otherVote.proposedHeader();
final var otherProposedHeader =
isIncomingVoteFromSelf
? otherVote.proposedHeader()
: incomingVote.getVoteData().getProposed();
final var selfNextVersion = selfProposedHeader.getLedgerHeader().nextProtocolVersion();
final var otherNextVersion = otherProposedHeader.getLedgerHeader().nextProtocolVersion();
final var otherAuthor = isIncomingVoteFromSelf ? otherVoteAuthor : incomingVote.getAuthor();
if (selfNextVersion.isPresent() && otherNextVersion.isEmpty()) {
// We're enacting, they don't: just a debug log.
if (divergentVertexExecutionLogRateLimiter.tryAcquire()) {
log.debug(
"""
Received a BFT vote from {} that doesn't enact a protocol update,
while we're enacting {}.
This indicates that they're likely running an outdated node version.
This node is unaffected, unless other error messages follow.""",
otherAuthor,
selfNextVersion);
} else if (selfNextVersion.isEmpty() && otherNextVersion.isPresent()) {
// We're not enacting, but they are: we're potentially outdated, log an early warning
if (divergentVertexExecutionLogRateLimiter.tryAcquire()) {
log.warn(
"""
Received a BFT vote from {} that enacts a protocol update {}, but this node doesn't enact it.
This node might be running an outdated version (but it's not certain).""",
otherAuthor,
otherNextVersion);
}
} else if (selfNextVersion.isPresent() && otherNextVersion.isPresent()) {
// We're enacting a different version, one of us is likely outdated.
if (divergentVertexExecutionLogRateLimiter.tryAcquire()) {
log.warn(
"Received a BFT vote from {} that enacts a protocol update {}, but this node enacts"
+ " {}. It is likely that one of the nodes (this node or {}) is running an"
+ " outdated version.",
otherAuthor,
otherNextVersion,
selfNextVersion,
otherAuthor);
}
}
} /* else: no-op; we don't care about other cases */
} /* else: no-op; we don't care if the vote doesn't affect this node */
}

/**
Expand All @@ -230,10 +126,6 @@ public VoteProcessingResult insertVote(Vote vote) {
return VoteProcessingResult.rejected(VoteRejectedReason.INVALID_AUTHOR);
}

// This doesn't do anything, other than logging and bumping the metrics,
// when divergent execution is detected (which should hopefully never happen).
checkForDivergentVertexExecution(vote);

if (!replacePreviousVote(author, vote, voteDataHash)) {
return VoteProcessingResult.rejected(VoteRejectedReason.DUPLICATE_VOTE);
}
Expand Down
4 changes: 2 additions & 2 deletions core/src/main/java/com/radixdlt/consensus/bft/BFTBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -207,8 +207,7 @@ public BFTEventProcessor build() {
if (!validatorSet.containsValidator(self)) {
return EmptyBFTEventProcessor.INSTANCE;
}
final PendingVotes pendingVotes =
new PendingVotes(self, hasher, doubleVoteDispatcher, validatorSet, metrics);
final PendingVotes pendingVotes = new PendingVotes(hasher, doubleVoteDispatcher, validatorSet);

/* Setting up the following BFT event processing pipeline:
ObsoleteEventsFilter (filters out obsolete events for past rounds)
Expand All @@ -227,6 +226,7 @@ public BFTEventProcessor build() {
roundQuorumResolutionDispatcher,
timeoutQuorumDelayedResolutionDispatcher,
metrics,
new DivergentVertexExecutionDetector(metrics, validatorSet),
pendingVotes,
roundUpdate,
timeoutQuorumResolutionDelayMs);
Expand Down
Loading

0 comments on commit 5ccf5e1

Please sign in to comment.