From c357abb57b82992bcc33c509f0e19c8eba274d05 Mon Sep 17 00:00:00 2001 From: Ed Hennis Date: Thu, 5 Sep 2024 15:10:41 -0400 Subject: [PATCH] Improve logging related to ledger acquisition --- src/xrpld/app/ledger/InboundLedger.h | 20 ++++ src/xrpld/app/ledger/detail/InboundLedger.cpp | 9 +- .../app/ledger/detail/InboundLedgers.cpp | 100 ++++++++++++++++-- src/xrpld/app/ledger/detail/LedgerMaster.cpp | 5 +- .../app/ledger/detail/TimeoutCounter.cpp | 8 ++ 5 files changed, 132 insertions(+), 10 deletions(-) diff --git a/src/xrpld/app/ledger/InboundLedger.h b/src/xrpld/app/ledger/InboundLedger.h index 13f603e79d0..f0c692b0791 100644 --- a/src/xrpld/app/ledger/InboundLedger.h +++ b/src/xrpld/app/ledger/InboundLedger.h @@ -196,6 +196,26 @@ class InboundLedger final : public TimeoutCounter, std::unique_ptr mPeerSet; }; +inline std::string +to_string(InboundLedger::Reason reason) +{ + using enum InboundLedger::Reason; + switch (reason) + { + case HISTORY: + return "HISTORY"; + case SHARD: + return "SHARD"; + case GENERIC: + return "GENERIC"; + case CONSENSUS: + return "CONSENSUS"; + default: + assert(false); + return "unknown"; + } +} + } // namespace ripple #endif diff --git a/src/xrpld/app/ledger/detail/InboundLedger.cpp b/src/xrpld/app/ledger/detail/InboundLedger.cpp index 7d59e258797..e8d6d5a9f53 100644 --- a/src/xrpld/app/ledger/detail/InboundLedger.cpp +++ b/src/xrpld/app/ledger/detail/InboundLedger.cpp @@ -390,7 +390,14 @@ InboundLedger::onTimer(bool wasProgress, ScopedLockType&) if (!wasProgress) { - checkLocal(); + if (checkLocal()) + { + // Done. Something else (probably consensus) built the ledger + // locally while waiting for data (or possibly before requesting) + assert(isDone()); + JLOG(journal_.info()) << "Finished while waiting " << hash_; + return; + } mByHash = true; diff --git a/src/xrpld/app/ledger/detail/InboundLedgers.cpp b/src/xrpld/app/ledger/detail/InboundLedgers.cpp index 542cdad5ffc..540bfd5502a 100644 --- a/src/xrpld/app/ledger/detail/InboundLedgers.cpp +++ b/src/xrpld/app/ledger/detail/InboundLedgers.cpp @@ -74,11 +74,90 @@ class InboundLedgersImp : public InboundLedgers auto doAcquire = [&, seq, reason]() -> std::shared_ptr { assert(hash.isNonZero()); - // probably not the right rule - if (app_.getOPs().isNeedNetworkLedger() && - (reason != InboundLedger::Reason::GENERIC) && - (reason != InboundLedger::Reason::CONSENSUS)) + bool const shouldAcquire = [&]() { + if (!app_.getOPs().isNeedNetworkLedger()) + return true; + if (reason == InboundLedger::Reason::GENERIC) + return true; + if (reason == InboundLedger::Reason::CONSENSUS) + return true; + return false; + }(); + assert( + shouldAcquire == + !(app_.getOPs().isNeedNetworkLedger() && + (reason != InboundLedger::Reason::GENERIC) && + (reason != InboundLedger::Reason::CONSENSUS))); + + std::stringstream ss; + ss << "InboundLedger::acquire: " + << "Request: " << to_string(hash) << ", " << seq + << " NeedNetworkLedger: " + << (app_.getOPs().isNeedNetworkLedger() ? "yes" : "no") + << " Reason: " << to_string(reason) + << " Should acquire: " << (shouldAcquire ? "true." : "false."); + + /* Acquiring ledgers is somewhat expensive. It requires lots of + * computation and network communication. Avoid it when it's not + * appropriate. Every validation from a peer for a ledger that + * we do not have locally results in a call to this function: even + * if we are moments away from validating the same ledger. + */ + bool const shouldBroadcast = [&]() { + // If the node is not in "full" state, it needs to sync to + // the network, and doesn't have the necessary tx's and + // ledger entries to build the ledger. + bool const isFull = app_.getOPs().isFull(); + // If everything else is ok, don't try to acquire the ledger + // if the requested seq is in the near future relative to + // the validated ledger. If the requested ledger is between + // 1 and 19 inclusive ledgers ahead of the valid ledger this + // node has not built it yet, but it's possible/likely it + // has the tx's necessary to build it and get caught up. + // Plus it might not become validated. On the other hand, if + // it's more than 20 in the future, this node should request + // it so that it can jump ahead and get caught up. + LedgerIndex const validSeq = + app_.getLedgerMaster().getValidLedgerIndex(); + constexpr std::size_t lagLeeway = 20; + bool const nearFuture = + (seq > validSeq) && (seq < validSeq + lagLeeway); + // If everything else is ok, don't try to acquire the ledger + // if the request is related to consensus. (Note that + // consensus calls usually pass a seq of 0, so nearFuture + // will be false other than on a brand new network.) + bool const consensus = + reason == InboundLedger::Reason::CONSENSUS; + ss << " Evaluating whether to broadcast requests to peers" + << ". full: " << (isFull ? "true" : "false") + << ". ledger sequence " << seq + << ". Valid sequence: " << validSeq + << ". Lag leeway: " << lagLeeway + << ". request for near future ledger: " + << (nearFuture ? "true" : "false") + << ". Consensus: " << (consensus ? "true" : "false"); + + // If the node is not synced, send requests. + if (!isFull) + return true; + // If the ledger is in the near future, do NOT send requests. + // This node is probably about to build it. + if (nearFuture) + return false; + // If the request is because of consensus, do NOT send requests. + // This node is probably about to build it. + if (consensus) + return false; + return true; + }(); + ss << ". Would broadcast to peers? " + << (shouldBroadcast ? "true." : "false."); + + if (!shouldAcquire) + { + JLOG(j_.debug()) << "Abort(rule): " << ss.str(); return {}; + } bool isNew = true; std::shared_ptr inbound; @@ -86,6 +165,7 @@ class InboundLedgersImp : public InboundLedgers ScopedLockType sl(mLock); if (stopping_) { + JLOG(j_.debug()) << "Abort(stopping): " << ss.str(); return {}; } @@ -109,23 +189,29 @@ class InboundLedgersImp : public InboundLedgers ++mCounter; } } + ss << " IsNew: " << (isNew ? "true" : "false"); if (inbound->isFailed()) + { + JLOG(j_.debug()) << "Abort(failed): " << ss.str(); return {}; + } if (!isNew) inbound->update(seq); if (!inbound->isComplete()) + { + JLOG(j_.debug()) << "InProgress: " << ss.str(); return {}; + } + JLOG(j_.debug()) << "Complete: " << ss.str(); return inbound->getLedger(); }; using namespace std::chrono_literals; - std::shared_ptr ledger = perf::measureDurationAndLog( + return perf::measureDurationAndLog( doAcquire, "InboundLedgersImp::acquire", 500ms, j_); - - return ledger; } void diff --git a/src/xrpld/app/ledger/detail/LedgerMaster.cpp b/src/xrpld/app/ledger/detail/LedgerMaster.cpp index d1eeabeb619..af472afc1de 100644 --- a/src/xrpld/app/ledger/detail/LedgerMaster.cpp +++ b/src/xrpld/app/ledger/detail/LedgerMaster.cpp @@ -1046,8 +1046,9 @@ LedgerMaster::checkAccept(std::shared_ptr const& ledger) } JLOG(m_journal.info()) << "Advancing accepted ledger to " - << ledger->info().seq << " with >= " << minVal - << " validations"; + << ledger->info().seq << " (" + << to_short_string(ledger->info().hash) + << ") with >= " << minVal << " validations"; ledger->setValidated(); ledger->setFull(); diff --git a/src/xrpld/app/ledger/detail/TimeoutCounter.cpp b/src/xrpld/app/ledger/detail/TimeoutCounter.cpp index 9853990e725..c9b9eb39b0f 100644 --- a/src/xrpld/app/ledger/detail/TimeoutCounter.cpp +++ b/src/xrpld/app/ledger/detail/TimeoutCounter.cpp @@ -52,6 +52,8 @@ TimeoutCounter::setTimer(ScopedLockType& sl) { if (isDone()) return; + JLOG(journal_.debug()) << "Setting timer for " << timerInterval_.count() + << "ms"; timer_.expires_after(timerInterval_); timer_.async_wait( [wptr = pmDowncast()](boost::system::error_code const& ec) { @@ -60,6 +62,12 @@ TimeoutCounter::setTimer(ScopedLockType& sl) if (auto ptr = wptr.lock()) { + JLOG(ptr->journal_.debug()) + << "timer: ec: " << ec << " (operation_aborted: " + << boost::asio::error::operation_aborted << " - " + << (ec == boost::asio::error::operation_aborted ? "aborted" + : "other") + << ")"; ScopedLockType sl(ptr->mtx_); ptr->queueJob(sl); }