approval-voting: Add no shows debug information (paritytech#4726)

Add some debug logs to be able to identify the validators and parachains that have most no-shows, this metric is valuable because it will help us identify validators and parachains that regularly have this problem. From the validator_index we can then query the on-chain information and identify the exact validator that is causing the no-shows. --------- Signed-off-by: Alexandru Gheorghe <alexandru.gheorghe@parity.io>
KILTprotocol · Jun 10, 2024 · 2869fd6 · 2869fd6
1 parent 497d64e
commit 2869fd6
Show file tree

Hide file tree

Showing 4 changed files with 124 additions and 18 deletions.
diff --git a/polkadot/node/core/approval-voting/src/approval_checking.rs b/polkadot/node/core/approval-voting/src/approval_checking.rs
@@ -23,6 +23,7 @@ use polkadot_primitives::ValidatorIndex;
 use crate::{
 	persisted_entries::{ApprovalEntry, CandidateEntry, TrancheEntry},
 	time::Tick,
+	MAX_RECORDED_NO_SHOW_VALIDATORS_PER_CANDIDATE,
 };
 
 /// Result of counting the necessary tranches needed for approving a block.
@@ -32,6 +33,7 @@ pub struct TranchesToApproveResult {
 	pub required_tranches: RequiredTranches,
 	/// The total number of no_shows at the moment we are doing the counting.
 	pub total_observed_no_shows: usize,
+	pub no_show_validators: Vec<ValidatorIndex>,
 }
 
 /// The required tranches of assignments needed to determine whether a candidate is approved.
@@ -188,6 +190,8 @@ struct State {
 	/// The last tick at which a considered assignment was received.
 	last_assignment_tick: Option<Tick>,
 	total_observed_no_shows: usize,
+	// The validator's index that are no-shows.
+	no_show_validators: Vec<ValidatorIndex>,
 }
 
 impl State {
@@ -203,6 +207,7 @@ impl State {
 			return TranchesToApproveResult {
 				required_tranches: RequiredTranches::All,
 				total_observed_no_shows: self.total_observed_no_shows,
+				no_show_validators: self.no_show_validators.clone(),
 			}
 		}
 
@@ -217,6 +222,7 @@ impl State {
 					last_assignment_tick: self.last_assignment_tick,
 				},
 				total_observed_no_shows: self.total_observed_no_shows,
+				no_show_validators: self.no_show_validators.clone(),
 			}
 		}
 
@@ -234,6 +240,7 @@ impl State {
 					clock_drift,
 				},
 				total_observed_no_shows: self.total_observed_no_shows,
+				no_show_validators: self.no_show_validators.clone(),
 			}
 		} else {
 			TranchesToApproveResult {
@@ -244,6 +251,7 @@ impl State {
 					clock_drift,
 				},
 				total_observed_no_shows: self.total_observed_no_shows,
+				no_show_validators: self.no_show_validators.clone(),
 			}
 		}
 	}
@@ -253,11 +261,12 @@ impl State {
 	}
 
 	fn advance(
-		&self,
+		mut self,
 		new_assignments: usize,
 		new_no_shows: usize,
 		next_no_show: Option<Tick>,
 		last_assignment_tick: Option<Tick>,
+		no_show_validators: Vec<ValidatorIndex>,
 	) -> State {
 		let new_covered = if self.depth == 0 {
 			new_assignments
@@ -290,6 +299,17 @@ impl State {
 			(self.depth, covering, uncovered)
 		};
 
+		// Make sure we don't store too many no-show validators, since this metric
+		// is valuable if there are just a few of them to identify the problematic
+		// validators.
+		// If there are a lot then we've got bigger problems and no need to make this
+		// array unnecessarily large.
+		if self.no_show_validators.len() + no_show_validators.len() <
+			MAX_RECORDED_NO_SHOW_VALIDATORS_PER_CANDIDATE
+		{
+			self.no_show_validators.extend(no_show_validators);
+		}
+
 		State {
 			assignments,
 			depth,
@@ -299,6 +319,7 @@ impl State {
 			next_no_show,
 			last_assignment_tick,
 			total_observed_no_shows: self.total_observed_no_shows + new_no_shows,
+			no_show_validators: self.no_show_validators,
 		}
 	}
 }
@@ -354,8 +375,9 @@ fn count_no_shows(
 	block_tick: Tick,
 	no_show_duration: Tick,
 	drifted_tick_now: Tick,
-) -> (usize, Option<u64>) {
+) -> (usize, Option<u64>, Vec<ValidatorIndex>) {
 	let mut next_no_show = None;
+	let mut no_show_validators = Vec::new();
 	let no_shows = assignments
 		.iter()
 		.map(|(v_index, tick)| {
@@ -379,12 +401,14 @@ fn count_no_shows(
 				// the clock drift will be removed again to do the comparison above.
 				next_no_show = super::min_prefer_some(next_no_show, Some(no_show_at + clock_drift));
 			}
-
+			if is_no_show {
+				no_show_validators.push(*v_index);
+			}
 			is_no_show
 		})
 		.count();
 
-	(no_shows, next_no_show)
+	(no_shows, next_no_show, no_show_validators)
 }
 
 /// Determine the amount of tranches of assignments needed to determine approval of a candidate.
@@ -408,6 +432,7 @@ pub fn tranches_to_approve(
 		next_no_show: None,
 		last_assignment_tick: None,
 		total_observed_no_shows: 0,
+		no_show_validators: Vec::new(),
 	};
 
 	// The `ApprovalEntry` doesn't have any data for empty tranches. We still want to iterate over
@@ -446,7 +471,7 @@ pub fn tranches_to_approve(
 			//
 			// While we count the no-shows, we also determine the next possible no-show we might
 			// see within this tranche.
-			let (no_shows, next_no_show) = count_no_shows(
+			let (no_shows, next_no_show, no_show_validators) = count_no_shows(
 				assignments,
 				approvals,
 				clock_drift,
@@ -455,7 +480,7 @@ pub fn tranches_to_approve(
 				drifted_tick_now,
 			);
 
-			let s = s.advance(n_assignments, no_shows, next_no_show, last_assignment_tick);
+			let s = s.advance(n_assignments, no_shows, next_no_show, last_assignment_tick, no_show_validators);
 			let output = s.output(tranche, needed_approvals, n_validators, no_show_duration);
 
 			*state = match output.required_tranches {
@@ -1186,7 +1211,7 @@ mod tests {
 			approvals.set(v_index, true);
 		}
 
-		let (no_shows, next_no_show) = count_no_shows(
+		let (no_shows, next_no_show, _) = count_no_shows(
 			&test.assignments,
 			&approvals,
 			test.clock_drift,
@@ -1390,6 +1415,7 @@ mod tests {
 			next_no_show: None,
 			last_assignment_tick: None,
 			total_observed_no_shows: 0,
+			no_show_validators: Default::default(),
 		};
 
 		assert_eq!(
@@ -1414,6 +1440,7 @@ mod tests {
 			next_no_show: None,
 			last_assignment_tick: None,
 			total_observed_no_shows: 0,
+			no_show_validators: Default::default(),
 		};
 
 		assert_eq!(

diff --git a/polkadot/node/core/approval-voting/src/import.rs b/polkadot/node/core/approval-voting/src/import.rs
@@ -662,6 +662,7 @@ pub(crate) mod tests {
 			per_block_assignments_gathering_times: LruMap::new(ByLength::new(
 				MAX_BLOCKS_WITH_ASSIGNMENT_TIMESTAMPS,
 			)),
+			no_show_stats: Default::default(),
 		}
 	}
 

diff --git a/polkadot/node/core/approval-voting/src/lib.rs b/polkadot/node/core/approval-voting/src/lib.rs
@@ -691,6 +691,7 @@ struct ApprovalStatus {
 	tranche_now: DelayTranche,
 	block_tick: Tick,
 	last_no_shows: usize,
+	no_show_validators: Vec<ValidatorIndex>,
 }
 
 #[derive(Copy, Clone)]
@@ -832,6 +833,51 @@ struct State {
 	// tranches we trigger and why.
 	per_block_assignments_gathering_times:
 		LruMap<BlockNumber, HashMap<(Hash, CandidateHash), AssignmentGatheringRecord>>,
+	no_show_stats: NoShowStats,
+}
+
+// Regularly dump the no-show stats at this block number frequency.
+const NO_SHOW_DUMP_FREQUENCY: BlockNumber = 50;
+// The maximum number of validators we record no-shows for, per candidate.
+pub(crate) const MAX_RECORDED_NO_SHOW_VALIDATORS_PER_CANDIDATE: usize = 20;
+
+// No show stats per validator and per parachain.
+// This is valuable information when we have to debug live network issue, because
+// it gives information if things are going wrong only for some validators or just
+// for some parachains.
+#[derive(Debug, Clone, PartialEq, Eq, Default)]
+struct NoShowStats {
+	per_validator_no_show: HashMap<SessionIndex, HashMap<ValidatorIndex, usize>>,
+	per_parachain_no_show: HashMap<u32, usize>,
+	last_dumped_block_number: BlockNumber,
+}
+
+impl NoShowStats {
+	// Print the no-show stats if NO_SHOW_DUMP_FREQUENCY blocks have passed since the last
+	// print.
+	fn maybe_print(&mut self, current_block_number: BlockNumber) {
+		if self.last_dumped_block_number > current_block_number ||
+			current_block_number - self.last_dumped_block_number < NO_SHOW_DUMP_FREQUENCY
+		{
+			return
+		}
+		if self.per_parachain_no_show.is_empty() && self.per_validator_no_show.is_empty() {
+			return
+		}
+
+		gum::debug!(
+			target: LOG_TARGET,
+			"Validators with no_show {:?} and parachains with no_shows {:?} since {:}",
+			self.per_validator_no_show,
+			self.per_parachain_no_show,
+			self.last_dumped_block_number
+		);
+
+		self.last_dumped_block_number = current_block_number;
+
+		self.per_validator_no_show.clear();
+		self.per_parachain_no_show.clear();
+	}
 }
 
 #[derive(Debug, Clone, PartialEq, Eq)]
@@ -887,21 +933,25 @@ impl State {
 		);
 
 		if let Some(approval_entry) = candidate_entry.approval_entry(&block_hash) {
-			let TranchesToApproveResult { required_tranches, total_observed_no_shows } =
-				approval_checking::tranches_to_approve(
-					approval_entry,
-					candidate_entry.approvals(),
-					tranche_now,
-					block_tick,
-					no_show_duration,
-					session_info.needed_approvals as _,
-				);
+			let TranchesToApproveResult {
+				required_tranches,
+				total_observed_no_shows,
+				no_show_validators,
+			} = approval_checking::tranches_to_approve(
+				approval_entry,
+				candidate_entry.approvals(),
+				tranche_now,
+				block_tick,
+				no_show_duration,
+				session_info.needed_approvals as _,
+			);
 
 			let status = ApprovalStatus {
 				required_tranches,
 				block_tick,
 				tranche_now,
 				last_no_shows: total_observed_no_shows,
+				no_show_validators,
 			};
 
 			Some((approval_entry, status))
@@ -1044,6 +1094,26 @@ impl State {
 			},
 		}
 	}
+
+	fn record_no_shows(
+		&mut self,
+		session_index: SessionIndex,
+		para_id: u32,
+		no_show_validators: &Vec<ValidatorIndex>,
+	) {
+		if !no_show_validators.is_empty() {
+			*self.no_show_stats.per_parachain_no_show.entry(para_id.into()).or_default() += 1;
+		}
+		for validator_index in no_show_validators {
+			*self
+				.no_show_stats
+				.per_validator_no_show
+				.entry(session_index)
+				.or_default()
+				.entry(*validator_index)
+				.or_default() += 1;
+		}
+	}
 }
 
 #[derive(Debug, Clone)]
@@ -1096,6 +1166,7 @@ where
 		per_block_assignments_gathering_times: LruMap::new(ByLength::new(
 			MAX_BLOCKS_WITH_ASSIGNMENT_TIMESTAMPS,
 		)),
+		no_show_stats: NoShowStats::default(),
 	};
 
 	// `None` on start-up. Gets initialized/updated on leaf update
@@ -1740,6 +1811,8 @@ async fn handle_from_overseer<Context>(
 								"Imported new block.",
 							);
 
+							state.no_show_stats.maybe_print(block_batch.block_number);
+
 							for (c_hash, c_entry) in block_batch.imported_candidates {
 								metrics.on_candidate_imported();
 
@@ -2904,7 +2977,8 @@ where
 	let mut actions = Vec::new();
 	let block_hash = block_entry.block_hash();
 	let block_number = block_entry.block_number();
-
+	let session_index = block_entry.session();
+	let para_id = candidate_entry.candidate_receipt().descriptor().para_id;
 	let tick_now = state.clock.tick_now();
 
 	let (is_approved, status) = if let Some((approval_entry, status)) = state
@@ -3000,7 +3074,9 @@ where
 		if is_approved {
 			approval_entry.mark_approved();
 		}
-
+		if newly_approved {
+			state.record_no_shows(session_index, para_id.into(), &status.no_show_validators);
+		}
 		actions.extend(schedule_wakeup_action(
 			&approval_entry,
 			block_hash,

diff --git a/polkadot/node/core/approval-voting/src/tests.rs b/polkadot/node/core/approval-voting/src/tests.rs
@@ -5068,6 +5068,7 @@ fn test_gathering_assignments_statements() {
 		per_block_assignments_gathering_times: LruMap::new(ByLength::new(
 			MAX_BLOCKS_WITH_ASSIGNMENT_TIMESTAMPS,
 		)),
+		no_show_stats: NoShowStats::default(),
 	};
 
 	for i in 0..200i32 {
@@ -5162,6 +5163,7 @@ fn test_observe_assignment_gathering_status() {
 		per_block_assignments_gathering_times: LruMap::new(ByLength::new(
 			MAX_BLOCKS_WITH_ASSIGNMENT_TIMESTAMPS,
 		)),
+		no_show_stats: NoShowStats::default(),
 	};
 
 	let metrics_inner = MetricsInner {