Skip to content

Commit

Permalink
storage controller: improve consistency_check_api (#10363)
Browse files Browse the repository at this point in the history
## Problem

Limitations found while using this to investigate
#10234:
- If we hit a node consistency issue, we drop out and don't check shards
for consistency
- The messages printed after a shard consistency issue are huge, and
grafana appears to drop them.

## Summary of changes

- Defer node consistency errors until the end of the function, so that
we always proceed to check shards for consistency
- Print out smaller log lines that just point out the diffs between
expected and persistent state
  • Loading branch information
jcsp authored Jan 13, 2025
1 parent de199d7 commit 12053cf
Showing 1 changed file with 54 additions and 5 deletions.
59 changes: 54 additions & 5 deletions storage_controller/src/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5256,7 +5256,8 @@ impl Service {
expect_nodes.sort_by_key(|n| n.node_id);
nodes.sort_by_key(|n| n.node_id);

if nodes != expect_nodes {
// Errors relating to nodes are deferred so that we don't skip the shard checks below if we have a node error
let node_result = if nodes != expect_nodes {
tracing::error!("Consistency check failed on nodes.");
tracing::error!(
"Nodes in memory: {}",
Expand All @@ -5268,10 +5269,12 @@ impl Service {
serde_json::to_string(&nodes)
.map_err(|e| ApiError::InternalServerError(e.into()))?
);
return Err(ApiError::InternalServerError(anyhow::anyhow!(
Err(ApiError::InternalServerError(anyhow::anyhow!(
"Node consistency failure"
)));
}
)))
} else {
Ok(())
};

let mut persistent_shards = self.persistence.load_active_tenant_shards().await?;
persistent_shards
Expand All @@ -5281,6 +5284,7 @@ impl Service {

if persistent_shards != expect_shards {
tracing::error!("Consistency check failed on shards.");

tracing::error!(
"Shards in memory: {}",
serde_json::to_string(&expect_shards)
Expand All @@ -5291,12 +5295,57 @@ impl Service {
serde_json::to_string(&persistent_shards)
.map_err(|e| ApiError::InternalServerError(e.into()))?
);

// The total dump log lines above are useful in testing but in the field grafana will
// usually just drop them because they're so large. So we also do some explicit logging
// of just the diffs.
let persistent_shards = persistent_shards
.into_iter()
.map(|tsp| (tsp.get_tenant_shard_id().unwrap(), tsp))
.collect::<HashMap<_, _>>();
let expect_shards = expect_shards
.into_iter()
.map(|tsp| (tsp.get_tenant_shard_id().unwrap(), tsp))
.collect::<HashMap<_, _>>();
for (tenant_shard_id, persistent_tsp) in &persistent_shards {
match expect_shards.get(tenant_shard_id) {
None => {
tracing::error!(
"Shard {} found in database but not in memory",
tenant_shard_id
);
}
Some(expect_tsp) => {
if expect_tsp != persistent_tsp {
tracing::error!(
"Shard {} is inconsistent. In memory: {}, database has: {}",
tenant_shard_id,
serde_json::to_string(expect_tsp).unwrap(),
serde_json::to_string(&persistent_tsp).unwrap()
);
}
}
}
}

// Having already logged any differences, log any shards that simply aren't present in the database
for (tenant_shard_id, memory_tsp) in &expect_shards {
if !persistent_shards.contains_key(tenant_shard_id) {
tracing::error!(
"Shard {} found in memory but not in database: {}",
tenant_shard_id,
serde_json::to_string(memory_tsp)
.map_err(|e| ApiError::InternalServerError(e.into()))?
);
}
}

return Err(ApiError::InternalServerError(anyhow::anyhow!(
"Shard consistency failure"
)));
}

Ok(())
node_result
}

/// For debug/support: a JSON dump of the [`Scheduler`]. Returns a response so that
Expand Down

1 comment on commit 12053cf

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

7433 tests run: 7057 passed, 1 failed, 375 skipped (full report)


Failures on Postgres 16

  • test_storage_controller_many_tenants[github-actions-selfhosted]: release-x86-64
# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_storage_controller_many_tenants[release-pg16-github-actions-selfhosted]"
Flaky tests (2)

Postgres 17

Postgres 16

  • test_physical_replication_config_mismatch_max_locks_per_transaction: release-arm64

Code coverage* (full report)

  • functions: 32.7% (8049 of 24642 functions)
  • lines: 47.7% (66858 of 140073 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
12053cf at 2025-01-13T13:49:26.191Z :recycle:

Please sign in to comment.