Skip to content

Commit b8bbaaf

Browse files
authored
storage controller: fix heatmaps getting disabled during shard split (#8197)
## Problem At the start of do_tenant_shard_split, we drop any secondary location for the parent shards. The reconciler uses presence of secondary locations as a condition for enabling heatmaps. On the pageserver, child shards inherit their configuration from parents, but the storage controller assumes the child's ObservedState is the same as the parent's config from the prepare phase. The result is that some child shards end up with inaccurate ObservedState, and until something next migrates or restarts, those tenant shards aren't uploading heatmaps, so their secondary locations are downloading everything that was resident at the moment of the split (including ancestor layers which are often cleaned up shortly after the split). Closes: #8189 ## Summary of changes - Use PlacementPolicy to control enablement of heatmap upload, rather than the literal presence of secondaries in IntentState: this way we avoid switching them off during shard split - test: during tenant split test, assert that the child shards have heatmap uploads enabled.
1 parent e1a06b4 commit b8bbaaf

File tree

4 files changed

+23
-10
lines changed

4 files changed

+23
-10
lines changed

storage_controller/src/reconciler.rs

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
use crate::pageserver_client::PageserverClient;
22
use crate::persistence::Persistence;
33
use crate::service;
4+
use pageserver_api::controller_api::PlacementPolicy;
45
use pageserver_api::models::{
56
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
67
};
@@ -29,6 +30,7 @@ pub(super) struct Reconciler {
2930
/// of a tenant's state from when we spawned a reconcile task.
3031
pub(super) tenant_shard_id: TenantShardId,
3132
pub(crate) shard: ShardIdentity,
33+
pub(crate) placement_policy: PlacementPolicy,
3234
pub(crate) generation: Option<Generation>,
3335
pub(crate) intent: TargetState,
3436

@@ -641,7 +643,7 @@ impl Reconciler {
641643
generation,
642644
&self.shard,
643645
&self.config,
644-
!self.intent.secondary.is_empty(),
646+
&self.placement_policy,
645647
);
646648
match self.observed.locations.get(&node.get_id()) {
647649
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
@@ -801,8 +803,15 @@ pub(crate) fn attached_location_conf(
801803
generation: Generation,
802804
shard: &ShardIdentity,
803805
config: &TenantConfig,
804-
has_secondaries: bool,
806+
policy: &PlacementPolicy,
805807
) -> LocationConfig {
808+
let has_secondaries = match policy {
809+
PlacementPolicy::Attached(0) | PlacementPolicy::Detached | PlacementPolicy::Secondary => {
810+
false
811+
}
812+
PlacementPolicy::Attached(_) => true,
813+
};
814+
806815
LocationConfig {
807816
mode: LocationConfigMode::AttachedSingle,
808817
generation: generation.into(),

storage_controller/src/service.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1390,7 +1390,7 @@ impl Service {
13901390
tenant_shard.generation.unwrap(),
13911391
&tenant_shard.shard,
13921392
&tenant_shard.config,
1393-
false,
1393+
&PlacementPolicy::Attached(0),
13941394
)),
13951395
},
13961396
)]);
@@ -3321,7 +3321,7 @@ impl Service {
33213321
generation,
33223322
&child_shard,
33233323
&config,
3324-
matches!(policy, PlacementPolicy::Attached(n) if n > 0),
3324+
&policy,
33253325
)),
33263326
},
33273327
);

storage_controller/src/tenant_shard.rs

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -908,12 +908,8 @@ impl TenantShard {
908908
.generation
909909
.expect("Attempted to enter attached state without a generation");
910910

911-
let wanted_conf = attached_location_conf(
912-
generation,
913-
&self.shard,
914-
&self.config,
915-
!self.intent.secondary.is_empty(),
916-
);
911+
let wanted_conf =
912+
attached_location_conf(generation, &self.shard, &self.config, &self.policy);
917913
match self.observed.locations.get(&node_id) {
918914
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
919915
Some(_) | None => {
@@ -1099,6 +1095,7 @@ impl TenantShard {
10991095
let mut reconciler = Reconciler {
11001096
tenant_shard_id: self.tenant_shard_id,
11011097
shard: self.shard,
1098+
placement_policy: self.policy.clone(),
11021099
generation: self.generation,
11031100
intent: reconciler_intent,
11041101
detach,

test_runner/regress/test_sharding.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -542,6 +542,13 @@ def check_effective_tenant_config():
542542
for k, v in non_default_tenant_config.items():
543543
assert config.effective_config[k] == v
544544

545+
# Check that heatmap uploads remain enabled after shard split
546+
# (https://github.com/neondatabase/neon/issues/8189)
547+
assert (
548+
config.effective_config["heatmap_period"]
549+
and config.effective_config["heatmap_period"] != "0s"
550+
)
551+
545552
# Validate pageserver state: expect every child shard to have an attached and secondary location
546553
(total, attached) = get_node_shard_counts(env, tenant_ids=[tenant_id])
547554
assert sum(attached.values()) == split_shard_count

0 commit comments

Comments
 (0)