Skip to content

Commit

Permalink
feat(pageserver): add compaction_upper_limit config (#10550)
Browse files Browse the repository at this point in the history
## Problem

Follow-up of the incident, we should not use the same bound on
lower/upper limit of compaction files. This patch adds an upper bound
limit, which is set to 50 for now.

## Summary of changes

Add `compaction_upper_limit`.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
  • Loading branch information
skyzh and problame authored Jan 28, 2025
1 parent b735df6 commit 983e18e
Show file tree
Hide file tree
Showing 9 changed files with 59 additions and 11 deletions.
5 changes: 5 additions & 0 deletions control_plane/src/pageserver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,11 @@ impl PageServerNode {
.map(|x| x.parse::<usize>())
.transpose()
.context("Failed to parse 'compaction_threshold' as an integer")?,
compaction_upper_limit: settings
.remove("compaction_upper_limit")
.map(|x| x.parse::<usize>())
.transpose()
.context("Failed to parse 'compaction_upper_limit' as an integer")?,
compaction_algorithm: settings
.remove("compaction_algorithm")
.map(serde_json::from_str)
Expand Down
12 changes: 12 additions & 0 deletions libs/pageserver_api/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,11 @@ pub struct TenantConfigToml {
pub compaction_period: Duration,
/// Level0 delta layer threshold for compaction.
pub compaction_threshold: usize,
/// Controls the amount of L0 included in a single compaction iteration.
/// The unit is `checkpoint_distance`, i.e., a size.
/// We add L0s to the set of layers to compact until their cumulative
/// size exceeds `compaction_upper_limit * checkpoint_distance`.
pub compaction_upper_limit: usize,
pub compaction_algorithm: crate::models::CompactionAlgorithmSettings,
/// Level0 delta layer threshold at which to delay layer flushes for compaction backpressure,
/// such that they take 2x as long, and start waiting for layer flushes during ephemeral layer
Expand Down Expand Up @@ -523,6 +528,12 @@ pub mod tenant_conf_defaults {

pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;

// This value needs to be tuned to avoid OOM. We have 3/4 of the total CPU threads to do background works, that's 16*3/4=9 on
// most of our pageservers. Compaction ~50 layers requires about 2GB memory (could be reduced later by optimizing L0 hole
// calculation to avoid loading all keys into the memory). So with this config, we can get a maximum peak compaction usage of 18GB.
pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 50;

pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm =
crate::models::CompactionAlgorithm::Legacy;

Expand Down Expand Up @@ -563,6 +574,7 @@ impl Default for TenantConfigToml {
compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
.expect("cannot parse default compaction period"),
compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
compaction_upper_limit: DEFAULT_COMPACTION_UPPER_LIMIT,
compaction_algorithm: crate::models::CompactionAlgorithmSettings {
kind: DEFAULT_COMPACTION_ALGORITHM,
},
Expand Down
8 changes: 8 additions & 0 deletions libs/pageserver_api/src/models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,8 @@ pub struct TenantConfigPatch {
pub compaction_period: FieldPatch<String>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub compaction_threshold: FieldPatch<usize>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub compaction_upper_limit: FieldPatch<usize>,
// defer parsing compaction_algorithm, like eviction_policy
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub compaction_algorithm: FieldPatch<CompactionAlgorithmSettings>,
Expand Down Expand Up @@ -522,6 +524,7 @@ pub struct TenantConfig {
pub compaction_target_size: Option<u64>,
pub compaction_period: Option<String>,
pub compaction_threshold: Option<usize>,
pub compaction_upper_limit: Option<usize>,
// defer parsing compaction_algorithm, like eviction_policy
pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
pub l0_flush_delay_threshold: Option<usize>,
Expand Down Expand Up @@ -559,6 +562,7 @@ impl TenantConfig {
mut compaction_target_size,
mut compaction_period,
mut compaction_threshold,
mut compaction_upper_limit,
mut compaction_algorithm,
mut l0_flush_delay_threshold,
mut l0_flush_stall_threshold,
Expand Down Expand Up @@ -594,6 +598,9 @@ impl TenantConfig {
.apply(&mut compaction_target_size);
patch.compaction_period.apply(&mut compaction_period);
patch.compaction_threshold.apply(&mut compaction_threshold);
patch
.compaction_upper_limit
.apply(&mut compaction_upper_limit);
patch.compaction_algorithm.apply(&mut compaction_algorithm);
patch
.l0_flush_delay_threshold
Expand Down Expand Up @@ -653,6 +660,7 @@ impl TenantConfig {
compaction_target_size,
compaction_period,
compaction_threshold,
compaction_upper_limit,
compaction_algorithm,
l0_flush_delay_threshold,
l0_flush_stall_threshold,
Expand Down
2 changes: 2 additions & 0 deletions pageserver/src/http/openapi_spec.yml
Original file line number Diff line number Diff line change
Expand Up @@ -984,6 +984,8 @@ components:
type: string
compaction_threshold:
type: string
compaction_upper_limit:
type: string
image_creation_threshold:
type: integer
walreceiver_connect_timeout:
Expand Down
8 changes: 8 additions & 0 deletions pageserver/src/tenant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3816,6 +3816,13 @@ impl Tenant {
.unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
}

pub fn get_compaction_upper_limit(&self) -> usize {
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
tenant_conf
.compaction_upper_limit
.unwrap_or(self.conf.default_tenant_conf.compaction_upper_limit)
}

pub fn get_gc_horizon(&self) -> u64 {
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
tenant_conf
Expand Down Expand Up @@ -5469,6 +5476,7 @@ pub(crate) mod harness {
compaction_target_size: Some(tenant_conf.compaction_target_size),
compaction_period: Some(tenant_conf.compaction_period),
compaction_threshold: Some(tenant_conf.compaction_threshold),
compaction_upper_limit: Some(tenant_conf.compaction_upper_limit),
compaction_algorithm: Some(tenant_conf.compaction_algorithm),
l0_flush_delay_threshold: tenant_conf.l0_flush_delay_threshold,
l0_flush_stall_threshold: tenant_conf.l0_flush_stall_threshold,
Expand Down
13 changes: 13 additions & 0 deletions pageserver/src/tenant/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,10 @@ pub struct TenantConfOpt {
#[serde(default)]
pub compaction_threshold: Option<usize>,

#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub compaction_upper_limit: Option<usize>,

#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
Expand Down Expand Up @@ -401,6 +405,9 @@ impl TenantConfOpt {
compaction_threshold: self
.compaction_threshold
.unwrap_or(global_conf.compaction_threshold),
compaction_upper_limit: self
.compaction_upper_limit
.unwrap_or(global_conf.compaction_upper_limit),
compaction_algorithm: self
.compaction_algorithm
.as_ref()
Expand Down Expand Up @@ -478,6 +485,7 @@ impl TenantConfOpt {
mut compaction_target_size,
mut compaction_period,
mut compaction_threshold,
mut compaction_upper_limit,
mut compaction_algorithm,
mut l0_flush_delay_threshold,
mut l0_flush_stall_threshold,
Expand Down Expand Up @@ -519,6 +527,9 @@ impl TenantConfOpt {
.map(|v| humantime::parse_duration(&v))?
.apply(&mut compaction_period);
patch.compaction_threshold.apply(&mut compaction_threshold);
patch
.compaction_upper_limit
.apply(&mut compaction_upper_limit);
patch.compaction_algorithm.apply(&mut compaction_algorithm);
patch
.l0_flush_delay_threshold
Expand Down Expand Up @@ -596,6 +607,7 @@ impl TenantConfOpt {
compaction_target_size,
compaction_period,
compaction_threshold,
compaction_upper_limit,
compaction_algorithm,
l0_flush_delay_threshold,
l0_flush_stall_threshold,
Expand Down Expand Up @@ -657,6 +669,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
compaction_target_size: value.compaction_target_size,
compaction_period: value.compaction_period.map(humantime),
compaction_threshold: value.compaction_threshold,
compaction_upper_limit: value.compaction_upper_limit,
l0_flush_delay_threshold: value.l0_flush_delay_threshold,
l0_flush_stall_threshold: value.l0_flush_stall_threshold,
l0_flush_wait_upload: value.l0_flush_wait_upload,
Expand Down
8 changes: 8 additions & 0 deletions pageserver/src/tenant/timeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2181,6 +2181,14 @@ impl Timeline {
.unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
}

fn get_compaction_upper_limit(&self) -> usize {
let tenant_conf = self.tenant_conf.load();
tenant_conf
.tenant_conf
.compaction_upper_limit
.unwrap_or(self.conf.default_tenant_conf.compaction_upper_limit)
}

fn get_l0_flush_delay_threshold(&self) -> Option<usize> {
// Disable L0 flushes by default. This and compaction needs further tuning.
const DEFAULT_L0_FLUSH_DELAY_FACTOR: usize = 0; // TODO: default to e.g. 3
Expand Down
13 changes: 2 additions & 11 deletions pageserver/src/tenant/timeline/compaction.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,7 @@ use crate::tenant::timeline::{ImageLayerCreationOutcome, IoConcurrency};
use crate::tenant::timeline::{Layer, ResidentLayer};
use crate::tenant::{gc_block, DeltaLayer, MaybeOffloaded};
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
use pageserver_api::config::tenant_conf_defaults::{
DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD,
};
use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE;

use pageserver_api::key::Key;
use pageserver_api::keyspace::KeySpace;
Expand Down Expand Up @@ -1117,14 +1115,7 @@ impl Timeline {
// Under normal circumstances, we will accumulate up to compaction_interval L0s of size
// checkpoint_distance each. To avoid edge cases using extra system resources, bound our
// work in this function to only operate on this much delta data at once.
//
// Take the max of the configured value & the default, so that tests that configure tiny values
// can still use a sensible amount of memory, but if a deployed system configures bigger values we
// still let them compact a full stack of L0s in one go.
let delta_size_limit = std::cmp::max(
self.get_compaction_threshold(),
DEFAULT_COMPACTION_THRESHOLD,
) as u64
let delta_size_limit = self.get_compaction_upper_limit() as u64
* std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE);

let mut fully_compacted = true;
Expand Down
1 change: 1 addition & 0 deletions test_runner/regress/test_attach_tenant_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
fully_custom_config = {
"compaction_period": "1h",
"compaction_threshold": 13,
"compaction_upper_limit": 100,
"l0_flush_delay_threshold": 25,
"l0_flush_stall_threshold": 42,
"l0_flush_wait_upload": True,
Expand Down

1 comment on commit 983e18e

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

7565 tests run: 7203 passed, 1 failed, 361 skipped (full report)


Failures on Postgres 16

# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_compaction_l0_memory[release-pg16-github-actions-selfhosted]"
Flaky tests (9)

Postgres 17

Postgres 15

Postgres 14

Code coverage* (full report)

  • functions: 33.5% (8512 of 25428 functions)
  • lines: 49.2% (71541 of 145443 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
983e18e at 2025-01-29T01:52:02.403Z :recycle:

Please sign in to comment.