diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 4263bacce8d1..2daed9038688 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -229,8 +229,13 @@ jobs: ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)' # run pageserver tests with different settings - for io_engine in std-fs tokio-epoll-uring ; do - NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)' + for get_vectored_concurrent_io in sequential sidecar-task; do + for io_engine in std-fs tokio-epoll-uring ; do + NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \ + NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \ + ${cov_prefix} \ + cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)' + done done # Run separate tests for real S3 @@ -314,6 +319,7 @@ jobs: CHECK_ONDISK_DATA_COMPATIBILITY: nonempty BUILD_TAG: ${{ inputs.build-tag }} PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring + PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }} # Temporary disable this step until we figure out why it's so flaky diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 4fc81dccaae0..b1230879d34d 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -892,14 +892,14 @@ jobs: run: | for repo in neondatabase 369495373322.dkr.ecr.eu-central-1.amazonaws.com; do docker buildx imagetools create -t $repo/neon:latest \ - $repo/neon:${{ needs.tag.outputs.build-tag }} + neondatabase/neon:${{ needs.tag.outputs.build-tag }} for version in ${VERSIONS}; do docker buildx imagetools create -t $repo/compute-node-${version}:latest \ - $repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }} + neondatabase/compute-node-${version}:${{ needs.tag.outputs.build-tag }} docker buildx imagetools create -t $repo/vm-compute-node-${version}:latest \ - $repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} + neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} done done docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \ diff --git a/Cargo.lock b/Cargo.lock index 02b02a09c1d2..1f090a27e496 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6774,7 +6774,7 @@ dependencies = [ [[package]] name = "tokio-epoll-uring" version = "0.1.0" -source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#33e00106a268644d02ba0461bbd64476073b0ee1" +source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#781989bb540a1408b0b93daa1e9d1fa452195497" dependencies = [ "futures", "nix 0.26.4", @@ -7004,12 +7004,9 @@ version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52" dependencies = [ - "async-stream", "async-trait", - "axum", "base64 0.22.1", "bytes", - "h2 0.4.4", "http 1.1.0", "http-body 1.0.0", "http-body-util", @@ -7021,7 +7018,6 @@ dependencies = [ "prost", "rustls-native-certs 0.8.0", "rustls-pemfile 2.1.1", - "socket2", "tokio", "tokio-rustls 0.26.0", "tokio-stream", @@ -7373,7 +7369,7 @@ dependencies = [ [[package]] name = "uring-common" version = "0.1.0" -source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#33e00106a268644d02ba0461bbd64476073b0ee1" +source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#781989bb540a1408b0b93daa1e9d1fa452195497" dependencies = [ "bytes", "io-uring", @@ -7582,7 +7578,6 @@ dependencies = [ "tikv-jemallocator", "tokio", "tokio-util", - "tonic", "tonic-build", "tracing", "utils", @@ -7991,8 +7986,6 @@ version = "0.1.0" dependencies = [ "ahash", "anyhow", - "axum", - "axum-core", "base64 0.13.1", "base64 0.21.1", "base64ct", @@ -8073,7 +8066,6 @@ dependencies = [ "toml_edit", "tonic", "tower 0.4.13", - "tower 0.5.2", "tracing", "tracing-core", "url", diff --git a/Cargo.toml b/Cargo.toml index a4e601bb589c..6e1e2888952a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -187,7 +187,7 @@ tokio-tar = "0.3" tokio-util = { version = "0.7.10", features = ["io", "rt"] } toml = "0.8" toml_edit = "0.22" -tonic = {version = "0.12.3", features = ["tls", "tls-roots"]} +tonic = {version = "0.12.3", default-features = false, features = ["channel", "tls", "tls-roots"]} tower = { version = "0.5.2", default-features = false } tower-http = { version = "0.6.2", features = ["request-id", "trace"] } tower-service = "0.3.3" diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index e2e402f8c611..8d5d20266abc 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -360,6 +360,8 @@ COPY compute/patches/pgvector.patch /pgvector.patch RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O pgvector.tar.gz && \ echo "867a2c328d4928a5a9d6f052cd3bc78c7d60228a9b914ad32aa3db88e9de27b0 pgvector.tar.gz" | sha256sum --check && \ mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \ + wget https://github.com/pgvector/pgvector/raw/refs/tags/v0.7.4/sql/vector.sql -O ./sql/vector--0.7.4.sql && \ + echo "10218d05dc02299562252a9484775178b14a1d8edb92a2d1672ef488530f7778 ./sql/vector--0.7.4.sql" | sha256sum --check && \ patch -p1 < /pgvector.patch && \ make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" && \ make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install && \ diff --git a/compute/patches/pgvector.patch b/compute/patches/pgvector.patch index 3e1ffcaaaf2c..da41c86140b9 100644 --- a/compute/patches/pgvector.patch +++ b/compute/patches/pgvector.patch @@ -1,8 +1,24 @@ +diff --git a/Makefile b/Makefile +index 7a4b88c..56678af 100644 +--- a/Makefile ++++ b/Makefile +@@ -3,7 +3,10 @@ EXTVERSION = 0.8.0 + + MODULE_big = vector + DATA = $(wildcard sql/*--*--*.sql) +-DATA_built = sql/$(EXTENSION)--$(EXTVERSION).sql ++# This change is needed to install different per-version SQL files ++# like pgvector--0.8.0.sql and pgvector--0.7.4.sql ++# The corresponding file is downloaded during the Docker image build process ++DATA_built = sql/$(EXTENSION)--$(EXTVERSION).sql sql/vector--0.7.4.sql + OBJS = src/bitutils.o src/bitvec.o src/halfutils.o src/halfvec.o src/hnsw.o src/hnswbuild.o src/hnswinsert.o src/hnswscan.o src/hnswutils.o src/hnswvacuum.o src/ivfbuild.o src/ivfflat.o src/ivfinsert.o src/ivfkmeans.o src/ivfscan.o src/ivfutils.o src/ivfvacuum.o src/sparsevec.o src/vector.o + HEADERS = src/halfvec.h src/sparsevec.h src/vector.h + diff --git a/src/hnswbuild.c b/src/hnswbuild.c -index dcfb2bd..d5189ee 100644 +index b667478..fc1897c 100644 --- a/src/hnswbuild.c +++ b/src/hnswbuild.c -@@ -860,9 +860,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc) +@@ -843,9 +843,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc) hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false); @@ -20,7 +36,7 @@ index dcfb2bd..d5189ee 100644 /* Close relations within worker */ index_close(indexRel, indexLockmode); table_close(heapRel, heapLockmode); -@@ -1117,12 +1125,38 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, +@@ -1100,12 +1108,38 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, SeedRandom(42); #endif diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index df81b44f2d94..b33b2877b3ea 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -423,6 +423,21 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'rel_size_v2_enabled' as bool")?, + gc_compaction_enabled: settings + .remove("gc_compaction_enabled") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'gc_compaction_enabled' as bool")?, + gc_compaction_initial_threshold_kb: settings + .remove("gc_compaction_initial_threshold_kb") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'gc_compaction_initial_threshold_kb' as integer")?, + gc_compaction_ratio_percent: settings + .remove("gc_compaction_ratio_percent") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'gc_compaction_ratio_percent' as integer")?, }; if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") diff --git a/docker-compose/run-tests.sh b/docker-compose/run-tests.sh index 94df22938f03..ded59b0005f7 100644 --- a/docker-compose/run-tests.sh +++ b/docker-compose/run-tests.sh @@ -9,8 +9,11 @@ LIST=$( (echo -e "${SKIP//","/"\n"}"; ls) | sort | uniq -u) for d in ${LIST} do [ -d "${d}" ] || continue - psql -c "select 1" >/dev/null || break - make -C "${d}" installcheck || FAILED="${d} ${FAILED}" + if ! psql -w -c "select 1" >/dev/null; then + FAILED="${d} ${FAILED}" + break + fi + USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}" done [ -z "${FAILED}" ] && exit 0 echo "${FAILED}" diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 7fb7a9d54e3f..4982c6233d1d 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -120,6 +120,7 @@ pub struct ConfigToml { pub no_sync: Option, pub wal_receiver_protocol: PostgresClientProtocol, pub page_service_pipelining: PageServicePipeliningConfig, + pub get_vectored_concurrent_io: GetVectoredConcurrentIo, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -158,6 +159,25 @@ pub enum PageServiceProtocolPipelinedExecutionStrategy { Tasks, } +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(tag = "mode", rename_all = "kebab-case")] +#[serde(deny_unknown_fields)] +pub enum GetVectoredConcurrentIo { + /// The read path is fully sequential: layers are visited + /// one after the other and IOs are issued and waited upon + /// from the same task that traverses the layers. + Sequential, + /// The read path still traverses layers sequentially, and + /// index blocks will be read into the PS PageCache from + /// that task, with waiting. + /// But data IOs are dispatched and waited upon from a sidecar + /// task so that the traversing task can continue to traverse + /// layers while the IOs are in flight. + /// If the PS PageCache miss rate is low, this improves + /// throughput dramatically. + SidecarTask, +} + pub mod statvfs { pub mod mock { #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -305,6 +325,16 @@ pub struct TenantConfigToml { /// Enable rel_size_v2 for this tenant. Once enabled, the tenant will persist this information into /// `index_part.json`, and it cannot be reversed. pub rel_size_v2_enabled: Option, + + // gc-compaction related configs + /// Enable automatic gc-compaction trigger on this tenant. + pub gc_compaction_enabled: bool, + /// The initial threshold for gc-compaction in KB. Once the total size of layers below the gc-horizon is above this threshold, + /// gc-compaction will be triggered. + pub gc_compaction_initial_threshold_kb: u64, + /// The ratio that triggers the auto gc-compaction. If (the total size of layers between L2 LSN and gc-horizon) / (size below the L2 LSN) + /// is above this ratio, gc-compaction will be triggered. + pub gc_compaction_ratio_percent: u64, } pub mod defaults { @@ -454,6 +484,11 @@ impl Default for ConfigToml { execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures, }) }, + get_vectored_concurrent_io: if !cfg!(test) { + GetVectoredConcurrentIo::Sequential + } else { + GetVectoredConcurrentIo::SidecarTask + }, } } } @@ -498,6 +533,9 @@ pub mod tenant_conf_defaults { // By default ingest enough WAL for two new L0 layers before checking if new image // image layers should be created. pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2; + pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false; + pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 10240000; + pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100; } impl Default for TenantConfigToml { @@ -543,6 +581,9 @@ impl Default for TenantConfigToml { timeline_offloading: false, wal_receiver_protocol_override: None, rel_size_v2_enabled: None, + gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED, + gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB, + gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT, } } } diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 1538134c96d1..fd4879087fc4 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -499,6 +499,12 @@ pub struct TenantConfigPatch { pub wal_receiver_protocol_override: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub rel_size_v2_enabled: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub gc_compaction_enabled: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub gc_compaction_initial_threshold_kb: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub gc_compaction_ratio_percent: FieldPatch, } /// An alternative representation of `pageserver::tenant::TenantConf` with @@ -531,6 +537,9 @@ pub struct TenantConfig { pub timeline_offloading: Option, pub wal_receiver_protocol_override: Option, pub rel_size_v2_enabled: Option, + pub gc_compaction_enabled: Option, + pub gc_compaction_initial_threshold_kb: Option, + pub gc_compaction_ratio_percent: Option, } impl TenantConfig { @@ -561,6 +570,9 @@ impl TenantConfig { mut timeline_offloading, mut wal_receiver_protocol_override, mut rel_size_v2_enabled, + mut gc_compaction_enabled, + mut gc_compaction_initial_threshold_kb, + mut gc_compaction_ratio_percent, } = self; patch.checkpoint_distance.apply(&mut checkpoint_distance); @@ -606,6 +618,15 @@ impl TenantConfig { .wal_receiver_protocol_override .apply(&mut wal_receiver_protocol_override); patch.rel_size_v2_enabled.apply(&mut rel_size_v2_enabled); + patch + .gc_compaction_enabled + .apply(&mut gc_compaction_enabled); + patch + .gc_compaction_initial_threshold_kb + .apply(&mut gc_compaction_initial_threshold_kb); + patch + .gc_compaction_ratio_percent + .apply(&mut gc_compaction_ratio_percent); Self { checkpoint_distance, @@ -633,6 +654,9 @@ impl TenantConfig { timeline_offloading, wal_receiver_protocol_override, rel_size_v2_enabled, + gc_compaction_enabled, + gc_compaction_initial_threshold_kb, + gc_compaction_ratio_percent, } } } diff --git a/libs/utils/src/env.rs b/libs/utils/src/env.rs index b3e326bfd0f4..a1bcec92290a 100644 --- a/libs/utils/src/env.rs +++ b/libs/utils/src/env.rs @@ -2,6 +2,7 @@ use std::{fmt::Display, str::FromStr}; +/// For types `V` that implement [`FromStr`]. pub fn var(varname: &str) -> Option where V: FromStr, @@ -10,7 +11,9 @@ where match std::env::var(varname) { Ok(s) => Some( s.parse() - .map_err(|e| format!("failed to parse env var {varname}: {e:#}")) + .map_err(|e| { + format!("failed to parse env var {varname} using FromStr::parse: {e:#}") + }) .unwrap(), ), Err(std::env::VarError::NotPresent) => None, @@ -19,3 +22,24 @@ where } } } + +/// For types `V` that implement [`serde::de::DeserializeOwned`]. +pub fn var_serde_json_string(varname: &str) -> Option +where + V: serde::de::DeserializeOwned, +{ + match std::env::var(varname) { + Ok(s) => Some({ + let value = serde_json::Value::String(s); + serde_json::from_value(value) + .map_err(|e| { + format!("failed to parse env var {varname} as a serde_json json string: {e:#}") + }) + .unwrap() + }), + Err(std::env::VarError::NotPresent) => None, + Err(std::env::VarError::NotUnicode(_)) => { + panic!("env var {varname} is not unicode") + } + } +} diff --git a/libs/utils/src/failpoint_support.rs b/libs/utils/src/failpoint_support.rs index 701ba2d42cb9..272c6ebb26a4 100644 --- a/libs/utils/src/failpoint_support.rs +++ b/libs/utils/src/failpoint_support.rs @@ -11,31 +11,55 @@ use tracing::*; /// Declare a failpoint that can use to `pause` failpoint action. /// We don't want to block the executor thread, hence, spawn_blocking + await. +/// +/// Optionally pass a cancellation token, and this failpoint will drop out of +/// its pause when the cancellation token fires. This is useful for testing +/// cases where we would like to block something, but test its clean shutdown behavior. +/// The macro evaluates to a Result in that case, where Ok(()) is the case +/// where the failpoint was not paused, and Err() is the case where cancellation +/// token fired while evaluating the failpoint. +/// +/// Remember to unpause the failpoint in the test; until that happens, one of the +/// limited number of spawn_blocking thread pool threads is leaked. #[macro_export] macro_rules! pausable_failpoint { - ($name:literal) => { + ($name:literal) => {{ + if cfg!(feature = "testing") { + let cancel = ::tokio_util::sync::CancellationToken::new(); + let _ = $crate::pausable_failpoint!($name, &cancel); + } + }}; + ($name:literal, $cancel:expr) => {{ if cfg!(feature = "testing") { - tokio::task::spawn_blocking({ - let current = tracing::Span::current(); + let failpoint_fut = ::tokio::task::spawn_blocking({ + let current = ::tracing::Span::current(); move || { let _entered = current.entered(); - tracing::info!("at failpoint {}", $name); - fail::fail_point!($name); + ::tracing::info!("at failpoint {}", $name); + ::fail::fail_point!($name); + } + }); + let cancel_fut = async move { + $cancel.cancelled().await; + }; + ::tokio::select! { + res = failpoint_fut => { + res.expect("spawn_blocking"); + // continue with execution + Ok(()) + }, + _ = cancel_fut => { + Err(()) } - }) - .await - .expect("spawn_blocking"); - } - }; - ($name:literal, $cond:expr) => { - if cfg!(feature = "testing") { - if $cond { - pausable_failpoint!($name) } + } else { + Ok(()) } - }; + }}; } +pub use pausable_failpoint; + /// use with fail::cfg("$name", "return(2000)") /// /// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs index 16ec563fa71d..0a1ed816210c 100644 --- a/libs/utils/src/sync/gate.rs +++ b/libs/utils/src/sync/gate.rs @@ -64,6 +64,12 @@ pub struct GateGuard { gate: Arc, } +impl GateGuard { + pub fn try_clone(&self) -> Result { + Gate::enter_impl(self.gate.clone()) + } +} + impl Drop for GateGuard { fn drop(&mut self) { if self.gate.closing.load(Ordering::Relaxed) { @@ -107,11 +113,11 @@ impl Gate { /// to avoid blocking close() indefinitely: typically types that contain a Gate will /// also contain a CancellationToken. pub fn enter(&self) -> Result { - let permit = self - .inner - .sem - .try_acquire() - .map_err(|_| GateError::GateClosed)?; + Self::enter_impl(self.inner.clone()) + } + + fn enter_impl(gate: Arc) -> Result { + let permit = gate.sem.try_acquire().map_err(|_| GateError::GateClosed)?; // we now have the permit, let's disable the normal raii functionality and leave // "returning" the permit to our GateGuard::drop. @@ -122,7 +128,7 @@ impl Gate { Ok(GateGuard { span_at_enter: tracing::Span::current(), - gate: self.inner.clone(), + gate, }) } @@ -252,4 +258,39 @@ mod tests { // Attempting to enter() is still forbidden gate.enter().expect_err("enter should fail finishing close"); } + + #[tokio::test(start_paused = true)] + async fn clone_gate_guard() { + let gate = Gate::default(); + let forever = Duration::from_secs(24 * 7 * 365); + + let guard1 = gate.enter().expect("gate isn't closed"); + + let guard2 = guard1.try_clone().expect("gate isn't clsoed"); + + let mut close_fut = std::pin::pin!(gate.close()); + + tokio::time::timeout(forever, &mut close_fut) + .await + .unwrap_err(); + + // we polled close_fut once, that should prevent all later enters and clones + gate.enter().unwrap_err(); + guard1.try_clone().unwrap_err(); + guard2.try_clone().unwrap_err(); + + // guard2 keeps gate open even if guard1 is closed + drop(guard1); + tokio::time::timeout(forever, &mut close_fut) + .await + .unwrap_err(); + + drop(guard2); + + // now that the last guard is dropped, closing should complete + close_fut.await; + + // entering is still forbidden + gate.enter().expect_err("enter should stilll fail"); + } } diff --git a/libs/wal_decoder/Cargo.toml b/libs/wal_decoder/Cargo.toml index 09c4afb18a59..cb0ef4b00d59 100644 --- a/libs/wal_decoder/Cargo.toml +++ b/libs/wal_decoder/Cargo.toml @@ -17,7 +17,6 @@ postgres_ffi.workspace = true serde.workspace = true thiserror.workspace = true tokio = { workspace = true, features = ["io-util"] } -tonic.workspace = true tracing.workspace = true utils.workspace = true workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/wal_decoder/src/models.rs b/libs/wal_decoder/src/models.rs index c2f9125b2107..51bf7e44ab79 100644 --- a/libs/wal_decoder/src/models.rs +++ b/libs/wal_decoder/src/models.rs @@ -45,7 +45,7 @@ pub mod proto { #![allow(clippy::derive_partial_eq_without_eq)] // The generated ValueMeta has a `len` method generate for its `len` field. #![allow(clippy::len_without_is_empty)] - tonic::include_proto!("interpreted_wal"); + include!(concat!(env!("OUT_DIR"), concat!("/interpreted_wal.rs"))); } #[derive(Copy, Clone, Serialize, Deserialize)] diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index 9f3984f1bd0a..a60efc756740 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -13,7 +13,7 @@ use rand::prelude::*; use tokio::task::JoinSet; use tracing::info; -use std::collections::HashSet; +use std::collections::{HashSet, VecDeque}; use std::future::Future; use std::num::NonZeroUsize; use std::pin::Pin; @@ -63,6 +63,10 @@ pub(crate) struct Args { #[clap(long)] set_io_mode: Option, + /// Queue depth generated in each client. + #[clap(long, default_value = "1")] + queue_depth: NonZeroUsize, + targets: Option>, } @@ -298,6 +302,7 @@ async fn main_impl( start_work_barrier.wait().await; let client_start = Instant::now(); let mut ticks_processed = 0; + let mut inflight = VecDeque::new(); while !cancel.is_cancelled() { // Detect if a request took longer than the RPS rate if let Some(period) = &rps_period { @@ -311,31 +316,37 @@ async fn main_impl( ticks_processed = periods_passed_until_now; } - let start = Instant::now(); - let req = { - let mut rng = rand::thread_rng(); - let r = &ranges[weights.sample(&mut rng)]; - let key: i128 = rng.gen_range(r.start..r.end); - let key = Key::from_i128(key); - assert!(key.is_rel_block_key()); - let (rel_tag, block_no) = key - .to_rel_block() - .expect("we filter non-rel-block keys out above"); - PagestreamGetPageRequest { - hdr: PagestreamRequest { - reqid: 0, - request_lsn: if rng.gen_bool(args.req_latest_probability) { - Lsn::MAX - } else { - r.timeline_lsn + while inflight.len() < args.queue_depth.get() { + let start = Instant::now(); + let req = { + let mut rng = rand::thread_rng(); + let r = &ranges[weights.sample(&mut rng)]; + let key: i128 = rng.gen_range(r.start..r.end); + let key = Key::from_i128(key); + assert!(key.is_rel_block_key()); + let (rel_tag, block_no) = key + .to_rel_block() + .expect("we filter non-rel-block keys out above"); + PagestreamGetPageRequest { + hdr: PagestreamRequest { + reqid: 0, + request_lsn: if rng.gen_bool(args.req_latest_probability) { + Lsn::MAX + } else { + r.timeline_lsn + }, + not_modified_since: r.timeline_lsn, }, - not_modified_since: r.timeline_lsn, - }, - rel: rel_tag, - blkno: block_no, - } - }; - client.getpage(req).await.unwrap(); + rel: rel_tag, + blkno: block_no, + } + }; + client.getpage_send(req).await.unwrap(); + inflight.push_back(start); + } + + let start = inflight.pop_front().unwrap(); + client.getpage_recv().await.unwrap(); let end = Instant::now(); live_stats.request_done(); ticks_processed += 1; diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index e1b5676f464b..a6087920fd60 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -25,6 +25,7 @@ use tokio_tar::{Builder, EntryType, Header}; use crate::context::RequestContext; use crate::pgdatadir_mapping::Version; +use crate::tenant::storage_layer::IoConcurrency; use crate::tenant::Timeline; use pageserver_api::reltag::{RelTag, SlruKind}; @@ -123,6 +124,13 @@ where full_backup, replica, ctx, + io_concurrency: IoConcurrency::spawn_from_conf( + timeline.conf, + timeline + .gate + .enter() + .map_err(|e| BasebackupError::Server(e.into()))?, + ), }; basebackup .send_tarball() @@ -144,6 +152,7 @@ where full_backup: bool, replica: bool, ctx: &'a RequestContext, + io_concurrency: IoConcurrency, } /// A sink that accepts SLRU blocks ordered by key and forwards @@ -303,7 +312,7 @@ where for part in slru_partitions.parts { let blocks = self .timeline - .get_vectored(part, self.lsn, self.ctx) + .get_vectored(part, self.lsn, self.io_concurrency.clone(), self.ctx) .await .map_err(|e| BasebackupError::Server(e.into()))?; @@ -358,7 +367,7 @@ where let start_time = Instant::now(); let aux_files = self .timeline - .list_aux_files(self.lsn, self.ctx) + .list_aux_files(self.lsn, self.ctx, self.io_concurrency.clone()) .await .map_err(|e| BasebackupError::Server(e.into()))?; let aux_scan_time = start_time.elapsed(); @@ -422,7 +431,7 @@ where } let repl_origins = self .timeline - .get_replorigins(self.lsn, self.ctx) + .get_replorigins(self.lsn, self.ctx, self.io_concurrency.clone()) .await .map_err(|e| BasebackupError::Server(e.into()))?; let n_origins = repl_origins.len(); @@ -489,7 +498,13 @@ where for blknum in startblk..endblk { let img = self .timeline - .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), self.ctx) + .get_rel_page_at_lsn( + src, + blknum, + Version::Lsn(self.lsn), + self.ctx, + self.io_concurrency.clone(), + ) .await .map_err(|e| BasebackupError::Server(e.into()))?; segment_data.extend_from_slice(&img[..]); diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 921c6a509218..576472850589 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -135,6 +135,7 @@ fn main() -> anyhow::Result<()> { info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode"); info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol"); info!(?conf.page_service_pipelining, "starting with page service pipelining config"); + info!(?conf.get_vectored_concurrent_io, "starting with get_vectored IO concurrency config"); // The tenants directory contains all the pageserver local disk state. // Create if not exists and make sure all the contents are durable before proceeding. diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 1651db8500c4..ce480c70a08d 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -191,6 +191,8 @@ pub struct PageServerConf { pub wal_receiver_protocol: PostgresClientProtocol, pub page_service_pipelining: pageserver_api::config::PageServicePipeliningConfig, + + pub get_vectored_concurrent_io: pageserver_api::config::GetVectoredConcurrentIo, } /// Token for authentication to safekeepers @@ -352,6 +354,7 @@ impl PageServerConf { no_sync, wal_receiver_protocol, page_service_pipelining, + get_vectored_concurrent_io, } = config_toml; let mut conf = PageServerConf { @@ -396,6 +399,7 @@ impl PageServerConf { import_pgdata_aws_endpoint_url, wal_receiver_protocol, page_service_pipelining, + get_vectored_concurrent_io, // ------------------------------------------------------------ // fields that require additional validation or custom handling diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 33b2d0458855..5452719bcd54 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -84,6 +84,7 @@ use crate::tenant::remote_timeline_client::list_remote_tenant_shards; use crate::tenant::remote_timeline_client::list_remote_timelines; use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; +use crate::tenant::storage_layer::IoConcurrency; use crate::tenant::storage_layer::LayerAccessStatsReset; use crate::tenant::storage_layer::LayerName; use crate::tenant::timeline::import_pgdata; @@ -2938,8 +2939,15 @@ async fn list_aux_files( active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; + let io_concurrency = IoConcurrency::spawn_from_conf( + state.conf, + timeline.gate.enter().map_err(|_| ApiError::Cancelled)?, + ); + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let files = timeline.list_aux_files(body.lsn, &ctx).await?; + let files = timeline + .list_aux_files(body.lsn, &ctx, io_concurrency) + .await?; json_response(StatusCode::OK, files) } diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 4758aaf2306f..02467cb6f705 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -126,73 +126,6 @@ pub(crate) static INITDB_RUN_TIME: Lazy = Lazy::new(|| { .expect("failed to define metric") }); -// Metrics collected on operations on the storage repository. -#[derive( - Clone, Copy, enum_map::Enum, strum_macros::EnumString, strum_macros::Display, IntoStaticStr, -)] -pub(crate) enum GetKind { - Singular, - Vectored, -} - -pub(crate) struct ReconstructTimeMetrics { - singular: Histogram, - vectored: Histogram, -} - -pub(crate) static RECONSTRUCT_TIME: Lazy = Lazy::new(|| { - let inner = register_histogram_vec!( - "pageserver_getpage_reconstruct_seconds", - "Time spent in reconstruct_value (reconstruct a page from deltas)", - &["get_kind"], - CRITICAL_OP_BUCKETS.into(), - ) - .expect("failed to define a metric"); - - ReconstructTimeMetrics { - singular: inner.with_label_values(&[GetKind::Singular.into()]), - vectored: inner.with_label_values(&[GetKind::Vectored.into()]), - } -}); - -impl ReconstructTimeMetrics { - pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram { - match get_kind { - GetKind::Singular => &self.singular, - GetKind::Vectored => &self.vectored, - } - } -} - -pub(crate) struct ReconstructDataTimeMetrics { - singular: Histogram, - vectored: Histogram, -} - -impl ReconstructDataTimeMetrics { - pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram { - match get_kind { - GetKind::Singular => &self.singular, - GetKind::Vectored => &self.vectored, - } - } -} - -pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy = Lazy::new(|| { - let inner = register_histogram_vec!( - "pageserver_getpage_get_reconstruct_data_seconds", - "Time spent in get_reconstruct_value_data", - &["get_kind"], - CRITICAL_OP_BUCKETS.into(), - ) - .expect("failed to define a metric"); - - ReconstructDataTimeMetrics { - singular: inner.with_label_values(&[GetKind::Singular.into()]), - vectored: inner.with_label_values(&[GetKind::Vectored.into()]), - } -}); - pub(crate) struct GetVectoredLatency { map: EnumMap>, } @@ -2550,12 +2483,19 @@ impl StorageTimeMetricsTimer { } } - /// Record the time from creation to now. - pub fn stop_and_record(self) { - let duration = self.start.elapsed().as_secs_f64(); - self.metrics.timeline_sum.inc_by(duration); + /// Returns the elapsed duration of the timer. + pub fn elapsed(&self) -> Duration { + self.start.elapsed() + } + + /// Record the time from creation to now and return it. + pub fn stop_and_record(self) -> Duration { + let duration = self.elapsed(); + let seconds = duration.as_secs_f64(); + self.metrics.timeline_sum.inc_by(seconds); self.metrics.timeline_count.inc(); - self.metrics.global_histogram.observe(duration); + self.metrics.global_histogram.observe(seconds); + duration } /// Turns this timer into a timer, which will always record -- usually this means recording @@ -2575,6 +2515,14 @@ impl Drop for AlwaysRecordingStorageTimeMetricsTimer { } } +impl AlwaysRecordingStorageTimeMetricsTimer { + /// Returns the elapsed duration of the timer. + #[allow(unused)] + pub fn elapsed(&self) -> Duration { + self.0.as_ref().expect("not dropped yet").elapsed() + } +} + /// Timing facilities for an globally histogrammed metric, which is supported by per tenant and /// timeline total sum and count. #[derive(Clone, Debug)] @@ -3919,7 +3867,6 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) { }); // Custom - Lazy::force(&RECONSTRUCT_TIME); Lazy::force(&BASEBACKUP_QUERY_TIME); Lazy::force(&COMPUTE_COMMANDS_COUNTERS); Lazy::force(&tokio_epoll_uring::THREAD_LOCAL_METRICS_STORAGE); diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index b14a44f9e3aa..e5063b7fc217 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -39,6 +39,7 @@ use tokio::io::{AsyncWriteExt, BufWriter}; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::*; +use utils::sync::gate::{Gate, GateGuard}; use utils::sync::spsc_fold; use utils::{ auth::{Claims, Scope, SwappableJwtAuth}, @@ -61,6 +62,7 @@ use crate::task_mgr::{self, COMPUTE_REQUEST_RUNTIME}; use crate::tenant::mgr::ShardSelector; use crate::tenant::mgr::TenantManager; use crate::tenant::mgr::{GetActiveTenantError, GetTenantError, ShardResolveResult}; +use crate::tenant::storage_layer::IoConcurrency; use crate::tenant::timeline::{self, WaitLsnError}; use crate::tenant::GetTimelineError; use crate::tenant::PageReconstructError; @@ -90,6 +92,7 @@ pub struct Listener { pub struct Connections { cancel: CancellationToken, tasks: tokio::task::JoinSet, + gate: Gate, } pub fn spawn( @@ -110,6 +113,7 @@ pub fn spawn( let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( "libpq listener", libpq_listener_main( + conf, tenant_manager, pg_auth, tcp_listener, @@ -134,11 +138,16 @@ impl Listener { } impl Connections { pub(crate) async fn shutdown(self) { - let Self { cancel, mut tasks } = self; + let Self { + cancel, + mut tasks, + gate, + } = self; cancel.cancel(); while let Some(res) = tasks.join_next().await { Self::handle_connection_completion(res); } + gate.close().await; } fn handle_connection_completion(res: Result, tokio::task::JoinError>) { @@ -158,7 +167,9 @@ impl Connections { /// Returns Ok(()) upon cancellation via `cancel`, returning the set of /// open connections. /// +#[allow(clippy::too_many_arguments)] pub async fn libpq_listener_main( + conf: &'static PageServerConf, tenant_manager: Arc, auth: Option>, listener: tokio::net::TcpListener, @@ -168,9 +179,15 @@ pub async fn libpq_listener_main( listener_cancel: CancellationToken, ) -> Connections { let connections_cancel = CancellationToken::new(); + let connections_gate = Gate::default(); let mut connection_handler_tasks = tokio::task::JoinSet::default(); loop { + let gate_guard = match connections_gate.enter() { + Ok(guard) => guard, + Err(_) => break, + }; + let accepted = tokio::select! { biased; _ = listener_cancel.cancelled() => break, @@ -190,6 +207,7 @@ pub async fn libpq_listener_main( let connection_ctx = listener_ctx .detached_child(TaskKind::PageRequestHandler, DownloadBehavior::Download); connection_handler_tasks.spawn(page_service_conn_main( + conf, tenant_manager.clone(), local_auth, socket, @@ -197,6 +215,7 @@ pub async fn libpq_listener_main( pipelining_config.clone(), connection_ctx, connections_cancel.child_token(), + gate_guard, )); } Err(err) => { @@ -211,13 +230,16 @@ pub async fn libpq_listener_main( Connections { cancel: connections_cancel, tasks: connection_handler_tasks, + gate: connections_gate, } } type ConnectionHandlerResult = anyhow::Result<()>; #[instrument(skip_all, fields(peer_addr))] +#[allow(clippy::too_many_arguments)] async fn page_service_conn_main( + conf: &'static PageServerConf, tenant_manager: Arc, auth: Option>, socket: tokio::net::TcpStream, @@ -225,6 +247,7 @@ async fn page_service_conn_main( pipelining_config: PageServicePipeliningConfig, connection_ctx: RequestContext, cancel: CancellationToken, + gate_guard: GateGuard, ) -> ConnectionHandlerResult { let _guard = LIVE_CONNECTIONS .with_label_values(&["page_service"]) @@ -274,11 +297,13 @@ async fn page_service_conn_main( // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler // and create the per-query context in process_query ourselves. let mut conn_handler = PageServerHandler::new( + conf, tenant_manager, auth, pipelining_config, connection_ctx, cancel.clone(), + gate_guard, ); let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; @@ -310,6 +335,7 @@ async fn page_service_conn_main( } struct PageServerHandler { + conf: &'static PageServerConf, auth: Option>, claims: Option, @@ -325,6 +351,8 @@ struct PageServerHandler { timeline_handles: Option, pipelining_config: PageServicePipeliningConfig, + + gate_guard: GateGuard, } struct TimelineHandles { @@ -634,19 +662,23 @@ impl BatchedFeMessage { impl PageServerHandler { pub fn new( + conf: &'static PageServerConf, tenant_manager: Arc, auth: Option>, pipelining_config: PageServicePipeliningConfig, connection_ctx: RequestContext, cancel: CancellationToken, + gate_guard: GateGuard, ) -> Self { PageServerHandler { + conf, auth, claims: None, connection_ctx, timeline_handles: Some(TimelineHandles::new(tenant_manager)), cancel, pipelining_config, + gate_guard, } } @@ -1015,6 +1047,7 @@ impl PageServerHandler { &mut self, pgb_writer: &mut PostgresBackend, batch: BatchedFeMessage, + io_concurrency: IoConcurrency, cancel: &CancellationToken, protocol_version: PagestreamProtocolVersion, ctx: &RequestContext, @@ -1084,6 +1117,7 @@ impl PageServerHandler { &*shard.upgrade()?, effective_request_lsn, pages, + io_concurrency, ctx, ) .instrument(span.clone()) @@ -1288,6 +1322,17 @@ impl PageServerHandler { } } + let io_concurrency = IoConcurrency::spawn_from_conf( + self.conf, + match self.gate_guard.try_clone() { + Ok(guard) => guard, + Err(_) => { + info!("shutdown request received in page handler"); + return Err(QueryError::Shutdown); + } + }, + ); + let pgb_reader = pgb .split() .context("implementation error: split pgb into reader and writer")?; @@ -1309,6 +1354,7 @@ impl PageServerHandler { request_span, pipelining_config, protocol_version, + io_concurrency, &ctx, ) .await @@ -1322,6 +1368,7 @@ impl PageServerHandler { timeline_handles, request_span, protocol_version, + io_concurrency, &ctx, ) .await @@ -1349,6 +1396,7 @@ impl PageServerHandler { mut timeline_handles: TimelineHandles, request_span: Span, protocol_version: PagestreamProtocolVersion, + io_concurrency: IoConcurrency, ctx: &RequestContext, ) -> ( (PostgresBackendReader, TimelineHandles), @@ -1383,7 +1431,14 @@ impl PageServerHandler { }; let err = self - .pagesteam_handle_batched_message(pgb_writer, msg, &cancel, protocol_version, ctx) + .pagesteam_handle_batched_message( + pgb_writer, + msg, + io_concurrency.clone(), + &cancel, + protocol_version, + ctx, + ) .await; match err { Ok(()) => {} @@ -1407,6 +1462,7 @@ impl PageServerHandler { request_span: Span, pipelining_config: PageServicePipeliningConfigPipelined, protocol_version: PagestreamProtocolVersion, + io_concurrency: IoConcurrency, ctx: &RequestContext, ) -> ( (PostgresBackendReader, TimelineHandles), @@ -1550,6 +1606,7 @@ impl PageServerHandler { self.pagesteam_handle_batched_message( pgb_writer, batch, + io_concurrency.clone(), &cancel, protocol_version, &ctx, @@ -1806,6 +1863,7 @@ impl PageServerHandler { timeline: &Timeline, effective_lsn: Lsn, requests: smallvec::SmallVec<[BatchedGetPageRequest; 1]>, + io_concurrency: IoConcurrency, ctx: &RequestContext, ) -> Vec> { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -1832,6 +1890,7 @@ impl PageServerHandler { .get_rel_page_at_lsn_batched( requests.iter().map(|p| (&p.req.rel, &p.req.blkno)), effective_lsn, + io_concurrency, ctx, ) .await; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index b65fe6cf7ce3..40c657524dff 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -17,6 +17,7 @@ use crate::span::{ debug_assert_current_span_has_tenant_and_timeline_id, debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id, }; +use crate::tenant::storage_layer::IoConcurrency; use crate::tenant::timeline::GetVectoredError; use anyhow::{ensure, Context}; use bytes::{Buf, Bytes, BytesMut}; @@ -200,6 +201,7 @@ impl Timeline { blknum: BlockNumber, version: Version<'_>, ctx: &RequestContext, + io_concurrency: IoConcurrency, ) -> Result { match version { Version::Lsn(effective_lsn) => { @@ -208,6 +210,7 @@ impl Timeline { .get_rel_page_at_lsn_batched( pages.iter().map(|(tag, blknum)| (tag, blknum)), effective_lsn, + io_concurrency.clone(), ctx, ) .await; @@ -246,6 +249,7 @@ impl Timeline { &self, pages: impl ExactSizeIterator, effective_lsn: Lsn, + io_concurrency: IoConcurrency, ctx: &RequestContext, ) -> Vec> { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -309,7 +313,10 @@ impl Timeline { acc.to_keyspace() }; - match self.get_vectored(keyspace, effective_lsn, ctx).await { + match self + .get_vectored(keyspace, effective_lsn, io_concurrency, ctx) + .await + { Ok(results) => { for (key, res) in results { let mut key_slots = keys_slots.remove(&key).unwrap().into_iter(); @@ -889,9 +896,15 @@ impl Timeline { &self, lsn: Lsn, ctx: &RequestContext, + io_concurrency: IoConcurrency, ) -> Result, PageReconstructError> { let kv = self - .scan(KeySpace::single(Key::metadata_aux_key_range()), lsn, ctx) + .scan( + KeySpace::single(Key::metadata_aux_key_range()), + lsn, + ctx, + io_concurrency, + ) .await?; let mut result = HashMap::new(); let mut sz = 0; @@ -914,8 +927,9 @@ impl Timeline { &self, lsn: Lsn, ctx: &RequestContext, + io_concurrency: IoConcurrency, ) -> Result<(), PageReconstructError> { - self.list_aux_files_v2(lsn, ctx).await?; + self.list_aux_files_v2(lsn, ctx, io_concurrency).await?; Ok(()) } @@ -923,17 +937,24 @@ impl Timeline { &self, lsn: Lsn, ctx: &RequestContext, + io_concurrency: IoConcurrency, ) -> Result, PageReconstructError> { - self.list_aux_files_v2(lsn, ctx).await + self.list_aux_files_v2(lsn, ctx, io_concurrency).await } pub(crate) async fn get_replorigins( &self, lsn: Lsn, ctx: &RequestContext, + io_concurrency: IoConcurrency, ) -> Result, PageReconstructError> { let kv = self - .scan(KeySpace::single(repl_origin_key_range()), lsn, ctx) + .scan( + KeySpace::single(repl_origin_key_range()), + lsn, + ctx, + io_concurrency, + ) .await?; let mut result = HashMap::new(); for (k, v) in kv { @@ -2432,7 +2453,11 @@ mod tests { ("foo/bar2".to_string(), Bytes::from_static(b"content2")), ]); - let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?; + let io_concurrency = IoConcurrency::spawn_for_test(); + + let readback = tline + .list_aux_files(Lsn(0x1008), &ctx, io_concurrency.clone()) + .await?; assert_eq!(readback, expect_1008); // Second modification: update one key, remove the other @@ -2444,11 +2469,15 @@ mod tests { let expect_2008 = HashMap::from([("foo/bar1".to_string(), Bytes::from_static(b"content3"))]); - let readback = tline.list_aux_files(Lsn(0x2008), &ctx).await?; + let readback = tline + .list_aux_files(Lsn(0x2008), &ctx, io_concurrency.clone()) + .await?; assert_eq!(readback, expect_2008); // Reading back in time works - let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?; + let readback = tline + .list_aux_files(Lsn(0x1008), &ctx, io_concurrency.clone()) + .await?; assert_eq!(readback, expect_1008); Ok(()) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 05a311391cb6..a273ef5d018f 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -5476,6 +5476,11 @@ pub(crate) mod harness { timeline_offloading: Some(tenant_conf.timeline_offloading), wal_receiver_protocol_override: tenant_conf.wal_receiver_protocol_override, rel_size_v2_enabled: tenant_conf.rel_size_v2_enabled, + gc_compaction_enabled: Some(tenant_conf.gc_compaction_enabled), + gc_compaction_initial_threshold_kb: Some( + tenant_conf.gc_compaction_initial_threshold_kb, + ), + gc_compaction_ratio_percent: Some(tenant_conf.gc_compaction_ratio_percent), } } } @@ -5709,7 +5714,7 @@ mod tests { use pageserver_api::value::Value; use pageserver_compaction::helpers::overlaps_with; use rand::{thread_rng, Rng}; - use storage_layer::PersistentLayerKey; + use storage_layer::{IoConcurrency, PersistentLayerKey}; use tests::storage_layer::ValuesReconstructState; use tests::timeline::{GetVectoredError, ShutdownMode}; use timeline::{CompactOptions, DeltaLayerTestDesc}; @@ -6490,6 +6495,7 @@ mod tests { async fn test_get_vectored() -> anyhow::Result<()> { let harness = TenantHarness::create("test_get_vectored").await?; let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) .await?; @@ -6554,7 +6560,7 @@ mod tests { .get_vectored_impl( read.clone(), reads_lsn, - &mut ValuesReconstructState::new(), + &mut ValuesReconstructState::new(io_concurrency.clone()), &ctx, ) .await; @@ -6601,6 +6607,7 @@ mod tests { let harness = TenantHarness::create("test_get_vectored_aux_files").await?; let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let tline = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx) .await?; @@ -6635,7 +6642,7 @@ mod tests { .get_vectored_impl( aux_keyspace.clone(), read_lsn, - &mut ValuesReconstructState::new(), + &mut ValuesReconstructState::new(io_concurrency.clone()), &ctx, ) .await; @@ -6683,6 +6690,7 @@ mod tests { ) .await?; let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let gap_at_key = current_key.add(100); @@ -6783,7 +6791,7 @@ mod tests { .get_vectored_impl( read.clone(), current_lsn, - &mut ValuesReconstructState::new(), + &mut ValuesReconstructState::new(io_concurrency.clone()), &ctx, ) .await?; @@ -6826,6 +6834,7 @@ mod tests { async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> { let harness = TenantHarness::create("test_get_vectored_on_lsn_axis").await?; let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let end_key = start_key.add(1000); @@ -6918,7 +6927,7 @@ mod tests { ranges: vec![child_gap_at_key..child_gap_at_key.next()], }, query_lsn, - &mut ValuesReconstructState::new(), + &mut ValuesReconstructState::new(io_concurrency.clone()), &ctx, ) .await; @@ -7364,6 +7373,7 @@ mod tests { async fn test_metadata_scan() -> anyhow::Result<()> { let harness = TenantHarness::create("test_metadata_scan").await?; let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; @@ -7417,7 +7427,7 @@ mod tests { .get_vectored_impl( keyspace.clone(), lsn, - &mut ValuesReconstructState::default(), + &mut ValuesReconstructState::new(io_concurrency.clone()), &ctx, ) .await? @@ -7532,6 +7542,7 @@ mod tests { let harness = TenantHarness::create("test_aux_file_e2e").await.unwrap(); let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let mut lsn = Lsn(0x08); @@ -7551,7 +7562,10 @@ mod tests { } // we can read everything from the storage - let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + let files = tline + .list_aux_files(lsn, &ctx, io_concurrency.clone()) + .await + .unwrap(); assert_eq!( files.get("pg_logical/mappings/test1"), Some(&bytes::Bytes::from_static(b"first")) @@ -7567,7 +7581,10 @@ mod tests { modification.commit(&ctx).await.unwrap(); } - let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + let files = tline + .list_aux_files(lsn, &ctx, io_concurrency.clone()) + .await + .unwrap(); assert_eq!( files.get("pg_logical/mappings/test2"), Some(&bytes::Bytes::from_static(b"second")) @@ -7578,7 +7595,10 @@ mod tests { .await .unwrap(); - let files = child.list_aux_files(lsn, &ctx).await.unwrap(); + let files = child + .list_aux_files(lsn, &ctx, io_concurrency.clone()) + .await + .unwrap(); assert_eq!(files.get("pg_logical/mappings/test1"), None); assert_eq!(files.get("pg_logical/mappings/test2"), None); } @@ -7587,6 +7607,7 @@ mod tests { async fn test_metadata_image_creation() -> anyhow::Result<()> { let harness = TenantHarness::create("test_metadata_image_creation").await?; let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; @@ -7606,8 +7627,9 @@ mod tests { keyspace: &KeySpace, lsn: Lsn, ctx: &RequestContext, + io_concurrency: IoConcurrency, ) -> anyhow::Result<(BTreeMap>, usize)> { - let mut reconstruct_state = ValuesReconstructState::default(); + let mut reconstruct_state = ValuesReconstructState::new(io_concurrency); let res = tline .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx) .await?; @@ -7655,7 +7677,8 @@ mod tests { if iter % 5 == 0 { let (_, before_delta_file_accessed) = - scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?; + scan_with_statistics(&tline, &keyspace, lsn, &ctx, io_concurrency.clone()) + .await?; tline .compact( &cancel, @@ -7669,7 +7692,8 @@ mod tests { ) .await?; let (_, after_delta_file_accessed) = - scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?; + scan_with_statistics(&tline, &keyspace, lsn, &ctx, io_concurrency.clone()) + .await?; assert!(after_delta_file_accessed < before_delta_file_accessed, "after_delta_file_accessed={after_delta_file_accessed}, before_delta_file_accessed={before_delta_file_accessed}"); // Given that we already produced an image layer, there should be no delta layer needed for the scan, but still setting a low threshold there for unforeseen circumstances. assert!( @@ -7758,6 +7782,7 @@ mod tests { async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> { let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads").await?; let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap(); let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap(); @@ -7896,7 +7921,7 @@ mod tests { ); // test vectored scan on parent timeline - let mut reconstruct_state = ValuesReconstructState::new(); + let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone()); let res = tline .get_vectored_impl( KeySpace::single(Key::metadata_key_range()), @@ -7922,7 +7947,7 @@ mod tests { ); // test vectored scan on child timeline - let mut reconstruct_state = ValuesReconstructState::new(); + let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone()); let res = child .get_vectored_impl( KeySpace::single(Key::metadata_key_range()), @@ -7960,7 +7985,9 @@ mod tests { lsn: Lsn, ctx: &RequestContext, ) -> Result, GetVectoredError> { - let mut reconstruct_state = ValuesReconstructState::new(); + let io_concurrency = + IoConcurrency::spawn_from_conf(tline.conf, tline.gate.enter().unwrap()); + let mut reconstruct_state = ValuesReconstructState::new(io_concurrency); let mut res = tline .get_vectored_impl( KeySpace::single(key..key.next()), @@ -8061,6 +8088,7 @@ mod tests { .await .unwrap(); let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap(); let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap(); @@ -8120,7 +8148,7 @@ mod tests { // Image layers are created at last_record_lsn let images = tline - .inspect_image_layers(Lsn(0x40), &ctx) + .inspect_image_layers(Lsn(0x40), &ctx, io_concurrency.clone()) .await .unwrap() .into_iter() @@ -8135,6 +8163,7 @@ mod tests { .await .unwrap(); let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap(); let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap(); @@ -8185,7 +8214,7 @@ mod tests { // Image layers are created at last_record_lsn let images = tline - .inspect_image_layers(Lsn(0x30), &ctx) + .inspect_image_layers(Lsn(0x30), &ctx, io_concurrency.clone()) .await .unwrap() .into_iter() @@ -8198,6 +8227,7 @@ mod tests { async fn test_simple_bottom_most_compaction_images() -> anyhow::Result<()> { let harness = TenantHarness::create("test_simple_bottom_most_compaction_images").await?; let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); fn get_key(id: u32) -> Key { // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. @@ -8339,7 +8369,7 @@ mod tests { // Check if the image layer at the GC horizon contains exactly what we want let image_at_gc_horizon = tline - .inspect_image_layers(Lsn(0x30), &ctx) + .inspect_image_layers(Lsn(0x30), &ctx, io_concurrency.clone()) .await .unwrap() .into_iter() @@ -10052,7 +10082,12 @@ mod tests { let keyspace = KeySpace::single(get_key(0)..get_key(10)); let results = tline - .get_vectored(keyspace, delta_layer_end_lsn, &ctx) + .get_vectored( + keyspace, + delta_layer_end_lsn, + IoConcurrency::sequential(), + &ctx, + ) .await .expect("No vectored errors"); for (key, res) in results { diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 14d8e9ccd411..3db1445f6e89 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -360,6 +360,15 @@ pub struct TenantConfOpt { #[serde(skip_serializing_if = "Option::is_none")] pub rel_size_v2_enabled: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + pub gc_compaction_enabled: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + pub gc_compaction_initial_threshold_kb: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + pub gc_compaction_ratio_percent: Option, } impl TenantConfOpt { @@ -429,6 +438,15 @@ impl TenantConfOpt { .wal_receiver_protocol_override .or(global_conf.wal_receiver_protocol_override), rel_size_v2_enabled: self.rel_size_v2_enabled.or(global_conf.rel_size_v2_enabled), + gc_compaction_enabled: self + .gc_compaction_enabled + .unwrap_or(global_conf.gc_compaction_enabled), + gc_compaction_initial_threshold_kb: self + .gc_compaction_initial_threshold_kb + .unwrap_or(global_conf.gc_compaction_initial_threshold_kb), + gc_compaction_ratio_percent: self + .gc_compaction_ratio_percent + .unwrap_or(global_conf.gc_compaction_ratio_percent), } } @@ -459,6 +477,9 @@ impl TenantConfOpt { mut timeline_offloading, mut wal_receiver_protocol_override, mut rel_size_v2_enabled, + mut gc_compaction_enabled, + mut gc_compaction_initial_threshold_kb, + mut gc_compaction_ratio_percent, } = self; patch.checkpoint_distance.apply(&mut checkpoint_distance); @@ -528,6 +549,15 @@ impl TenantConfOpt { .wal_receiver_protocol_override .apply(&mut wal_receiver_protocol_override); patch.rel_size_v2_enabled.apply(&mut rel_size_v2_enabled); + patch + .gc_compaction_enabled + .apply(&mut gc_compaction_enabled); + patch + .gc_compaction_initial_threshold_kb + .apply(&mut gc_compaction_initial_threshold_kb); + patch + .gc_compaction_ratio_percent + .apply(&mut gc_compaction_ratio_percent); Ok(Self { checkpoint_distance, @@ -555,6 +585,9 @@ impl TenantConfOpt { timeline_offloading, wal_receiver_protocol_override, rel_size_v2_enabled, + gc_compaction_enabled, + gc_compaction_initial_threshold_kb, + gc_compaction_ratio_percent, }) } } @@ -611,6 +644,9 @@ impl From for models::TenantConfig { timeline_offloading: value.timeline_offloading, wal_receiver_protocol_override: value.wal_receiver_protocol_override, rel_size_v2_enabled: value.rel_size_v2_enabled, + gc_compaction_enabled: value.gc_compaction_enabled, + gc_compaction_initial_threshold_kb: value.gc_compaction_initial_threshold_kb, + gc_compaction_ratio_percent: value.gc_compaction_ratio_percent, } } } diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 1b6924425c25..a69cce932e6b 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -57,6 +57,7 @@ use std::collections::{HashMap, VecDeque}; use std::iter::Peekable; use std::ops::Range; use std::sync::Arc; +use tokio::sync::watch; use utils::lsn::Lsn; use historic_layer_coverage::BufferedHistoricLayerCoverage; @@ -67,7 +68,6 @@ use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc}; /// /// LayerMap tracks what layers exist on a timeline. /// -#[derive(Default)] pub struct LayerMap { // // 'open_layer' holds the current InMemoryLayer that is accepting new @@ -93,7 +93,25 @@ pub struct LayerMap { /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient. /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree. + /// + /// NB: make sure to notify `watch_l0_deltas` on changes. l0_delta_layers: Vec>, + + /// Notifies about L0 delta layer changes, sending the current number of L0 layers. + watch_l0_deltas: watch::Sender, +} + +impl Default for LayerMap { + fn default() -> Self { + Self { + open_layer: Default::default(), + next_open_layer_at: Default::default(), + frozen_layers: Default::default(), + historic: Default::default(), + l0_delta_layers: Default::default(), + watch_l0_deltas: watch::channel(0).0, + } + } } /// The primary update API for the layer map. @@ -466,6 +484,8 @@ impl LayerMap { if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) { self.l0_delta_layers.push(layer_desc.clone().into()); + self.watch_l0_deltas + .send_replace(self.l0_delta_layers.len()); } self.historic.insert( @@ -488,6 +508,8 @@ impl LayerMap { let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers); l0_delta_layers.retain(|other| other.key() != layer_key); self.l0_delta_layers = l0_delta_layers; + self.watch_l0_deltas + .send_replace(self.l0_delta_layers.len()); // this assertion is related to use of Arc::ptr_eq in Self::compare_arced_layers, // there's a chance that the comparison fails at runtime due to it comparing (pointer, // vtable) pairs. @@ -850,6 +872,11 @@ impl LayerMap { &self.l0_delta_layers } + /// Subscribes to L0 delta layer changes, sending the current number of L0 delta layers. + pub fn watch_level0_deltas(&self) -> watch::Receiver { + self.watch_l0_deltas.subscribe() + } + /// debugging function to print out the contents of the layer map #[allow(unused)] pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> { diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 30b6b07ca360..3824bc8f116d 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -77,11 +77,17 @@ pub struct IndexPart { /// /// None means no aux files have been written to the storage before the point /// when this flag is introduced. + /// + /// This flag is not used any more as all tenants have been transitioned to the new aux file policy. #[serde(skip_serializing_if = "Option::is_none", default)] pub(crate) last_aux_file_policy: Option, #[serde(skip_serializing_if = "Option::is_none", default)] pub(crate) rel_size_migration: Option, + + /// The LSN of gc-compaction horizon. Once gc-compaction is finished for all layer files below an LSN, this LSN will be updated. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub(crate) l2_lsn: Option, } #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -116,10 +122,11 @@ impl IndexPart { /// - 9: +gc_blocking /// - 10: +import_pgdata /// - 11: +rel_size_migration - const LATEST_VERSION: usize = 11; + /// - 12: +l2_lsn + const LATEST_VERSION: usize = 12; // Versions we may see when reading from a bucket. - pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; + pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]; pub const FILE_NAME: &'static str = "index_part.json"; @@ -136,6 +143,7 @@ impl IndexPart { last_aux_file_policy: None, import_pgdata: None, rel_size_migration: None, + l2_lsn: None, } } @@ -437,6 +445,7 @@ mod tests { last_aux_file_policy: None, import_pgdata: None, rel_size_migration: None, + l2_lsn: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -483,6 +492,7 @@ mod tests { last_aux_file_policy: None, import_pgdata: None, rel_size_migration: None, + l2_lsn: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -530,6 +540,7 @@ mod tests { last_aux_file_policy: None, import_pgdata: None, rel_size_migration: None, + l2_lsn: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -580,6 +591,7 @@ mod tests { last_aux_file_policy: None, import_pgdata: None, rel_size_migration: None, + l2_lsn: None, }; let empty_layers_parsed = IndexPart::from_json_bytes(empty_layers_json.as_bytes()).unwrap(); @@ -625,6 +637,7 @@ mod tests { last_aux_file_policy: None, import_pgdata: None, rel_size_migration: None, + l2_lsn: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -673,6 +686,7 @@ mod tests { last_aux_file_policy: None, import_pgdata: None, rel_size_migration: None, + l2_lsn: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -726,6 +740,7 @@ mod tests { last_aux_file_policy: Some(AuxFilePolicy::V2), import_pgdata: None, rel_size_migration: None, + l2_lsn: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -784,6 +799,7 @@ mod tests { last_aux_file_policy: Default::default(), import_pgdata: None, rel_size_migration: None, + l2_lsn: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -843,6 +859,7 @@ mod tests { last_aux_file_policy: Default::default(), import_pgdata: None, rel_size_migration: None, + l2_lsn: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -907,6 +924,7 @@ mod tests { archived_at: None, import_pgdata: None, rel_size_migration: None, + l2_lsn: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -984,6 +1002,7 @@ mod tests { idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()), }))), rel_size_migration: None, + l2_lsn: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -1062,6 +1081,87 @@ mod tests { idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()), }))), rel_size_migration: Some(RelSizeMigration::Legacy), + l2_lsn: None, + }; + + let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); + assert_eq!(part, expected); + } + + #[test] + fn v12_l2_lsn_is_parsed() { + let example = r#"{ + "version": 12, + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } + }, + "disk_consistent_lsn":"0/16960E8", + "metadata": { + "disk_consistent_lsn": "0/16960E8", + "prev_record_lsn": "0/1696070", + "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e", + "ancestor_lsn": "0/0", + "latest_gc_cutoff_lsn": "0/1696070", + "initdb_lsn": "0/1696070", + "pg_version": 14 + }, + "gc_blocking": { + "started_at": "2024-07-19T09:00:00.123", + "reasons": ["DetachAncestor"] + }, + "import_pgdata": { + "V1": { + "Done": { + "idempotency_key": "specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5", + "started_at": "2024-11-13T09:23:42.123", + "finished_at": "2024-11-13T09:42:23.123" + } + } + }, + "rel_size_migration": "legacy", + "l2_lsn": "0/16960E8" + }"#; + + let expected = IndexPart { + version: 12, + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { + file_size: 25600000, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { + file_size: 9007199254741001, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }) + ]), + disk_consistent_lsn: "0/16960E8".parse::().unwrap(), + metadata: TimelineMetadata::new( + Lsn::from_str("0/16960E8").unwrap(), + Some(Lsn::from_str("0/1696070").unwrap()), + Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()), + Lsn::INVALID, + Lsn::from_str("0/1696070").unwrap(), + Lsn::from_str("0/1696070").unwrap(), + 14, + ).with_recalculated_checksum().unwrap(), + deleted_at: None, + lineage: Default::default(), + gc_blocking: Some(GcBlocking { + started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"), + reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]), + }), + last_aux_file_policy: Default::default(), + archived_at: None, + import_pgdata: Some(import_pgdata::index_part_format::Root::V1(import_pgdata::index_part_format::V1::Done(import_pgdata::index_part_format::Done{ + started_at: parse_naive_datetime("2024-11-13T09:23:42.123000000"), + finished_at: parse_naive_datetime("2024-11-13T09:42:23.123000000"), + idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()), + }))), + rel_size_migration: Some(RelSizeMigration::Legacy), + l2_lsn: Some("0/16960E8".parse::().unwrap()), }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 3913637ca071..c24d037dde14 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -10,18 +10,26 @@ mod layer_desc; mod layer_name; pub mod merge_iterator; +use crate::config::PageServerConf; use crate::context::{AccessStatsBehavior, RequestContext}; use bytes::Bytes; +use futures::stream::FuturesUnordered; +use futures::StreamExt; use pageserver_api::key::Key; use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; use pageserver_api::record::NeonWalRecord; use pageserver_api::value::Value; -use std::cmp::{Ordering, Reverse}; +use std::cmp::Ordering; use std::collections::hash_map::Entry; use std::collections::{BinaryHeap, HashMap}; +use std::future::Future; use std::ops::Range; +use std::pin::Pin; +use std::sync::atomic::AtomicUsize; use std::sync::Arc; use std::time::{Duration, SystemTime, UNIX_EPOCH}; +use tracing::{trace, Instrument}; +use utils::sync::gate::GateGuard; use utils::lsn::Lsn; @@ -78,30 +86,151 @@ pub(crate) enum ValueReconstructSituation { Continue, } +/// On disk representation of a value loaded in a buffer +#[derive(Debug)] +pub(crate) enum OnDiskValue { + /// Unencoded [`Value::Image`] + RawImage(Bytes), + /// Encoded [`Value`]. Can deserialize into an image or a WAL record + WalRecordOrImage(Bytes), +} + /// Reconstruct data accumulated for a single key during a vectored get -#[derive(Debug, Default, Clone)] +#[derive(Debug, Default)] pub(crate) struct VectoredValueReconstructState { - pub(crate) records: Vec<(Lsn, NeonWalRecord)>, - pub(crate) img: Option<(Lsn, Bytes)>, + pub(crate) on_disk_values: Vec<(Lsn, OnDiskValueIoWaiter)>, - situation: ValueReconstructSituation, + pub(crate) situation: ValueReconstructSituation, } -impl VectoredValueReconstructState { - fn get_cached_lsn(&self) -> Option { - self.img.as_ref().map(|img| img.0) +#[derive(Debug)] +pub(crate) struct OnDiskValueIoWaiter { + rx: tokio::sync::oneshot::Receiver, +} + +#[derive(Debug)] +#[must_use] +pub(crate) enum OnDiskValueIo { + /// Traversal identified this IO as required to complete the vectored get. + Required { + num_active_ios: Arc, + tx: tokio::sync::oneshot::Sender, + }, + /// Sparse keyspace reads always read all the values for a given key, + /// even though only the first value is needed. + /// + /// This variant represents the unnecessary IOs for those values at lower LSNs + /// that aren't needed, but are currently still being done. + /// + /// The execution of unnecessary IOs was a pre-existing behavior before concurrent IO. + /// We added this explicit representation here so that we can drop + /// unnecessary IO results immediately, instead of buffering them in + /// `oneshot` channels inside [`VectoredValueReconstructState`] until + /// [`VectoredValueReconstructState::collect_pending_ios`] gets called. + Unnecessary, +} + +type OnDiskValueIoResult = Result; + +impl OnDiskValueIo { + pub(crate) fn complete(self, res: OnDiskValueIoResult) { + match self { + OnDiskValueIo::Required { num_active_ios, tx } => { + num_active_ios.fetch_sub(1, std::sync::atomic::Ordering::Release); + let _ = tx.send(res); + } + OnDiskValueIo::Unnecessary => { + // Nobody cared, see variant doc comment. + } + } } } -impl From for ValueReconstructState { - fn from(mut state: VectoredValueReconstructState) -> Self { - // walredo expects the records to be descending in terms of Lsn - state.records.sort_by_key(|(lsn, _)| Reverse(*lsn)); +#[derive(Debug, thiserror::Error)] +pub(crate) enum WaitCompletionError { + #[error("OnDiskValueIo was dropped without completing, likely the sidecar task panicked")] + IoDropped, +} + +impl OnDiskValueIoWaiter { + pub(crate) async fn wait_completion(self) -> Result { + // NB: for Unnecessary IOs, this method never gets called because we don't add them to `on_disk_values`. + self.rx.await.map_err(|_| WaitCompletionError::IoDropped) + } +} - ValueReconstructState { - records: state.records, - img: state.img, +impl VectoredValueReconstructState { + /// # Cancel-Safety + /// + /// Technically fine to stop polling this future, but, the IOs will still + /// be executed to completion by the sidecar task and hold on to / consume resources. + /// Better not do it to make reasonsing about the system easier. + pub(crate) async fn collect_pending_ios( + self, + ) -> Result { + use utils::bin_ser::BeSer; + + let mut res = Ok(ValueReconstructState::default()); + + // We should try hard not to bail early, so that by the time we return from this + // function, all IO for this value is done. It's not required -- we could totally + // stop polling the IO futures in the sidecar task, they need to support that, + // but just stopping to poll doesn't reduce the IO load on the disk. It's easier + // to reason about the system if we just wait for all IO to complete, even if + // we're no longer interested in the result. + // + // Revisit this when IO futures are replaced with a more sophisticated IO system + // and an IO scheduler, where we know which IOs were submitted and which ones + // just queued. Cf the comment on IoConcurrency::spawn_io. + for (lsn, waiter) in self.on_disk_values { + let value_recv_res = waiter + .wait_completion() + // we rely on the caller to poll us to completion, so this is not a bail point + .await; + // Force not bailing early by wrapping the code into a closure. + #[allow(clippy::redundant_closure_call)] + let _: () = (|| { + match (&mut res, value_recv_res) { + (Err(_), _) => { + // We've already failed, no need to process more. + } + (Ok(_), Err(wait_err)) => { + // This shouldn't happen - likely the sidecar task panicked. + res = Err(PageReconstructError::Other(wait_err.into())); + } + (Ok(_), Ok(Err(err))) => { + let err: std::io::Error = err; + // TODO: returning IO error here will fail a compute query. + // Probably not what we want, we're not doing `maybe_fatal_err` + // in the IO futures. + // But it's been like that for a long time, not changing it + // as part of concurrent IO. + // => https://github.com/neondatabase/neon/issues/10454 + res = Err(PageReconstructError::Other(err.into())); + } + (Ok(ok), Ok(Ok(OnDiskValue::RawImage(img)))) => { + assert!(ok.img.is_none()); + ok.img = Some((lsn, img)); + } + (Ok(ok), Ok(Ok(OnDiskValue::WalRecordOrImage(buf)))) => { + match Value::des(&buf) { + Ok(Value::WalRecord(rec)) => { + ok.records.push((lsn, rec)); + } + Ok(Value::Image(img)) => { + assert!(ok.img.is_none()); + ok.img = Some((lsn, img)); + } + Err(err) => { + res = Err(PageReconstructError::Other(err.into())); + } + } + } + } + })(); } + + res } } @@ -109,7 +238,7 @@ impl From for ValueReconstructState { pub(crate) struct ValuesReconstructState { /// The keys will be removed after `get_vectored` completes. The caller outside `Timeline` /// should not expect to get anything from this hashmap. - pub(crate) keys: HashMap>, + pub(crate) keys: HashMap, /// The keys which are already retrieved keys_done: KeySpaceRandomAccum, @@ -119,27 +248,365 @@ pub(crate) struct ValuesReconstructState { // Statistics that are still accessible as a caller of `get_vectored_impl`. layers_visited: u32, delta_layers_visited: u32, + + pub(crate) io_concurrency: IoConcurrency, + num_active_ios: Arc, +} + +/// The level of IO concurrency to be used on the read path +/// +/// The desired end state is that we always do parallel IO. +/// This struct and the dispatching in the impl will be removed once +/// we've built enough confidence. +pub(crate) enum IoConcurrency { + Sequential, + SidecarTask { + task_id: usize, + ios_tx: tokio::sync::mpsc::UnboundedSender, + }, +} + +type IoFuture = Pin>>; + +pub(crate) enum SelectedIoConcurrency { + Sequential, + SidecarTask(GateGuard), +} + +impl std::fmt::Debug for IoConcurrency { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + IoConcurrency::Sequential => write!(f, "Sequential"), + IoConcurrency::SidecarTask { .. } => write!(f, "SidecarTask"), + } + } +} + +impl std::fmt::Debug for SelectedIoConcurrency { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + SelectedIoConcurrency::Sequential => write!(f, "Sequential"), + SelectedIoConcurrency::SidecarTask(_) => write!(f, "SidecarTask"), + } + } +} + +impl IoConcurrency { + /// Force sequential IO. This is a temporary workaround until we have + /// moved plumbing-through-the-call-stack + /// of IoConcurrency into `RequestContextq. + /// + /// DO NOT USE for new code. + /// + /// Tracking issue: . + pub(crate) fn sequential() -> Self { + Self::spawn(SelectedIoConcurrency::Sequential) + } + + pub(crate) fn spawn_from_conf( + conf: &'static PageServerConf, + gate_guard: GateGuard, + ) -> IoConcurrency { + use pageserver_api::config::GetVectoredConcurrentIo; + let selected = match conf.get_vectored_concurrent_io { + GetVectoredConcurrentIo::Sequential => SelectedIoConcurrency::Sequential, + GetVectoredConcurrentIo::SidecarTask => SelectedIoConcurrency::SidecarTask(gate_guard), + }; + Self::spawn(selected) + } + + pub(crate) fn spawn(io_concurrency: SelectedIoConcurrency) -> Self { + match io_concurrency { + SelectedIoConcurrency::Sequential => IoConcurrency::Sequential, + SelectedIoConcurrency::SidecarTask(gate_guard) => { + let (ios_tx, ios_rx) = tokio::sync::mpsc::unbounded_channel(); + static TASK_ID: AtomicUsize = AtomicUsize::new(0); + let task_id = TASK_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + // TODO: enrich the span with more context (tenant,shard,timeline) + (basebackup|pagestream|...) + let span = + tracing::info_span!(parent: None, "IoConcurrency_sidecar", task_id = task_id); + trace!(task_id, "spawning sidecar task"); + tokio::spawn(async move { + trace!("start"); + scopeguard::defer!{ trace!("end") }; + type IosRx = tokio::sync::mpsc::UnboundedReceiver; + enum State { + Waiting { + // invariant: is_empty(), but we recycle the allocation + empty_futures: FuturesUnordered, + ios_rx: IosRx, + }, + Executing { + futures: FuturesUnordered, + ios_rx: IosRx, + }, + ShuttingDown { + futures: FuturesUnordered, + }, + } + let mut state = State::Waiting { + empty_futures: FuturesUnordered::new(), + ios_rx, + }; + loop { + match state { + State::Waiting { + empty_futures, + mut ios_rx, + } => { + assert!(empty_futures.is_empty()); + tokio::select! { + fut = ios_rx.recv() => { + if let Some(fut) = fut { + trace!("received new io future"); + empty_futures.push(fut); + state = State::Executing { futures: empty_futures, ios_rx }; + } else { + state = State::ShuttingDown { futures: empty_futures } + } + } + } + } + State::Executing { + mut futures, + mut ios_rx, + } => { + tokio::select! { + res = futures.next() => { + trace!("io future completed"); + assert!(res.is_some()); + if futures.is_empty() { + state = State::Waiting { empty_futures: futures, ios_rx}; + } else { + state = State::Executing { futures, ios_rx }; + } + } + fut = ios_rx.recv() => { + if let Some(fut) = fut { + trace!("received new io future"); + futures.push(fut); + state = State::Executing { futures, ios_rx}; + } else { + state = State::ShuttingDown { futures }; + } + } + } + } + State::ShuttingDown { + mut futures, + } => { + trace!("shutting down"); + while let Some(()) = futures.next().await { + trace!("io future completed (shutdown)"); + // drain + } + trace!("shutdown complete"); + break; + } + } + } + drop(gate_guard); // drop it right before we exit + }.instrument(span)); + IoConcurrency::SidecarTask { task_id, ios_tx } + } + } + } + + pub(crate) fn clone(&self) -> Self { + match self { + IoConcurrency::Sequential => IoConcurrency::Sequential, + IoConcurrency::SidecarTask { task_id, ios_tx } => IoConcurrency::SidecarTask { + task_id: *task_id, + ios_tx: ios_tx.clone(), + }, + } + } + + /// Submit an IO to be executed in the background. DEADLOCK RISK, read the full doc string. + /// + /// The IO is represented as an opaque future. + /// IO completion must be handled inside the future, e.g., through a oneshot channel. + /// + /// The API seems simple but there are multiple **pitfalls** involving + /// DEADLOCK RISK. + /// + /// First, there are no guarantees about the exexecution of the IO. + /// It may be `await`ed in-place before this function returns. + /// It may be polled partially by this task and handed off to another task to be finished. + /// It may be polled and then dropped before returning ready. + /// + /// This means that submitted IOs must not be interedependent. + /// Interdependence may be through shared limited resources, e.g., + /// - VirtualFile file descriptor cache slot acquisition + /// - tokio-epoll-uring slot + /// + /// # Why current usage is safe from deadlocks + /// + /// Textbook condition for a deadlock is that _all_ of the following be given + /// - Mutual exclusion + /// - Hold and wait + /// - No preemption + /// - Circular wait + /// + /// The current usage is safe because: + /// - Mutual exclusion: IO futures definitely use mutexes, no way around that for now + /// - Hold and wait: IO futures currently hold two kinds of locks/resources while waiting + /// for acquisition of other resources: + /// - VirtualFile file descriptor cache slot tokio mutex + /// - tokio-epoll-uring slot (uses tokio notify => wait queue, much like mutex) + /// - No preemption: there's no taking-away of acquired locks/resources => given + /// - Circular wait: this is the part of the condition that isn't met: all IO futures + /// first acquire VirtualFile mutex, then tokio-epoll-uring slot. + /// There is no IO future that acquires slot before VirtualFile. + /// Hence there can be no circular waiting. + /// Hence there cannot be a deadlock. + /// + /// This is a very fragile situation and must be revisited whenver any code called from + /// inside the IO futures is changed. + /// + /// We will move away from opaque IO futures towards well-defined IOs at some point in + /// the future when we have shipped this first version of concurrent IO to production + /// and are ready to retire the Sequential mode which runs the futures in place. + /// Right now, while brittle, the opaque IO approach allows us to ship the feature + /// with minimal changes to the code and minimal changes to existing behavior in Sequential mode. + /// + /// Also read the comment in `collect_pending_ios`. + pub(crate) async fn spawn_io(&mut self, fut: F) + where + F: std::future::Future + Send + 'static, + { + match self { + IoConcurrency::Sequential => fut.await, + IoConcurrency::SidecarTask { ios_tx, .. } => { + let fut = Box::pin(fut); + // NB: experiments showed that doing an opportunistic poll of `fut` here was bad for throughput + // while insignificant for latency. + // It would make sense to revisit the tokio-epoll-uring API in the future such that we can try + // a submission here, but never poll the future. That way, io_uring can make proccess while + // the future sits in the ios_tx queue. + match ios_tx.send(fut) { + Ok(()) => {} + Err(_) => { + unreachable!("the io task must have exited, likely it panicked") + } + } + } + } + } + + #[cfg(test)] + pub(crate) fn spawn_for_test() -> impl std::ops::DerefMut { + use std::ops::{Deref, DerefMut}; + use tracing::info; + use utils::sync::gate::Gate; + + // Spawn needs a Gate, give it one. + struct Wrapper { + inner: IoConcurrency, + #[allow(dead_code)] + gate: Box, + } + impl Deref for Wrapper { + type Target = IoConcurrency; + + fn deref(&self) -> &Self::Target { + &self.inner + } + } + impl DerefMut for Wrapper { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.inner + } + } + let gate = Box::new(Gate::default()); + + // The default behavior when running Rust unit tests without any further + // flags is to use the new behavior. + // The CI uses the following environment variable to unit test both old + // and new behavior. + // NB: the Python regression & perf tests take the `else` branch + // below and have their own defaults management. + let selected = { + // The pageserver_api::config type is unsuitable because it's internally tagged. + #[derive(serde::Deserialize)] + #[serde(rename_all = "kebab-case")] + enum TestOverride { + Sequential, + SidecarTask, + } + use once_cell::sync::Lazy; + static TEST_OVERRIDE: Lazy = Lazy::new(|| { + utils::env::var_serde_json_string( + "NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO", + ) + .unwrap_or(TestOverride::SidecarTask) + }); + + match *TEST_OVERRIDE { + TestOverride::Sequential => SelectedIoConcurrency::Sequential, + TestOverride::SidecarTask => { + SelectedIoConcurrency::SidecarTask(gate.enter().expect("just created it")) + } + } + }; + + info!(?selected, "get_vectored_concurrent_io test"); + + Wrapper { + inner: Self::spawn(selected), + gate, + } + } +} + +/// Make noise in case the [`ValuesReconstructState`] gets dropped while +/// there are still IOs in flight. +/// Refer to `collect_pending_ios` for why we prefer not to do that. +// +/// We log from here instead of from the sidecar task because the [`ValuesReconstructState`] +/// gets dropped in a tracing span with more context. +/// We repeat the sidecar tasks's `task_id` so we can correlate what we emit here with +/// the logs / panic handler logs from the sidecar task, which also logs the `task_id`. +impl Drop for ValuesReconstructState { + fn drop(&mut self) { + let num_active_ios = self + .num_active_ios + .load(std::sync::atomic::Ordering::Acquire); + if num_active_ios == 0 { + return; + } + let sidecar_task_id = match &self.io_concurrency { + IoConcurrency::Sequential => None, + IoConcurrency::SidecarTask { task_id, .. } => Some(*task_id), + }; + tracing::warn!( + num_active_ios, + ?sidecar_task_id, + backtrace=%std::backtrace::Backtrace::force_capture(), + "dropping ValuesReconstructState while some IOs have not been completed", + ); + } } impl ValuesReconstructState { - pub(crate) fn new() -> Self { + pub(crate) fn new(io_concurrency: IoConcurrency) -> Self { Self { keys: HashMap::new(), keys_done: KeySpaceRandomAccum::new(), keys_with_image_coverage: None, layers_visited: 0, delta_layers_visited: 0, + io_concurrency, + num_active_ios: Arc::new(AtomicUsize::new(0)), } } - /// Associate a key with the error which it encountered and mark it as done - pub(crate) fn on_key_error(&mut self, key: Key, err: PageReconstructError) { - let previous = self.keys.insert(key, Err(err)); - if let Some(Ok(state)) = previous { - if state.situation == ValueReconstructSituation::Continue { - self.keys_done.add_key(key); - } - } + /// Absolutely read [`IoConcurrency::spawn_io`] to learn about assumptions & pitfalls. + pub(crate) async fn spawn_io(&mut self, fut: F) + where + F: std::future::Future + Send + 'static, + { + self.io_concurrency.spawn_io(fut).await; } pub(crate) fn on_layer_visited(&mut self, layer: &ReadableLayer) { @@ -159,29 +626,6 @@ impl ValuesReconstructState { self.layers_visited } - /// This function is called after reading a keyspace from a layer. - /// It checks if the read path has now moved past the cached Lsn for any keys. - /// - /// Implementation note: We intentionally iterate over the keys for which we've - /// already collected some reconstruct data. This avoids scaling complexity with - /// the size of the search space. - pub(crate) fn on_lsn_advanced(&mut self, keyspace: &KeySpace, advanced_to: Lsn) { - for (key, value) in self.keys.iter_mut() { - if !keyspace.contains(key) { - continue; - } - - if let Ok(state) = value { - if state.situation != ValueReconstructSituation::Complete - && state.get_cached_lsn() >= Some(advanced_to) - { - state.situation = ValueReconstructSituation::Complete; - self.keys_done.add_key(*key); - } - } - } - } - /// On hitting image layer, we can mark all keys in this range as done, because /// if the image layer does not contain a key, it is deleted/never added. pub(crate) fn on_image_layer_visited(&mut self, key_range: &Range) { @@ -199,70 +643,42 @@ impl ValuesReconstructState { /// /// If the key is in the sparse keyspace (i.e., aux files), we do not track them in /// `key_done`. - pub(crate) fn update_key( - &mut self, - key: &Key, - lsn: Lsn, - value: Value, - ) -> ValueReconstructSituation { - let state = self - .keys - .entry(*key) - .or_insert(Ok(VectoredValueReconstructState::default())); + // TODO: rename this method & update description. + pub(crate) fn update_key(&mut self, key: &Key, lsn: Lsn, completes: bool) -> OnDiskValueIo { + let state = self.keys.entry(*key).or_default(); + let is_sparse_key = key.is_sparse(); - if let Ok(state) = state { - let key_done = match state.situation { - ValueReconstructSituation::Complete => { - if is_sparse_key { - // Sparse keyspace might be visited multiple times because - // we don't track unmapped keyspaces. - return ValueReconstructSituation::Complete; - } else { - unreachable!() - } - } - ValueReconstructSituation::Continue => match value { - Value::Image(img) => { - state.img = Some((lsn, img)); - true - } - Value::WalRecord(rec) => { - debug_assert!( - Some(lsn) > state.get_cached_lsn(), - "Attempt to collect a record below cached LSN for walredo: {} < {}", - lsn, - state - .get_cached_lsn() - .expect("Assertion can only fire if a cached lsn is present") - ); - - let will_init = rec.will_init(); - state.records.push((lsn, rec)); - will_init - } - }, - }; - if key_done && state.situation == ValueReconstructSituation::Continue { - state.situation = ValueReconstructSituation::Complete; - if !is_sparse_key { - self.keys_done.add_key(*key); + let required_io = match state.situation { + ValueReconstructSituation::Complete => { + if is_sparse_key { + // Sparse keyspace might be visited multiple times because + // we don't track unmapped keyspaces. + return OnDiskValueIo::Unnecessary; + } else { + unreachable!() } } + ValueReconstructSituation::Continue => { + self.num_active_ios + .fetch_add(1, std::sync::atomic::Ordering::Release); + let (tx, rx) = tokio::sync::oneshot::channel(); + state.on_disk_values.push((lsn, OnDiskValueIoWaiter { rx })); + OnDiskValueIo::Required { + tx, + num_active_ios: Arc::clone(&self.num_active_ios), + } + } + }; - state.situation - } else { - ValueReconstructSituation::Complete + if completes && state.situation == ValueReconstructSituation::Continue { + state.situation = ValueReconstructSituation::Complete; + if !is_sparse_key { + self.keys_done.add_key(*key); + } } - } - /// Returns the Lsn at which this key is cached if one exists. - /// The read path should go no further than this Lsn for the given key. - pub(crate) fn get_cached_lsn(&self, key: &Key) -> Option { - self.keys - .get(key) - .and_then(|k| k.as_ref().ok()) - .and_then(|state| state.get_cached_lsn()) + required_io } /// Returns the key space describing the keys that have @@ -276,12 +692,6 @@ impl ValuesReconstructState { } } -impl Default for ValuesReconstructState { - fn default() -> Self { - Self::new() - } -} - /// A key that uniquely identifies a layer in a timeline #[derive(Debug, PartialEq, Eq, Clone, Hash)] pub(crate) enum LayerId { @@ -720,3 +1130,78 @@ impl std::fmt::Debug for RangeDisplayDebug<'_, T> { write!(f, "{}..{}", self.0.start, self.0.end) } } + +#[cfg(test)] +mod tests2 { + use pageserver_api::key::DBDIR_KEY; + use tracing::info; + + use super::*; + use crate::tenant::storage_layer::IoConcurrency; + + /// TODO: currently this test relies on manual visual inspection of the --no-capture output. + /// Should look like so: + /// ```text + /// RUST_LOG=trace cargo nextest run --features testing --no-capture test_io_concurrency_noise + /// running 1 test + /// 2025-01-21T17:42:01.335679Z INFO get_vectored_concurrent_io test selected=SidecarTask + /// 2025-01-21T17:42:01.335680Z TRACE spawning sidecar task task_id=0 + /// 2025-01-21T17:42:01.335937Z TRACE IoConcurrency_sidecar{task_id=0}: start + /// 2025-01-21T17:42:01.335972Z TRACE IoConcurrency_sidecar{task_id=0}: received new io future + /// 2025-01-21T17:42:01.335999Z INFO IoConcurrency_sidecar{task_id=0}: waiting for signal to complete IO + /// 2025-01-21T17:42:01.336229Z WARN dropping ValuesReconstructState while some IOs have not been completed num_active_ios=1 sidecar_task_id=Some(0) backtrace= 0: ::drop + /// at ./src/tenant/storage_layer.rs:553:24 + /// 1: core::ptr::drop_in_place + /// at /home/christian/.rustup/toolchains/1.84.0-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/ptr/mod.rs:521:1 + /// 2: core::mem::drop + /// at /home/christian/.rustup/toolchains/1.84.0-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/mem/mod.rs:942:24 + /// 3: pageserver::tenant::storage_layer::tests2::test_io_concurrency_noise::{{closure}} + /// at ./src/tenant/storage_layer.rs:1159:9 + /// ... + /// 49: + /// 2025-01-21T17:42:01.452293Z INFO IoConcurrency_sidecar{task_id=0}: completing IO + /// 2025-01-21T17:42:01.452357Z TRACE IoConcurrency_sidecar{task_id=0}: io future completed + /// 2025-01-21T17:42:01.452473Z TRACE IoConcurrency_sidecar{task_id=0}: end + /// test tenant::storage_layer::tests2::test_io_concurrency_noise ... ok + /// + /// ``` + #[tokio::test] + async fn test_io_concurrency_noise() { + crate::tenant::harness::setup_logging(); + + let io_concurrency = IoConcurrency::spawn_for_test(); + match *io_concurrency { + IoConcurrency::Sequential => { + // This test asserts behavior in sidecar mode, doesn't make sense in sequential mode. + return; + } + IoConcurrency::SidecarTask { .. } => {} + } + let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone()); + + let (io_fut_is_waiting_tx, io_fut_is_waiting) = tokio::sync::oneshot::channel(); + let (do_complete_io, should_complete_io) = tokio::sync::oneshot::channel(); + let (io_fut_exiting_tx, io_fut_exiting) = tokio::sync::oneshot::channel(); + + let io = reconstruct_state.update_key(&DBDIR_KEY, Lsn(8), true); + reconstruct_state + .spawn_io(async move { + info!("waiting for signal to complete IO"); + io_fut_is_waiting_tx.send(()).unwrap(); + should_complete_io.await.unwrap(); + info!("completing IO"); + io.complete(Ok(OnDiskValue::RawImage(Bytes::new()))); + io_fut_exiting_tx.send(()).unwrap(); + }) + .await; + + io_fut_is_waiting.await.unwrap(); + + // this is what makes the noise + drop(reconstruct_state); + + do_complete_io.send(()).unwrap(); + + io_fut_exiting.await.unwrap(); + } +} diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index ade1b794c65d..885c50425f00 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -41,13 +41,12 @@ use crate::tenant::vectored_blob_io::{ BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner, }; -use crate::tenant::PageReconstructError; use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt}; use crate::virtual_file::IoBufferMut; use crate::virtual_file::{self, MaybeFatalIo, VirtualFile}; use crate::TEMP_FILE_SUFFIX; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; -use anyhow::{anyhow, bail, ensure, Context, Result}; +use anyhow::{bail, ensure, Context, Result}; use camino::{Utf8Path, Utf8PathBuf}; use futures::StreamExt; use itertools::Itertools; @@ -60,7 +59,7 @@ use pageserver_api::shard::TenantShardId; use pageserver_api::value::Value; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; -use std::collections::VecDeque; +use std::collections::{HashMap, VecDeque}; use std::fs::File; use std::io::SeekFrom; use std::ops::Range; @@ -77,7 +76,10 @@ use utils::{ lsn::Lsn, }; -use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState}; +use super::{ + AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer, + ValuesReconstructState, +}; /// /// Header stored in the beginning of the file @@ -226,7 +228,7 @@ pub struct DeltaLayerInner { index_start_blk: u32, index_root_blk: u32, - file: VirtualFile, + file: Arc, file_id: FileId, layer_key_range: Range, @@ -795,9 +797,11 @@ impl DeltaLayerInner { max_vectored_read_bytes: Option, ctx: &RequestContext, ) -> anyhow::Result { - let file = VirtualFile::open_v2(path, ctx) - .await - .context("open layer file")?; + let file = Arc::new( + VirtualFile::open_v2(path, ctx) + .await + .context("open layer file")?, + ); let file_id = page_cache::next_file_id(); @@ -842,12 +846,11 @@ impl DeltaLayerInner { // Look up the keys in the provided keyspace and update // the reconstruct state with whatever is found. // - // If the key is cached, go no further than the cached Lsn. - // // Currently, the index is visited for each range, but this // can be further optimised to visit the index only once. pub(super) async fn get_values_reconstruct_data( &self, + this: ResidentLayer, keyspace: KeySpace, lsn_range: Range, reconstruct_state: &mut ValuesReconstructState, @@ -875,17 +878,14 @@ impl DeltaLayerInner { data_end_offset, index_reader, planner, - reconstruct_state, ctx, ) .await .map_err(GetVectoredError::Other)?; - self.do_reads_and_update_state(reads, reconstruct_state, ctx) + self.do_reads_and_update_state(this, reads, reconstruct_state, ctx) .await; - reconstruct_state.on_lsn_advanced(&keyspace, lsn_range.start); - Ok(()) } @@ -895,7 +895,6 @@ impl DeltaLayerInner { data_end_offset: u64, index_reader: DiskBtreeReader, mut planner: VectoredReadPlanner, - reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> anyhow::Result> where @@ -922,10 +921,9 @@ impl DeltaLayerInner { assert!(key >= range.start); let outside_lsn_range = !lsn_range.contains(&lsn); - let below_cached_lsn = reconstruct_state.get_cached_lsn(&key) >= Some(lsn); let flag = { - if outside_lsn_range || below_cached_lsn { + if outside_lsn_range { BlobFlag::Ignore } else if blob_ref.will_init() { BlobFlag::ReplaceAll @@ -994,98 +992,78 @@ impl DeltaLayerInner { async fn do_reads_and_update_state( &self, + this: ResidentLayer, reads: Vec, reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) { - let vectored_blob_reader = VectoredBlobReader::new(&self.file); - let mut ignore_key_with_err = None; - let max_vectored_read_bytes = self .max_vectored_read_bytes .expect("Layer is loaded with max vectored bytes config") .0 .into(); let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes); - let mut buf = Some(IoBufferMut::with_capacity(buf_size)); // Note that reads are processed in reverse order (from highest key+lsn). // This is the order that `ReconstructState` requires such that it can // track when a key is done. for read in reads.into_iter().rev() { - let res = vectored_blob_reader - .read_blobs(&read, buf.take().expect("Should have a buffer"), ctx) - .await; - - let blobs_buf = match res { - Ok(blobs_buf) => blobs_buf, - Err(err) => { - let kind = err.kind(); - for (_, blob_meta) in read.blobs_at.as_slice() { - reconstruct_state.on_key_error( - blob_meta.key, - PageReconstructError::Other(anyhow!( - "Failed to read blobs from virtual file {}: {}", - self.file.path(), - kind - )), - ); - } - - // We have "lost" the buffer since the lower level IO api - // doesn't return the buffer on error. Allocate a new one. - buf = Some(IoBufferMut::with_capacity(buf_size)); - - continue; - } - }; - let view = BufView::new_slice(&blobs_buf.buf); - for meta in blobs_buf.blobs.iter().rev() { - if Some(meta.meta.key) == ignore_key_with_err { - continue; - } - let blob_read = meta.read(&view).await; - let blob_read = match blob_read { - Ok(buf) => buf, - Err(e) => { - reconstruct_state.on_key_error( - meta.meta.key, - PageReconstructError::Other(anyhow!(e).context(format!( - "Failed to decompress blob from virtual file {}", - self.file.path(), - ))), - ); - - ignore_key_with_err = Some(meta.meta.key); - continue; - } - }; + let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default(); + for (_, blob_meta) in read.blobs_at.as_slice().iter().rev() { + let io = reconstruct_state.update_key( + &blob_meta.key, + blob_meta.lsn, + blob_meta.will_init, + ); + ios.insert((blob_meta.key, blob_meta.lsn), io); + } - let value = Value::des(&blob_read); - - let value = match value { - Ok(v) => v, - Err(e) => { - reconstruct_state.on_key_error( - meta.meta.key, - PageReconstructError::Other(anyhow!(e).context(format!( - "Failed to deserialize blob from virtual file {}", - self.file.path(), - ))), - ); - - ignore_key_with_err = Some(meta.meta.key); - continue; + let read_extend_residency = this.clone(); + let read_from = self.file.clone(); + let read_ctx = ctx.attached_child(); + reconstruct_state + .spawn_io(async move { + let vectored_blob_reader = VectoredBlobReader::new(&read_from); + let buf = IoBufferMut::with_capacity(buf_size); + + let res = vectored_blob_reader.read_blobs(&read, buf, &read_ctx).await; + match res { + Ok(blobs_buf) => { + let view = BufView::new_slice(&blobs_buf.buf); + for meta in blobs_buf.blobs.iter().rev() { + let io = ios.remove(&(meta.meta.key, meta.meta.lsn)).unwrap(); + + let blob_read = meta.read(&view).await; + let blob_read = match blob_read { + Ok(buf) => buf, + Err(e) => { + io.complete(Err(e)); + continue; + } + }; + + io.complete(Ok(OnDiskValue::WalRecordOrImage( + blob_read.into_bytes(), + ))); + } + + assert!(ios.is_empty()); + } + Err(err) => { + for (_, sender) in ios { + sender.complete(Err(std::io::Error::new( + err.kind(), + "vec read failed", + ))); + } + } } - }; - // Invariant: once a key reaches [`ValueReconstructSituation::Complete`] - // state, no further updates shall be made to it. The call below will - // panic if the invariant is violated. - reconstruct_state.update_key(&meta.meta.key, meta.meta.lsn, value); - } - - buf = Some(blobs_buf.buf); + // keep layer resident until this IO is done; this spawned IO future generally outlives the + // call to `self` / the `Arc` / the `ResidentLayer` that guarantees residency + drop(read_extend_residency); + }) + .await; } } @@ -1224,7 +1202,14 @@ impl DeltaLayerInner { let actionable = if let Some((key, lsn, start_offset)) = prev.take() { let end_offset = offset; - Some((BlobMeta { key, lsn }, start_offset..end_offset)) + Some(( + BlobMeta { + key, + lsn, + will_init: false, + }, + start_offset..end_offset, + )) } else { None }; @@ -1560,7 +1545,9 @@ impl DeltaLayerIterator<'_> { let lsn = DeltaKey::extract_lsn_from_buf(&raw_key); let blob_ref = BlobRef(value); let offset = blob_ref.pos(); - if let Some(batch_plan) = self.planner.handle(key, lsn, offset) { + if let Some(batch_plan) = + self.planner.handle(key, lsn, offset, blob_ref.will_init()) + { break batch_plan; } } else { @@ -1673,7 +1660,6 @@ pub(crate) mod test { .expect("In memory disk finish should never fail"); let reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(0, root_offset, disk); let planner = VectoredReadPlanner::new(100); - let mut reconstruct_state = ValuesReconstructState::new(); let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let keyspace = KeySpace { @@ -1691,7 +1677,6 @@ pub(crate) mod test { disk_offset, reader, planner, - &mut reconstruct_state, &ctx, ) .await @@ -1935,7 +1920,6 @@ pub(crate) mod test { ); let planner = VectoredReadPlanner::new(constants::MAX_VECTORED_READ_BYTES); - let mut reconstruct_state = ValuesReconstructState::new(); let keyspace = pick_random_keyspace(rng, &entries_meta.key_range); let data_end_offset = inner.index_start_blk as u64 * PAGE_SZ as u64; @@ -1945,7 +1929,6 @@ pub(crate) mod test { data_end_offset, index_reader, planner, - &mut reconstruct_state, &ctx, ) .await?; diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 0d3c9d5a44ca..c49281dc4535 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -38,12 +38,11 @@ use crate::tenant::vectored_blob_io::{ BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner, }; -use crate::tenant::PageReconstructError; use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; use crate::virtual_file::IoBufferMut; use crate::virtual_file::{self, MaybeFatalIo, VirtualFile}; use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; -use anyhow::{anyhow, bail, ensure, Context, Result}; +use anyhow::{bail, ensure, Context, Result}; use bytes::Bytes; use camino::{Utf8Path, Utf8PathBuf}; use hex; @@ -56,12 +55,13 @@ use pageserver_api::shard::{ShardIdentity, TenantShardId}; use pageserver_api::value::Value; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; -use std::collections::VecDeque; +use std::collections::{HashMap, VecDeque}; use std::fs::File; use std::io::SeekFrom; use std::ops::Range; use std::os::unix::prelude::FileExt; use std::str::FromStr; +use std::sync::Arc; use tokio::sync::OnceCell; use tokio_stream::StreamExt; use tracing::*; @@ -73,7 +73,10 @@ use utils::{ }; use super::layer_name::ImageLayerName; -use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState}; +use super::{ + AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer, + ValuesReconstructState, +}; /// /// Header stored in the beginning of the file @@ -164,7 +167,7 @@ pub struct ImageLayerInner { key_range: Range, lsn: Lsn, - file: VirtualFile, + file: Arc, file_id: FileId, max_vectored_read_bytes: Option, @@ -391,9 +394,11 @@ impl ImageLayerInner { max_vectored_read_bytes: Option, ctx: &RequestContext, ) -> anyhow::Result { - let file = VirtualFile::open_v2(path, ctx) - .await - .context("open layer file")?; + let file = Arc::new( + VirtualFile::open_v2(path, ctx) + .await + .context("open layer file")?, + ); let file_id = page_cache::next_file_id(); let block_reader = FileBlockReader::new(&file, file_id); let summary_blk = block_reader @@ -439,6 +444,7 @@ impl ImageLayerInner { // the reconstruct state with whatever is found. pub(super) async fn get_values_reconstruct_data( &self, + this: ResidentLayer, keyspace: KeySpace, reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, @@ -448,7 +454,7 @@ impl ImageLayerInner { .await .map_err(GetVectoredError::Other)?; - self.do_reads_and_update_state(reads, reconstruct_state, ctx) + self.do_reads_and_update_state(this, reads, reconstruct_state, ctx) .await; reconstruct_state.on_image_layer_visited(&self.key_range); @@ -570,6 +576,7 @@ impl ImageLayerInner { async fn do_reads_and_update_state( &self, + this: ResidentLayer, reads: Vec, reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, @@ -580,8 +587,13 @@ impl ImageLayerInner { .0 .into(); - let vectored_blob_reader = VectoredBlobReader::new(&self.file); for read in reads.into_iter() { + let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default(); + for (_, blob_meta) in read.blobs_at.as_slice() { + let io = reconstruct_state.update_key(&blob_meta.key, blob_meta.lsn, true); + ios.insert((blob_meta.key, blob_meta.lsn), io); + } + let buf_size = read.size(); if buf_size > max_vectored_read_bytes { @@ -611,50 +623,51 @@ impl ImageLayerInner { } } - let buf = IoBufferMut::with_capacity(buf_size); - let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await; - - match res { - Ok(blobs_buf) => { - let view = BufView::new_slice(&blobs_buf.buf); - for meta in blobs_buf.blobs.iter() { - let img_buf = meta.read(&view).await; - - let img_buf = match img_buf { - Ok(img_buf) => img_buf, - Err(e) => { - reconstruct_state.on_key_error( - meta.meta.key, - PageReconstructError::Other(anyhow!(e).context(format!( - "Failed to decompress blob from virtual file {}", - self.file.path(), - ))), - ); - - continue; + let read_extend_residency = this.clone(); + let read_from = self.file.clone(); + let read_ctx = ctx.attached_child(); + reconstruct_state + .spawn_io(async move { + let buf = IoBufferMut::with_capacity(buf_size); + let vectored_blob_reader = VectoredBlobReader::new(&read_from); + let res = vectored_blob_reader.read_blobs(&read, buf, &read_ctx).await; + + match res { + Ok(blobs_buf) => { + let view = BufView::new_slice(&blobs_buf.buf); + for meta in blobs_buf.blobs.iter() { + let io: OnDiskValueIo = + ios.remove(&(meta.meta.key, meta.meta.lsn)).unwrap(); + let img_buf = meta.read(&view).await; + + let img_buf = match img_buf { + Ok(img_buf) => img_buf, + Err(e) => { + io.complete(Err(e)); + continue; + } + }; + + io.complete(Ok(OnDiskValue::RawImage(img_buf.into_bytes()))); } - }; - reconstruct_state.update_key( - &meta.meta.key, - self.lsn, - Value::Image(img_buf.into_bytes()), - ); - } - } - Err(err) => { - let kind = err.kind(); - for (_, blob_meta) in read.blobs_at.as_slice() { - reconstruct_state.on_key_error( - blob_meta.key, - PageReconstructError::from(anyhow!( - "Failed to read blobs from virtual file {}: {}", - self.file.path(), - kind - )), - ); + + assert!(ios.is_empty()); + } + Err(err) => { + for (_, io) in ios { + io.complete(Err(std::io::Error::new( + err.kind(), + "vec read failed", + ))); + } + } } - } - }; + + // keep layer resident until this IO is done; this spawned IO future generally outlives the + // call to `self` / the `Arc` / the `ResidentLayer` that guarantees residency + drop(read_extend_residency); + }) + .await; } } @@ -1069,6 +1082,7 @@ impl ImageLayerIterator<'_> { Key::from_slice(&raw_key[..KEY_SIZE]), self.image_layer.lsn, offset, + true, ) { break batch_plan; } diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 2b67f55a17b4..61a0fdea8ca2 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -8,23 +8,22 @@ use crate::assert_u64_eq_usize::{u64_to_usize, U64IsUsize, UsizeIsU64}; use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; use crate::tenant::ephemeral_file::EphemeralFile; +use crate::tenant::storage_layer::{OnDiskValue, OnDiskValueIo}; use crate::tenant::timeline::GetVectoredError; -use crate::tenant::PageReconstructError; use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; use crate::{l0_flush, page_cache}; -use anyhow::{anyhow, Result}; +use anyhow::Result; use camino::Utf8PathBuf; use pageserver_api::key::CompactKey; use pageserver_api::key::Key; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; -use pageserver_api::value::Value; use std::collections::{BTreeMap, HashMap}; use std::sync::{Arc, OnceLock}; use std::time::Instant; use tracing::*; -use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap}; +use utils::{id::TimelineId, lsn::Lsn, vec_map::VecMap}; use wal_decoder::serialized_batch::{SerializedValueBatch, SerializedValueMeta, ValueMeta}; // avoid binding to Write (conflicts with std::io::Write) // while being able to use std::fmt::Write's methods @@ -36,9 +35,7 @@ use std::sync::atomic::Ordering as AtomicOrdering; use std::sync::atomic::{AtomicU64, AtomicUsize}; use tokio::sync::RwLock; -use super::{ - DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState, -}; +use super::{DeltaLayerWriter, PersistentLayerDesc, ValuesReconstructState}; pub(crate) mod vectored_dio_read; @@ -415,10 +412,8 @@ impl InMemoryLayer { // Look up the keys in the provided keyspace and update // the reconstruct state with whatever is found. - // - // If the key is cached, go no further than the cached Lsn. pub(crate) async fn get_values_reconstruct_data( - &self, + self: &Arc, keyspace: KeySpace, end_lsn: Lsn, reconstruct_state: &mut ValuesReconstructState, @@ -435,6 +430,9 @@ impl InMemoryLayer { read: vectored_dio_read::LogicalRead>, } let mut reads: HashMap> = HashMap::new(); + let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default(); + + let lsn_range = self.start_lsn..end_lsn; for range in keyspace.ranges.iter() { for (key, vec_map) in inner @@ -442,12 +440,7 @@ impl InMemoryLayer { .range(range.start.to_compact()..range.end.to_compact()) { let key = Key::from_compact(*key); - let lsn_range = match reconstruct_state.get_cached_lsn(&key) { - Some(cached_lsn) => (cached_lsn + 1)..end_lsn, - None => self.start_lsn..end_lsn, - }; - - let slice = vec_map.slice_range(lsn_range); + let slice = vec_map.slice_range(lsn_range.clone()); for (entry_lsn, index_entry) in slice.iter().rev() { let IndexEntryUnpacked { @@ -463,55 +456,59 @@ impl InMemoryLayer { Vec::with_capacity(len as usize), ), }); + + let io = reconstruct_state.update_key(&key, *entry_lsn, will_init); + ios.insert((key, *entry_lsn), io); + if will_init { break; } } } } - - // Execute the reads. - - let f = vectored_dio_read::execute( - &inner.file, - reads - .iter() - .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)), - &ctx, - ); - send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865 - .await; - - // Process results into the reconstruct state - 'next_key: for (key, value_reads) in reads { - for ValueRead { entry_lsn, read } in value_reads { - match read.into_result().expect("we run execute() above") { - Err(e) => { - reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e))); - continue 'next_key; - } - Ok(value_buf) => { - let value = Value::des(&value_buf); - if let Err(e) = value { - reconstruct_state - .on_key_error(key, PageReconstructError::from(anyhow!(e))); - continue 'next_key; - } - - let key_situation = - reconstruct_state.update_key(&key, entry_lsn, value.unwrap()); - if key_situation == ValueReconstructSituation::Complete { - // TODO: metric to see if we fetched more values than necessary - continue 'next_key; + drop(inner); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below + let read_from = Arc::clone(self); + let read_ctx = ctx.attached_child(); + reconstruct_state + .spawn_io(async move { + let inner = read_from.inner.read().await; + let f = vectored_dio_read::execute( + &inner.file, + reads + .iter() + .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)), + &read_ctx, + ); + send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865 + .await; + + for (key, value_reads) in reads { + for ValueRead { entry_lsn, read } in value_reads { + let io = ios.remove(&(key, entry_lsn)).expect("sender must exist"); + match read.into_result().expect("we run execute() above") { + Err(e) => { + io.complete(Err(std::io::Error::new( + e.kind(), + "dio vec read failed", + ))); + } + Ok(value_buf) => { + io.complete(Ok(OnDiskValue::WalRecordOrImage(value_buf.into()))); + } } - - // process the next value in the next iteration of the loop } } - } - } - reconstruct_state.on_lsn_advanced(&keyspace, self.start_lsn); + assert!(ios.is_empty()); + + // Keep layer existent until this IO is done; + // This is kinda forced for InMemoryLayer because we need to inner.read() anyway, + // but it's less obvious for DeltaLayer and ImageLayer. So, keep this explicit + // drop for consistency among all three layer types. + drop(inner); + drop(read_from); + }) + .await; Ok(()) } @@ -606,6 +603,7 @@ impl InMemoryLayer { // Write the batch to the file inner.file.write_raw(&raw, ctx).await?; let new_size = inner.file.len(); + let expected_new_len = base_offset .checked_add(raw.len().into_u64()) // write_raw would error if we were to overflow u64. diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 2b06c88e8b2b..2a86885f6b57 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -308,7 +308,7 @@ impl Layer { reconstruct_data: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { - let layer = self + let downloaded = self .0 .get_or_maybe_download(true, Some(ctx)) .await @@ -318,11 +318,15 @@ impl Layer { } other => GetVectoredError::Other(anyhow::anyhow!(other)), })?; + let this = ResidentLayer { + downloaded: downloaded.clone(), + owner: self.clone(), + }; self.record_access(ctx); - layer - .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx) + downloaded + .get_values_reconstruct_data(this, keyspace, lsn_range, reconstruct_data, ctx) .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self)) .await .map_err(|err| match err { @@ -1768,25 +1772,25 @@ impl DownloadedLayer { async fn get_values_reconstruct_data( &self, + this: ResidentLayer, keyspace: KeySpace, lsn_range: Range, reconstruct_data: &mut ValuesReconstructState, - owner: &Arc, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { use LayerKind::*; match self - .get(owner, ctx) + .get(&this.owner.0, ctx) .await .map_err(GetVectoredError::Other)? { Delta(d) => { - d.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, ctx) + d.get_values_reconstruct_data(this, keyspace, lsn_range, reconstruct_data, ctx) .await } Image(i) => { - i.get_values_reconstruct_data(keyspace, reconstruct_data, ctx) + i.get_values_reconstruct_data(this, keyspace, reconstruct_data, ctx) .await } } diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index fcb73ad20d59..d93c378ffc44 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -11,7 +11,10 @@ use super::failpoints::{Failpoint, FailpointKind}; use super::*; use crate::{ context::DownloadBehavior, - tenant::{harness::test_img, storage_layer::LayerVisibilityHint}, + tenant::{ + harness::test_img, + storage_layer::{IoConcurrency, LayerVisibilityHint}, + }, }; use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness}; @@ -31,6 +34,7 @@ async fn smoke_test() { let span = h.span(); let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); let (tenant, _) = h.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download); @@ -89,7 +93,7 @@ async fn smoke_test() { }; let img_before = { - let mut data = ValuesReconstructState::default(); + let mut data = ValuesReconstructState::new(io_concurrency.clone()); layer .get_values_reconstruct_data( controlfile_keyspace.clone(), @@ -99,10 +103,13 @@ async fn smoke_test() { ) .await .unwrap(); + data.keys .remove(&CONTROLFILE_KEY) .expect("must be present") - .expect("should not error") + .collect_pending_ios() + .await + .expect("must not error") .img .take() .expect("tenant harness writes the control file") @@ -121,7 +128,7 @@ async fn smoke_test() { // on accesses when the layer is evicted, it will automatically be downloaded. let img_after = { - let mut data = ValuesReconstructState::default(); + let mut data = ValuesReconstructState::new(io_concurrency.clone()); layer .get_values_reconstruct_data( controlfile_keyspace.clone(), @@ -135,7 +142,9 @@ async fn smoke_test() { data.keys .remove(&CONTROLFILE_KEY) .expect("must be present") - .expect("should not error") + .collect_pending_ios() + .await + .expect("must not error") .img .take() .expect("tenant harness writes the control file") diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 0118a5ce5f9d..3725e2f7fcbc 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -67,10 +67,9 @@ pub(crate) async fn concurrent_background_tasks_rate_limit_permit( ) -> tokio::sync::SemaphorePermit<'static> { let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.measure_acquisition(loop_kind); - pausable_failpoint!( - "initial-size-calculation-permit-pause", - loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation - ); + if loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation { + pausable_failpoint!("initial-size-calculation-permit-pause"); + } // TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id(); match CONCURRENT_BACKGROUND_TASKS.acquire().await { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 5f4272fb2b8b..5d348ac4746b 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -20,6 +20,7 @@ use camino::Utf8Path; use chrono::{DateTime, Utc}; use enumset::EnumSet; use fail::fail_point; +use futures::{stream::FuturesUnordered, StreamExt}; use handle::ShardTimelineId; use offload::OffloadError; use once_cell::sync::Lazy; @@ -60,6 +61,7 @@ use utils::{ use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta}; use std::sync::atomic::Ordering as AtomicOrdering; +use std::sync::OnceLock; use std::sync::{Arc, Mutex, RwLock, Weak}; use std::time::{Duration, Instant, SystemTime}; use std::{ @@ -72,8 +74,8 @@ use std::{ collections::btree_map::Entry, ops::{Deref, Range}, }; -use std::{pin::pin, sync::OnceLock}; +use crate::l0_flush::{self, L0FlushGlobalState}; use crate::{ aux_file::AuxFileSizeEstimator, page_service::TenantManagerTypes, @@ -81,7 +83,10 @@ use crate::{ config::AttachmentMode, layer_map::{LayerMap, SearchResult}, metadata::TimelineMetadata, - storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc}, + storage_layer::{ + inmemory_layer::IndexEntry, IoConcurrency, PersistentLayerDesc, + ValueReconstructSituation, + }, }, walingest::WalLagCooldown, walredo, @@ -102,10 +107,6 @@ use crate::{ use crate::{ disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry, }; -use crate::{ - l0_flush::{self, L0FlushGlobalState}, - metrics::GetKind, -}; use crate::{ metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize, }; @@ -1005,9 +1006,7 @@ impl Timeline { ranges: vec![key..key.next()], }; - // Initialise the reconstruct state for the key with the cache - // entry returned above. - let mut reconstruct_state = ValuesReconstructState::new(); + let mut reconstruct_state = ValuesReconstructState::new(IoConcurrency::sequential()); let vectored_res = self .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx) @@ -1050,6 +1049,7 @@ impl Timeline { &self, keyspace: KeySpace, lsn: Lsn, + io_concurrency: super::storage_layer::IoConcurrency, ctx: &RequestContext, ) -> Result>, GetVectoredError> { if !lsn.is_valid() { @@ -1084,7 +1084,7 @@ impl Timeline { .get_vectored_impl( keyspace.clone(), lsn, - &mut ValuesReconstructState::new(), + &mut ValuesReconstructState::new(io_concurrency), ctx, ) .await; @@ -1109,6 +1109,7 @@ impl Timeline { keyspace: KeySpace, lsn: Lsn, ctx: &RequestContext, + io_concurrency: super::storage_layer::IoConcurrency, ) -> Result>, GetVectoredError> { if !lsn.is_valid() { return Err(GetVectoredError::InvalidLsn(lsn)); @@ -1140,7 +1141,7 @@ impl Timeline { .get_vectored_impl( keyspace.clone(), lsn, - &mut ValuesReconstructState::default(), + &mut ValuesReconstructState::new(io_concurrency), ctx, ) .await; @@ -1159,39 +1160,56 @@ impl Timeline { reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result>, GetVectoredError> { - let get_kind = if keyspace.total_raw_size() == 1 { - GetKind::Singular - } else { - GetKind::Vectored + let traversal_res: Result<(), _> = self + .get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx) + .await; + if let Err(err) = traversal_res { + // Wait for all the spawned IOs to complete. + // See comments on `spawn_io` inside `storage_layer` for more details. + let mut collect_futs = std::mem::take(&mut reconstruct_state.keys) + .into_values() + .map(|state| state.collect_pending_ios()) + .collect::>(); + while collect_futs.next().await.is_some() {} + return Err(err); }; - let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME - .for_get_kind(get_kind) - .start_timer(); - self.get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx) - .await?; - get_data_timer.stop_and_record(); - - let reconstruct_timer = crate::metrics::RECONSTRUCT_TIME - .for_get_kind(get_kind) - .start_timer(); - let mut results: BTreeMap> = BTreeMap::new(); let layers_visited = reconstruct_state.get_layers_visited(); - for (key, res) in std::mem::take(&mut reconstruct_state.keys) { - match res { - Err(err) => { - results.insert(key, Err(err)); - } - Ok(state) => { - let state = ValueReconstructState::from(state); + let futs = FuturesUnordered::new(); + for (key, state) in std::mem::take(&mut reconstruct_state.keys) { + futs.push({ + let walredo_self = self.myself.upgrade().expect("&self method holds the arc"); + async move { + assert_eq!(state.situation, ValueReconstructSituation::Complete); + + let converted = match state.collect_pending_ios().await { + Ok(ok) => ok, + Err(err) => { + return (key, Err(err)); + } + }; + + // The walredo module expects the records to be descending in terms of Lsn. + // And we submit the IOs in that order, so, there shuold be no need to sort here. + debug_assert!( + converted + .records + .is_sorted_by_key(|(lsn, _)| std::cmp::Reverse(*lsn)), + "{converted:?}" + ); - let reconstruct_res = self.reconstruct_value(key, lsn, state).await; - results.insert(key, reconstruct_res); + ( + key, + walredo_self.reconstruct_value(key, lsn, converted).await, + ) } - } + }); } - reconstruct_timer.stop_and_record(); + + let results = futs + .collect::>>() + .await; // For aux file keys (v1 or v2) the vectored read path does not return an error // when they're missing. Instead they are omitted from the resulting btree @@ -2804,12 +2822,10 @@ impl Timeline { "initial size calculation", // NB: don't log errors here, task_mgr will do that. async move { - let cancel = task_mgr::shutdown_token(); self_clone .initial_logical_size_calculation_task( initial_part_end, cancel_wait_for_background_loop_concurrency_limit_semaphore, - cancel, background_ctx, ) .await; @@ -2819,11 +2835,21 @@ impl Timeline { ); } + /// # Cancellation + /// + /// This method is sensitive to `Timeline::cancel`. + /// + /// It is _not_ sensitive to task_mgr::shutdown_token(). + /// + /// # Cancel-Safety + /// + /// It does Timeline IO, hence this should be polled to completion because + /// we could be leaving in-flight IOs behind, which is safe, but annoying + /// to reason about. async fn initial_logical_size_calculation_task( self: Arc, initial_part_end: Lsn, skip_concurrency_limiter: CancellationToken, - cancel: CancellationToken, background_ctx: RequestContext, ) { scopeguard::defer! { @@ -2836,7 +2862,6 @@ impl Timeline { let self_ref = &self; let skip_concurrency_limiter = &skip_concurrency_limiter; async move { - let cancel = task_mgr::shutdown_token(); let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit_permit( BackgroundLoopKind::InitialLogicalSizeCalculation, background_ctx, @@ -2850,9 +2875,6 @@ impl Timeline { _ = self_ref.cancel.cancelled() => { return Err(CalculateLogicalSizeError::Cancelled); } - _ = cancel.cancelled() => { - return Err(CalculateLogicalSizeError::Cancelled); - }, () = skip_concurrency_limiter.cancelled() => { // Some action that is part of a end user interaction requested logical size // => break out of the rate limit @@ -2869,6 +2891,14 @@ impl Timeline { crate::metrics::initial_logical_size::START_CALCULATION.retry(circumstances) }; + let io_concurrency = IoConcurrency::spawn_from_conf( + self_ref.conf, + self_ref + .gate + .enter() + .map_err(|_| CalculateLogicalSizeError::Cancelled)?, + ); + let calculated_size = self_ref .logical_size_calculation_task( initial_part_end, @@ -2878,7 +2908,11 @@ impl Timeline { .await?; self_ref - .trigger_aux_file_size_computation(initial_part_end, background_ctx) + .trigger_aux_file_size_computation( + initial_part_end, + background_ctx, + io_concurrency, + ) .await?; // TODO: add aux file size to logical size @@ -2911,22 +2945,18 @@ impl Timeline { ) .expect("10min < 1hour"), ); - tokio::time::sleep(sleep_duration).await; + tokio::select! { + _ = tokio::time::sleep(sleep_duration) => {} + _ = self.cancel.cancelled() => return ControlFlow::Break(()), + } } } } }; - let (calculated_size, metrics_guard) = tokio::select! { - res = retrying => { - match res { - ControlFlow::Continue(calculated_size) => calculated_size, - ControlFlow::Break(()) => return, - } - } - _ = cancel.cancelled() => { - return; - } + let (calculated_size, metrics_guard) = match retrying.await { + ControlFlow::Continue(calculated_size) => calculated_size, + ControlFlow::Break(()) => return, }; // we cannot query current_logical_size.current_size() to know the current @@ -2982,9 +3012,6 @@ impl Timeline { receiver } - /// # Cancel-Safety - /// - /// This method is cancellation-safe. #[instrument(skip_all)] async fn logical_size_calculation_task( self: &Arc, @@ -3002,32 +3029,13 @@ impl Timeline { .enter() .map_err(|_| CalculateLogicalSizeError::Cancelled)?; - let self_calculation = Arc::clone(self); - - let mut calculation = pin!(async { - let ctx = ctx.attached_child(); - self_calculation - .calculate_logical_size(lsn, cause, &guard, &ctx) - .await - }); - - tokio::select! { - res = &mut calculation => { res } - _ = self.cancel.cancelled() => { - debug!("cancelling logical size calculation for timeline shutdown"); - calculation.await - } - } + self.calculate_logical_size(lsn, cause, &guard, ctx).await } /// Calculate the logical size of the database at the latest LSN. /// /// NOTE: counted incrementally, includes ancestors. This can be a slow operation, /// especially if we need to download remote layers. - /// - /// # Cancel-Safety - /// - /// This method is cancellation-safe. async fn calculate_logical_size( &self, up_to_lsn: Lsn, @@ -3040,7 +3048,10 @@ impl Timeline { self.timeline_id, up_to_lsn ); - pausable_failpoint!("timeline-calculate-logical-size-pause"); + if let Err(()) = pausable_failpoint!("timeline-calculate-logical-size-pause", &self.cancel) + { + return Err(CalculateLogicalSizeError::Cancelled); + } // See if we've already done the work for initial size calculation. // This is a short-cut for timelines that are mostly unused. @@ -3617,6 +3628,12 @@ impl Timeline { return; } + // Break to notify potential waiters as soon as we've flushed the requested LSN. If + // more requests have arrived in the meanwhile, we'll resume flushing afterwards. + if flushed_to_lsn >= frozen_to_lsn { + break Ok(()); + } + let timer = self.metrics.flush_time_histo.start_timer(); let num_frozen_layers; @@ -4128,6 +4145,7 @@ impl Timeline { /// Create image layers for Postgres data. Assumes the caller passes a partition that is not too large, /// so that at most one image layer will be produced from this function. + #[allow(clippy::too_many_arguments)] async fn create_image_layer_for_rel_blocks( self: &Arc, partition: &KeySpace, @@ -4136,6 +4154,7 @@ impl Timeline { ctx: &RequestContext, img_range: Range, start: Key, + io_concurrency: IoConcurrency, ) -> Result { let mut wrote_keys = false; @@ -4164,7 +4183,12 @@ impl Timeline { || (last_key_in_range && key_request_accum.raw_size() > 0) { let results = self - .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx) + .get_vectored( + key_request_accum.consume_keyspace(), + lsn, + io_concurrency.clone(), + ctx, + ) .await?; if self.cancel.is_cancelled() { @@ -4243,9 +4267,10 @@ impl Timeline { img_range: Range, mode: ImageLayerCreationMode, start: Key, + io_concurrency: IoConcurrency, ) -> Result { // Metadata keys image layer creation. - let mut reconstruct_state = ValuesReconstructState::default(); + let mut reconstruct_state = ValuesReconstructState::new(io_concurrency); let begin = Instant::now(); let data = self .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx) @@ -4462,6 +4487,13 @@ impl Timeline { ))) }); + let io_concurrency = IoConcurrency::spawn_from_conf( + self.conf, + self.gate + .enter() + .map_err(|_| CreateImageLayersError::Cancelled)?, + ); + if !compact_metadata { let ImageLayerCreationOutcome { image, @@ -4474,6 +4506,7 @@ impl Timeline { ctx, img_range, start, + io_concurrency, ) .await?; @@ -4492,6 +4525,7 @@ impl Timeline { img_range, mode, start, + io_concurrency, ) .await?; start = next_start_key; @@ -5759,13 +5793,14 @@ impl Timeline { self: &Arc, lsn: Lsn, ctx: &RequestContext, + io_concurrency: IoConcurrency, ) -> anyhow::Result> { let mut all_data = Vec::new(); let guard = self.layers.read().await; for layer in guard.layer_map()?.iter_historic_layers() { if !layer.is_delta() && layer.image_layer_lsn() == lsn { let layer = guard.get_from_desc(&layer); - let mut reconstruct_data = ValuesReconstructState::default(); + let mut reconstruct_data = ValuesReconstructState::new(io_concurrency.clone()); layer .get_values_reconstruct_data( KeySpace::single(Key::MIN..Key::MAX), @@ -5774,8 +5809,9 @@ impl Timeline { ctx, ) .await?; - for (k, v) in reconstruct_data.keys { - all_data.push((k, v?.img.unwrap().1)); + for (k, v) in std::mem::take(&mut reconstruct_data.keys) { + let v = v.collect_pending_ios().await?; + all_data.push((k, v.img.unwrap().1)); } } } diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 06a21f6b3c42..57fc415d0674 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -42,8 +42,8 @@ use crate::tenant::storage_layer::merge_iterator::MergeIterator; use crate::tenant::storage_layer::{ AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState, }; -use crate::tenant::timeline::ImageLayerCreationOutcome; use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter}; +use crate::tenant::timeline::{ImageLayerCreationOutcome, IoConcurrency}; use crate::tenant::timeline::{Layer, ResidentLayer}; use crate::tenant::{gc_block, DeltaLayer, MaybeOffloaded}; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; @@ -3170,6 +3170,7 @@ impl TimelineAdaptor { ctx, key_range.clone(), start, + IoConcurrency::sequential(), ) .await?; diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs index dfe2352310bd..47fb4a276bc6 100644 --- a/pageserver/src/tenant/vectored_blob_io.rs +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -35,6 +35,7 @@ use crate::virtual_file::{self, VirtualFile}; pub struct BlobMeta { pub key: Key, pub lsn: Lsn, + pub will_init: bool, } /// A view into the vectored blobs read buffer. @@ -310,7 +311,15 @@ pub enum BlobFlag { /// * Iterate over the collected blobs and coalesce them into reads at the end pub struct VectoredReadPlanner { // Track all the blob offsets. Start offsets must be ordered. - blobs: BTreeMap>, + // Values in the value tuples are: + // ( + // lsn of the blob, + // start offset of the blob in the underlying file, + // end offset of the blob in the underlying file, + // whether the blob initializes the page image or not + // see [`pageserver_api::record::NeonWalRecord::will_init`] + // ) + blobs: BTreeMap>, // Arguments for previous blob passed into [`VectoredReadPlanner::handle`] prev: Option<(Key, Lsn, u64, BlobFlag)>, @@ -371,12 +380,12 @@ impl VectoredReadPlanner { match flag { BlobFlag::None => { let blobs_for_key = self.blobs.entry(key).or_default(); - blobs_for_key.push((lsn, start_offset, end_offset)); + blobs_for_key.push((lsn, start_offset, end_offset, false)); } BlobFlag::ReplaceAll => { let blobs_for_key = self.blobs.entry(key).or_default(); blobs_for_key.clear(); - blobs_for_key.push((lsn, start_offset, end_offset)); + blobs_for_key.push((lsn, start_offset, end_offset, true)); } BlobFlag::Ignore => {} } @@ -387,11 +396,17 @@ impl VectoredReadPlanner { let mut reads = Vec::new(); for (key, blobs_for_key) in self.blobs { - for (lsn, start_offset, end_offset) in blobs_for_key { + for (lsn, start_offset, end_offset, will_init) in blobs_for_key { let extended = match &mut current_read_builder { - Some(read_builder) => { - read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn }) - } + Some(read_builder) => read_builder.extend( + start_offset, + end_offset, + BlobMeta { + key, + lsn, + will_init, + }, + ), None => VectoredReadExtended::No, }; @@ -399,7 +414,11 @@ impl VectoredReadPlanner { let next_read_builder = ChunkedVectoredReadBuilder::new( start_offset, end_offset, - BlobMeta { key, lsn }, + BlobMeta { + key, + lsn, + will_init, + }, self.max_read_size, ); @@ -527,7 +546,7 @@ impl<'a> VectoredBlobReader<'a> { pub struct StreamingVectoredReadPlanner { read_builder: Option, // Arguments for previous blob passed into [`StreamingVectoredReadPlanner::handle`] - prev: Option<(Key, Lsn, u64)>, + prev: Option<(Key, Lsn, u64, bool)>, /// Max read size per batch. This is not a strict limit. If there are [0, 100) and [100, 200), while the `max_read_size` is 150, /// we will produce a single batch instead of split them. max_read_size: u64, @@ -550,27 +569,47 @@ impl StreamingVectoredReadPlanner { } } - pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64) -> Option { + pub fn handle( + &mut self, + key: Key, + lsn: Lsn, + offset: u64, + will_init: bool, + ) -> Option { // Implementation note: internally lag behind by one blob such that // we have a start and end offset when initialising [`VectoredRead`] - let (prev_key, prev_lsn, prev_offset) = match self.prev { + let (prev_key, prev_lsn, prev_offset, prev_will_init) = match self.prev { None => { - self.prev = Some((key, lsn, offset)); + self.prev = Some((key, lsn, offset, will_init)); return None; } Some(prev) => prev, }; - let res = self.add_blob(prev_key, prev_lsn, prev_offset, offset, false); + let res = self.add_blob( + prev_key, + prev_lsn, + prev_offset, + offset, + false, + prev_will_init, + ); - self.prev = Some((key, lsn, offset)); + self.prev = Some((key, lsn, offset, will_init)); res } pub fn handle_range_end(&mut self, offset: u64) -> Option { - let res = if let Some((prev_key, prev_lsn, prev_offset)) = self.prev { - self.add_blob(prev_key, prev_lsn, prev_offset, offset, true) + let res = if let Some((prev_key, prev_lsn, prev_offset, prev_will_init)) = self.prev { + self.add_blob( + prev_key, + prev_lsn, + prev_offset, + offset, + true, + prev_will_init, + ) } else { None }; @@ -587,10 +626,19 @@ impl StreamingVectoredReadPlanner { start_offset: u64, end_offset: u64, is_last_blob_in_read: bool, + will_init: bool, ) -> Option { match &mut self.read_builder { Some(read_builder) => { - let extended = read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn }); + let extended = read_builder.extend( + start_offset, + end_offset, + BlobMeta { + key, + lsn, + will_init, + }, + ); assert_eq!(extended, VectoredReadExtended::Yes); } None => { @@ -598,7 +646,11 @@ impl StreamingVectoredReadPlanner { Some(ChunkedVectoredReadBuilder::new_streaming( start_offset, end_offset, - BlobMeta { key, lsn }, + BlobMeta { + key, + lsn, + will_init, + }, )) }; } @@ -812,7 +864,7 @@ mod tests { let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1000); let mut reads = Vec::new(); for (key, lsn, offset, _) in blob_descriptions.clone() { - reads.extend(planner.handle(key, lsn, offset)); + reads.extend(planner.handle(key, lsn, offset, false)); } reads.extend(planner.handle_range_end(652 * 1024)); @@ -850,7 +902,7 @@ mod tests { let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 2); let mut reads = Vec::new(); for (key, lsn, offset, _) in blob_descriptions.clone() { - reads.extend(planner.handle(key, lsn, offset)); + reads.extend(planner.handle(key, lsn, offset, false)); } reads.extend(planner.handle_range_end(652 * 1024)); @@ -875,7 +927,7 @@ mod tests { { let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1); let mut reads = Vec::new(); - reads.extend(planner.handle(key, lsn, 0)); + reads.extend(planner.handle(key, lsn, 0, false)); reads.extend(planner.handle_range_end(652 * 1024)); assert_eq!(reads.len(), 1); validate_read(&reads[0], &[(key, lsn, 0, BlobFlag::None)]); @@ -883,8 +935,8 @@ mod tests { { let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1); let mut reads = Vec::new(); - reads.extend(planner.handle(key, lsn, 0)); - reads.extend(planner.handle(key, lsn, 128 * 1024)); + reads.extend(planner.handle(key, lsn, 0, false)); + reads.extend(planner.handle(key, lsn, 128 * 1024, false)); reads.extend(planner.handle_range_end(652 * 1024)); assert_eq!(reads.len(), 2); validate_read(&reads[0], &[(key, lsn, 0, BlobFlag::None)]); @@ -893,8 +945,8 @@ mod tests { { let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 2); let mut reads = Vec::new(); - reads.extend(planner.handle(key, lsn, 0)); - reads.extend(planner.handle(key, lsn, 128 * 1024)); + reads.extend(planner.handle(key, lsn, 0, false)); + reads.extend(planner.handle(key, lsn, 128 * 1024, false)); reads.extend(planner.handle_range_end(652 * 1024)); assert_eq!(reads.len(), 1); validate_read( @@ -923,6 +975,7 @@ mod tests { let meta = BlobMeta { key: Key::MIN, lsn: Lsn(0), + will_init: false, }; for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() { diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index ad7bcc07149d..e0283d99e0fb 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -499,7 +499,13 @@ impl WalIngest { let content = modification .tline - .get_rel_page_at_lsn(src_rel, blknum, Version::Modified(modification), ctx) + .get_rel_page_at_lsn( + src_rel, + blknum, + Version::Modified(modification), + ctx, + crate::tenant::storage_layer::IoConcurrency::sequential(), + ) .await?; modification.put_rel_page_image(dst_rel, blknum, content)?; num_blocks_copied += 1; @@ -1489,6 +1495,7 @@ mod tests { use super::*; use crate::tenant::harness::*; use crate::tenant::remote_timeline_client::{remote_initdb_archive_path, INITDB_PATH}; + use crate::tenant::storage_layer::IoConcurrency; use postgres_ffi::RELSEG_SIZE; use crate::DEFAULT_PG_VERSION; @@ -1532,6 +1539,7 @@ mod tests { #[tokio::test] async fn test_relsize() -> Result<()> { let (tenant, ctx) = TenantHarness::create("test_relsize").await?.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) .await?; @@ -1599,7 +1607,13 @@ mod tests { // Check page contents at each LSN assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 0, + Version::Lsn(Lsn(0x20)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 0 at 2") @@ -1607,7 +1621,13 @@ mod tests { assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 0, + Version::Lsn(Lsn(0x30)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 0 at 3") @@ -1615,14 +1635,26 @@ mod tests { assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 0, + Version::Lsn(Lsn(0x40)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 1, + Version::Lsn(Lsn(0x40)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 1 at 4") @@ -1630,21 +1662,39 @@ mod tests { assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 0, + Version::Lsn(Lsn(0x50)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 1, + Version::Lsn(Lsn(0x50)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 1 at 4") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 2, + Version::Lsn(Lsn(0x50)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 2 at 5") @@ -1667,14 +1717,26 @@ mod tests { ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 0, + Version::Lsn(Lsn(0x60)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 1, + Version::Lsn(Lsn(0x60)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 1 at 4") @@ -1689,7 +1751,13 @@ mod tests { ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 2, + Version::Lsn(Lsn(0x50)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 2 at 5") @@ -1722,14 +1790,26 @@ mod tests { ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 0, + Version::Lsn(Lsn(0x70)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, ZERO_PAGE ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 1, + Version::Lsn(Lsn(0x70)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 1") @@ -1750,7 +1830,13 @@ mod tests { for blk in 2..1500 { assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + blk, + Version::Lsn(Lsn(0x80)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, ZERO_PAGE @@ -1758,7 +1844,13 @@ mod tests { } assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 1500, + Version::Lsn(Lsn(0x80)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 1500") @@ -1851,6 +1943,7 @@ mod tests { .await? .load() .await; + let io_concurrency = IoConcurrency::spawn_for_test(); let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) .await?; @@ -1903,7 +1996,13 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + blkno, + Version::Lsn(lsn), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img(&data) @@ -1931,7 +2030,13 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + blkno, + Version::Lsn(Lsn(0x60)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img(&data) @@ -1950,7 +2055,13 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + blkno, + Version::Lsn(Lsn(0x50)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img(&data) @@ -1987,7 +2098,13 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + blkno, + Version::Lsn(Lsn(0x80)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img(&data) diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index 1fbb651656c8..9d4c22484c8b 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -32,7 +32,6 @@ use tokio::sync::broadcast::error::RecvError; use tokio::time; use tonic::body::{self, empty_body, BoxBody}; use tonic::codegen::Service; -use tonic::transport::server::Connected; use tonic::Code; use tonic::{Request, Response, Status}; use tracing::*; @@ -459,9 +458,10 @@ impl BrokerService for Broker { &self, request: Request>, ) -> Result, Status> { - let remote_addr = request - .remote_addr() - .expect("TCPConnectInfo inserted by handler"); + let &RemoteAddr(remote_addr) = request + .extensions() + .get() + .expect("RemoteAddr inserted by handler"); let mut publisher = self.registry.register_publisher(remote_addr); let mut stream = request.into_inner(); @@ -484,9 +484,10 @@ impl BrokerService for Broker { &self, request: Request, ) -> Result, Status> { - let remote_addr = request - .remote_addr() - .expect("TCPConnectInfo inserted by handler"); + let &RemoteAddr(remote_addr) = request + .extensions() + .get() + .expect("RemoteAddr inserted by handler"); let proto_key = request .into_inner() .subscription_key @@ -537,9 +538,10 @@ impl BrokerService for Broker { &self, request: Request, ) -> std::result::Result, Status> { - let remote_addr = request - .remote_addr() - .expect("TCPConnectInfo inserted by handler"); + let &RemoteAddr(remote_addr) = request + .extensions() + .get() + .expect("RemoteAddr inserted by handler"); let proto_filter = request.into_inner(); let ttid_filter = proto_filter.tenant_timeline_id.as_ref(); @@ -628,6 +630,9 @@ async fn http1_handler( Ok(resp) } +#[derive(Clone, Copy)] +struct RemoteAddr(SocketAddr); + #[tokio::main] async fn main() -> Result<(), Box> { let args = Args::parse(); @@ -687,13 +692,13 @@ async fn main() -> Result<(), Box> { .max_concurrent_streams(None); let storage_broker_server_cloned = storage_broker_server.clone(); - let connect_info = stream.connect_info(); + let remote_addr = RemoteAddr(addr); let service_fn_ = async move { service_fn(move |mut req| { // That's what tonic's MakeSvc.call does to pass conninfo to // the request handler (and where its request.remote_addr() // expects it to find). - req.extensions_mut().insert(connect_info.clone()); + req.extensions_mut().insert(remote_addr); // Technically this second clone is not needed, but consume // by async block is apparently unavoidable. BTW, error diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py index 6c22b31e0092..c82c7578d1a4 100644 --- a/test_runner/fixtures/common_types.py +++ b/test_runner/fixtures/common_types.py @@ -208,6 +208,10 @@ def parse(cls: type[ShardIndex], input: str) -> ShardIndex: shard_count=int(input[2:4], 16), ) + @property + def is_sharded(self) -> bool: + return self.shard_count != 0 + class TenantShardId: def __init__(self, tenant_id: TenantId, shard_number: int, shard_count: int): diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index fa541bad171f..fd7e193778bb 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -126,12 +126,8 @@ def counter(name: str) -> str: "pageserver_page_cache_read_accesses_total", "pageserver_page_cache_size_current_bytes", "pageserver_page_cache_size_max_bytes", - "pageserver_getpage_reconstruct_seconds_bucket", - "pageserver_getpage_reconstruct_seconds_count", - "pageserver_getpage_reconstruct_seconds_sum", *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]], *histogram("pageserver_smgr_query_seconds_global"), - *histogram("pageserver_getpage_get_reconstruct_data_seconds"), *histogram("pageserver_wait_lsn_seconds"), *histogram("pageserver_remote_operation_seconds"), *histogram("pageserver_io_operations_seconds"), diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index a01cb479842c..af427b92d2fc 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -313,6 +313,10 @@ def safe_psql_scalar(self, query: str, log_query: bool = True) -> Any: """ return self.safe_psql(query, log_query=log_query)[0][0] + def show_timeline_id(self) -> TimelineId: + """SHOW neon.timeline_id""" + return TimelineId(cast("str", self.safe_psql("show neon.timeline_id")[0][0])) + class PageserverWalReceiverProtocol(StrEnum): VANILLA = "vanilla" @@ -387,6 +391,7 @@ def __init__( storage_controller_port_override: int | None = None, pageserver_virtual_file_io_mode: str | None = None, pageserver_wal_receiver_protocol: PageserverWalReceiverProtocol | None = None, + pageserver_get_vectored_concurrent_io: str | None = None, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override @@ -426,6 +431,9 @@ def __init__( self.storage_controller_config: dict[Any, Any] | None = None self.pageserver_virtual_file_io_engine: str | None = pageserver_virtual_file_io_engine + self.pageserver_get_vectored_concurrent_io: str | None = ( + pageserver_get_vectored_concurrent_io + ) self.pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None = ( pageserver_default_tenant_config_compaction_algorithm @@ -452,6 +460,7 @@ def __init__( self.test_name = test_name self.compatibility_neon_binpath = compatibility_neon_binpath self.compatibility_pg_distrib_dir = compatibility_pg_distrib_dir + self.test_may_use_compatibility_snapshot_binaries = False self.version_combination = combination self.mixdir = self.test_output_dir / "mixdir_neon" if self.version_combination is not None: @@ -463,6 +472,7 @@ def __init__( ), "the environment variable COMPATIBILITY_POSTGRES_DISTRIB_DIR is required when using mixed versions" self.mixdir.mkdir(mode=0o755, exist_ok=True) self._mix_versions() + self.test_may_use_compatibility_snapshot_binaries = True def init_configs(self, default_remote_storage_if_missing: bool = True) -> NeonEnv: # Cannot create more than one environment from one builder @@ -1062,6 +1072,7 @@ def __init__(self, config: NeonEnvBuilder): self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode self.pageserver_wal_receiver_protocol = config.pageserver_wal_receiver_protocol + self.pageserver_get_vectored_concurrent_io = config.pageserver_get_vectored_concurrent_io # Create the neon_local's `NeonLocalInitConf` cfg: dict[str, Any] = { @@ -1115,12 +1126,24 @@ def __init__(self, config: NeonEnvBuilder): # Batching (https://github.com/neondatabase/neon/issues/9377): # enable batching by default in tests and benchmarks. + ps_cfg["page_service_pipelining"] = { + "mode": "pipelined", + "execution": "concurrent-futures", + "max_batch_size": 32, + } + + # Concurrent IO (https://github.com/neondatabase/neon/issues/9378): + # enable concurrent IO by default in tests and benchmarks. # Compat tests are exempt because old versions fail to parse the new config. - if not config.compatibility_neon_binpath: - ps_cfg["page_service_pipelining"] = { - "mode": "pipelined", - "execution": "concurrent-futures", - "max_batch_size": 32, + get_vectored_concurrent_io = self.pageserver_get_vectored_concurrent_io + if config.test_may_use_compatibility_snapshot_binaries: + log.info( + "Forcing use of binary-built-in default to avoid forward-compatibility related test failures" + ) + get_vectored_concurrent_io = None + if get_vectored_concurrent_io is not None: + ps_cfg["get_vectored_concurrent_io"] = { + "mode": self.pageserver_get_vectored_concurrent_io, } if self.pageserver_virtual_file_io_engine is not None: @@ -1457,6 +1480,7 @@ def neon_simple_env( pageserver_virtual_file_io_engine: str, pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None, pageserver_virtual_file_io_mode: str | None, + pageserver_get_vectored_concurrent_io: str | None, ) -> Iterator[NeonEnv]: """ Simple Neon environment, with 1 safekeeper and 1 pageserver. No authentication, no fsync. @@ -1489,6 +1513,7 @@ def neon_simple_env( pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine, pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm, pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode, + pageserver_get_vectored_concurrent_io=pageserver_get_vectored_concurrent_io, combination=combination, ) as builder: env = builder.init_start() @@ -1515,6 +1540,7 @@ def neon_env_builder( pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None, record_property: Callable[[str, object], None], pageserver_virtual_file_io_mode: str | None, + pageserver_get_vectored_concurrent_io: str | None, ) -> Iterator[NeonEnvBuilder]: """ Fixture to create a Neon environment for test. @@ -1557,6 +1583,7 @@ def neon_env_builder( test_overlay_dir=test_overlay_dir, pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm, pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode, + pageserver_get_vectored_concurrent_io=pageserver_get_vectored_concurrent_io, ) as builder: yield builder # Propogate `preserve_database_files` to make it possible to use in other fixtures, diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py index f57c0f801f17..3404c16f5515 100644 --- a/test_runner/fixtures/parametrize.py +++ b/test_runner/fixtures/parametrize.py @@ -44,6 +44,11 @@ def pageserver_virtual_file_io_mode() -> str | None: return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_MODE") +@pytest.fixture(scope="function", autouse=True) +def pageserver_get_vectored_concurrent_io() -> str | None: + return os.getenv("PAGESERVER_GET_VECTORED_CONCURRENT_IO") + + def get_pageserver_default_tenant_config_compaction_algorithm() -> dict[str, Any] | None: toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM") if toml_table is None: diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index b34dbddc80cb..b8d47346a3ef 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -177,6 +177,9 @@ def test_fully_custom_config(positive_env: NeonEnv): "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}}, }, "rel_size_v2_enabled": True, + "gc_compaction_enabled": True, + "gc_compaction_initial_threshold_kb": 1024000, + "gc_compaction_ratio_percent": 200, } vps_http = env.storage_controller.pageserver_api() diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index a6eaaf6c4c9a..ac44630d308b 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -251,6 +251,8 @@ def test_forward_compatibility( os.environ.get("ALLOW_FORWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true" ) + neon_env_builder.test_may_use_compatibility_snapshot_binaries = True + try: neon_env_builder.num_safekeepers = 3 diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 0ffeeead18b4..a3dffa8f195a 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -17,8 +17,6 @@ license.workspace = true [dependencies] ahash = { version = "0.8" } anyhow = { version = "1", features = ["backtrace"] } -axum = { version = "0.7", features = ["ws"] } -axum-core = { version = "0.4", default-features = false, features = ["tracing"] } base64-594e8ee84c453af0 = { package = "base64", version = "0.13", features = ["alloc"] } base64-647d43efb71741da = { package = "base64", version = "0.21", features = ["alloc"] } base64ct = { version = "1", default-features = false, features = ["std"] } @@ -46,7 +44,7 @@ hex = { version = "0.4", features = ["serde"] } hmac = { version = "0.12", default-features = false, features = ["reset"] } hyper-582f2526e08bb6a0 = { package = "hyper", version = "0.14", features = ["full"] } hyper-dff4ba8e3ae991db = { package = "hyper", version = "1", features = ["full"] } -hyper-util = { version = "0.1", features = ["client-legacy", "server-auto", "service"] } +hyper-util = { version = "0.1", features = ["client-legacy", "http1", "http2", "server", "service"] } indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] } indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] } itertools = { version = "0.12" } @@ -87,12 +85,11 @@ tikv-jemalloc-sys = { version = "0.6", features = ["profiling", "stats", "unpref time = { version = "0.3", features = ["macros", "serde-well-known"] } tokio = { version = "1", features = ["full", "test-util"] } tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] } -tokio-stream = { version = "0.1", features = ["net"] } +tokio-stream = { version = "0.1" } tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] } toml_edit = { version = "0.22", features = ["serde"] } -tonic = { version = "0.12", features = ["tls-roots"] } -tower-9fbad63c4bcf4a8f = { package = "tower", version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "util"] } -tower-d8f496e17d97b5cb = { package = "tower", version = "0.5", default-features = false, features = ["log", "make", "util"] } +tonic = { version = "0.12", default-features = false, features = ["codegen", "prost", "tls-roots"] } +tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "util"] } tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } url = { version = "2", features = ["serde"] }