Skip to content

Commit

Permalink
feat(pageserver): separate sparse and dense keyspace (#7503)
Browse files Browse the repository at this point in the history
extracted (and tested) from
#7468, part of
#7462.

The current codebase assumes the keyspace is dense -- which means that
if we have a keyspace of 0x00-0x100, we assume every key (e.g., 0x00,
0x01, 0x02, ...) exists in the storage engine. However, the assumption
does not hold any more in metadata keyspace. The metadata keyspace is
sparse. It is impossible to do per-key check.

Ideally, we should not have the assumption of dense keyspace at all, but
this would incur a lot of refactors. Therefore, we split the keyspaces
we have to dense/sparse and handle them differently in the code for now.
At some point in the future, we should assume all keyspaces are sparse.

## Summary of changes

* Split collect_keyspace to return dense+sparse keyspace.
* Do not allow generating image layers for sparse keyspace (for now --
will fix this next week, we need image layers anyways).
* Generate delta layers for sparse keyspace.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
  • Loading branch information
skyzh authored Apr 30, 2024
1 parent 84b6b95 commit 45c625f
Show file tree
Hide file tree
Showing 8 changed files with 266 additions and 100 deletions.
27 changes: 27 additions & 0 deletions libs/pageserver_api/src/keyspace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ pub struct KeySpace {
pub ranges: Vec<Range<Key>>,
}

/// A wrapper type for sparse keyspaces.
#[derive(Clone, Debug, Default, PartialEq, Eq)]
pub struct SparseKeySpace(pub KeySpace);

/// Represents a contiguous half-open range of the keyspace, masked according to a particular
/// ShardNumber's stripes: within this range of keys, only some "belong" to the current
/// shard.
Expand Down Expand Up @@ -435,10 +439,33 @@ pub struct KeyPartitioning {
pub parts: Vec<KeySpace>,
}

/// Represents a partitioning of the sparse key space.
#[derive(Clone, Debug, Default)]
pub struct SparseKeyPartitioning {
pub parts: Vec<SparseKeySpace>,
}

impl KeyPartitioning {
pub fn new() -> Self {
KeyPartitioning { parts: Vec::new() }
}

/// Convert a key partitioning to a sparse partition.
pub fn into_sparse(self) -> SparseKeyPartitioning {
SparseKeyPartitioning {
parts: self.parts.into_iter().map(SparseKeySpace).collect(),
}
}
}

impl SparseKeyPartitioning {
/// Note: use this function with caution. Attempt to handle a sparse keyspace in the same way as a dense keyspace will
/// cause long/dead loops.
pub fn into_dense(self) -> KeyPartitioning {
KeyPartitioning {
parts: self.parts.into_iter().map(|x| x.0).collect(),
}
}
}

///
Expand Down
14 changes: 13 additions & 1 deletion libs/pageserver_api/src/models/partitioning.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
use utils::lsn::Lsn;

use crate::keyspace::SparseKeySpace;

#[derive(Debug, PartialEq, Eq)]
pub struct Partitioning {
pub keys: crate::keyspace::KeySpace,

pub sparse_keys: crate::keyspace::SparseKeySpace,
pub at_lsn: Lsn,
}

Expand Down Expand Up @@ -32,6 +34,8 @@ impl serde::Serialize for Partitioning {
let mut map = serializer.serialize_map(Some(2))?;
map.serialize_key("keys")?;
map.serialize_value(&KeySpace(&self.keys))?;
map.serialize_key("sparse_keys")?;
map.serialize_value(&KeySpace(&self.sparse_keys.0))?;
map.serialize_key("at_lsn")?;
map.serialize_value(&WithDisplay(&self.at_lsn))?;
map.end()
Expand Down Expand Up @@ -99,6 +103,7 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
#[derive(serde::Deserialize)]
struct De {
keys: KeySpace,
sparse_keys: KeySpace,
#[serde_as(as = "serde_with::DisplayFromStr")]
at_lsn: Lsn,
}
Expand All @@ -107,6 +112,7 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
Ok(Self {
at_lsn: de.at_lsn,
keys: de.keys.0,
sparse_keys: SparseKeySpace(de.sparse_keys.0),
})
}
}
Expand All @@ -133,6 +139,12 @@ mod tests {
"030000000000000000000000000000000003"
]
],
"sparse_keys": [
[
"620000000000000000000000000000000000",
"620000000000000000000000000000000003"
]
],
"at_lsn": "0/2240160"
}
"#;
Expand Down
6 changes: 4 additions & 2 deletions pageserver/src/http/routes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1918,12 +1918,14 @@ async fn timeline_collect_keyspace(
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
let keys = timeline
let (dense_ks, sparse_ks) = timeline
.collect_keyspace(at_lsn, &ctx)
.await
.map_err(|e| ApiError::InternalServerError(e.into()))?;

let res = pageserver_api::models::partitioning::Partitioning { keys, at_lsn };
// This API is currently used by pagebench. Pagebench will iterate all keys within the keyspace.
// Therefore, we split dense/sparse keys in this API.
let res = pageserver_api::models::partitioning::Partitioning { keys: dense_ks, sparse_keys: sparse_ks, at_lsn };

json_response(StatusCode::OK, res)
}
Expand Down
12 changes: 10 additions & 2 deletions pageserver/src/pgdatadir_mapping.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use pageserver_api::key::{
slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
};
use pageserver_api::keyspace::SparseKeySpace;
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::BLCKSZ;
Expand Down Expand Up @@ -730,11 +731,13 @@ impl Timeline {
/// Get a KeySpace that covers all the Keys that are in use at the given LSN.
/// Anything that's not listed maybe removed from the underlying storage (from
/// that LSN forwards).
///
/// The return value is (dense keyspace, sparse keyspace).
pub(crate) async fn collect_keyspace(
&self,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<KeySpace, CollectKeySpaceError> {
) -> Result<(KeySpace, SparseKeySpace), CollectKeySpaceError> {
// Iterate through key ranges, greedily packing them into partitions
let mut result = KeySpaceAccum::new();

Expand Down Expand Up @@ -806,7 +809,12 @@ impl Timeline {
if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
result.add_key(AUX_FILES_KEY);
}
Ok(result.to_keyspace())

Ok((
result.to_keyspace(),
/* AUX sparse key space */
SparseKeySpace(KeySpace::single(Key::metadata_aux_key_range())),
))
}

/// Get cached size of relation if it not updated after specified LSN
Expand Down
1 change: 1 addition & 0 deletions pageserver/src/tenant/layer_map.rs
Original file line number Diff line number Diff line change
Expand Up @@ -916,6 +916,7 @@ mod tests {
assert_eq!(lhs, rhs);
}

#[cfg(test)]
fn brute_force_range_search(
layer_map: &LayerMap,
key_range: Range<Key>,
Expand Down
24 changes: 21 additions & 3 deletions pageserver/src/tenant/storage_layer/inmemory_layer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -597,14 +597,17 @@ impl InMemoryLayer {
}
}

/// Write this frozen in-memory layer to disk.
/// Write this frozen in-memory layer to disk. If `key_range` is set, the delta
/// layer will only contain the key range the user specifies, and may return `None`
/// if there are no matching keys.
///
/// Returns a new delta layer with all the same data as this in-memory layer
pub(crate) async fn write_to_disk(
&self,
timeline: &Arc<Timeline>,
ctx: &RequestContext,
) -> Result<ResidentLayer> {
key_range: Option<Range<Key>>,
) -> Result<Option<ResidentLayer>> {
// Grab the lock in read-mode. We hold it over the I/O, but because this
// layer is not writeable anymore, no one should be trying to acquire the
// write lock on it, so we shouldn't block anyone. There's one exception
Expand All @@ -618,6 +621,21 @@ impl InMemoryLayer {

let end_lsn = *self.end_lsn.get().unwrap();

let keys: Vec<_> = if let Some(key_range) = key_range {
inner
.index
.iter()
.filter(|(k, _)| key_range.contains(k))
.map(|(k, m)| (k.to_i128(), m))
.collect()
} else {
inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect()
};

if keys.is_empty() {
return Ok(None);
}

let mut delta_layer_writer = DeltaLayerWriter::new(
self.conf,
self.timeline_id,
Expand Down Expand Up @@ -649,6 +667,6 @@ impl InMemoryLayer {

// MAX is used here because we identify L0 layers by full key range
let delta_layer = delta_layer_writer.finish(Key::MAX, timeline).await?;
Ok(delta_layer)
Ok(Some(delta_layer))
}
}
Loading

1 comment on commit 45c625f

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2930 tests run: 2796 passed, 0 failed, 134 skipped (full report)


Code coverage* (full report)

  • functions: 28.2% (6583 of 23348 functions)
  • lines: 47.0% (46825 of 99700 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
45c625f at 2024-04-30T15:03:22.930Z :recycle:

Please sign in to comment.