Skip to content

Commit

Permalink
pageserver: add per-timeline read amp histogram (#10566)
Browse files Browse the repository at this point in the history
## Problem

We don't have per-timeline observability for read amplification.

Touches neondatabase/cloud#23283.

## Summary of changes

Add a per-timeline `pageserver_layers_per_read` histogram.

NB: per-timeline histograms are expensive, but probably worth it in this
case.
  • Loading branch information
erikgrinaker authored Jan 30, 2025
1 parent 8804d58 commit 6a2afa0
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 0 deletions.
19 changes: 19 additions & 0 deletions pageserver/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,17 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
/// are amortized across the batch, and some layers may not intersect with a given key, each visited
/// layer contributes directly to the observed latency for every read in the batch, which is what we
/// care about.
pub(crate) static LAYERS_PER_READ: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_layers_per_read",
"Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.",
&["tenant_id", "shard_id", "timeline_id"],
// Low resolution to reduce cardinality.
vec![1.0, 5.0, 10.0, 25.0, 50.0, 100.0],
)
.expect("failed to define a metric")
});

pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_layers_per_read_global",
Expand Down Expand Up @@ -2648,6 +2659,7 @@ pub(crate) struct TimelineMetrics {
pub disk_consistent_lsn_gauge: IntGauge,
pub pitr_history_size: UIntGauge,
pub archival_size: UIntGauge,
pub layers_per_read: Histogram,
pub standby_horizon_gauge: IntGauge,
pub resident_physical_size_gauge: UIntGauge,
pub visible_physical_size_gauge: UIntGauge,
Expand Down Expand Up @@ -2745,6 +2757,10 @@ impl TimelineMetrics {
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();

let layers_per_read = LAYERS_PER_READ
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();

let standby_horizon_gauge = STANDBY_HORIZON
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
Expand Down Expand Up @@ -2809,6 +2825,7 @@ impl TimelineMetrics {
disk_consistent_lsn_gauge,
pitr_history_size,
archival_size,
layers_per_read,
standby_horizon_gauge,
resident_physical_size_gauge,
visible_physical_size_gauge,
Expand Down Expand Up @@ -2978,6 +2995,8 @@ impl TimelineMetrics {
}
}

let _ = LAYERS_PER_READ.remove_label_values(&[tenant_id, shard_id, timeline_id]);

let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
Expand Down
1 change: 1 addition & 0 deletions pageserver/src/tenant/timeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1242,6 +1242,7 @@ impl Timeline {
}

for _ in &results {
self.metrics.layers_per_read.observe(layers_visited as f64);
LAYERS_PER_READ_GLOBAL.observe(layers_visited as f64);
}
}
Expand Down
3 changes: 3 additions & 0 deletions test_runner/fixtures/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,9 @@ def counter(name: str) -> str:
"pageserver_pitr_history_size",
"pageserver_layer_bytes",
"pageserver_layer_count",
"pageserver_layers_per_read_bucket",
"pageserver_layers_per_read_count",
"pageserver_layers_per_read_sum",
"pageserver_visible_physical_size",
"pageserver_storage_operations_seconds_count_total",
"pageserver_storage_operations_seconds_sum_total",
Expand Down

1 comment on commit 6a2afa0

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

7565 tests run: 7204 passed, 0 failed, 361 skipped (full report)


Flaky tests (9)

Postgres 17

Postgres 16

Postgres 14

Code coverage* (full report)

  • functions: 33.4% (8511 of 25502 functions)
  • lines: 49.1% (71482 of 145542 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
6a2afa0 at 2025-01-30T14:17:52.091Z :recycle:

Please sign in to comment.