Skip to content

Commit

Permalink
test_runner/performance: add sharded ingest benchmark (#9591)
Browse files Browse the repository at this point in the history
Adds a Python benchmark for sharded ingestion. This ingests 7 GB of WAL
(100M rows) into a Safekeeper and fans out to 10 shards running on 10
different pageservers. The ingest volume and duration is recorded.
  • Loading branch information
erikgrinaker authored Nov 2, 2024
1 parent 8ac523d commit 0058eb0
Showing 1 changed file with 71 additions and 0 deletions.
71 changes: 71 additions & 0 deletions test_runner/performance/test_sharded_ingest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from __future__ import annotations

from contextlib import closing

import pytest
from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
from fixtures.common_types import Lsn, TenantShardId
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnvBuilder,
tenant_get_shards,
wait_for_last_flush_lsn,
)


@pytest.mark.timeout(600)
@pytest.mark.parametrize("shard_count", [1, 8, 32])
def test_sharded_ingest(
neon_env_builder: NeonEnvBuilder,
zenbenchmark: NeonBenchmarker,
shard_count: int,
):
"""
Benchmarks sharded ingestion throughput, by ingesting a large amount of WAL into a Safekeeper
and fanning out to a large number of shards on dedicated Pageservers. Comparing the base case
(shard_count=1) to the sharded case indicates the overhead of sharding.
"""

ROW_COUNT = 100_000_000 # about 7 GB of WAL

neon_env_builder.num_pageservers = shard_count
env = neon_env_builder.init_start()

# Create a sharded tenant and timeline, and migrate it to the respective pageservers. Ensure
# the storage controller doesn't mess with shard placements.
#
# TODO: there should be a way to disable storage controller background reconciliations.
# Currently, disabling reconciliation also disables foreground operations.
tenant_id, timeline_id = env.create_tenant(shard_count=shard_count)

for shard_number in range(0, shard_count):
tenant_shard_id = TenantShardId(tenant_id, shard_number, shard_count)
pageserver_id = shard_number + 1
env.storage_controller.tenant_shard_migrate(tenant_shard_id, pageserver_id)

shards = tenant_get_shards(env, tenant_id)
env.storage_controller.reconcile_until_idle()
assert tenant_get_shards(env, tenant_id) == shards, "shards moved"

# Start the endpoint.
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
start_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])

# Ingest data and measure WAL volume and duration.
with closing(endpoint.connect()) as conn:
with conn.cursor() as cur:
log.info("Ingesting data")
cur.execute("set statement_timeout = 0")
cur.execute("create table huge (i int, j int)")

with zenbenchmark.record_duration("pageserver_ingest"):
with zenbenchmark.record_duration("wal_ingest"):
cur.execute(f"insert into huge values (generate_series(1, {ROW_COUNT}), 0)")

wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)

end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
wal_written_mb = round((end_lsn - start_lsn) / (1024 * 1024))
zenbenchmark.record("wal_written", wal_written_mb, "MB", MetricReport.TEST_PARAM)

assert tenant_get_shards(env, tenant_id) == shards, "shards moved"

1 comment on commit 0058eb0

@github-actions
Copy link

@github-actions github-actions bot commented on 0058eb0 Nov 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

5380 tests run: 5155 passed, 3 failed, 222 skipped (full report)


Failures on Postgres 17

# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_pg_regress[debug-pg17-4] or test_pg_regress[debug-pg17-None] or test_sharding_split_failures[debug-pg17-failure6]"
Flaky tests (17)

Postgres 17

Postgres 15

Test coverage report is not available

The comment gets automatically updated with the latest test results
0058eb0 at 2024-11-04T00:39:50.802Z :recycle:

Please sign in to comment.