Skip to content

Commit b418337

Browse files
authored
Merge of #19730
2 parents 709e36a + d59ad67 commit b418337

26 files changed

+530
-26
lines changed

.github/workflows/benchmarks.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ name: benchmarks
22

33
# Secret access requires allowlist in https://github.com/elastic/oblt-infra/tree/main/conf/resources/repos/apm-server
44

5-
# To call benchmarks workflow programmatically with different params, use the `gh workflow run` command.
5+
# To call benchmarks workflow programmatically with different params, use the scripts in .github/workflows/benchmarks
66

77
on:
88
workflow_dispatch:
@@ -203,6 +203,10 @@ jobs:
203203
if: ${{ env.RUN_STANDALONE == 'true' && failure() }}
204204
run: make cat-apm-server-logs
205205

206+
- name: Cat moxy logs
207+
if: ${{ env.RUN_STANDALONE == 'true' && failure() }}
208+
run: make cat-moxy-logs
209+
206210
- name: Index benchmarks result
207211
run: make index-benchmark-results
208212

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
# Benchmark Scripts
2+
3+
This directory contains utility scripts for users to run locally to kickstart and analyze APM Server benchmark workflows in different configurations.
4+
5+
The scripts here are used to generate the TBS benchmark docs at https://www.elastic.co/docs/solutions/observability/apm/transaction-sampling#_tail_based_sampling_performance_and_requirements
6+
7+
To generate non-TBS benchmark, modify the benchmarks workflow inputs in `start-workflows.sh`, and use the other scripts as is.
8+
9+
## Workflow
10+
11+
1. **Run benchmarks**: Execute `start-workflows.sh` to trigger multiple benchmark workflow runs on GitHub and save run IDs to `${BENCH_BRANCH}.txt`
12+
2. **Wait for completion**: Monitor the workflow runs on GitHub Actions until all benchmarks finish
13+
3. **Download results**: Run `download-artifacts.sh` to fetch all benchmark artifacts using the saved run IDs
14+
4. **Analyze results**: Run `analyze-artifacts.sh` to generate benchstat analysis for each result
15+
5. **Summarize for documentation**: Create a summary table from the benchstat results for the documentation (can be done manually or using an LLM to extract geomean values from each `benchstat.txt` file)
16+
17+
```bash
18+
# Run benchmarks on the tbs-arm-bench-92 branch
19+
BENCH_BRANCH=tbs-arm-bench-92 ./start-workflows.sh
20+
21+
# Wait for workflows to complete on GitHub Actions
22+
23+
# Download all benchmark results using the saved run IDs
24+
BENCH_BRANCH=tbs-arm-bench-92 ./download-artifacts.sh
25+
26+
# Analyze the results by producing benchstat
27+
./analyze-artifacts.sh
28+
```
29+
30+
## start-workflows.sh
31+
32+
Triggers multiple benchmark workflows on GitHub Actions with different instance configurations.
33+
34+
### Usage
35+
36+
```bash
37+
BENCH_BRANCH=tbs-arm-bench-92 ./start-workflows.sh
38+
```
39+
40+
### Environment Variables
41+
42+
- `BENCH_BRANCH` (required): The git branch/ref to run the benchmarks against
43+
44+
### Description
45+
46+
This script triggers 9 benchmark workflow runs with different configurations:
47+
- Tests 8GB, 16GB, and 32GB instance profiles
48+
- Compares 3 different TBS disabled vs enabled and disk configurations
49+
50+
The script runs configurations sequentially with 2-second delays between each and automatically saves each workflow run ID to `${BENCH_BRANCH}.txt` for later use with `download-artifacts.sh`.
51+
52+
The script can be modified to run in any other configurations, e.g. non-TBS.
53+
54+
## download-artifacts.sh
55+
56+
Downloads benchmark result artifacts from completed GitHub workflow runs.
57+
58+
### Usage
59+
60+
```bash
61+
# Download all benchmark results (after running start-workflows.sh)
62+
BENCH_BRANCH=tbs-arm-bench-92 ./download-artifacts.sh
63+
```
64+
65+
### Environment Variables
66+
67+
- `BENCH_BRANCH` (required): The git branch/ref name, used to locate the file containing workflow run IDs
68+
69+
### Input
70+
71+
Reads workflow run IDs from `${BENCH_BRANCH}.txt` file (one per line). Each benchmark result is downloaded to a numbered directory (`benchmark-result-1`, `benchmark-result-2`, etc.).
72+
73+
## analyze-artifacts.sh
74+
75+
Generates benchstat analysis for each downloaded benchmark result.
76+
77+
### Usage
78+
79+
```bash
80+
./analyze-artifacts.sh
81+
# Creates benchmark-result-1/benchstat.txt, benchmark-result-2/benchstat.txt, etc.
82+
```
83+
84+
### Description
85+
86+
For each `benchmark-result-*/benchmark-result.txt` file, runs `benchstat` and saves the output to `benchstat.txt` in the same directory.
87+
88+
### Requirements
89+
90+
- `benchstat` must be installed (`go install golang.org/x/perf/cmd/benchstat@latest`)
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#!/bin/bash
2+
3+
# Analyze each benchmark result
4+
for dir in benchmark-result-*/; do
5+
if [ -f "$dir/benchmark-result.txt" ]; then
6+
echo "=== Analyzing $dir ==="
7+
benchstat "$dir/benchmark-result.txt" > "$dir/benchstat.txt"
8+
echo "Saved to $dir/benchstat.txt"
9+
fi
10+
done
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#!/bin/bash
2+
3+
# Validate BENCH_BRANCH environment variable
4+
if [ -z "$BENCH_BRANCH" ]; then
5+
echo "Error: BENCH_BRANCH environment variable is not set or is empty"
6+
exit 1
7+
fi
8+
9+
# Read workflow run IDs from ${BENCH_BRANCH}.txt
10+
if [ ! -f "${BENCH_BRANCH}.txt" ]; then
11+
echo "Error: File ${BENCH_BRANCH}.txt not found"
12+
exit 1
13+
fi
14+
15+
mapfile -t run_ids < "${BENCH_BRANCH}.txt"
16+
17+
# Download each benchmark result and rename
18+
for i in "${!run_ids[@]}"; do
19+
run_id="${run_ids[$i]}"
20+
result_num=$((i + 1))
21+
22+
echo "Downloading run $run_id as benchmark-result-$result_num..."
23+
gh run download "$run_id" -n benchmark-result -D "benchmark-result-$result_num"
24+
done
25+
26+
echo "Done!"
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#!/bin/sh
2+
3+
# Validate BENCH_BRANCH environment variable
4+
if [ -z "$BENCH_BRANCH" ]; then
5+
echo "Error: BENCH_BRANCH environment variable is not set or is empty. Set it to the branch on which the workflow should run."
6+
exit 1
7+
fi
8+
9+
# 8GB, TBS disabled
10+
gh workflow run benchmarks.yml --ref "$BENCH_BRANCH" -f runStandalone=true -f enableTailSampling=false -f tailSamplingStorageLimit=0 -f tailSamplingSampleRate=0.1 -f profile=system-profiles/8GB_ARM-x1zone.tfvars -f pgoExport=false -f benchmarkAgents=1024 -f benchmarkRun=BenchmarkTraces -f warmupTime=5m
11+
gh run list --workflow=benchmarks.yml --branch "$BENCH_BRANCH" --json 'databaseId' -q '.[0].databaseId' >> "${BENCH_BRANCH}.txt"
12+
sleep 2
13+
# 8GB, TBS enabled, gp3 3000IOPS
14+
gh workflow run benchmarks.yml --ref "$BENCH_BRANCH" -f runStandalone=true -f enableTailSampling=true -f tailSamplingStorageLimit=0 -f tailSamplingSampleRate=0.1 -f profile=system-profiles/8GB_ARM-x1zone.tfvars -f pgoExport=false -f benchmarkAgents=1024 -f benchmarkRun=BenchmarkTraces -f warmupTime=5m
15+
gh run list --workflow=benchmarks.yml --branch "$BENCH_BRANCH" --json 'databaseId' -q '.[0].databaseId' >> "${BENCH_BRANCH}.txt"
16+
sleep 2
17+
# 8GB, TBS enabled, NVMe
18+
gh workflow run benchmarks.yml --ref "$BENCH_BRANCH" -f runStandalone=true -f enableTailSampling=true -f tailSamplingStorageLimit=0 -f tailSamplingSampleRate=0.1 -f profile=system-profiles/8GB_NVMe_ARM-x1zone.tfvars -f pgoExport=false -f benchmarkAgents=1024 -f benchmarkRun=BenchmarkTraces -f warmupTime=5m
19+
gh run list --workflow=benchmarks.yml --branch "$BENCH_BRANCH" --json 'databaseId' -q '.[0].databaseId' >> "${BENCH_BRANCH}.txt"
20+
21+
sleep 2
22+
# 16GB, TBS disabled
23+
gh workflow run benchmarks.yml --ref "$BENCH_BRANCH" -f runStandalone=true -f enableTailSampling=false -f tailSamplingStorageLimit=0 -f tailSamplingSampleRate=0.1 -f profile=system-profiles/16GB_ARM-x2zone.tfvars -f pgoExport=false -f benchmarkAgents=1024 -f benchmarkRun=BenchmarkTraces -f warmupTime=5m
24+
gh run list --workflow=benchmarks.yml --branch "$BENCH_BRANCH" --json 'databaseId' -q '.[0].databaseId' >> "${BENCH_BRANCH}.txt"
25+
sleep 2
26+
# 16GB, TBS enabled, gp3 3000IOPS
27+
gh workflow run benchmarks.yml --ref "$BENCH_BRANCH" -f runStandalone=true -f enableTailSampling=true -f tailSamplingStorageLimit=0 -f tailSamplingSampleRate=0.1 -f profile=system-profiles/16GB_ARM-x2zone.tfvars -f pgoExport=false -f benchmarkAgents=1024 -f benchmarkRun=BenchmarkTraces -f warmupTime=5m
28+
gh run list --workflow=benchmarks.yml --branch "$BENCH_BRANCH" --json 'databaseId' -q '.[0].databaseId' >> "${BENCH_BRANCH}.txt"
29+
sleep 2
30+
# 16GB, TBS enabled, NVMe
31+
gh workflow run benchmarks.yml --ref "$BENCH_BRANCH" -f runStandalone=true -f enableTailSampling=true -f tailSamplingStorageLimit=0 -f tailSamplingSampleRate=0.1 -f profile=system-profiles/16GB_NVMe_ARM-x2zone.tfvars -f pgoExport=false -f benchmarkAgents=1024 -f benchmarkRun=BenchmarkTraces -f warmupTime=5m
32+
gh run list --workflow=benchmarks.yml --branch "$BENCH_BRANCH" --json 'databaseId' -q '.[0].databaseId' >> "${BENCH_BRANCH}.txt"
33+
34+
sleep 2
35+
# 32GB, TBS disabled
36+
gh workflow run benchmarks.yml --ref "$BENCH_BRANCH" -f runStandalone=true -f enableTailSampling=false -f tailSamplingStorageLimit=0 -f tailSamplingSampleRate=0.1 -f profile=system-profiles/32GB_ARM-x2zone.tfvars -f pgoExport=false -f benchmarkAgents=1024 -f benchmarkRun=BenchmarkTraces -f warmupTime=5m
37+
gh run list --workflow=benchmarks.yml --branch "$BENCH_BRANCH" --json 'databaseId' -q '.[0].databaseId' >> "${BENCH_BRANCH}.txt"
38+
sleep 2
39+
# 32GB, TBS enabled, gp3 3000IOPS
40+
gh workflow run benchmarks.yml --ref "$BENCH_BRANCH" -f runStandalone=true -f enableTailSampling=true -f tailSamplingStorageLimit=0 -f tailSamplingSampleRate=0.1 -f profile=system-profiles/32GB_ARM-x2zone.tfvars -f pgoExport=false -f benchmarkAgents=1024 -f benchmarkRun=BenchmarkTraces -f warmupTime=5m
41+
gh run list --workflow=benchmarks.yml --branch "$BENCH_BRANCH" --json 'databaseId' -q '.[0].databaseId' >> "${BENCH_BRANCH}.txt"
42+
sleep 2
43+
# 32GB, TBS enabled, NVMe
44+
gh workflow run benchmarks.yml --ref "$BENCH_BRANCH" -f runStandalone=true -f enableTailSampling=true -f tailSamplingStorageLimit=0 -f tailSamplingSampleRate=0.1 -f profile=system-profiles/32GB_NVMe_ARM-x2zone.tfvars -f pgoExport=false -f benchmarkAgents=1024 -f benchmarkRun=BenchmarkTraces -f warmupTime=5m
45+
gh run list --workflow=benchmarks.yml --branch "$BENCH_BRANCH" --json 'databaseId' -q '.[0].databaseId' >> "${BENCH_BRANCH}.txt"

systemtest/benchtest/expvar/metrics.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ func run(ctx context.Context, serverURL string, period time.Duration) (<-chan ex
263263
return
264264
case <-ticker.C:
265265
var e expvar
266-
ctxWithTimeout, cancel := context.WithTimeout(ctx, period)
266+
ctxWithTimeout, cancel := context.WithTimeout(ctx, period+5*time.Second)
267267
err := queryExpvar(ctxWithTimeout, &e, serverURL)
268268
cancel()
269269
if err != nil {

systemtest/benchtest/main.go

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -36,15 +36,14 @@ import (
3636

3737
"go.elastic.co/apm/v2/stacktrace"
3838
"go.uber.org/zap"
39-
"go.uber.org/zap/zaptest"
4039
"golang.org/x/time/rate"
4140

4241
"github.com/elastic/apm-perf/loadgen"
4342
loadgencfg "github.com/elastic/apm-perf/loadgen/config"
4443
"github.com/elastic/apm-server/systemtest/benchtest/expvar"
4544
)
4645

47-
const waitInactiveTimeout = 60 * time.Second
46+
const waitInactiveTimeout = 90 * time.Second
4847

4948
// BenchmarkFunc is the benchmark function type accepted by Run.
5049
type BenchmarkFunc func(*testing.B, *rate.Limiter)
@@ -56,20 +55,35 @@ type benchmark struct {
5655
f BenchmarkFunc
5756
}
5857

58+
// getLogger returns a logger that does not depend on b.Log because b.Log does not work in testing.Benchmark.
59+
// See https://github.com/golang/go/issues/32066
60+
// Log to stdout to avoid interfering with benchmark output in stderr.
61+
func getLogger() (*zap.Logger, error) {
62+
c := zap.NewDevelopmentConfig()
63+
c.OutputPaths = []string{"stdout"}
64+
return c.Build()
65+
}
66+
5967
func runBenchmark(f BenchmarkFunc) (testing.BenchmarkResult, bool, bool, error) {
68+
logger, err := getLogger()
69+
if err != nil {
70+
return testing.BenchmarkResult{}, false, false, err
71+
}
6072
// Run the benchmark. testing.Benchmark will invoke the function
6173
// multiple times, but only returns the final result.
6274
var failed bool
6375
var skipped bool
6476
var collector *expvar.Collector
77+
var reterr error
6578
result := testing.Benchmark(func(b *testing.B) {
6679
ctx, cancel := context.WithCancel(context.Background())
6780
defer cancel()
6881
var err error
6982
server := loadgencfg.Config.ServerURL.String()
70-
collector, err = expvar.StartNewCollector(ctx, server, 100*time.Millisecond, zaptest.NewLogger(b))
83+
collector, err = expvar.StartNewCollector(ctx, server, 100*time.Millisecond, logger)
7184
if err != nil {
72-
b.Error(err)
85+
reterr = fmt.Errorf("expvar.StartNewCollector error: %w", err)
86+
b.Error(reterr)
7387
failed = b.Failed()
7488
return
7589
}
@@ -93,9 +107,11 @@ func runBenchmark(f BenchmarkFunc) (testing.BenchmarkResult, bool, bool, error)
93107
if !b.Failed() {
94108
watcher, err := collector.WatchMetric(expvar.ActiveEvents, 0)
95109
if err != nil {
96-
b.Error(err)
110+
reterr = fmt.Errorf("collector.WatchMetric error: %w", err)
111+
b.Error(reterr)
97112
} else if status := <-watcher; !status {
98-
b.Error("failed to wait for APM server to be inactive")
113+
reterr = fmt.Errorf("failed to wait for APM server to be inactive")
114+
b.Error(reterr)
99115
}
100116
}
101117
failed = b.Failed()
@@ -104,7 +120,7 @@ func runBenchmark(f BenchmarkFunc) (testing.BenchmarkResult, bool, bool, error)
104120
if result.Extra != nil {
105121
addExpvarMetrics(&result, collector, benchConfig.Detailed)
106122
}
107-
return result, failed, skipped, nil
123+
return result, failed, skipped, reterr
108124
}
109125

110126
func addExpvarMetrics(result *testing.BenchmarkResult, collector *expvar.Collector, detailed bool) {
@@ -234,7 +250,8 @@ func Run(allBenchmarks ...BenchmarkFunc) error {
234250
profileChan := profiles.record(name)
235251
result, failed, skipped, err := runBenchmark(benchmark.f)
236252
if err != nil {
237-
return err
253+
fmt.Fprintf(os.Stderr, "--- FAIL: %s\n", name)
254+
return fmt.Errorf("benchmark %q failed: %w", name, err)
238255
}
239256
if skipped {
240257
continue

systemtest/benchtest/profiles.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,7 @@ func fetchProfile(urlPath string, duration time.Duration) (*profile.Profile, err
4444
query.Set("seconds", strconv.Itoa(int(duration.Seconds())))
4545
req.URL.RawQuery = query.Encode()
4646

47-
timeout := time.Duration(float64(duration) * 1.5)
48-
ctx := req.Context()
47+
timeout := duration * 3
4948
ctx, cancel := context.WithTimeout(req.Context(), timeout)
5049
defer cancel()
5150
req = req.WithContext(ctx)
@@ -108,7 +107,7 @@ func (p *profiles) recordCPU() error {
108107
}
109108
// We don't need the address in the profile, so discard it to reduce the size.
110109
if err := profile.Aggregate(true, true, true, true, false); err != nil {
111-
return fmt.Errorf("failed to fetch CPU profile: %w", err)
110+
return fmt.Errorf("failed to aggregate CPU profile: %w", err)
112111
}
113112
profile = profile.Compact()
114113
p.cpu = append(p.cpu, profile)

testing/benchmark/Makefile

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,18 @@
1+
TFVARS_SOURCE ?= terraform.tfvars.example
2+
3+
# Automatically detect architecture based on instance type in TFVARS_SOURCE
4+
# If we need to change this, remember to grep for other instance type checks in the codebase.
5+
DETECTED_ARCH = $(shell grep -E "standalone_apm_server_instance_size|standalone_moxy_instance_size|worker_instance_type" $(TFVARS_SOURCE) | head -1 | cut -d'=' -f2 | tr -d ' "' | grep -qE '^(a1|t4g|c6g|c7g|m6g|m7g|r6g|r7g|x2gd)' && echo "arm64" || echo "amd64")
6+
17
APMBENCH_PATH ?= ../../systemtest/cmd/apmbench
28
APMBENCH_GOOS ?= linux
3-
APMBENCH_GOARCH ?= amd64
9+
APMBENCH_GOARCH ?= $(DETECTED_ARCH)
410

511
MOXY_GOOS ?= linux
6-
MOXY_GOARCH ?= amd64
12+
MOXY_GOARCH ?= $(DETECTED_ARCH)
713

814
APM_SERVER_GOOS ?= linux
9-
APM_SERVER_GOARCH ?= amd64
10-
11-
TFVARS_SOURCE ?= terraform.tfvars.example
15+
APM_SERVER_GOARCH ?= $(DETECTED_ARCH)
1216

1317
BENCHMARK_WARMUP_TIME ?= 5m
1418
BENCHMARK_AGENTS ?= 64
@@ -137,6 +141,11 @@ endif
137141
cat-apm-server-logs:
138142
@ssh $(SSH_OPTS) -i $(SSH_KEY) $(SSH_USER)@$(APM_SERVER_IP) "cat /var/log/apm-server/*"
139143

144+
.PHONY: cat-moxy-logs
145+
cat-moxy-logs:
146+
$(eval MOXY_IP = $(shell terraform output -raw moxy_ip))
147+
@ssh $(SSH_OPTS) -i $(SSH_KEY) $(SSH_USER)@$(MOXY_IP) "cat moxy.log"
148+
140149
$(SSH_KEY):
141150
@ssh-keygen -t rsa -b 4096 -C "$(USER)@elastic.co" -N "" -f $(SSH_KEY)
142151

testing/benchmark/main.tf

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,10 @@ provider "aws" {
4444

4545
locals {
4646
name_prefix = "${coalesce(var.user_name, "unknown-user")}-bench"
47+
48+
# Detect if standalone APM server instance type is ARM (Graviton) based
49+
# If we need to change this, remember to grep for other instance type checks in the codebase.
50+
standalone_apm_is_arm = can(regex("^(a1|t4g|c6g|c7g|m6g|m7g|r6g|r7g|x2gd)", var.standalone_apm_server_instance_size))
4751
}
4852

4953
module "vpc" {
@@ -153,8 +157,9 @@ module "standalone_apm_server" {
153157
count = var.run_standalone ? 1 : 0
154158
source = "../infra/terraform/modules/standalone_apm_server"
155159

156-
vpc_id = module.vpc.vpc_id
157-
aws_os = "al2023-ami-2023.*-x86_64"
160+
vpc_id = module.vpc.vpc_id
161+
# Use appropriate AMI pattern based on instance architecture
162+
aws_os = local.standalone_apm_is_arm ? "al2023-ami-2023" : "al2023-ami-2023.*-x86_64"
158163
apm_instance_type = var.standalone_apm_server_instance_size
159164
apm_volume_type = var.standalone_apm_server_volume_type
160165
apm_volume_size = var.apm_server_tail_sampling ? coalesce(var.standalone_apm_server_volume_size, 60) : var.standalone_apm_server_volume_size

0 commit comments

Comments
 (0)