performance: add a new benchmarks workflow to enable PGO builds (#13884)

Add a benchmark workflow mode with automation to collect, preserve, and inject CPU profiles, enabling PGO builds. The new workflow will run on a schedule and raise a special pull request that includes the most recent representative CPU profile, which will be inserted as the `default.pgo` file into the main package and automatically used in the build pipeline. The actual schedule and the model for raising pull requests with updated profiles are subject to further revisions. This new workflow mode uses a lightweight output destination - a mock proxy (Moxy) from apm-perf to better isolate the performance component of the APM Server.
elastic · Oct 3, 2024 · 5af8cf4 · 5af8cf4
1 parent 1bc9e5e
commit 5af8cf4
Show file tree

Hide file tree

Showing 36 changed files with 731 additions and 195 deletions.
diff --git a/.ci/scripts/push-pgo-pr.sh b/.ci/scripts/push-pgo-pr.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+set -eo pipefail
+
+PGO_BRANCH="update-pgo-$(date +%s)"
+cd $WORKSPACE_PATH
+git fetch origin main
+git checkout main
+git checkout -b $PGO_BRANCH
+mv $PROFILE_PATH x-pack/apm-server/default.pgo
+git add x-pack/apm-server/default.pgo
+git commit -m "PGO: Update default.pgo from benchmarks $WORKFLOW."
+git push -u origin $PGO_BRANCH
+gh pr create -B main -H $PGO_BRANCH -t "PGO: Update default.pgo" -b "Update default.pgo CPU profile from the benchmarks [workflow]($WORKFLOW)." -R elastic/apm-server
+gh pr merge --auto --delete-branch --squash $PGO_BRANCH
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -3,6 +3,11 @@ name: benchmarks
 on:
   workflow_dispatch:
     inputs:
+      runStandalone:
+        description: 'Run the benchmarks against standalone APM Server with Moxy'
+        required: false
+        type: boolean
+        default: false
       profile:
         description: 'The system profile used to run the benchmarks'
         required: false
@@ -21,10 +26,12 @@ on:
         required: false
         type: string
   schedule:
-    - cron: '0 17 * * *'
+    - cron: '0 17 * * *' # Scheduled regular benchmarks.
+    - cron: '0 5 */5 * *' # Scheduled PGO benchmarks.
 
 env:
   PNG_REPORT_FILE: out.png
+  BENCHMARK_CPU_OUT: default.pgo
   BENCHMARK_RESULT: benchmark-result.txt
   WORKING_DIRECTORY: testing/benchmark
 
@@ -38,12 +45,13 @@ jobs:
       run:
         working-directory: ${{ env.WORKING_DIRECTORY }}
     permissions:
-      contents: read
+      contents: write
       id-token: write
     env:
       SSH_KEY: ./id_rsa_terraform
       TF_VAR_private_key: ./id_rsa_terraform
       TF_VAR_public_key: ./id_rsa_terraform.pub
+      RUN_STANDALONE: ${{ inputs.runStandalone || github.event.schedule=='0 5 */5 * *' }}
       TFVARS_SOURCE: ${{ inputs.profile || 'system-profiles/8GBx1zone.tfvars' }} # // Default to use an 8gb profile
       TF_VAR_BUILD_ID: ${{ github.run_id }}
       TF_VAR_ENVIRONMENT: ci
@@ -101,28 +109,48 @@ jobs:
           terraform_version: 1.3.7
           terraform_wrapper: false
 
+      - name: Init terraform module
+        id: init
+        run: make init
+
       - name: Build apmbench
         run: make apmbench $SSH_KEY terraform.tfvars
 
+      - name: Build APM Server and Moxy
+        if: ${{ env.RUN_STANDALONE == 'true' }}
+        run: |
+          make apm-server
+          make moxy
+
       - name: Override docker committed version
-        if: ${{ ! inputs.runOnStable }}
+        if: ${{ ! inputs.runOnStable && env.RUN_STANDALONE == 'false' }}
         run: make docker-override-committed-version
 
       - name: Spin up benchmark environment
         id: deploy
         run: |
-          make init apply
+          make apply
           admin_console_url=$(terraform output -raw admin_console_url)
           echo "admin_console_url=$admin_console_url" >> "$GITHUB_OUTPUT"
           echo "-> infra setup done"
+        env:
+          TF_VAR_worker_region: ${{ env.AWS_REGION }}
+          TF_VAR_run_standalone: ${{ env.RUN_STANDALONE }}
 
       - name: Run benchmarks autotuned
         if: ${{ inputs.benchmarkAgents == '' }}
-        run: make run-benchmark-autotuned index-benchmark-results
+        run: make run-benchmark-autotuned
 
       - name: Run benchmarks self tuned
         if: ${{ inputs.benchmarkAgents != '' }}
-        run: make run-benchmark index-benchmark-results
+        run: make run-benchmark
+
+      - name: Cat standalone server logs
+        if: ${{ env.RUN_STANDALONE == 'true' && failure() }}
+        run: make cat-apm-server-logs
+
+      - name: Index benchmarks result
+        run: make index-benchmark-results
 
       - name: Download PNG
         run: >-
@@ -150,15 +178,65 @@ jobs:
 
       - name: Upload benchmark result
         uses: actions/upload-artifact@v4
-        if: always()
         with:
           name: benchmark-result
           path: ${{ env.WORKING_DIRECTORY }}/${{ env.BENCHMARK_RESULT }}
           if-no-files-found: error
 
+      # The next section injects CPU profile collected by apmbench into the build.
+      # By copying the profile, uploading it to the artifacts and pushing it
+      # via a PR to update default.pgo.
+
+      - name: Copy CPU profile
+        run: make cp-cpuprof
+
+      - name: Upload CPU profile
+        uses: actions/upload-artifact@v4
+        with:
+          name: cpu-profile
+          path: ${{ env.WORKING_DIRECTORY }}/${{ env.BENCHMARK_CPU_OUT }}
+          if-no-files-found: error
+
+      - name: Get token
+        id: get_token
+        uses: tibdex/github-app-token@3beb63f4bd073e61482598c45c71c1019b59b73a # v2.1.0
+        with:
+          app_id: ${{ secrets.OBS_AUTOMATION_APP_ID }}
+          private_key: ${{ secrets.OBS_AUTOMATION_APP_PEM }}
+          permissions: >-
+            {
+              "contents": "write",
+              "pull_requests": "write"
+            }
+
+      # Required to use a service account, otherwise PRs created by
+      # GitHub bot won't trigger any CI builds.
+      # See https://github.com/peter-evans/create-pull-request/issues/48#issuecomment-537478081
+      - name: Configure git user
+        uses: elastic/oblt-actions/git/setup@v1
+        with:
+          github-token: ${{ steps.get_token.outputs.token }}
+
+      - name: Import GPG key
+        uses: crazy-max/ghaction-import-gpg@01dd5d3ca463c7f10f7f4f7b4f177225ac661ee4  # v6.1.0
+        with:
+          gpg_private_key: ${{ secrets.APM_SERVER_RELEASE_GPG_PRIVATE_KEY }}
+          passphrase: ${{ secrets.APM_SERVER_RELEASE_PASSPHRASE }}
+          git_user_signingkey: true
+          git_commit_gpgsign: true
+
+      - name: Open PGO PR
+        if: ${{ env.RUN_STANDALONE == 'true' && github.ref == 'refs/heads/main' }}
+        run: ${{ github.workspace }}/.ci/scripts/push-pgo-pr.sh
+        env:
+          WORKSPACE_PATH: ${{ github.workspace }}
+          PROFILE_PATH: ${{ env.WORKING_DIRECTORY }}/${{ env.BENCHMARK_CPU_OUT }}
+          GITHUB_TOKEN: ${{ steps.get_token.outputs.token }}
+          WORKFLOW: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}/attempts/${{ github.run_attempt }}
+
       - name: Tear down benchmark environment
         if: always()
-        run: make destroy
+        run: make init destroy
 
       # Notify failure to Slack only on schedule (nightly run)
       - if: failure() && github.event_name == 'schedule'

diff --git a/systemtest/benchtest/profiles.go b/systemtest/benchtest/profiles.go
@@ -88,7 +88,7 @@ func (p *profiles) recordCPU() error {
 	if benchConfig.CPUProfile == "" {
 		return nil
 	}
-	duration := 2 * benchConfig.Benchtime
+	duration := benchConfig.Benchtime
 	profile, err := fetchProfile("/debug/pprof/profile", duration)
 	if err != nil {
 		return fmt.Errorf("failed to fetch CPU profile: %w", err)

diff --git a/testing/benchmark/Makefile b/testing/benchmark/Makefile
@@ -2,6 +2,12 @@ APMBENCH_PATH ?= ../../systemtest/cmd/apmbench
 APMBENCH_GOOS ?= linux
 APMBENCH_GOARCH ?= amd64
 
+MOXY_GOOS ?= linux
+MOXY_GOARCH ?= amd64
+
+APM_SERVER_GOOS ?= linux
+APM_SERVER_GOARCH ?= amd64
+
 TFVARS_SOURCE ?= terraform.tfvars.example
 
 BENCHMARK_WARMUP_TIME ?= 5m
@@ -23,6 +29,8 @@ SSH_USER ?= ec2-user
 SSH_OPTS ?= -o LogLevel=ERROR -o StrictHostKeyChecking=no -o ServerAliveInterval=60 -o ServerAliveCountMax=10
 SSH_KEY ?= ~/.ssh/id_rsa_terraform
 WORKER_IP = $(shell terraform output -raw public_ip)
+APM_SERVER_IP = $(shell terraform output -raw apm_server_ip)
+RUN_STANDALONE = $(shell echo var.run_standalone | terraform console | tr -d '"')
 
 SHELL = /bin/bash
 .SHELLFLAGS = -o pipefail -c
@@ -67,6 +75,15 @@ apmbench:
 	@echo "-> Building apmbench..."
 	@cd $(APMBENCH_PATH) && CGO_ENABLED=0 GOOS=$(APMBENCH_GOOS) GOARCH=$(APMBENCH_GOARCH) go build .
 
+.PHONY: moxy
+moxy:
+	@echo "-> Building moxy..."
+	@cd ../../tools && CGO_ENABLED=0 GOOS=$(MOXY_GOOS) GOARCH=$(MOXY_GOARCH) go build -o "../build" github.com/elastic/apm-perf/cmd/moxy
+
+.PHONY: apm-server
+apm-server:
+	@cd ../.. && make build/apm-server-$(APM_SERVER_GOOS)-$(APM_SERVER_GOARCH) && mv build/apm-server-$(APM_SERVER_GOOS)-$(APM_SERVER_GOARCH) build/apm-server
+
 .PHONY: init
 init:
 	@terraform init
@@ -110,13 +127,25 @@ index-benchmark-results: _default-gobench-vars
 
 .PHONY: _default-gobench-vars
 _default-gobench-vars:
+ifeq ($(RUN_STANDALONE),true)
+	$(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),apm_server_size=$(shell echo var.standalone_apm_server_instance_size | terraform console | tr -d '"'))
+	$(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),moxy_size=$(shell echo var.standalone_moxy_instance_size | terraform console | tr -d '"'))
+	$(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),build_sha=$(shell git rev-parse HEAD))
+	$(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),bench_mode=standalone)
+else
 # TODO(marclop) Update code below to use a foor loop, rather than copying the lines.
 	$(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),apm_server_size=$(shell echo var.apm_server_size | terraform console | tr -d '"'))
 	$(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),elasticsearch_size=$(shell echo var.elasticsearch_size | terraform console | tr -d '"'))
 	$(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),stack_version=$(shell echo var.stack_version | terraform console | tr -d '"'))
 	$(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),apm_server_zone_count=$(shell echo var.apm_server_zone_count | terraform console | tr -d '"'))
 	$(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),elasticsearch_zone_count=$(shell echo var.elasticsearch_zone_count | terraform console | tr -d '"'))
 	$(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),build_sha=$(shell curl -sL -H "Authorization: Bearer $(shell terraform output -raw apm_secret_token )" $(shell terraform output -raw apm_server_url ) | jq -r '.build_sha'))
+	$(eval GOBENCH_DEFAULT_TAGS = $(GOBENCH_DEFAULT_TAGS),bench_mode=cloud)
+endif
+
+.PHONY: cat-apm-server-logs
+cat-apm-server-logs:
+	@ssh $(SSH_OPTS) -i $(SSH_KEY) $(SSH_USER)@$(APM_SERVER_IP) "cat /var/log/apm-server/*"
 
 $(SSH_KEY):
 	@ssh-keygen -t rsa -b 4096 -C "$(USER)@elastic.co" -N "" -f $(SSH_KEY)
@@ -172,4 +201,4 @@ elastic_agent_docker_image: build_elastic_agent_docker_image
 build_elastic_agent_docker_image:
 	@env BASE_IMAGE=${ELASTIC_AGENT_DOCKER_IMAGE}:${ELASTIC_AGENT_IMAGE_TAG} GOARCH=amd64 \
 		bash ${REPO_ROOT}/testing/docker/elastic-agent/build.sh \
-		     -t ${CI_ELASTIC_AGENT_DOCKER_IMAGE}:${CUSTOM_IMAGE_TAG}
+		     -t ${CI_ELASTIC_AGENT_DOCKER_IMAGE}:${CUSTOM_IMAGE_TAG}
diff --git a/testing/benchmark/README.md b/testing/benchmark/README.md
@@ -89,7 +89,7 @@ overridden automatically, you need to remove it manually if present.
 #### Override docker image tag
 
 It is possible to override the tag of the docker image that is run in the remote ESS deployment. You can
-specify any of the avilable tags (such as `8.3.0-SNAPSHOT` or a more specific tag `8.3.0-c655cda8-SNAPSHOT`).
+specify any of the available tags (such as `8.3.0-SNAPSHOT` or a more specific tag `8.3.0-c655cda8-SNAPSHOT`).
 Alternatively, you can run `make docker-override-committed-version` in your shell, to have use the committed
 tags in the `docker-compose.yml` file in the repository root.
 

diff --git a/testing/benchmark/main.tf b/testing/benchmark/main.tf
@@ -45,7 +45,46 @@ locals {
   name_prefix = "${coalesce(var.user_name, "unknown-user")}-bench"
 }
 
+module "vpc" {
+  source  = "terraform-aws-modules/vpc/aws"
+  version = "3.14.0"
+
+  name = "${var.user_name}-worker"
+  cidr = var.vpc_cidr
+
+  azs                = ["${var.worker_region}a"]
+  public_subnets     = var.public_cidr
+  enable_ipv6        = false
+  enable_nat_gateway = false
+  single_nat_gateway = false
+
+  manage_default_security_group = true
+  default_security_group_ingress = [
+    {
+      "from_port" : 0,
+      "to_port" : 0,
+      "protocol" : -1,
+      "self" : true,
+      "cidr_blocks" : "0.0.0.0/0",
+    }
+  ]
+  default_security_group_egress = [
+    {
+      "from_port" : 0,
+      "to_port" : 0,
+      "protocol" : -1,
+      "cidr_blocks" : "0.0.0.0/0",
+    }
+  ]
+
+  tags = merge(local.ci_tags, module.tags.tags)
+  vpc_tags = {
+    Name = "vpc-${var.user_name}-worker"
+  }
+}
+
 module "ec_deployment" {
+  count  = var.run_standalone ? 0 : 1
   source = "../infra/terraform/modules/ec_deployment"
 
   region        = var.ess_region
@@ -73,18 +112,55 @@ module "ec_deployment" {
 
 module "benchmark_worker" {
   source = "../infra/terraform/modules/benchmark_executor"
-  region = var.worker_region
 
+  vpc_id    = module.vpc.vpc_id
+  region    = var.worker_region
   user_name = var.user_name
 
-  apm_server_url   = module.ec_deployment.apm_url
-  apm_secret_token = module.ec_deployment.apm_secret_token
+  apm_server_url   = var.run_standalone ? module.standalone_apm_server[0].apm_server_url : module.ec_deployment[0].apm_url
+  apm_secret_token = var.run_standalone ? module.standalone_apm_server[0].apm_secret_token : module.ec_deployment[0].apm_secret_token
 
   apmbench_bin_path = var.apmbench_bin_path
   instance_type     = var.worker_instance_type
 
   public_key  = var.public_key
   private_key = var.private_key
 
-  tags = merge(local.ci_tags, module.tags.tags)
+  tags       = merge(local.ci_tags, module.tags.tags)
+  depends_on = [module.standalone_apm_server, module.ec_deployment]
+}
+
+module "moxy" {
+  count  = var.run_standalone ? 1 : 0
+  source = "../infra/terraform/modules/moxy"
+
+  vpc_id        = module.vpc.vpc_id
+  instance_type = var.standalone_moxy_instance_size
+  moxy_bin_path = var.moxy_bin_path
+
+  aws_provisioner_key_name = var.private_key
+
+  tags       = merge(local.ci_tags, module.tags.tags)
+  depends_on = [module.vpc]
+}
+
+
+module "standalone_apm_server" {
+  count  = var.run_standalone ? 1 : 0
+  source = "../infra/terraform/modules/standalone_apm_server"
+
+  vpc_id              = module.vpc.vpc_id
+  aws_os              = "amzn2-ami-hvm-*-x86_64-ebs"
+  apm_instance_type   = var.standalone_apm_server_instance_size
+  apm_server_bin_path = var.apm_server_bin_path
+  ea_managed          = false
+
+  aws_provisioner_key_name = var.private_key
+
+  elasticsearch_url      = module.moxy[0].moxy_url
+  elasticsearch_username = "elastic"
+  elasticsearch_password = module.moxy[0].moxy_password
+
+  tags       = merge(local.ci_tags, module.tags.tags)
+  depends_on = [module.moxy]
 }