Merge pull request #235 from oqc-community/feature/lc/benchmarking_wo…

…rkflow Performance regression checks: workflow
oqc-community · Nov 18, 2024 · 11349b3 · 11349b3
2 parents 320ea30 + 015a90c
commit 11349b3
Show file tree

Hide file tree

Showing 6 changed files with 277 additions and 30 deletions.
diff --git a/.github/workflows/benchmarking-regression.yml b/.github/workflows/benchmarking-regression.yml
@@ -0,0 +1,124 @@
+name: Performance regression tests
+
+on:
+  pull_request:
+    branches: [main]
+  workflow_dispatch:
+
+permissions:
+  contents: write
+  actions: write
+  pull-requests: write
+
+run-name: Performance regression tests from ${{ github.ref }}
+
+jobs:
+  benchmarking:
+    name: Benchmarking
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ ubuntu-latest]
+        python: ["3.10"]
+    defaults:
+      run:
+        shell: bash
+
+    steps:
+      - name: Set up Python 3.x
+        id: setup-python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python }}
+
+      - name: Install Poetry
+        uses: snok/install-poetry@v1
+        with:
+          virtualenvs-create: true
+          virtualenvs-in-project: true
+
+      - name: Load cached venv
+        id: cached-pip-wheels
+        uses: actions/cache@v4
+        with:
+          path: .venv
+          key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}
+
+      - uses: actions/checkout@v4
+        with:
+          ref: main
+
+      - name: Install dependancies
+        run: poetry install --sync
+
+      - name: Benchmarking on main
+        run: poetry run pytest benchmarks/run.py --benchmark-only --benchmark-save="benchmark"
+
+      - uses: actions/checkout@v4
+        with:
+          clean: false
+
+      - name: Install dependancies
+        run: 
+          poetry install --sync
+
+      - name: Benchmarking on branch
+        run: poetry run pytest benchmarks/run.py --benchmark-only --benchmark-save="benchmark" --benchmark-compare --benchmark-compare-fail=min:50%
+
+      - name: Generate report 
+        id: generate-report
+        if: always()
+        shell: bash
+        run: |
+          poetry run python benchmarks/generate_report.py "benchmark" ".benchmarks/summary.md"
+          SUMMARY=$(cat .benchmarks/summary.md)
+          echo "$SUMMARY" >> $GITHUB_STEP_SUMMARY
+          {
+            echo 'markdown-summary<<EOF'
+            echo "$SUMMARY"
+            echo EOF
+          } >> "$GITHUB_OUTPUT"
+        
+      - name: Update PR
+        if: github.event_name == 'pull_request' && always()
+        uses: actions/github-script@v7
+        env:
+          SUMMARY: ${{ steps.generate-report.outputs.markdown-summary }}
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const maxGitHubBodyCharacters = 65536;
+            
+            const output = process.env.SUMMARY + `
+            
+            *Pusher: @${{ github.actor }}, Action: \`${{ github.event_name }}\`, Workflow: \`${{ github.workflow }}\`*`; 
+
+            const {data: comments} = await github.rest.issues.listComments({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.payload.number,
+            })                                                                                         
+            const botComment = comments.find(
+              comment => comment.user.id === 41898282 &&
+              comment.body.includes("Performance Regression Tests")
+            )
+            
+            if (context.payload.pull_request.head.repo.full_name !== 'oqc-community/qat') {
+              console.log('Not attempting to write comment on fork.');
+            } else {
+              if (botComment) {
+                await github.rest.issues.updateComment({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  comment_id: botComment.id,
+                  body: output
+                })
+              } else {
+                await github.rest.issues.createComment({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  issue_number: context.payload.number,
+                  body: output
+                })
+              }
+            }
diff --git a/benchmarks/generate_report.py b/benchmarks/generate_report.py
@@ -0,0 +1,142 @@
+import argparse
+import json
+import os
+from pathlib import Path
+
+import numpy as np
+from jinja2 import Environment, FileSystemLoader, select_autoescape
+
+
+def get_directory(dir):
+    """
+    Save directory for benchmarks depends on the environment: determine this.
+    """
+    subdir = [name for name in os.listdir(dir) if os.path.isdir(dir + name)]
+    return dir + subdir[0]
+
+
+def round_sf(x, sf=4):
+    return np.round(x, -int(np.floor(np.log10(abs(x)))) + sf - 1)
+
+
+def compare_tests(
+    warn_threshold=1.2,
+    fail_threshold=1.5,
+    improve_threshold=0.9,
+    benchmark_name="benchmark",
+    return_successes=False,
+    return_improvements=True,
+    dir=".benchmarks/",
+):
+    """
+    Generate a dictonary of tests that contains the key information for the report.
+    """
+    # load in the two benchmarks
+    dir = get_directory(dir)
+    with open(f"{dir}/0001_{benchmark_name}.json", "r") as f:
+        benchmark_before = json.load(f)["benchmarks"]
+    with open(f"{dir}/0002_{benchmark_name}.json", "r") as f:
+        benchmark_after = json.load(f)["benchmarks"]
+
+    # compare the benchmarks
+    benchmarks = {}
+    for bm_after in benchmark_after:
+        name = bm_after["name"]
+        data = {
+            "min_after": round_sf(bm_after["stats"]["min"]),
+            "min_before": "-",
+            "rel_diff": "-",
+            "outcome": "success",
+        }
+        for bm_before in benchmark_before:
+            if bm_before["name"] == name:
+                data["min_before"] = round_sf(bm_before["stats"]["min"])
+                data["rel_diff"] = round_sf(
+                    bm_after["stats"]["min"] / bm_before["stats"]["min"]
+                )
+                if data["rel_diff"] > fail_threshold:
+                    data["outcome"] = "fail"
+                elif data["rel_diff"] > warn_threshold:
+                    data["outcome"] = "warning"
+                elif data["rel_diff"] < improve_threshold:
+                    data["outcome"] = "improvement"
+                break
+
+        # if not warn/fail, decide if it should be added
+        if not (data["outcome"] == "success" and return_successes == False) and not (
+            data["outcome"] == "improvement" and return_improvements == False
+        ):
+            benchmarks[name] = data
+    return benchmarks
+
+
+def create_report(
+    benchmarks,
+    input_path="benchmarks/report_template.md",
+    output_path=".benchmarks/summary.md",
+):
+    """
+    Create the report using the template
+    """
+    env = Environment(loader=FileSystemLoader("."), autoescape=select_autoescape())
+    template = env.get_template(input_path)
+    summary = template.render(tests=benchmarks)
+    Path(output_path).write_text(summary)
+
+
+def get_args():
+    """
+    Allow arguments to be parsed to the program.
+    """
+
+    parser = argparse.ArgumentParser(
+        prog="Performance regression report",
+        description="Generates a report that compares performance between two benchmarks.",
+    )
+    parser.add_argument("benchmark_name")
+    parser.add_argument("output_path")
+    parser.add_argument(
+        "--template",
+        default="benchmarks/report_template.md",
+        type=str,
+        help="Path to the template file",
+    )
+    parser.add_argument(
+        "--warn_threshold", default=1.2, type=float, help="Slow down needed to warn"
+    )
+    parser.add_argument(
+        "--fail_threshold", default=1.5, type=float, help="Slow down needed to fail"
+    )
+    parser.add_argument(
+        "--improvement",
+        default=True,
+        type=bool,
+        help="Should performance improvements be shown",
+    )
+    parser.add_argument(
+        "--improvement_threshold",
+        default=0.9,
+        type=float,
+        help="The speed-up needed to notify the improvement",
+    )
+    parser.add_argument(
+        "--success",
+        default=False,
+        type=bool,
+        help="Should successes be shown (tests that do not give a performance regression)",
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = get_args()
+    report = compare_tests(
+        args.warn_threshold,
+        args.fail_threshold,
+        args.improvement_threshold,
+        args.benchmark_name,
+        args.success,
+        args.improvement,
+    )
+    create_report(report, input_path=args.template, output_path=args.output_path)
diff --git a/benchmarks/report_template.md b/benchmarks/report_template.md
@@ -0,0 +1,8 @@
+#### Performance Regression Tests
+Performance changes detected in the following benchmarks: {% if not tests %} none {% else %}
+| Test | Main Exec Time (s) | PR Exec Time (s) | Slow-down | Status |
+| ------- | ------ | ------ | ------ | ------ |
+{% for name, test in tests.items() -%}
+| {{name}} | {{ test.min_before }} | {{test.min_after}} | {{test.rel_diff}}x | {% if test.outcome == "success" %} :white_check_mark: {% elif test.outcome == "warning" %} :warning: {% elif test.outcome == "improvement" %} :rocket: {% else %} :x: {% endif %} |
+{% endfor %}
+{% endif %}
diff --git a/benchmarks/run.py b/benchmarks/run.py
@@ -1,5 +1,3 @@
-import time
-
 import pytest
 
 from benchmarks.utils.models import get_mock_live_hardware
@@ -57,18 +55,3 @@ def run():
 
     benchmark(run)
     assert True
-
-
-@pytest.mark.benchmark(disable_gc=True, max_time=2, min_rounds=10)
-def test_regression_report_warn(benchmark):
-    benchmark(time.sleep, 0.1)
-
-
-@pytest.mark.benchmark(disable_gc=True, max_time=2, min_rounds=10)
-def test_regression_report_fail(benchmark):
-    benchmark(time.sleep, 0.1)
-
-
-@pytest.mark.benchmark(disable_gc=True, max_time=2, min_rounds=10)
-def test_regression_report_improvement(benchmark):
-    benchmark(time.sleep, 0.1)
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -60,6 +60,7 @@ isort = "^5.13.2"
 docplex = "^2.21.207"
 pre-commit = "^3.2.0"
 pytest-benchmark = "^4.0.0"
+jinja2 = "^3.1.4"
 
 [tool.poetry.group.licenses]
 optional = true