From b7e8c8c59214a76f81b3cc69ed32fb104cca830e Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@voetter.nl>
Date: Mon, 25 Aug 2025 19:15:27 +0200
Subject: [PATCH 1/6] Simplify run_single_evaluation

This de-duplicates some duplicated code paths. This makes it easier to
patch profiling calls into the function later on.
---
 src/libkernelbot/run_eval.py | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py
index 73e7e374..8f468e39 100644
--- a/src/libkernelbot/run_eval.py
+++ b/src/libkernelbot/run_eval.py
@@ -301,26 +301,24 @@ def run_single_evaluation(
     A single runner run, either in the context of test files, or in the
     context of benchmark files.
     """
-    if mode == "test":
-        with tempfile.NamedTemporaryFile("w") as tests_file:
-            tests_file.write(tests)
-            tests_file.flush()
-            return run_program(
-                call + [mode, tests_file.name], seed=seed, timeout=test_timeout, multi_gpu=multi_gpu
-            )
-    elif mode in ["benchmark", "profile", "leaderboard"]:
-        timeout = ranked_timeout if mode == "leaderboard" else benchmark_timeout
-        with tempfile.NamedTemporaryFile("w") as bench_file:
+    with tempfile.NamedTemporaryFile("w") as cases:
+        if mode == "test":
+            timeout = test_timeout
+            cases.write(tests)
+        elif mode in ["benchmark", "profile", "leaderboard"]:
+            timeout = ranked_timeout if mode == "leaderboard" else benchmark_timeout
             if ranking_by == "last":
-                bench_file.write(benchmarks.splitlines(keepends=True)[-1])
+                cases.write(benchmarks.splitlines(keepends=True)[-1])
             else:
-                bench_file.write(benchmarks)
-            bench_file.flush()
-            return run_program(
-                call + [mode, bench_file.name], seed=seed, timeout=timeout, multi_gpu=multi_gpu
-            )
-    else:
-        raise ValueError(f"Invalid mode {mode}")
+                cases.write(benchmarks)
+        else:
+            raise ValueError(f"Invalid mode {mode}")
+
+        cases.flush()
+
+        call += [mode, cases.name]
+
+        return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu)
 
 
 def make_system_info() -> SystemInfo:

From f3dd42e8eba528be1d37c0206fb65e9324982670 Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@voetter.nl>
Date: Mon, 25 Aug 2025 19:59:43 +0200
Subject: [PATCH 2/6] add gpu runtime info to SystemInfo

This way we can tell whether we are using CUDA or ROCm later on.

This also fixes the ROCm fallback path.
---
 scripts/ci_test_cuda.py      |  4 +++-
 scripts/ci_test_python.py    |  3 ++-
 src/libkernelbot/report.py   |  1 +
 src/libkernelbot/run_eval.py | 33 +++++++++++++++++++++++++--------
 tests/test_backend.py        |  4 ++++
 tests/test_report.py         | 12 +++++++++++-
 6 files changed, 46 insertions(+), 11 deletions(-)

diff --git a/scripts/ci_test_cuda.py b/scripts/ci_test_cuda.py
index ae1a6cd7..c3fa893c 100644
--- a/scripts/ci_test_cuda.py
+++ b/scripts/ci_test_cuda.py
@@ -4,7 +4,7 @@
 import pytest
 
 from libkernelbot.consts import ExitCode, SubmissionMode
-from libkernelbot.run_eval import compile_cuda_script, run_cuda_script
+from libkernelbot.run_eval import compile_cuda_script, make_system_info, run_cuda_script
 
 ref = Path("examples/identity_cuda/reference.cuh").read_text()
 task_h = Path("examples/identity_cuda/task.h").read_text()
@@ -19,6 +19,7 @@ def run_cuda_helper(sources: dict, headers: dict = None, arch=None, **kwargs):
         headers = header_files
 
     eval_result = run_cuda_script(
+        make_system_info(),
         sources,
         headers,
         arch=arch,
@@ -194,6 +195,7 @@ def test_include_dirs(tmp_path: Path):
 
     # can also use generic flags argument
     result = run_cuda_script(
+        make_system_info(),
         {"eval.cu": eval_cu, "submission.cu": sub},
         header_files,
         flags=["-I.", f"-I{tmp_path}"],
diff --git a/scripts/ci_test_python.py b/scripts/ci_test_python.py
index 41ac92bd..7cc4fedd 100644
--- a/scripts/ci_test_python.py
+++ b/scripts/ci_test_python.py
@@ -1,7 +1,7 @@
 from pathlib import Path
 
 from libkernelbot.consts import ExitCode, SubmissionMode
-from libkernelbot.run_eval import run_pytorch_script
+from libkernelbot.run_eval import make_system_info, run_pytorch_script
 
 ref = Path("examples/identity_py/reference.py").read_text()
 task = Path("examples/identity_py/task.py").read_text()
@@ -12,6 +12,7 @@
 
 def run_pytorch_helper(sources: dict, tests=None, **kwargs):
     result = run_pytorch_script(
+        make_system_info(),
         sources,
         "eval.py",
         mode=SubmissionMode.TEST.value,
diff --git a/src/libkernelbot/report.py b/src/libkernelbot/report.py
index ec52e7bd..805a17df 100644
--- a/src/libkernelbot/report.py
+++ b/src/libkernelbot/report.py
@@ -267,6 +267,7 @@ def generate_system_info(system: SystemInfo):
 Running on:
 * GPU: `{system.gpu}`
 * CPU: `{system.cpu}`
+* Runtime: `{system.runtime}`
 * Platform: `{system.platform}`
 * Torch: `{system.torch}`
 """
diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py
index 8f468e39..6f39d78f 100644
--- a/src/libkernelbot/run_eval.py
+++ b/src/libkernelbot/run_eval.py
@@ -1,6 +1,7 @@
 import dataclasses
 import datetime
 import functools
+import json
 import os
 import shlex
 import subprocess
@@ -46,6 +47,7 @@ class SystemInfo:
     gpu: str = ''           # Model name of the GPU
     device_count: int = 1   # Number of GPUs
     cpu: str = ''           # Model name of the CPU
+    runtime: str = ''       # Whether CUDA or ROCm
     platform: str = ''      # Platform string of the machine
     torch: str = ''         # Torch version
     # fmt: on
@@ -285,6 +287,7 @@ def run_program(
 
 
 def run_single_evaluation(
+    system: SystemInfo,
     call: list[str],
     mode: str,
     *,
@@ -321,7 +324,7 @@ def run_single_evaluation(
         return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu)
 
 
-def make_system_info() -> SystemInfo:
+def make_system_info() -> SystemInfo: # noqa: C901
     info = SystemInfo()
     try:
         import torch
@@ -332,19 +335,29 @@ def make_system_info() -> SystemInfo:
         if torch.cuda.is_available():
             info.gpu = torch.cuda.get_device_name()
             info.device_count = torch.cuda.device_count()
+            if torch.version.hip is not None:
+                info.runtime = "ROCm"
+            elif torch.version.cuda is not None:
+                info.runtime = "CUDA"
     except ImportError:
         # get GPU info manually
         try:
             info.gpu = subprocess.check_output(
                 ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"], encoding="utf-8"
             )
+            info.device_count = info.gpu.count('\n')
+            info.runtime = "CUDA"
         except subprocess.CalledProcessError:
             # try again for HIP
-            # TODO suggested by Claude, untested
             try:
-                info.gpu = subprocess.check_output(
-                    ["rocm-smi", "--showproductname"], encoding="utf-8"
-                )
+                rocm_info = json.loads(subprocess.check_output(
+                    ["rocm-smi", "--showproductname", "--json"], encoding="utf-8"
+                ))
+                if len(rocm_info) > 0:
+                    info.gpu = next(rocm_info.__iter__())["Card Series"]
+
+                info.device_count = len(rocm_info)
+                info.runtime = "ROCm"
             except subprocess.CalledProcessError:
                 # OK, no GPU info available
                 pass
@@ -373,6 +386,7 @@ def make_system_info() -> SystemInfo:
 
 
 def run_cuda_script(  # # noqa: C901
+    system: SystemInfo,
     sources: dict[str, str],
     headers: Optional[dict[str, str]] = None,
     arch: Optional[int] = None,
@@ -432,7 +446,7 @@ def run_cuda_script(  # # noqa: C901
             if os.path.exists(f):
                 os.remove(f)
 
-    run_result = run_single_evaluation(["./eval.out"], **kwargs)
+    run_result = run_single_evaluation(system, ["./eval.out"], **kwargs)
     return EvalResult(
         start=start,
         end=datetime.datetime.now(),
@@ -442,6 +456,7 @@ def run_cuda_script(  # # noqa: C901
 
 
 def run_pytorch_script(  # noqa: C901
+    system: SystemInfo,
     sources: dict[str, str],
     main: str,
     **kwargs,
@@ -493,7 +508,7 @@ def run_pytorch_script(  # noqa: C901
                 exit_code=e.returncode,
             )
 
-        run = run_single_evaluation(["python", main], **kwargs)
+        run = run_single_evaluation(system, ["python", main], **kwargs)
 
         return EvalResult(
             start=start,
@@ -556,7 +571,9 @@ def build_test_string(tests: list[dict]):
 
 
 def run_config(config: dict):
+    system = make_system_info()
     common_args = {
+        "system": system,
         "tests": build_test_string(config.get("tests", [])),
         "benchmarks": build_test_string(config.get("benchmarks", [])),
         "seed": config.get("seed", None),
@@ -589,4 +606,4 @@ def run_config(config: dict):
         raise ValueError(f"Invalid language {config['lang']}")
 
     results = run_evaluation(runner, config["mode"])
-    return FullResult(success=True, error="", runs=results, system=make_system_info())
+    return FullResult(success=True, error="", runs=results, system=system)
diff --git a/tests/test_backend.py b/tests/test_backend.py
index 585674cd..94cc2795 100644
--- a/tests/test_backend.py
+++ b/tests/test_backend.py
@@ -77,6 +77,7 @@ async def test_handle_submission(bot: backend.KernelBackend, task_directory):
             "Running on:\n"
             "* GPU: `NVIDIA RTX 4090`\n"
             "* CPU: `Intel i9-12900K`\n"
+            "* Runtime: `CUDA`\n"
             "* Platform: `Linux-5.15.0`\n"
             "* Torch: `2.0.1+cu118`\n"
         ),
@@ -210,6 +211,7 @@ async def test_submit_leaderboard(bot: backend.KernelBackend, task_directory):
                         "device_count": 1,
                         "gpu": "NVIDIA RTX 4090",
                         "platform": "Linux-5.15.0",
+                        "runtime": "CUDA",
                         "torch": "2.0.1+cu118",
                     },
                 }
@@ -315,6 +317,7 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory):
                         "device_count": 1,
                         "gpu": "NVIDIA RTX 4090",
                         "platform": "Linux-5.15.0",
+                        "runtime": "CUDA",
                         "torch": "2.0.1+cu118",
                     },
                 },
@@ -357,6 +360,7 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory):
                         "device_count": 1,
                         "gpu": "NVIDIA RTX 4090",
                         "platform": "Linux-5.15.0",
+                        "runtime": "CUDA",
                         "torch": "2.0.1+cu118",
                     },
                 },
diff --git a/tests/test_report.py b/tests/test_report.py
index e9efc41c..ab62d8e6 100644
--- a/tests/test_report.py
+++ b/tests/test_report.py
@@ -22,7 +22,11 @@
 # define helpers and  fixtures that create mock results
 def sample_system_info() -> SystemInfo:
     return SystemInfo(
-        gpu="NVIDIA RTX 4090", cpu="Intel i9-12900K", platform="Linux-5.15.0", torch="2.0.1+cu118"
+        gpu="NVIDIA RTX 4090",
+        cpu="Intel i9-12900K",
+        runtime="CUDA",
+        platform="Linux-5.15.0",
+        torch="2.0.1+cu118",
     )
 
 
@@ -491,6 +495,7 @@ def test_generate_report_test_failure(sample_full_result: FullResult):
             "Running on:\n"
             "* GPU: `NVIDIA RTX 4090`\n"
             "* CPU: `Intel i9-12900K`\n"
+            "* Runtime: `CUDA`\n"
             "* Platform: `Linux-5.15.0`\n"
             "* Torch: `2.0.1+cu118`\n"
         ),
@@ -522,6 +527,7 @@ def test_generate_report_benchmark_failure(sample_full_result: FullResult):
             "Running on:\n"
             "* GPU: `NVIDIA RTX 4090`\n"
             "* CPU: `Intel i9-12900K`\n"
+            "* Runtime: `CUDA`\n"
             "* Platform: `Linux-5.15.0`\n"
             "* Torch: `2.0.1+cu118`\n"
         ),
@@ -556,6 +562,7 @@ def test_generate_report_benchmark_failure(sample_full_result: FullResult):
             "Running on:\n"
             "* GPU: `NVIDIA RTX 4090`\n"
             "* CPU: `Intel i9-12900K`\n"
+            "* Runtime: `CUDA`\n"
             "* Platform: `Linux-5.15.0`\n"
             "* Torch: `2.0.1+cu118`\n"
         ),
@@ -591,6 +598,7 @@ def test_generate_report_leaderboard_failure(sample_full_result: FullResult):
             "Running on:\n"
             "* GPU: `NVIDIA RTX 4090`\n"
             "* CPU: `Intel i9-12900K`\n"
+            "* Runtime: `CUDA`\n"
             "* Platform: `Linux-5.15.0`\n"
             "* Torch: `2.0.1+cu118`\n"
         ),
@@ -616,6 +624,7 @@ def test_generate_report_leaderboard_failure(sample_full_result: FullResult):
             "Running on:\n"
             "* GPU: `NVIDIA RTX 4090`\n"
             "* CPU: `Intel i9-12900K`\n"
+            "* Runtime: `CUDA`\n"
             "* Platform: `Linux-5.15.0`\n"
             "* Torch: `2.0.1+cu118`\n"
         ),
@@ -653,6 +662,7 @@ def test_generate_report_profile(sample_full_result: FullResult):
             "Running on:\n"
             "* GPU: `NVIDIA RTX 4090`\n"
             "* CPU: `Intel i9-12900K`\n"
+            "* Runtime: `CUDA`\n"
             "* Platform: `Linux-5.15.0`\n"
             "* Torch: `2.0.1+cu118`\n"
         ),

From c4c6b4197b99db2086522fb617ddcbfc114ea3a2 Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@voetter.nl>
Date: Thu, 28 Aug 2025 22:45:13 +0200
Subject: [PATCH 3/6] add 'link' report result type

This will be used to communicate external download links
such as profiling results.
---
 src/kernelbot/discord_reporter.py |  6 ++++++
 src/libkernelbot/report.py        | 16 +++++++++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/src/kernelbot/discord_reporter.py b/src/kernelbot/discord_reporter.py
index f49e5225..3b6fd8c3 100644
--- a/src/kernelbot/discord_reporter.py
+++ b/src/kernelbot/discord_reporter.py
@@ -2,6 +2,7 @@
 from discord_utils import _send_split_log
 
 from libkernelbot.report import (
+    Link,
     Log,
     MultiProgressReporter,
     RunProgressReporter,
@@ -69,6 +70,11 @@ async def display_report(self, title: str, report: RunResultReport):
                 message += part.text
             elif isinstance(part, Log):
                 message = await _send_split_log(thread, message, part.header, part.content)
+            elif isinstance(part, Link):
+                if len(message) > 0:
+                    await thread.send(message)
+                    message = ""
+                await thread.send(f"{part.title}: [{part.text}]({part.url})")
 
         if len(message) > 0:
             await thread.send(message)
diff --git a/src/libkernelbot/report.py b/src/libkernelbot/report.py
index 805a17df..1ad52aa0 100644
--- a/src/libkernelbot/report.py
+++ b/src/libkernelbot/report.py
@@ -32,9 +32,20 @@ class Log:
     content: str
 
 
+@dataclasses.dataclass
+class Link:
+    """
+    Link represents a link in the profiling report, to result data
+    which can be downloaded by clicking it.
+    """
+    title: str
+    text: str
+    url: str
+
+
 class RunResultReport:
     def __init__(self, data=None):
-        self.data: List[Text | Log] = data or []
+        self.data: List[Text | Log | Link] = data or []
 
     def add_text(self, section: str):
         self.data.append(Text(section))
@@ -42,6 +53,9 @@ def add_text(self, section: str):
     def add_log(self, header: str, log: str):
         self.data.append(Log(header, log))
 
+    def add_link(self, title: str, text: str, url: str):
+        self.data.append(Link(title, text, url))
+
     def __repr__(self):
         return f"RunResultReport(data={self.data})"
 

From 9474255b5bc9c12ee860ef0d68b93c7b425ca47d Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@voetter.nl>
Date: Thu, 28 Aug 2025 22:50:13 +0200
Subject: [PATCH 4/6] add profiling data infrastructure

A new ProfileResult type is added to run_eval, which is
is returned in the EvalResult type. Among other fields,
this contains the `download_url` field which should be
used by the user to download profiling data. Note that
the actual public download link may not be known in
run_eval.py. In this case, it is the intention that the
launcher fixes up the `download_url` before returning the
results back to libkernelbot.
---
 src/libkernelbot/launchers/github.py | 24 ++++++++++++++++--------
 src/libkernelbot/report.py           |  7 +++++++
 src/libkernelbot/run_eval.py         | 22 ++++++++++++++++++----
 tests/test_report.py                 | 18 ++++++++++++++++--
 4 files changed, 57 insertions(+), 14 deletions(-)

diff --git a/src/libkernelbot/launchers/github.py b/src/libkernelbot/launchers/github.py
index be968429..1ba1108d 100644
--- a/src/libkernelbot/launchers/github.py
+++ b/src/libkernelbot/launchers/github.py
@@ -23,7 +23,14 @@
     SubmissionMode,
 )
 from libkernelbot.report import RunProgressReporter
-from libkernelbot.run_eval import CompileResult, EvalResult, FullResult, RunResult, SystemInfo
+from libkernelbot.run_eval import (
+    CompileResult,
+    EvalResult,
+    FullResult,
+    ProfileResult,
+    RunResult,
+    SystemInfo,
+)
 from libkernelbot.utils import setup_logging
 
 from .launcher import Launcher
@@ -123,17 +130,18 @@ async def run_submission(
         runs = {}
         # convert json back to EvalResult structures, which requires
         # special handling for datetime and our dataclasses.
+
         for k, v in data["runs"].items():
-            if "compilation" in v and v["compilation"] is not None:
-                comp = CompileResult(**v["compilation"])
-            else:
-                comp = None
-            run = RunResult(**v["run"])
+            comp_res = None if v.get("compilation") is None else CompileResult(**v["compilation"])
+            run_res = None if v.get("run") is None else RunResult(**v["run"])
+            profile_res = None if v.get("profile") is None else ProfileResult(**v["profile"])
+
             res = EvalResult(
                 start=datetime.datetime.fromisoformat(v["start"]),
                 end=datetime.datetime.fromisoformat(v["end"]),
-                compilation=comp,
-                run=run,
+                compilation=comp_res,
+                run=run_res,
+                profile=profile_res,
             )
             runs[k] = res
 
diff --git a/src/libkernelbot/report.py b/src/libkernelbot/report.py
index 1ad52aa0..25bb27cb 100644
--- a/src/libkernelbot/report.py
+++ b/src/libkernelbot/report.py
@@ -337,6 +337,13 @@ def generate_report(result: FullResult) -> RunResultReport:  # noqa: C901
             make_profile_log(prof_run.run),
         )
 
+        if prof_run.profile is not None and prof_run.profile.download_url is not None:
+            report.add_link(
+                f"{prof_run.profile.profiler} profiling output",
+                "Download from GitHub",
+                prof_run.profile.download_url,
+            )
+
     if "leaderboard" in runs:
         bench_run = runs["leaderboard"]
         if _handle_crash_report(report, bench_run):
diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py
index 6f39d78f..e8722ba7 100644
--- a/src/libkernelbot/run_eval.py
+++ b/src/libkernelbot/run_eval.py
@@ -14,6 +14,16 @@
 from libkernelbot.consts import CUDA_FLAGS, ExitCode, Timeout
 
 
+@dataclasses.dataclass
+class ProfileResult:
+    # fmt: off
+    profiler: str      # The profiler used to gather this data
+    # Public download URL of all files created by the profiler
+    # This may also be configured later
+    download_url: Optional[str]
+    #fmt: on
+
+
 @dataclasses.dataclass
 class CompileResult:
     # fmt: off
@@ -60,6 +70,7 @@ class EvalResult:
     end: datetime.datetime              # and when did it finish
     compilation: CompileResult | None   # results of compilation
     run: RunResult | None               # result of actually running the executable/script
+    profile: ProfileResult | None       # result of profiling the executable
     # fmt: on
 
 
@@ -299,7 +310,7 @@ def run_single_evaluation(
     ranked_timeout: int = Timeout.RANKED,
     ranking_by: str = "last",
     seed: Optional[int] = None,
-) -> RunResult:
+) -> tuple[RunResult, Optional[ProfileResult]]:
     """
     A single runner run, either in the context of test files, or in the
     context of benchmark files.
@@ -321,7 +332,7 @@ def run_single_evaluation(
 
         call += [mode, cases.name]
 
-        return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu)
+        return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu), None
 
 
 def make_system_info() -> SystemInfo: # noqa: C901
@@ -436,6 +447,7 @@ def run_cuda_script(  # # noqa: C901
                 end=datetime.datetime.now(),
                 compilation=compile_result,
                 run=None,
+                profile=None,
             )
 
     # cleaning up all source files _before_ we let the user code run, just in
@@ -446,12 +458,13 @@ def run_cuda_script(  # # noqa: C901
             if os.path.exists(f):
                 os.remove(f)
 
-    run_result = run_single_evaluation(system, ["./eval.out"], **kwargs)
+    run_result, profile_result = run_single_evaluation(system, ["./eval.out"], **kwargs)
     return EvalResult(
         start=start,
         end=datetime.datetime.now(),
         compilation=compile_result,
         run=run_result,
+        profile=profile_result,
     )
 
 
@@ -508,13 +521,14 @@ def run_pytorch_script(  # noqa: C901
                 exit_code=e.returncode,
             )
 
-        run = run_single_evaluation(system, ["python", main], **kwargs)
+        run, profile = run_single_evaluation(system, ["python", main], **kwargs)
 
         return EvalResult(
             start=start,
             end=datetime.datetime.now(),
             compilation=comp,
             run=run,
+            profile=profile,
         )
     finally:
         for f in sources.keys():
diff --git a/tests/test_report.py b/tests/test_report.py
index ab62d8e6..a1964e62 100644
--- a/tests/test_report.py
+++ b/tests/test_report.py
@@ -16,7 +16,14 @@
     make_short_report,
     make_test_log,
 )
-from libkernelbot.run_eval import CompileResult, EvalResult, FullResult, RunResult, SystemInfo
+from libkernelbot.run_eval import (
+    CompileResult,
+    EvalResult,
+    FullResult,
+    ProfileResult,
+    RunResult,
+    SystemInfo,
+)
 
 
 # define helpers and  fixtures that create mock results
@@ -86,6 +93,7 @@ def create_eval_result(mode="test") -> EvalResult:
         end=datetime.datetime.now(),
         compilation=sample_compile_result(),
         run=sample_run_result(mode),
+        profile=None,
     )
 
 
@@ -298,6 +306,7 @@ def test_make_short_report_full_success():
                 stderr="",
                 result={},
             ),
+            profile=None,
         )
 
     result = make_short_report(runs, full=True)
@@ -653,8 +662,12 @@ def test_generate_report_profile(sample_full_result: FullResult):
         "benchmark.0.spec": "Benchmark",
         "benchmark.0.report": base64.b64encode(b"Profile report", b"+*").decode("utf-8"),
     }
+    sample_full_result.runs["profile"].profile = ProfileResult(
+        profiler="NSight",
+        download_url="https://example.com",
+    )
     report = generate_report(sample_full_result)
-    from libkernelbot.report import Log, Text
+    from libkernelbot.report import Link, Log, Text
 
     assert report.data == [
         Text(
@@ -675,6 +688,7 @@ def test_generate_report_profile(sample_full_result: FullResult):
             "> Division by zero",
         ),
         Log(header="Profiling", content="Benchmark\n\n  Profile report\n"),
+        Link("NSight profiling output", "Download from GitHub", "https://example.com"),
     ]
 
 

From a6fe7c5ac6b89e1748151d5d362949db3edae67d Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@voetter.nl>
Date: Thu, 28 Aug 2025 23:03:43 +0200
Subject: [PATCH 5/6] github launcher: separate artifact downloading from
 indexing

The new function `GitHubRun.get_artifact_index` returns a
dict of artifacts available from the run. For each artifact,
the GitHub API URL and public download URL are returned.

The latter is not available directly from the GitHub API,
however, it can be easily constructed from the data that is
available in the worflow result.

`download_artifacts` is replaced by a function which downloads
a specific artifact rather than all of them. Additionally, the
function no longer writes to a temp file when downloading the
artifact; the results of the download request can be piped
directly into zipfile using BytesIO.
---
 src/libkernelbot/launchers/github.py | 79 +++++++++++++++++-----------
 1 file changed, 49 insertions(+), 30 deletions(-)

diff --git a/src/libkernelbot/launchers/github.py b/src/libkernelbot/launchers/github.py
index 1ba1108d..41c603ab 100644
--- a/src/libkernelbot/launchers/github.py
+++ b/src/libkernelbot/launchers/github.py
@@ -1,10 +1,11 @@
 import asyncio
 import base64
+import dataclasses
 import datetime
+import io
 import json
 import math
 import pprint
-import tempfile
 import uuid
 import zipfile
 import zlib
@@ -56,7 +57,7 @@ def __init__(self, repo: str, token: str, branch: str):
         self.token = token
         self.branch = branch
 
-    async def run_submission(
+    async def run_submission(  # noqa: C901
         self, config: dict, gpu_type: GPU, status: RunProgressReporter
     ) -> FullResult:
         gpu_vendor = None
@@ -113,15 +114,17 @@ async def run_submission(
         await status.push("Downloading artifacts...")
         logger.info("Downloading artifacts...")
 
-        artifacts = await run.download_artifacts()
-        if "run-result" not in artifacts:
-            logger.error("Could not find `run-result` among artifacts: %s", artifacts.keys())
+        index = run.get_artifact_index()
+
+        if "run-result" not in index:
+            logger.error("Could not find `run-result` among artifacts: %s", index.keys())
             await status.push("Downloading artifacts...  failed")
             return FullResult(
                 success=False, error="Could not download artifacts", runs={}, system=SystemInfo()
             )
 
-        logs = artifacts["run-result"]["result.json"].decode("utf-8")
+        artifact = await run.download_artifact(index["run-result"])
+        logs = artifact["result.json"].decode("utf-8")
 
         await status.update("Downloading artifacts... done")
         logger.info("Downloading artifacts... done")
@@ -155,6 +158,13 @@ async def wait_callback(self, run: "GitHubRun", status: RunProgressReporter):
         )
 
 
+@dataclasses.dataclass
+class GitHubArtifact:
+    name: str
+    archive_download_url: str
+    public_download_url: str
+
+
 class GitHubRun:
     def __init__(self, repo: str, token: str, branch: str, workflow_file: str):
         gh = Github(token)
@@ -331,34 +341,43 @@ async def wait_for_completion(
                 logger.error(f"Error waiting for GitHub run {self.run_id}: {e}", exc_info=e)
                 raise  # Re-raise other exceptions
 
-    async def download_artifacts(self) -> dict:
-        logger.info("Attempting to download artifacts for run %s", self.run_id)
+
+    def get_artifact_index(self) -> dict[str, GitHubArtifact]:
+        logger.info("Creating artifact index for run %s", self.run_id)
         artifacts = self.run.get_artifacts()
 
         extracted = {}
 
         for artifact in artifacts:
-            url = artifact.archive_download_url
-            headers = {"Authorization": f"token {self.token}"}
-            response = requests.get(url, headers=headers)
-
-            if response.status_code == 200:
-                with tempfile.NamedTemporaryFile("w+b") as temp:
-                    temp.write(response.content)
-                    temp.flush()
-
-                    with zipfile.ZipFile(temp.name) as z:
-                        artifact_dict = {}
-                        for file in z.namelist():
-                            with z.open(file) as f:
-                                artifact_dict[file] = f.read()
-
-                extracted[artifact.name] = artifact_dict
-            else:
-                raise RuntimeError(
-                    f"Failed to download artifact {artifact.name}. "
-                    f"Status code: {response.status_code}"
-                )
+            extracted[artifact.name] = GitHubArtifact(
+                name=artifact.name,
+                archive_download_url=artifact.archive_download_url,
+                # Non-machine users cannot download from the archive_download_url and
+                # the GitHub API does not give us access to the public download url.
+                public_download_url=f"{self.repo.html_url}/actions/runs/{self.run_id}/artifacts/{artifact.id}",
+            )
 
-        logger.info("Download artifacts for run %s: %s", self.run_id, list(extracted.keys()))
         return extracted
+
+
+    async def download_artifact(self, artifact: GitHubArtifact) -> dict:
+        logger.info("Attempting to download artifact '%s' for run %s", artifact.name, self.run_id)
+
+        url = artifact.archive_download_url
+        headers = {"Authorization": f"token {self.token}"}
+        response = requests.get(url, headers=headers)
+
+        if response.status_code == 200:
+            artifact_dict = {}
+            with zipfile.ZipFile(io.BytesIO(response.content)) as z:
+                for file in z.namelist():
+                    with z.open(file) as f:
+                        artifact_dict[file] = f.read()
+
+            logger.info("Downloaded artifact '%s' for run %s", artifact.name, self.run_id)
+            return artifact_dict
+        else:
+            raise RuntimeError(
+                f"Failed to download artifact {artifact.name}. "
+                f"Status code: {response.status_code}"
+            )

From b44da6434e9da6e6d7d56f3e9ed3fbc1cda8e94e Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@voetter.nl>
Date: Thu, 28 Aug 2025 23:12:11 +0200
Subject: [PATCH 6/6] github runner: yield 'profile_data/*' from job as profile
 data

The idea is that eval_run.py places profiling data in the
profile_data/ directory, which is then automatically exported
to the user. This is done by uploading that directory as the
'profile-data' artifact, then fetching its public download
link and returning that as the ProfileResult.download_url.
---
 .github/workflows/amd_workflow.yml    | 17 ++++++++++++-----
 .github/workflows/nvidia_workflow.yml | 15 ++++++++++-----
 src/libkernelbot/launchers/github.py  |  6 ++++++
 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/amd_workflow.yml b/.github/workflows/amd_workflow.yml
index 8a0014ef..39dc66c5 100644
--- a/.github/workflows/amd_workflow.yml
+++ b/.github/workflows/amd_workflow.yml
@@ -35,13 +35,13 @@ jobs:
       run: |
         # Extract the payload content without printing it
         PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH)
-        
+
         # Apply mask to the extracted content
         echo "::add-mask::$PAYLOAD"
-        
+
         # Now write to file (won't be logged since it's masked)
         echo "$PAYLOAD" > payload.json
-    
+
     - name: Set venv directory based on runner
       run: |
         if [[ "${{ github.event.inputs.runner }}" == "amdgpu-mi250-x86-64" ]]; then
@@ -77,5 +77,12 @@ jobs:
       if: always()
       with:
         name: run-result
-        path: |
-          result.json
+        path: result.json
+
+    - name: Upload profiling artifacts
+      uses: actions/upload-artifact@v4
+      if: always()
+      with:
+        name: profile-data
+        path: profile_data/*
+        retention-days: 1
diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml
index e16cf4d5..b50ec044 100644
--- a/.github/workflows/nvidia_workflow.yml
+++ b/.github/workflows/nvidia_workflow.yml
@@ -42,10 +42,10 @@ jobs:
         # Extract the payload content without printing it
         apt-get update && apt-get install -y jq
         PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH)
-        
+
         # Apply mask to the extracted content
         echo "::add-mask::$PAYLOAD"
-        
+
         # Now write to file (won't be logged since it's masked)
         echo "$PAYLOAD" > payload.json
 
@@ -73,15 +73,20 @@ jobs:
       shell: bash
       run: |
         python src/runners/github-runner.py
-        cat result.json  # Debug: show output
 
     - name: Upload training artifacts
       uses: actions/upload-artifact@v4
       if: always()
       with:
         name: run-result
-        path: |
-          result.json
+        path: result.json
 
+    - name: Upload profiling artifacts
+      uses: actions/upload-artifact@v4
+      if: always()
+      with:
+        name: profile-data
+        path: profile_data/*
+        retention-days: 1
     env:
       CUDA_VISIBLE_DEVICES: 0
diff --git a/src/libkernelbot/launchers/github.py b/src/libkernelbot/launchers/github.py
index 41c603ab..c748e0fe 100644
--- a/src/libkernelbot/launchers/github.py
+++ b/src/libkernelbot/launchers/github.py
@@ -139,6 +139,12 @@ async def run_submission(  # noqa: C901
             run_res = None if v.get("run") is None else RunResult(**v["run"])
             profile_res = None if v.get("profile") is None else ProfileResult(**v["profile"])
 
+            # Update profile artifact to the actual download URL.
+            # For the GitHub launcher the profile_artifact currently just contains
+            # the name of the artifact.
+            if profile_res is not None:
+                profile_res.download_url = index["profile-data"].public_download_url
+
             res = EvalResult(
                 start=datetime.datetime.fromisoformat(v["start"]),
                 end=datetime.datetime.fromisoformat(v["end"]),