diff --git a/.github/workflows/amd_workflow.yml b/.github/workflows/amd_workflow.yml
index 8a0014ef..39dc66c5 100644
--- a/.github/workflows/amd_workflow.yml
+++ b/.github/workflows/amd_workflow.yml
@@ -35,13 +35,13 @@ jobs:
       run: |
         # Extract the payload content without printing it
         PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH)
-        
+
         # Apply mask to the extracted content
         echo "::add-mask::$PAYLOAD"
-        
+
         # Now write to file (won't be logged since it's masked)
         echo "$PAYLOAD" > payload.json
-    
+
     - name: Set venv directory based on runner
       run: |
         if [[ "${{ github.event.inputs.runner }}" == "amdgpu-mi250-x86-64" ]]; then
@@ -77,5 +77,12 @@ jobs:
       if: always()
       with:
         name: run-result
-        path: |
-          result.json
+        path: result.json
+
+    - name: Upload profiling artifacts
+      uses: actions/upload-artifact@v4
+      if: always()
+      with:
+        name: profile-data
+        path: profile_data/*
+        retention-days: 1
diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml
index e16cf4d5..b50ec044 100644
--- a/.github/workflows/nvidia_workflow.yml
+++ b/.github/workflows/nvidia_workflow.yml
@@ -42,10 +42,10 @@ jobs:
         # Extract the payload content without printing it
         apt-get update && apt-get install -y jq
         PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH)
-        
+
         # Apply mask to the extracted content
         echo "::add-mask::$PAYLOAD"
-        
+
         # Now write to file (won't be logged since it's masked)
         echo "$PAYLOAD" > payload.json
 
@@ -73,15 +73,20 @@ jobs:
       shell: bash
       run: |
         python src/runners/github-runner.py
-        cat result.json  # Debug: show output
 
     - name: Upload training artifacts
       uses: actions/upload-artifact@v4
       if: always()
       with:
         name: run-result
-        path: |
-          result.json
+        path: result.json
 
+    - name: Upload profiling artifacts
+      uses: actions/upload-artifact@v4
+      if: always()
+      with:
+        name: profile-data
+        path: profile_data/*
+        retention-days: 1
     env:
       CUDA_VISIBLE_DEVICES: 0
diff --git a/scripts/ci_test_cuda.py b/scripts/ci_test_cuda.py
index ae1a6cd7..c3fa893c 100644
--- a/scripts/ci_test_cuda.py
+++ b/scripts/ci_test_cuda.py
@@ -4,7 +4,7 @@
 import pytest
 
 from libkernelbot.consts import ExitCode, SubmissionMode
-from libkernelbot.run_eval import compile_cuda_script, run_cuda_script
+from libkernelbot.run_eval import compile_cuda_script, make_system_info, run_cuda_script
 
 ref = Path("examples/identity_cuda/reference.cuh").read_text()
 task_h = Path("examples/identity_cuda/task.h").read_text()
@@ -19,6 +19,7 @@ def run_cuda_helper(sources: dict, headers: dict = None, arch=None, **kwargs):
         headers = header_files
 
     eval_result = run_cuda_script(
+        make_system_info(),
         sources,
         headers,
         arch=arch,
@@ -194,6 +195,7 @@ def test_include_dirs(tmp_path: Path):
 
     # can also use generic flags argument
     result = run_cuda_script(
+        make_system_info(),
         {"eval.cu": eval_cu, "submission.cu": sub},
         header_files,
         flags=["-I.", f"-I{tmp_path}"],
diff --git a/scripts/ci_test_python.py b/scripts/ci_test_python.py
index 41ac92bd..7cc4fedd 100644
--- a/scripts/ci_test_python.py
+++ b/scripts/ci_test_python.py
@@ -1,7 +1,7 @@
 from pathlib import Path
 
 from libkernelbot.consts import ExitCode, SubmissionMode
-from libkernelbot.run_eval import run_pytorch_script
+from libkernelbot.run_eval import make_system_info, run_pytorch_script
 
 ref = Path("examples/identity_py/reference.py").read_text()
 task = Path("examples/identity_py/task.py").read_text()
@@ -12,6 +12,7 @@
 
 def run_pytorch_helper(sources: dict, tests=None, **kwargs):
     result = run_pytorch_script(
+        make_system_info(),
         sources,
         "eval.py",
         mode=SubmissionMode.TEST.value,
diff --git a/src/kernelbot/discord_reporter.py b/src/kernelbot/discord_reporter.py
index f49e5225..3b6fd8c3 100644
--- a/src/kernelbot/discord_reporter.py
+++ b/src/kernelbot/discord_reporter.py
@@ -2,6 +2,7 @@
 from discord_utils import _send_split_log
 
 from libkernelbot.report import (
+    Link,
     Log,
     MultiProgressReporter,
     RunProgressReporter,
@@ -69,6 +70,11 @@ async def display_report(self, title: str, report: RunResultReport):
                 message += part.text
             elif isinstance(part, Log):
                 message = await _send_split_log(thread, message, part.header, part.content)
+            elif isinstance(part, Link):
+                if len(message) > 0:
+                    await thread.send(message)
+                    message = ""
+                await thread.send(f"{part.title}: [{part.text}]({part.url})")
 
         if len(message) > 0:
             await thread.send(message)
diff --git a/src/libkernelbot/launchers/github.py b/src/libkernelbot/launchers/github.py
index be968429..c748e0fe 100644
--- a/src/libkernelbot/launchers/github.py
+++ b/src/libkernelbot/launchers/github.py
@@ -1,10 +1,11 @@
 import asyncio
 import base64
+import dataclasses
 import datetime
+import io
 import json
 import math
 import pprint
-import tempfile
 import uuid
 import zipfile
 import zlib
@@ -23,7 +24,14 @@
     SubmissionMode,
 )
 from libkernelbot.report import RunProgressReporter
-from libkernelbot.run_eval import CompileResult, EvalResult, FullResult, RunResult, SystemInfo
+from libkernelbot.run_eval import (
+    CompileResult,
+    EvalResult,
+    FullResult,
+    ProfileResult,
+    RunResult,
+    SystemInfo,
+)
 from libkernelbot.utils import setup_logging
 
 from .launcher import Launcher
@@ -49,7 +57,7 @@ def __init__(self, repo: str, token: str, branch: str):
         self.token = token
         self.branch = branch
 
-    async def run_submission(
+    async def run_submission(  # noqa: C901
         self, config: dict, gpu_type: GPU, status: RunProgressReporter
     ) -> FullResult:
         gpu_vendor = None
@@ -106,15 +114,17 @@ async def run_submission(
         await status.push("Downloading artifacts...")
         logger.info("Downloading artifacts...")
 
-        artifacts = await run.download_artifacts()
-        if "run-result" not in artifacts:
-            logger.error("Could not find `run-result` among artifacts: %s", artifacts.keys())
+        index = run.get_artifact_index()
+
+        if "run-result" not in index:
+            logger.error("Could not find `run-result` among artifacts: %s", index.keys())
             await status.push("Downloading artifacts...  failed")
             return FullResult(
                 success=False, error="Could not download artifacts", runs={}, system=SystemInfo()
             )
 
-        logs = artifacts["run-result"]["result.json"].decode("utf-8")
+        artifact = await run.download_artifact(index["run-result"])
+        logs = artifact["result.json"].decode("utf-8")
 
         await status.update("Downloading artifacts... done")
         logger.info("Downloading artifacts... done")
@@ -123,17 +133,24 @@ async def run_submission(
         runs = {}
         # convert json back to EvalResult structures, which requires
         # special handling for datetime and our dataclasses.
+
         for k, v in data["runs"].items():
-            if "compilation" in v and v["compilation"] is not None:
-                comp = CompileResult(**v["compilation"])
-            else:
-                comp = None
-            run = RunResult(**v["run"])
+            comp_res = None if v.get("compilation") is None else CompileResult(**v["compilation"])
+            run_res = None if v.get("run") is None else RunResult(**v["run"])
+            profile_res = None if v.get("profile") is None else ProfileResult(**v["profile"])
+
+            # Update profile artifact to the actual download URL.
+            # For the GitHub launcher the profile_artifact currently just contains
+            # the name of the artifact.
+            if profile_res is not None:
+                profile_res.download_url = index["profile-data"].public_download_url
+
             res = EvalResult(
                 start=datetime.datetime.fromisoformat(v["start"]),
                 end=datetime.datetime.fromisoformat(v["end"]),
-                compilation=comp,
-                run=run,
+                compilation=comp_res,
+                run=run_res,
+                profile=profile_res,
             )
             runs[k] = res
 
@@ -147,6 +164,13 @@ async def wait_callback(self, run: "GitHubRun", status: RunProgressReporter):
         )
 
 
+@dataclasses.dataclass
+class GitHubArtifact:
+    name: str
+    archive_download_url: str
+    public_download_url: str
+
+
 class GitHubRun:
     def __init__(self, repo: str, token: str, branch: str, workflow_file: str):
         gh = Github(token)
@@ -323,34 +347,43 @@ async def wait_for_completion(
                 logger.error(f"Error waiting for GitHub run {self.run_id}: {e}", exc_info=e)
                 raise  # Re-raise other exceptions
 
-    async def download_artifacts(self) -> dict:
-        logger.info("Attempting to download artifacts for run %s", self.run_id)
+
+    def get_artifact_index(self) -> dict[str, GitHubArtifact]:
+        logger.info("Creating artifact index for run %s", self.run_id)
         artifacts = self.run.get_artifacts()
 
         extracted = {}
 
         for artifact in artifacts:
-            url = artifact.archive_download_url
-            headers = {"Authorization": f"token {self.token}"}
-            response = requests.get(url, headers=headers)
-
-            if response.status_code == 200:
-                with tempfile.NamedTemporaryFile("w+b") as temp:
-                    temp.write(response.content)
-                    temp.flush()
-
-                    with zipfile.ZipFile(temp.name) as z:
-                        artifact_dict = {}
-                        for file in z.namelist():
-                            with z.open(file) as f:
-                                artifact_dict[file] = f.read()
-
-                extracted[artifact.name] = artifact_dict
-            else:
-                raise RuntimeError(
-                    f"Failed to download artifact {artifact.name}. "
-                    f"Status code: {response.status_code}"
-                )
+            extracted[artifact.name] = GitHubArtifact(
+                name=artifact.name,
+                archive_download_url=artifact.archive_download_url,
+                # Non-machine users cannot download from the archive_download_url and
+                # the GitHub API does not give us access to the public download url.
+                public_download_url=f"{self.repo.html_url}/actions/runs/{self.run_id}/artifacts/{artifact.id}",
+            )
 
-        logger.info("Download artifacts for run %s: %s", self.run_id, list(extracted.keys()))
         return extracted
+
+
+    async def download_artifact(self, artifact: GitHubArtifact) -> dict:
+        logger.info("Attempting to download artifact '%s' for run %s", artifact.name, self.run_id)
+
+        url = artifact.archive_download_url
+        headers = {"Authorization": f"token {self.token}"}
+        response = requests.get(url, headers=headers)
+
+        if response.status_code == 200:
+            artifact_dict = {}
+            with zipfile.ZipFile(io.BytesIO(response.content)) as z:
+                for file in z.namelist():
+                    with z.open(file) as f:
+                        artifact_dict[file] = f.read()
+
+            logger.info("Downloaded artifact '%s' for run %s", artifact.name, self.run_id)
+            return artifact_dict
+        else:
+            raise RuntimeError(
+                f"Failed to download artifact {artifact.name}. "
+                f"Status code: {response.status_code}"
+            )
diff --git a/src/libkernelbot/report.py b/src/libkernelbot/report.py
index ec52e7bd..25bb27cb 100644
--- a/src/libkernelbot/report.py
+++ b/src/libkernelbot/report.py
@@ -32,9 +32,20 @@ class Log:
     content: str
 
 
+@dataclasses.dataclass
+class Link:
+    """
+    Link represents a link in the profiling report, to result data
+    which can be downloaded by clicking it.
+    """
+    title: str
+    text: str
+    url: str
+
+
 class RunResultReport:
     def __init__(self, data=None):
-        self.data: List[Text | Log] = data or []
+        self.data: List[Text | Log | Link] = data or []
 
     def add_text(self, section: str):
         self.data.append(Text(section))
@@ -42,6 +53,9 @@ def add_text(self, section: str):
     def add_log(self, header: str, log: str):
         self.data.append(Log(header, log))
 
+    def add_link(self, title: str, text: str, url: str):
+        self.data.append(Link(title, text, url))
+
     def __repr__(self):
         return f"RunResultReport(data={self.data})"
 
@@ -267,6 +281,7 @@ def generate_system_info(system: SystemInfo):
 Running on:
 * GPU: `{system.gpu}`
 * CPU: `{system.cpu}`
+* Runtime: `{system.runtime}`
 * Platform: `{system.platform}`
 * Torch: `{system.torch}`
 """
@@ -322,6 +337,13 @@ def generate_report(result: FullResult) -> RunResultReport:  # noqa: C901
             make_profile_log(prof_run.run),
         )
 
+        if prof_run.profile is not None and prof_run.profile.download_url is not None:
+            report.add_link(
+                f"{prof_run.profile.profiler} profiling output",
+                "Download from GitHub",
+                prof_run.profile.download_url,
+            )
+
     if "leaderboard" in runs:
         bench_run = runs["leaderboard"]
         if _handle_crash_report(report, bench_run):
diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py
index 73e7e374..e8722ba7 100644
--- a/src/libkernelbot/run_eval.py
+++ b/src/libkernelbot/run_eval.py
@@ -1,6 +1,7 @@
 import dataclasses
 import datetime
 import functools
+import json
 import os
 import shlex
 import subprocess
@@ -13,6 +14,16 @@
 from libkernelbot.consts import CUDA_FLAGS, ExitCode, Timeout
 
 
+@dataclasses.dataclass
+class ProfileResult:
+    # fmt: off
+    profiler: str      # The profiler used to gather this data
+    # Public download URL of all files created by the profiler
+    # This may also be configured later
+    download_url: Optional[str]
+    #fmt: on
+
+
 @dataclasses.dataclass
 class CompileResult:
     # fmt: off
@@ -46,6 +57,7 @@ class SystemInfo:
     gpu: str = ''           # Model name of the GPU
     device_count: int = 1   # Number of GPUs
     cpu: str = ''           # Model name of the CPU
+    runtime: str = ''       # Whether CUDA or ROCm
     platform: str = ''      # Platform string of the machine
     torch: str = ''         # Torch version
     # fmt: on
@@ -58,6 +70,7 @@ class EvalResult:
     end: datetime.datetime              # and when did it finish
     compilation: CompileResult | None   # results of compilation
     run: RunResult | None               # result of actually running the executable/script
+    profile: ProfileResult | None       # result of profiling the executable
     # fmt: on
 
 
@@ -285,6 +298,7 @@ def run_program(
 
 
 def run_single_evaluation(
+    system: SystemInfo,
     call: list[str],
     mode: str,
     *,
@@ -296,34 +310,32 @@ def run_single_evaluation(
     ranked_timeout: int = Timeout.RANKED,
     ranking_by: str = "last",
     seed: Optional[int] = None,
-) -> RunResult:
+) -> tuple[RunResult, Optional[ProfileResult]]:
     """
     A single runner run, either in the context of test files, or in the
     context of benchmark files.
     """
-    if mode == "test":
-        with tempfile.NamedTemporaryFile("w") as tests_file:
-            tests_file.write(tests)
-            tests_file.flush()
-            return run_program(
-                call + [mode, tests_file.name], seed=seed, timeout=test_timeout, multi_gpu=multi_gpu
-            )
-    elif mode in ["benchmark", "profile", "leaderboard"]:
-        timeout = ranked_timeout if mode == "leaderboard" else benchmark_timeout
-        with tempfile.NamedTemporaryFile("w") as bench_file:
+    with tempfile.NamedTemporaryFile("w") as cases:
+        if mode == "test":
+            timeout = test_timeout
+            cases.write(tests)
+        elif mode in ["benchmark", "profile", "leaderboard"]:
+            timeout = ranked_timeout if mode == "leaderboard" else benchmark_timeout
             if ranking_by == "last":
-                bench_file.write(benchmarks.splitlines(keepends=True)[-1])
+                cases.write(benchmarks.splitlines(keepends=True)[-1])
             else:
-                bench_file.write(benchmarks)
-            bench_file.flush()
-            return run_program(
-                call + [mode, bench_file.name], seed=seed, timeout=timeout, multi_gpu=multi_gpu
-            )
-    else:
-        raise ValueError(f"Invalid mode {mode}")
+                cases.write(benchmarks)
+        else:
+            raise ValueError(f"Invalid mode {mode}")
+
+        cases.flush()
 
+        call += [mode, cases.name]
 
-def make_system_info() -> SystemInfo:
+        return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu), None
+
+
+def make_system_info() -> SystemInfo: # noqa: C901
     info = SystemInfo()
     try:
         import torch
@@ -334,19 +346,29 @@ def make_system_info() -> SystemInfo:
         if torch.cuda.is_available():
             info.gpu = torch.cuda.get_device_name()
             info.device_count = torch.cuda.device_count()
+            if torch.version.hip is not None:
+                info.runtime = "ROCm"
+            elif torch.version.cuda is not None:
+                info.runtime = "CUDA"
     except ImportError:
         # get GPU info manually
         try:
             info.gpu = subprocess.check_output(
                 ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"], encoding="utf-8"
             )
+            info.device_count = info.gpu.count('\n')
+            info.runtime = "CUDA"
         except subprocess.CalledProcessError:
             # try again for HIP
-            # TODO suggested by Claude, untested
             try:
-                info.gpu = subprocess.check_output(
-                    ["rocm-smi", "--showproductname"], encoding="utf-8"
-                )
+                rocm_info = json.loads(subprocess.check_output(
+                    ["rocm-smi", "--showproductname", "--json"], encoding="utf-8"
+                ))
+                if len(rocm_info) > 0:
+                    info.gpu = next(rocm_info.__iter__())["Card Series"]
+
+                info.device_count = len(rocm_info)
+                info.runtime = "ROCm"
             except subprocess.CalledProcessError:
                 # OK, no GPU info available
                 pass
@@ -375,6 +397,7 @@ def make_system_info() -> SystemInfo:
 
 
 def run_cuda_script(  # # noqa: C901
+    system: SystemInfo,
     sources: dict[str, str],
     headers: Optional[dict[str, str]] = None,
     arch: Optional[int] = None,
@@ -424,6 +447,7 @@ def run_cuda_script(  # # noqa: C901
                 end=datetime.datetime.now(),
                 compilation=compile_result,
                 run=None,
+                profile=None,
             )
 
     # cleaning up all source files _before_ we let the user code run, just in
@@ -434,16 +458,18 @@ def run_cuda_script(  # # noqa: C901
             if os.path.exists(f):
                 os.remove(f)
 
-    run_result = run_single_evaluation(["./eval.out"], **kwargs)
+    run_result, profile_result = run_single_evaluation(system, ["./eval.out"], **kwargs)
     return EvalResult(
         start=start,
         end=datetime.datetime.now(),
         compilation=compile_result,
         run=run_result,
+        profile=profile_result,
     )
 
 
 def run_pytorch_script(  # noqa: C901
+    system: SystemInfo,
     sources: dict[str, str],
     main: str,
     **kwargs,
@@ -495,13 +521,14 @@ def run_pytorch_script(  # noqa: C901
                 exit_code=e.returncode,
             )
 
-        run = run_single_evaluation(["python", main], **kwargs)
+        run, profile = run_single_evaluation(system, ["python", main], **kwargs)
 
         return EvalResult(
             start=start,
             end=datetime.datetime.now(),
             compilation=comp,
             run=run,
+            profile=profile,
         )
     finally:
         for f in sources.keys():
@@ -558,7 +585,9 @@ def build_test_string(tests: list[dict]):
 
 
 def run_config(config: dict):
+    system = make_system_info()
     common_args = {
+        "system": system,
         "tests": build_test_string(config.get("tests", [])),
         "benchmarks": build_test_string(config.get("benchmarks", [])),
         "seed": config.get("seed", None),
@@ -591,4 +620,4 @@ def run_config(config: dict):
         raise ValueError(f"Invalid language {config['lang']}")
 
     results = run_evaluation(runner, config["mode"])
-    return FullResult(success=True, error="", runs=results, system=make_system_info())
+    return FullResult(success=True, error="", runs=results, system=system)
diff --git a/tests/test_backend.py b/tests/test_backend.py
index 585674cd..94cc2795 100644
--- a/tests/test_backend.py
+++ b/tests/test_backend.py
@@ -77,6 +77,7 @@ async def test_handle_submission(bot: backend.KernelBackend, task_directory):
             "Running on:\n"
             "* GPU: `NVIDIA RTX 4090`\n"
             "* CPU: `Intel i9-12900K`\n"
+            "* Runtime: `CUDA`\n"
             "* Platform: `Linux-5.15.0`\n"
             "* Torch: `2.0.1+cu118`\n"
         ),
@@ -210,6 +211,7 @@ async def test_submit_leaderboard(bot: backend.KernelBackend, task_directory):
                         "device_count": 1,
                         "gpu": "NVIDIA RTX 4090",
                         "platform": "Linux-5.15.0",
+                        "runtime": "CUDA",
                         "torch": "2.0.1+cu118",
                     },
                 }
@@ -315,6 +317,7 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory):
                         "device_count": 1,
                         "gpu": "NVIDIA RTX 4090",
                         "platform": "Linux-5.15.0",
+                        "runtime": "CUDA",
                         "torch": "2.0.1+cu118",
                     },
                 },
@@ -357,6 +360,7 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory):
                         "device_count": 1,
                         "gpu": "NVIDIA RTX 4090",
                         "platform": "Linux-5.15.0",
+                        "runtime": "CUDA",
                         "torch": "2.0.1+cu118",
                     },
                 },
diff --git a/tests/test_report.py b/tests/test_report.py
index e9efc41c..a1964e62 100644
--- a/tests/test_report.py
+++ b/tests/test_report.py
@@ -16,13 +16,24 @@
     make_short_report,
     make_test_log,
 )
-from libkernelbot.run_eval import CompileResult, EvalResult, FullResult, RunResult, SystemInfo
+from libkernelbot.run_eval import (
+    CompileResult,
+    EvalResult,
+    FullResult,
+    ProfileResult,
+    RunResult,
+    SystemInfo,
+)
 
 
 # define helpers and  fixtures that create mock results
 def sample_system_info() -> SystemInfo:
     return SystemInfo(
-        gpu="NVIDIA RTX 4090", cpu="Intel i9-12900K", platform="Linux-5.15.0", torch="2.0.1+cu118"
+        gpu="NVIDIA RTX 4090",
+        cpu="Intel i9-12900K",
+        runtime="CUDA",
+        platform="Linux-5.15.0",
+        torch="2.0.1+cu118",
     )
 
 
@@ -82,6 +93,7 @@ def create_eval_result(mode="test") -> EvalResult:
         end=datetime.datetime.now(),
         compilation=sample_compile_result(),
         run=sample_run_result(mode),
+        profile=None,
     )
 
 
@@ -294,6 +306,7 @@ def test_make_short_report_full_success():
                 stderr="",
                 result={},
             ),
+            profile=None,
         )
 
     result = make_short_report(runs, full=True)
@@ -491,6 +504,7 @@ def test_generate_report_test_failure(sample_full_result: FullResult):
             "Running on:\n"
             "* GPU: `NVIDIA RTX 4090`\n"
             "* CPU: `Intel i9-12900K`\n"
+            "* Runtime: `CUDA`\n"
             "* Platform: `Linux-5.15.0`\n"
             "* Torch: `2.0.1+cu118`\n"
         ),
@@ -522,6 +536,7 @@ def test_generate_report_benchmark_failure(sample_full_result: FullResult):
             "Running on:\n"
             "* GPU: `NVIDIA RTX 4090`\n"
             "* CPU: `Intel i9-12900K`\n"
+            "* Runtime: `CUDA`\n"
             "* Platform: `Linux-5.15.0`\n"
             "* Torch: `2.0.1+cu118`\n"
         ),
@@ -556,6 +571,7 @@ def test_generate_report_benchmark_failure(sample_full_result: FullResult):
             "Running on:\n"
             "* GPU: `NVIDIA RTX 4090`\n"
             "* CPU: `Intel i9-12900K`\n"
+            "* Runtime: `CUDA`\n"
             "* Platform: `Linux-5.15.0`\n"
             "* Torch: `2.0.1+cu118`\n"
         ),
@@ -591,6 +607,7 @@ def test_generate_report_leaderboard_failure(sample_full_result: FullResult):
             "Running on:\n"
             "* GPU: `NVIDIA RTX 4090`\n"
             "* CPU: `Intel i9-12900K`\n"
+            "* Runtime: `CUDA`\n"
             "* Platform: `Linux-5.15.0`\n"
             "* Torch: `2.0.1+cu118`\n"
         ),
@@ -616,6 +633,7 @@ def test_generate_report_leaderboard_failure(sample_full_result: FullResult):
             "Running on:\n"
             "* GPU: `NVIDIA RTX 4090`\n"
             "* CPU: `Intel i9-12900K`\n"
+            "* Runtime: `CUDA`\n"
             "* Platform: `Linux-5.15.0`\n"
             "* Torch: `2.0.1+cu118`\n"
         ),
@@ -644,8 +662,12 @@ def test_generate_report_profile(sample_full_result: FullResult):
         "benchmark.0.spec": "Benchmark",
         "benchmark.0.report": base64.b64encode(b"Profile report", b"+*").decode("utf-8"),
     }
+    sample_full_result.runs["profile"].profile = ProfileResult(
+        profiler="NSight",
+        download_url="https://example.com",
+    )
     report = generate_report(sample_full_result)
-    from libkernelbot.report import Log, Text
+    from libkernelbot.report import Link, Log, Text
 
     assert report.data == [
         Text(
@@ -653,6 +675,7 @@ def test_generate_report_profile(sample_full_result: FullResult):
             "Running on:\n"
             "* GPU: `NVIDIA RTX 4090`\n"
             "* CPU: `Intel i9-12900K`\n"
+            "* Runtime: `CUDA`\n"
             "* Platform: `Linux-5.15.0`\n"
             "* Torch: `2.0.1+cu118`\n"
         ),
@@ -665,6 +688,7 @@ def test_generate_report_profile(sample_full_result: FullResult):
             "> Division by zero",
         ),
         Log(header="Profiling", content="Benchmark\n\n  Profile report\n"),
+        Link("NSight profiling output", "Download from GitHub", "https://example.com"),
     ]