implement ROCm profiling

Snektron · Snektron · commit edfd3f0e38ac · 2025-08-28T23:59:51.000+02:00
This uses rocPROF to fetch some interesting data and put it
in the profile_data directory, the download link of which
is then returned to the user.
diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py
@@ -297,6 +297,62 @@ def run_program(
     )
 
 
+def profile_program(
+    system: SystemInfo,
+    call: list[str],
+    seed: Optional[int],
+    timeout: int,
+    multi_gpu: bool,
+) -> tuple[RunResult, Optional[ProfileResult]]:
+    # The runner-specific configuration should implement logic
+    # to fetch the data in this directory and return it as
+    # ProfileResult.download_url.
+    output_dir = Path('profile_data')
+
+    if system.runtime == "ROCm":
+        # Wrap program in rocprof
+        output_dir.mkdir()
+        call = [
+            "rocprofv3",
+            "--log-level",
+            "fatal",
+            "--hip-trace",
+            "--kernel-trace",
+            "--rccl-trace",
+            "--marker-trace",
+            "--hip-trace",
+            "--memory-copy-trace",
+            # TODO(Robin): New? Doesn't work in the runner
+            # "--memory-allocation-trace",
+            "--scratch-memory-trace",
+            # TODO(Robin): The HSA trace is very large. Skip for now, maybe make optional later?
+            # "--hsa-trace",
+            "--output-format",
+            "pftrace",
+            "csv",
+            "-d",
+            str(output_dir),
+            # Just store the files as %pid%_tracename.ext instead of putting them in an
+            # additional directory named after the hostname.
+            "-o",
+            "%pid%",
+            "--",
+        ] + call
+
+        run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu)
+        profile_result = None
+
+        if run_result.success:
+            profile_result = ProfileResult(
+                profiler='rocPROF',
+                download_url=None,
+            )
+
+        return run_result, profile_result
+    else:
+        # TODO: Implement profiling for other platforms
+        return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu), None
+
 def run_single_evaluation(
     system: SystemInfo,
     call: list[str],
@@ -332,6 +388,9 @@ def run_single_evaluation(
 
         call += [mode, cases.name]
 
+        if mode == "profile":
+            return profile_program(system, call, seed=seed, timeout=timeout, multi_gpu=multi_gpu)
+
         return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu), None