From 3bf6f6450709ec640ee4d3fdff3a6d243bf34937 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Mon, 13 Oct 2025 16:32:19 -0700
Subject: [PATCH 01/35] Change runner from gpumode-nvidia-arc to Nvidia-A100

---
 .github/workflows/nvidia-arc-health.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml
index 13841d27..7e2e4104 100644
--- a/.github/workflows/nvidia-arc-health.yml
+++ b/.github/workflows/nvidia-arc-health.yml
@@ -10,7 +10,7 @@ on:
 
 jobs:
   health-check:
-    runs-on: [gpumode-nvidia-arc]
+    runs-on: [Nvidia-A100-8-x86-64]
     timeout-minutes: 5
     container:
       image: nvidia/cuda:12.4.0-devel-ubuntu22.04

From 5f40e369284bbc5baf0c3a9cf1dfd048d3104cfe Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Mon, 13 Oct 2025 16:35:37 -0700
Subject: [PATCH 02/35] Update nvidia-arc-health.yml

---
 .github/workflows/nvidia-arc-health.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml
index 7e2e4104..babdf01c 100644
--- a/.github/workflows/nvidia-arc-health.yml
+++ b/.github/workflows/nvidia-arc-health.yml
@@ -6,7 +6,6 @@ on:
     - cron: '0 2 * * *'
   workflow_dispatch:
   push:
-    branches: [main]
 
 jobs:
   health-check:

From e3ac7307681b9726ef8802af8c079f9b68f25416 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Mon, 13 Oct 2025 16:46:24 -0700
Subject: [PATCH 03/35] Update nvidia-arc-health.yml

---
 .github/workflows/nvidia-arc-health.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml
index babdf01c..0c552666 100644
--- a/.github/workflows/nvidia-arc-health.yml
+++ b/.github/workflows/nvidia-arc-health.yml
@@ -11,8 +11,6 @@ jobs:
   health-check:
     runs-on: [Nvidia-A100-8-x86-64]
     timeout-minutes: 5
-    container:
-      image: nvidia/cuda:12.4.0-devel-ubuntu22.04
     
     steps:
     - name: Setup Python

From c60090bf4b68e966f845b5a10b1d308ef27c850f Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 1 Nov 2025 11:15:56 -0700
Subject: [PATCH 04/35] Feat: run health on b200

---
 .github/workflows/nvidia-arc-health.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml
index 0c552666..47d4a54e 100644
--- a/.github/workflows/nvidia-arc-health.yml
+++ b/.github/workflows/nvidia-arc-health.yml
@@ -9,7 +9,7 @@ on:
 
 jobs:
   health-check:
-    runs-on: [Nvidia-A100-8-x86-64]
+    runs-on: [nvidia-docker-b200-8-x86-64, Nvidia-A100-8-x86-64]
     timeout-minutes: 5
     
     steps:

From 2a69a10a83480e78e2e9fed2143b77a2f1bb54cd Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 1 Nov 2025 11:16:37 -0700
Subject: [PATCH 05/35] tmp

---
 .github/workflows/nvidia-arc-health.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml
index 47d4a54e..67926d82 100644
--- a/.github/workflows/nvidia-arc-health.yml
+++ b/.github/workflows/nvidia-arc-health.yml
@@ -9,7 +9,7 @@ on:
 
 jobs:
   health-check:
-    runs-on: [nvidia-docker-b200-8-x86-64, Nvidia-A100-8-x86-64]
+    runs-on: [nvidia-docker-b200-8-x86-64]
     timeout-minutes: 5
     
     steps:

From 9a6c08d82184d3af8143a257856787010b5f614c Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 1 Nov 2025 11:18:02 -0700
Subject: [PATCH 06/35] tmp

---
 .github/workflows/nvidia-arc-health.yml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml
index 67926d82..a07c8ada 100644
--- a/.github/workflows/nvidia-arc-health.yml
+++ b/.github/workflows/nvidia-arc-health.yml
@@ -18,10 +18,6 @@ jobs:
       with:
         python-version: '3.10'
     
-    - name: Install PyTorch
-      run: |
-        pip install torch
-    
     - name: GPU Health Check
       run: python -c "import torch; torch.randn(5, device='cuda')"
     

From aa2f8946a68cc6af81ea807623fb4d2bb6e2094b Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 1 Nov 2025 11:18:42 -0700
Subject: [PATCH 07/35] tmp

---
 .github/workflows/nvidia-arc-health.yml | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml
index a07c8ada..14e19aae 100644
--- a/.github/workflows/nvidia-arc-health.yml
+++ b/.github/workflows/nvidia-arc-health.yml
@@ -12,14 +12,8 @@ jobs:
     runs-on: [nvidia-docker-b200-8-x86-64]
     timeout-minutes: 5
     
-    steps:
-    - name: Setup Python
-      uses: actions/setup-python@v5
-      with:
-        python-version: '3.10'
-    
     - name: GPU Health Check
-      run: python -c "import torch; torch.randn(5, device='cuda')"
+      run: python3 -c "import torch; torch.randn(5, device='cuda')"
     
     env:
       CUDA_VISIBLE_DEVICES: 0

From fbc28addf29f76b66165218ef063ca3d196b8870 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 1 Nov 2025 11:20:48 -0700
Subject: [PATCH 08/35] feat


From 6437e1975bf7397da85355de011fa6fb93e3b3c8 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 1 Nov 2025 11:21:20 -0700
Subject: [PATCH 09/35] feat

---
 .github/workflows/nvidia-arc-health.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml
index 14e19aae..87f58473 100644
--- a/.github/workflows/nvidia-arc-health.yml
+++ b/.github/workflows/nvidia-arc-health.yml
@@ -12,8 +12,9 @@ jobs:
     runs-on: [nvidia-docker-b200-8-x86-64]
     timeout-minutes: 5
     
+    steps:
     - name: GPU Health Check
-      run: python3 -c "import torch; torch.randn(5, device='cuda')"
+      run: python -c "import torch; torch.randn(5, device='cuda')"
     
     env:
       CUDA_VISIBLE_DEVICES: 0

From 6c4bde053c23a5155fc57190ae5bdc649550d93e Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 1 Nov 2025 11:21:45 -0700
Subject: [PATCH 10/35] feat

---
 .github/workflows/nvidia-arc-health.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml
index 87f58473..a619f8e1 100644
--- a/.github/workflows/nvidia-arc-health.yml
+++ b/.github/workflows/nvidia-arc-health.yml
@@ -14,7 +14,7 @@ jobs:
     
     steps:
     - name: GPU Health Check
-      run: python -c "import torch; torch.randn(5, device='cuda')"
+      run: python3 -c "import torch; torch.randn(5, device='cuda')"
     
     env:
       CUDA_VISIBLE_DEVICES: 0

From a3e045c77b9a399b5c15e7abaec7e7361fa83aea Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Sat, 1 Nov 2025 14:26:08 -0400
Subject: [PATCH 11/35] replace nvidia workflow to point to our b200 cluster

---
 .github/workflows/nvidia_workflow.yml | 32 +--------------------------
 1 file changed, 1 insertion(+), 31 deletions(-)

diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml
index b50ec044..59811d30 100644
--- a/.github/workflows/nvidia_workflow.yml
+++ b/.github/workflows/nvidia_workflow.yml
@@ -19,23 +19,13 @@ run-name: 'NVIDIA Job - ${{ github.event.inputs.run_id }}'
 
 jobs:
   run:
-    runs-on: [gpumode-nvidia-arc]
+    runs-on: [nvidia-docker-b200-8-x86-64]
     timeout-minutes: 10
     container:
       image: nvidia/cuda:12.4.0-devel-ubuntu22.04
     steps:
     - uses: actions/checkout@v3
 
-    - name: Setup Python
-      uses: actions/setup-python@v5
-      with:
-        python-version: '3.10'
-
-    - name: Install uv
-      uses: astral-sh/setup-uv@v3
-      with:
-        version: "latest"
-
     - name: Create input files
       shell: bash
       run: |
@@ -49,26 +39,6 @@ jobs:
         # Now write to file (won't be logged since it's masked)
         echo "$PAYLOAD" > payload.json
 
-    - name: Install uv
-      uses: astral-sh/setup-uv@v3
-      with:
-        version: "latest"
-
-    - name: Setup Python environment
-      shell: bash
-      run: |
-        uv venv .venv
-        echo "VIRTUAL_ENV=$PWD/.venv" >> $GITHUB_ENV
-        echo "$PWD/.venv/bin" >> $GITHUB_PATH
-
-        if [[ -n "${{ github.event.inputs.requirements }}" ]]; then
-          cat > "requirements.txt" <<'EOL'
-          ${{ github.event.inputs.requirements }}
-        EOL
-        uv pip install -r "requirements.txt"
-        fi
-        uv pip install -e .
-
     - name: Run script
       shell: bash
       run: |

From 844d3bf85266f4b2a2f12df2f49da9b1c2b1ddbe Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 1 Nov 2025 11:33:13 -0700
Subject: [PATCH 12/35] Fix: container

---
 .github/workflows/nvidia_workflow.yml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml
index 59811d30..b5ad437e 100644
--- a/.github/workflows/nvidia_workflow.yml
+++ b/.github/workflows/nvidia_workflow.yml
@@ -21,8 +21,6 @@ jobs:
   run:
     runs-on: [nvidia-docker-b200-8-x86-64]
     timeout-minutes: 10
-    container:
-      image: nvidia/cuda:12.4.0-devel-ubuntu22.04
     steps:
     - uses: actions/checkout@v3
 
@@ -58,5 +56,3 @@ jobs:
         name: profile-data
         path: profile_data/*
         retention-days: 1
-    env:
-      CUDA_VISIBLE_DEVICES: 0

From 3275924f6f3a9728d386612dc0ef9844c54f3b5c Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 1 Nov 2025 11:35:10 -0700
Subject: [PATCH 13/35] Fix: python->python3

---
 .github/workflows/nvidia_workflow.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml
index b5ad437e..766072cc 100644
--- a/.github/workflows/nvidia_workflow.yml
+++ b/.github/workflows/nvidia_workflow.yml
@@ -40,7 +40,7 @@ jobs:
     - name: Run script
       shell: bash
       run: |
-        python src/runners/github-runner.py
+        python3 src/runners/github-runner.py
 
     - name: Upload training artifacts
       uses: actions/upload-artifact@v4

From b19b59bf336c919be5b3c481c7df3b342cb64eac Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 1 Nov 2025 11:37:38 -0700
Subject: [PATCH 14/35] Fix: add back deps

---
 .github/workflows/nvidia_workflow.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml
index 766072cc..ff156a50 100644
--- a/.github/workflows/nvidia_workflow.yml
+++ b/.github/workflows/nvidia_workflow.yml
@@ -37,6 +37,14 @@ jobs:
         # Now write to file (won't be logged since it's masked)
         echo "$PAYLOAD" > payload.json
 
+    - name: Setup Virtual Environment and Install Dependencies
+      shell: bash
+      run: |
+        pip install --upgrade pip
+        pip install -r "requirements.txt"
+        pip install -e .
+
+
     - name: Run script
       shell: bash
       run: |

From 3e8eb6fc9bd88fd590c6feec93d0a5b7a9d28614 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 1 Nov 2025 11:39:06 -0700
Subject: [PATCH 15/35] Fix: python->python3

---
 src/libkernelbot/run_eval.py | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py
index c0897baf..a7eadb8b 100644
--- a/src/libkernelbot/run_eval.py
+++ b/src/libkernelbot/run_eval.py
@@ -22,7 +22,7 @@ class ProfileResult:
     # Public download URL of all files created by the profiler
     # This may also be configured later
     download_url: Optional[str]
-    #fmt: on
+    # fmt: on
 
 
 @dataclasses.dataclass
@@ -351,9 +351,15 @@ def profile_program(
             "--",
         ] + call
 
-        run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu, extra_env={
-            "GPU_DUMP_CODE_OBJECT": "1",
-        })
+        run_result = run_program(
+            call,
+            seed=seed,
+            timeout=timeout,
+            multi_gpu=multi_gpu,
+            extra_env={
+                "GPU_DUMP_CODE_OBJECT": "1",
+            },
+        )
 
         profile_result = None
 
@@ -377,7 +383,7 @@ def profile_program(
                 code_obj.rename(output_dir / code_obj.name)
 
             profile_result = ProfileResult(
-                profiler='rocPROF',
+                profiler="rocPROF",
                 download_url=None,
             )
 
@@ -386,6 +392,7 @@ def profile_program(
         # TODO: Implement profiling for other platforms
         return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu), None
 
+
 def run_single_evaluation(
     system: SystemInfo,
     call: list[str],
@@ -427,7 +434,7 @@ def run_single_evaluation(
         return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu), None
 
 
-def make_system_info() -> SystemInfo: # noqa: C901
+def make_system_info() -> SystemInfo:  # noqa: C901
     info = SystemInfo()
     try:
         import torch
@@ -448,14 +455,16 @@ def make_system_info() -> SystemInfo: # noqa: C901
             info.gpu = subprocess.check_output(
                 ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"], encoding="utf-8"
             )
-            info.device_count = info.gpu.count('\n')
+            info.device_count = info.gpu.count("\n")
             info.runtime = "CUDA"
         except subprocess.CalledProcessError:
             # try again for HIP
             try:
-                rocm_info = json.loads(subprocess.check_output(
-                    ["rocm-smi", "--showproductname", "--json"], encoding="utf-8"
-                ))
+                rocm_info = json.loads(
+                    subprocess.check_output(
+                        ["rocm-smi", "--showproductname", "--json"], encoding="utf-8"
+                    )
+                )
                 if len(rocm_info) > 0:
                     info.gpu = next(rocm_info.__iter__())["Card Series"]
 
@@ -587,7 +596,7 @@ def run_pytorch_script(  # noqa: C901
         # "compile" step: execute the script once. Will populate
         # `load_inline`'s compile cache, so the actual runs will be faster.
         try:
-            compile_run = run_program(["python", "submission.py"], seed=1, timeout=Timeout.COMPILE)
+            compile_run = run_program(["python3", "submission.py"], seed=1, timeout=Timeout.COMPILE)
             if "-DTORCH_EXTENSION_NAME" in compile_run.stdout:
                 comp = CompileResult(
                     nvcc_found=True,

From 998cf42c18b4fbe5d34a2eb747458f96f8b8b00e Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 1 Nov 2025 11:41:25 -0700
Subject: [PATCH 16/35] Fix: python->python3

---
 src/libkernelbot/run_eval.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py
index a7eadb8b..de448784 100644
--- a/src/libkernelbot/run_eval.py
+++ b/src/libkernelbot/run_eval.py
@@ -622,7 +622,7 @@ def run_pytorch_script(  # noqa: C901
                 exit_code=e.returncode,
             )
 
-        run, profile = run_single_evaluation(system, ["python", main], **kwargs)
+        run, profile = run_single_evaluation(system, ["python3", main], **kwargs)
 
         return EvalResult(
             start=start,

From 1de31fd608cb8fa9c3a0bc4f62b0bbc849c507cb Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 1 Nov 2025 11:49:22 -0700
Subject: [PATCH 17/35] Add nvidia-smi

---
 .github/workflows/nvidia_workflow.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml
index ff156a50..6f455fbe 100644
--- a/.github/workflows/nvidia_workflow.yml
+++ b/.github/workflows/nvidia_workflow.yml
@@ -24,6 +24,11 @@ jobs:
     steps:
     - uses: actions/checkout@v3
 
+    - name: nvidia-smi
+      shell: bash
+      run: |
+        nvidia-smi || echo "nvidia-smi failed"
+
     - name: Create input files
       shell: bash
       run: |

From d754094605a43a0ba319874b9001a00163ec04a4 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Sat, 1 Nov 2025 19:39:55 +0100
Subject: [PATCH 18/35] split profiling into rocm/ncu; small code improvements

---
 examples/eval.py             |  33 +++++-
 src/libkernelbot/run_eval.py | 192 +++++++++++++++++++++--------------
 2 files changed, 145 insertions(+), 80 deletions(-)

diff --git a/examples/eval.py b/examples/eval.py
index 597b5ff4..187e11cd 100644
--- a/examples/eval.py
+++ b/examples/eval.py
@@ -500,9 +500,9 @@ def run_benchmarking(logger: PopcornOutput, pool: multiprocessing.Pool, tests: l
         return 112
 
 
-def _run_single_profile(test: TestCase) -> str:
+def _run_single_profile_torch(test: TestCase) -> str:
     """
-    Runs a single test case. Do not call directly
+    Profiles a single benchmark using the torch profiler.
     """
     from submission import custom_kernel
     from torch.profiler import profile, ProfilerActivity
@@ -511,14 +511,36 @@ def _run_single_profile(test: TestCase) -> str:
         data = generate_input(**test.args)
         torch.cuda.synchronize()
 
+    cloned = _clone_data(data, 0)
     with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
         with nvtx_range("custom_kernel"):
-            submission_output = custom_kernel(_clone_data(data, 0))
+            submission_output = custom_kernel(cloned)
             torch.cuda.synchronize()
 
     return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
 
 
+def _run_single_profile_ncu(test: TestCase) -> str:
+    """
+    Profiles a single benchmark using ncu. Note: this does not
+    invoke NCU; instead, it is expected that eval is launched
+    under NCU, and this function will rurnthe kernel excactly
+    once in the 'custom_kernel' nvtx range.
+    """
+    from submission import custom_kernel
+
+    with nvtx_range("generate input"):
+        data = generate_input(**test.args)
+        torch.cuda.synchronize()
+
+    cloned = _clone_data(data, 0)
+    with nvtx_range("custom_kernel"):
+        submission_output = custom_kernel(cloned)
+        torch.cuda.synchronize()
+
+    return ""
+
+
 def _run_distributed_profile(test: TestCase, rank: int) -> "EventList":
     """
     Runs a single profiling case. Do not call directly
@@ -610,7 +632,10 @@ def run_single_profile(test: TestCase, pool: multiprocessing.Pool) -> str:
     """
     world_size = test.args.get("world_size", None)
     if world_size is None:
-        return pool.apply(_run_single_profile, (test,))
+        if bool(os.getenv("POPCORN_NCU", "0")):
+            return pool.apply(_run_single_profile_ncu, (test,))
+        else:
+            return pool.apply(_run_single_profile_torch, (test,))
     else:
         return run_multi_gpu_profile(pool, test, world_size)
 
diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py
index de448784..e82f466d 100644
--- a/src/libkernelbot/run_eval.py
+++ b/src/libkernelbot/run_eval.py
@@ -305,6 +305,112 @@ def run_program(
     )
 
 
+def profile_program_roc(
+    call: list[str],
+    seed: Optional[int],
+    timeout: int,
+    multi_gpu: bool,
+    output_dir: Path,
+) -> tuple[RunResult, Optional[ProfileResult]]:
+    # Wrap program in rocprof
+    call = [
+        "rocprofv3",
+        "--log-level",
+        "fatal",
+        "--hip-trace",
+        "--kernel-trace",
+        "--rccl-trace",
+        "--marker-trace",
+        "--hip-trace",
+        "--memory-copy-trace",
+        # New? Doesn't work in the runner
+        # "--memory-allocation-trace",
+        "--scratch-memory-trace",
+        # The HSA trace output is very large, so skip it for now
+        # "--hsa-trace",
+        "--output-format",
+        "pftrace",
+        "csv",
+        "-d",
+        str(output_dir),
+        # Just store the files as %pid%_tracename.ext instead of putting them in an
+        # additional directory named after the hostname.
+        "-o",
+        # Insert an extra path here so that the resulting zip has all files
+        # in the profile_data/ directory rather than the root.
+        "%pid%",
+        "--",
+    ] + call
+
+    run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu, extra_env={
+        "GPU_DUMP_CODE_OBJECT": "1",
+    },
+        )
+
+    profile_result = None
+
+    if run_result.success:
+        # Post-process trace data.
+        # rocPROF generates one trace for every process, but its more useful to
+        # have all traces be in the same file. Fortunately we can do that by
+        # concatenating.
+        traces = list(output_dir.glob("*.pftrace"))
+        with (output_dir / "combined.pftrace").open("wb") as combined:
+            for trace_path in traces:
+                with trace_path.open("rb") as trace:
+                    shutil.copyfileobj(trace, combined)
+
+                # After we've created the combined trace, there is no point in
+                # keeping the individual traces around.
+                trace_path.unlink()
+
+        # Also move the code objects to the profiling output directory.
+        for code_obj in list(Path.cwd().glob("_code_object*.o")):
+            code_obj.rename(output_dir / code_obj.name)
+
+        profile_result = ProfileResult(
+            profiler="rocPROF",
+            download_url=None,
+        )
+
+    return run_result, profile_result
+
+
+def profile_program_ncu(
+    call: list[str],
+    seed: Optional[int],
+    timeout: int,
+    multi_gpu: bool,
+    output_dir: Path,
+) -> tuple[RunResult, Optional[ProfileResult]]:
+    assert not multi_gpu, "Multi-GPU profiling not supported for ncu."
+
+    # Wrap program in ncu
+    call = [
+        "ncu",
+        "--set", "full",
+        "--nvtx",
+        "--nvtx-include", "custom_kernel/",
+        "--import-source", "1",
+        "-o", f"{str(output_dir / 'profile.ncu-rep')}",
+        "--",
+    ] + call
+
+    run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu, extra_env={
+        "POPCORN_NCU": "1"
+    })
+
+    profile_result = None
+
+    if run_result.success:
+        profile_result = ProfileResult(
+            profiler='ncu',
+            download_url=None,
+        )
+
+    return run_result, profile_result
+
+
 def profile_program(
     system: SystemInfo,
     call: list[str],
@@ -315,89 +421,25 @@ def profile_program(
     # The runner-specific configuration should implement logic
     # to fetch the data in this directory and return it as
     # ProfileResult.download_url.
-    # Insert an extra nested nested path here so that the resulting zip has all files
+    # Insert an extra nested path here so that the resulting zip has all files
     # in the profile_data/ directory rather than directly in the root.
     output_dir = Path(".") / "profile_data" / "profile_data"
     output_dir.mkdir(parents=True, exist_ok=True)
 
     if system.runtime == "ROCm":
-        # Wrap program in rocprof
-        call = [
-            "rocprofv3",
-            "--log-level",
-            "fatal",
-            "--hip-trace",
-            "--kernel-trace",
-            "--rccl-trace",
-            "--marker-trace",
-            "--hip-trace",
-            "--memory-copy-trace",
-            # New? Doesn't work in the runner
-            # "--memory-allocation-trace",
-            "--scratch-memory-trace",
-            # The HSA trace output is very large, so skip it for now
-            # "--hsa-trace",
-            "--output-format",
-            "pftrace",
-            "csv",
-            "-d",
-            str(output_dir),
-            # Just store the files as %pid%_tracename.ext instead of putting them in an
-            # additional directory named after the hostname.
-            "-o",
-            # Insert an extra path here so that the resulting zip has all files
-            # in the profile_data/ directory rather than the root.
-            "%pid%",
-            "--",
-        ] + call
-
-        run_result = run_program(
-            call,
-            seed=seed,
-            timeout=timeout,
-            multi_gpu=multi_gpu,
-            extra_env={
-                "GPU_DUMP_CODE_OBJECT": "1",
-            },
-        )
-
-        profile_result = None
-
-        if run_result.success:
-            # Post-process trace data.
-            # rocPROF generates one trace for every process, but its more useful to
-            # have all traces be in the same file. Fortunately we can do that by
-            # concatenating.
-            traces = list(output_dir.glob("*.pftrace"))
-            with (output_dir / "combined.pftrace").open("wb") as combined:
-                for trace_path in traces:
-                    with trace_path.open("rb") as trace:
-                        shutil.copyfileobj(trace, combined)
-
-                    # After we've created the combined trace, there is no point in
-                    # keeping the individual traces around.
-                    trace_path.unlink()
-
-            # Also move the code objects to the profiling output directory.
-            for code_obj in list(Path.cwd().glob("_code_object*.o")):
-                code_obj.rename(output_dir / code_obj.name)
-
-            profile_result = ProfileResult(
-                profiler="rocPROF",
-                download_url=None,
-            )
-
-        return run_result, profile_result
+        return profile_program_roc(call, seed, timeout, multi_gpu, output_dir)
+    elif system.runtime == "CUDA":
+        return profile_program_ncu(call, seed, timeout, multi_gpu, output_dir)
     else:
-        # TODO: Implement profiling for other platforms
-        return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu), None
+        raise ValueError(f"Unknown runtime {system.runtime}")
+
 
 
 def run_single_evaluation(
-    system: SystemInfo,
     call: list[str],
     mode: str,
     *,
+    system: SystemInfo,
     multi_gpu: bool = False,
     tests: Optional[str] = None,
     benchmarks: Optional[str] = None,
@@ -426,7 +468,7 @@ def run_single_evaluation(
 
         cases.flush()
 
-        call += [mode, cases.name]
+        call = call + [mode, cases.name]
 
         if mode == "profile":
             return profile_program(system, call, seed=seed, timeout=timeout, multi_gpu=multi_gpu)
@@ -498,7 +540,6 @@ def make_system_info() -> SystemInfo:  # noqa: C901
 
 
 def run_cuda_script(  # # noqa: C901
-    system: SystemInfo,
     sources: dict[str, str],
     headers: Optional[dict[str, str]] = None,
     arch: Optional[int] = None,
@@ -559,7 +600,7 @@ def run_cuda_script(  # # noqa: C901
             if os.path.exists(f):
                 os.remove(f)
 
-    run_result, profile_result = run_single_evaluation(system, ["./eval.out"], **kwargs)
+    run_result, profile_result = run_single_evaluation(["./eval.out"], **kwargs)
     return EvalResult(
         start=start,
         end=datetime.datetime.now(),
@@ -570,7 +611,6 @@ def run_cuda_script(  # # noqa: C901
 
 
 def run_pytorch_script(  # noqa: C901
-    system: SystemInfo,
     sources: dict[str, str],
     main: str,
     **kwargs,
@@ -622,7 +662,7 @@ def run_pytorch_script(  # noqa: C901
                 exit_code=e.returncode,
             )
 
-        run, profile = run_single_evaluation(system, ["python3", main], **kwargs)
+        run, profile = run_single_evaluation(["python3", main], **kwargs)
 
         return EvalResult(
             start=start,

From 394e2341e0962c0d0362e991a9d561943805b182 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Sun, 9 Nov 2025 13:43:59 +0100
Subject: [PATCH 19/35] profile each benchmark individually for cleaner traces

---
 src/libkernelbot/report.py   | 51 +++++++++++++++++++-----------------
 src/libkernelbot/run_eval.py | 26 +++++++++++-------
 2 files changed, 43 insertions(+), 34 deletions(-)

diff --git a/src/libkernelbot/report.py b/src/libkernelbot/report.py
index 25bb27cb..3579bf43 100644
--- a/src/libkernelbot/report.py
+++ b/src/libkernelbot/report.py
@@ -174,16 +174,18 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]:  # n
     elif full:
         result.append("❌ Benchmarks missing")
 
-    if "profile" in runs:
-        bench_run = runs["profile"].run
-        if not bench_run.success:
-            result.append("❌ Running profile failed" + _short_fail_reason(bench_run))
-            return result
-        elif not bench_run.passed:
-            result.append("❌ Profiling failed")
-            return result
-        else:
-            result.append("✅ Profiling successful")
+    profile_runs = [v for k, v in runs.items() if k.startswith("profile")]
+    if len(profile_runs) > 0:
+        for prof_run in profile_runs:
+            bench_run = prof_run.run
+            if not bench_run.success:
+                result.append("❌ Running profile failed" + _short_fail_reason(bench_run))
+                return result
+            elif not bench_run.passed:
+                result.append("❌ Profiling failed")
+                return result
+            else:
+                result.append("✅ Profiling successful")
 
     if "leaderboard" in runs:
         lb_run = runs["leaderboard"].run
@@ -327,23 +329,24 @@ def generate_report(result: FullResult) -> RunResultReport:  # noqa: C901
             make_benchmark_log(bench_run.run),
         )
 
-    if "profile" in runs:
-        prof_run = runs["profile"]
-        if _handle_crash_report(report, prof_run):
-            return report
+    profile_runs = [v for k, v in runs.items() if k.startswith("profile")]
+    if len(profile_runs) > 0:
+        for prof_run in profile_runs:
+            if _handle_crash_report(report, prof_run):
+                return report
 
-        report.add_log(
-            "Profiling",
-            make_profile_log(prof_run.run),
-        )
-
-        if prof_run.profile is not None and prof_run.profile.download_url is not None:
-            report.add_link(
-                f"{prof_run.profile.profiler} profiling output",
-                "Download from GitHub",
-                prof_run.profile.download_url,
+            report.add_log(
+                "Profiling",
+                make_profile_log(prof_run.run),
             )
 
+            if prof_run.profile is not None and prof_run.profile.download_url is not None:
+                report.add_link(
+                    f"{prof_run.profile.profiler} profiling output",
+                    "Download from GitHub",
+                    prof_run.profile.download_url,
+                )
+
     if "leaderboard" in runs:
         bench_run = runs["leaderboard"]
         if _handle_crash_report(report, bench_run):
diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py
index e82f466d..cc8e3489 100644
--- a/src/libkernelbot/run_eval.py
+++ b/src/libkernelbot/run_eval.py
@@ -1,3 +1,4 @@
+import copy
 import dataclasses
 import datetime
 import functools
@@ -678,12 +679,13 @@ def run_pytorch_script(  # noqa: C901
 
 
 class _EvalRunner(Protocol):
-    def __call__(self, mode: str) -> EvalResult: ...
+    def __call__(self, mode: str, **kwargs) -> EvalResult: ...
 
 
 def run_evaluation(
     call: _EvalRunner,
     mode: str,
+    common_args: dict,
 ) -> dict[str, EvalResult]:
     """
     Given a "runner" function `call`, interprets the mode
@@ -693,22 +695,28 @@ def run_evaluation(
     require multiple runner calls.
     """
     results: dict[str, EvalResult] = {}
-    if mode in ["test", "benchmark", "profile"]:
-        results[mode] = call(mode=mode)
+    if mode == "profile":
+        benchmarks = copy.deepcopy(common_args["benchmarks"])
+        for i, benchmark in enumerate(benchmarks.splitlines()):
+            common_args["benchmarks"] = benchmark
+            results[f"{mode}.{i}"] = call(mode=mode, **common_args)
+
+    elif mode in ["test", "benchmark"]:
+        results[mode] = call(mode=mode, **common_args)
     elif mode in ["private", "leaderboard"]:
         # first, run the tests
-        results["test"] = call(mode="test")
+        results["test"] = call(mode="test", **common_args)
 
         if not results["test"].run or not results["test"].run.passed:
             return results
 
-        results["benchmark"] = call(mode="benchmark")
+        results["benchmark"] = call(mode="benchmark", **common_args)
 
         if not results["benchmark"].run or not results["benchmark"].run.passed:
             return results
 
         # if they pass, run the leaderboard validation
-        results["leaderboard"] = call(mode="leaderboard")
+        results["leaderboard"] = call(mode="leaderboard", **common_args)
     else:
         raise AssertionError("Invalid mode")
 
@@ -742,8 +750,7 @@ def run_config(config: dict):
         runner = functools.partial(
             run_pytorch_script,
             sources=config["sources"],
-            main=config["main"],
-            **common_args,
+            main=config["main"]
         )
     elif config["lang"] == "cu":
         runner = functools.partial(
@@ -755,10 +762,9 @@ def run_config(config: dict):
             include_dirs=config.get("include_dirs", []),
             libraries=config.get("libraries", []),
             flags=CUDA_FLAGS,
-            **common_args,
         )
     else:
         raise ValueError(f"Invalid language {config['lang']}")
 
-    results = run_evaluation(runner, config["mode"])
+    results = run_evaluation(runner, config["mode"], common_args)
     return FullResult(success=True, error="", runs=results, system=system)

From 0e51cf58819cc33defaad5bbc09bff84dea4714e Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Sun, 9 Nov 2025 14:35:24 +0100
Subject: [PATCH 20/35] profile in tempdir

---
 src/libkernelbot/run_eval.py | 34 ++++++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py
index cc8e3489..6a7ee9d1 100644
--- a/src/libkernelbot/run_eval.py
+++ b/src/libkernelbot/run_eval.py
@@ -1,3 +1,4 @@
+import base64
 import copy
 import dataclasses
 import datetime
@@ -20,6 +21,9 @@
 class ProfileResult:
     # fmt: off
     profiler: str      # The profiler used to gather this data
+    # Profiler trace. May be empty, in which case `download_url`
+    # should point to the trace file.
+    trace: str
     # Public download URL of all files created by the profiler
     # This may also be configured later
     download_url: Optional[str]
@@ -123,6 +127,14 @@ def _create_files(files: Optional[dict[str, str]]):
         Path(name).write_text(content)
 
 
+def _directory_to_zip_bytes(directory_path) -> str:
+    """Create a zip archive and return as bas64 encoded bytes."""
+    with tempfile.NamedTemporaryFile() as archive_path:
+        shutil.make_archive(archive_path.name, 'zip', directory_path)
+        data = archive_path.read()
+        return base64.b64encode(data).decode('utf-8')
+
+
 def compile_cuda_script(  # # noqa: C901
     files: list[str],
     arch: Optional[int] = None,
@@ -371,6 +383,7 @@ def profile_program_roc(
 
         profile_result = ProfileResult(
             profiler="rocPROF",
+            trace=_directory_to_zip_bytes(output_dir),
             download_url=None,
         )
 
@@ -405,7 +418,8 @@ def profile_program_ncu(
 
     if run_result.success:
         profile_result = ProfileResult(
-            profiler='ncu',
+            profiler='Nsight-Compute',
+            trace=_directory_to_zip_bytes(output_dir),
             download_url=None,
         )
 
@@ -424,16 +438,16 @@ def profile_program(
     # ProfileResult.download_url.
     # Insert an extra nested path here so that the resulting zip has all files
     # in the profile_data/ directory rather than directly in the root.
-    output_dir = Path(".") / "profile_data" / "profile_data"
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    if system.runtime == "ROCm":
-        return profile_program_roc(call, seed, timeout, multi_gpu, output_dir)
-    elif system.runtime == "CUDA":
-        return profile_program_ncu(call, seed, timeout, multi_gpu, output_dir)
-    else:
-        raise ValueError(f"Unknown runtime {system.runtime}")
 
+    with tempfile.TemporaryDirectory(dir=".") as tmpdir:
+        output_dir = Path(tmpdir) / "profile_data"
+        output_dir.mkdir()
+        if system.runtime == "ROCm":
+            return profile_program_roc(call, seed, timeout, multi_gpu, output_dir)
+        elif system.runtime == "CUDA":
+            return profile_program_ncu(call, seed, timeout, multi_gpu, output_dir)
+        else:
+            raise ValueError(f"Unknown runtime {system.runtime}")
 
 
 def run_single_evaluation(

From 3e6a59ce53f0df5b439f08b29a14ed4b4aada521 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Sun, 9 Nov 2025 15:12:30 +0100
Subject: [PATCH 21/35] send profile results as attached files

---
 src/kernelbot/discord_reporter.py    |  8 ++++-
 src/kernelbot/discord_utils.py       |  5 ++++
 src/libkernelbot/launchers/github.py |  2 +-
 src/libkernelbot/report.py           | 45 +++++++++++++++++++++-------
 src/libkernelbot/run_eval.py         | 12 +++++---
 5 files changed, 55 insertions(+), 17 deletions(-)

diff --git a/src/kernelbot/discord_reporter.py b/src/kernelbot/discord_reporter.py
index 3b6fd8c3..54ba4063 100644
--- a/src/kernelbot/discord_reporter.py
+++ b/src/kernelbot/discord_reporter.py
@@ -1,7 +1,8 @@
 import discord
-from discord_utils import _send_split_log
+from discord_utils import _send_split_log, _send_file
 
 from libkernelbot.report import (
+    File,
     Link,
     Log,
     MultiProgressReporter,
@@ -70,6 +71,11 @@ async def display_report(self, title: str, report: RunResultReport):
                 message += part.text
             elif isinstance(part, Log):
                 message = await _send_split_log(thread, message, part.header, part.content)
+            elif isinstance(part, File):
+                if len(message) > 0:
+                    await thread.send(message)
+                await _send_file(thread, part.message, part.name, part.content)
+                message = ""
             elif isinstance(part, Link):
                 if len(message) > 0:
                     await thread.send(message)
diff --git a/src/kernelbot/discord_utils.py b/src/kernelbot/discord_utils.py
index d014f3ca..6830db1f 100644
--- a/src/kernelbot/discord_utils.py
+++ b/src/kernelbot/discord_utils.py
@@ -1,5 +1,6 @@
 import functools
 import logging
+from io import BytesIO
 
 import discord
 
@@ -136,3 +137,7 @@ async def _send_split_log(thread: discord.Thread, partial_message: str, header:
             await thread.send(partial_message)
 
         return ""
+
+
+async def _send_file(thread: discord.Thread, message: str, name: str, file: bytes):
+    await thread.send(message, file=discord.File(BytesIO(file), filename=name))
diff --git a/src/libkernelbot/launchers/github.py b/src/libkernelbot/launchers/github.py
index 4c1b1d5f..3f09b94d 100644
--- a/src/libkernelbot/launchers/github.py
+++ b/src/libkernelbot/launchers/github.py
@@ -143,7 +143,7 @@ async def run_submission(  # noqa: C901
             # Update profile artifact to the actual download URL.
             # For the GitHub launcher the profile_artifact currently just contains
             # the name of the artifact.
-            if profile_res is not None:
+            if profile_res is not None and "profile-data" in index:
                 profile_res.download_url = index["profile-data"].public_download_url
 
             res = EvalResult(
diff --git a/src/libkernelbot/report.py b/src/libkernelbot/report.py
index 3579bf43..5764a2bd 100644
--- a/src/libkernelbot/report.py
+++ b/src/libkernelbot/report.py
@@ -43,9 +43,19 @@ class Link:
     url: str
 
 
+@dataclasses.dataclass
+class File:
+    """
+    Link represents a file that gets attached to the report.
+    """
+    name: str
+    message: str
+    content: bytes
+
+
 class RunResultReport:
     def __init__(self, data=None):
-        self.data: List[Text | Log | Link] = data or []
+        self.data: List[Text | Log | Link | File] = data or []
 
     def add_text(self, section: str):
         self.data.append(Text(section))
@@ -56,6 +66,9 @@ def add_log(self, header: str, log: str):
     def add_link(self, title: str, text: str, url: str):
         self.data.append(Link(title, text, url))
 
+    def add_file(self, name: str, message: str, content: bytes):
+        self.data.append(File(name, message, content))
+
     def __repr__(self):
         return f"RunResultReport(data={self.data})"
 
@@ -335,18 +348,28 @@ def generate_report(result: FullResult) -> RunResultReport:  # noqa: C901
             if _handle_crash_report(report, prof_run):
                 return report
 
-            report.add_log(
-                "Profiling",
-                make_profile_log(prof_run.run),
-            )
-
-            if prof_run.profile is not None and prof_run.profile.download_url is not None:
-                report.add_link(
-                    f"{prof_run.profile.profiler} profiling output",
-                    "Download from GitHub",
-                    prof_run.profile.download_url,
+            if prof_run.profile.trace is not None:
+                report.add_log(
+                    "Profiling",
+                    make_profile_log(prof_run.run),
                 )
 
+                if prof_run.profile.download_url is not None:
+                    report.add_link(
+                        f"{prof_run.profile.profiler} profiling output",
+                        "Download from GitHub",
+                        prof_run.profile.download_url,
+                    )
+
+        for prof_run in profile_runs:
+            if prof_run.profile is not None:
+                if prof_run.profile.trace is not None:
+                    report.add_file(
+                        "profile.zip",
+                        make_profile_log(prof_run.run),
+                        base64.b64decode(prof_run.profile.trace),
+                    )
+
     if "leaderboard" in runs:
         bench_run = runs["leaderboard"]
         if _handle_crash_report(report, bench_run):
diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py
index 6a7ee9d1..0953d8d9 100644
--- a/src/libkernelbot/run_eval.py
+++ b/src/libkernelbot/run_eval.py
@@ -128,10 +128,14 @@ def _create_files(files: Optional[dict[str, str]]):
 
 
 def _directory_to_zip_bytes(directory_path) -> str:
-    """Create a zip archive and return as bas64 encoded bytes."""
-    with tempfile.NamedTemporaryFile() as archive_path:
-        shutil.make_archive(archive_path.name, 'zip', directory_path)
-        data = archive_path.read()
+    """Create a zip archive and return as base64 encoded bytes."""
+    with tempfile.TemporaryDirectory() as temp_dir:
+        archive_path = os.path.join(temp_dir, 'archive')
+        shutil.make_archive(archive_path, 'zip', directory_path)
+
+        with open(archive_path + '.zip', 'rb') as f:
+            data = f.read()
+
         return base64.b64encode(data).decode('utf-8')
 
 
From f31e4bb2f70ed4fe24235284d532b79e99d78095 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Sun, 9 Nov 2025 15:31:46 +0100
Subject: [PATCH 22/35] don't spam alerts

---
 src/kernelbot/discord_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/kernelbot/discord_utils.py b/src/kernelbot/discord_utils.py
index 6830db1f..f6be7350 100644
--- a/src/kernelbot/discord_utils.py
+++ b/src/kernelbot/discord_utils.py
@@ -140,4 +140,4 @@ async def _send_split_log(thread: discord.Thread, partial_message: str, header:
 
 
 async def _send_file(thread: discord.Thread, message: str, name: str, file: bytes):
-    await thread.send(message, file=discord.File(BytesIO(file), filename=name))
+    await thread.send(message, file=discord.File(BytesIO(file), filename=name), silent=True)

From 00c215aa9844d69c8deeec34b5db3fc19adc6ca5 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Sun, 9 Nov 2025 15:38:43 +0100
Subject: [PATCH 23/35] include default ncu report

---
 src/libkernelbot/run_eval.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py
index 0953d8d9..a9d85308 100644
--- a/src/libkernelbot/run_eval.py
+++ b/src/libkernelbot/run_eval.py
@@ -417,9 +417,14 @@ def profile_program_ncu(
     run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu, extra_env={
         "POPCORN_NCU": "1"
     })
-
     profile_result = None
 
+    try:
+        report = subprocess.check_output(["ncu", "--import", f"{str(output_dir / 'profile.ncu-rep')}",], text=True)
+        run_result.result["benchmark.0.report"] = base64.b64encode(report.encode("utf-8")).decode("utf-8")
+    except subprocess.CalledProcessError:
+        pass
+
     if run_result.success:
         profile_result = ProfileResult(
             profiler='Nsight-Compute',

From b014b79bf48a3be436faa40da61cb5d372f2a8c7 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Sun, 9 Nov 2025 16:15:47 +0100
Subject: [PATCH 24/35] attempt at filtered ncu

---
 src/libkernelbot/run_eval.py | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py
index a9d85308..156f525e 100644
--- a/src/libkernelbot/run_eval.py
+++ b/src/libkernelbot/run_eval.py
@@ -139,6 +139,35 @@ def _directory_to_zip_bytes(directory_path) -> str:
         return base64.b64encode(data).decode('utf-8')
 
 
+def _filter_ncu_report(report: str, tables: list):
+    """
+    Extract the Speed-of-light section from the full ncu terminal report.
+
+    For expert users, we just attach the full ncu profile to the result,
+    and they can view whichever metrics they are interested in. But to
+    encourage novice users to try out profiling, we want to have a
+    *simple* set of things to display automatically, short enough to fit
+    in a *single* discord message.
+    """
+    result = ""
+    collect = False
+    for line in report.splitlines():
+        if "Table Name : " in line:
+            table = line[line.find("Table Name :") + len("Table Name :"):].strip()
+            if table in tables:
+                result += "\n"
+                collect = True
+            else:
+                collect = False
+
+        if len(line.strip()) == 0:
+            collect = False
+
+        if collect:
+            result += line + "\n"
+    return result
+
+
 def compile_cuda_script(  # # noqa: C901
     files: list[str],
     arch: Optional[int] = None,
@@ -420,7 +449,8 @@ def profile_program_ncu(
     profile_result = None
 
     try:
-        report = subprocess.check_output(["ncu", "--import", f"{str(output_dir / 'profile.ncu-rep')}",], text=True)
+        report = subprocess.check_output(["ncu", "--import", f"{str(output_dir / 'profile.ncu-rep')}", "--print-details", "body"], text=True)
+        report = _filter_ncu_report(report, ["GPU Throughput", "Pipe Utilization (% of active cycles)", "Warp State (All Cycles)"])
         run_result.result["benchmark.0.report"] = base64.b64encode(report.encode("utf-8")).decode("utf-8")
     except subprocess.CalledProcessError:
         pass

From f328ebae33aaabadf579640de0931e63bf9d8b38 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Sun, 9 Nov 2025 16:24:16 +0100
Subject: [PATCH 25/35] formatting fix

---
 src/kernelbot/discord_utils.py |  4 ++--
 src/libkernelbot/report.py     | 13 +++++++------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/kernelbot/discord_utils.py b/src/kernelbot/discord_utils.py
index f6be7350..7924a3d2 100644
--- a/src/kernelbot/discord_utils.py
+++ b/src/kernelbot/discord_utils.py
@@ -125,7 +125,7 @@ async def _send_split_log(thread: discord.Thread, partial_message: str, header:
             else:
                 if partial_message != "":
                     chunks.append(partial_message)
-                partial_message = line
+                partial_message = line + "\n"
 
         if partial_message != "":
             chunks.append(partial_message)
@@ -134,7 +134,7 @@ async def _send_split_log(thread: discord.Thread, partial_message: str, header:
         for i, chunk in enumerate(chunks):
             partial_message = f"\n\n## {header} ({i+1}/{len(chunks)}):\n"
             partial_message += f"```\n{limit_length(chunk, 1900)}```"
-            await thread.send(partial_message)
+            await thread.send(partial_message, silent=True)
 
         return ""
 
diff --git a/src/libkernelbot/report.py b/src/libkernelbot/report.py
index 5764a2bd..71f9a7b0 100644
--- a/src/libkernelbot/report.py
+++ b/src/libkernelbot/report.py
@@ -272,12 +272,9 @@ def make_profile_log(run: RunResult) -> str:
     num_bench = int(run.result.get("benchmark-count", 0))
 
     def log_one(base_name):
-        spec = run.result.get(f"{base_name}.spec")
-
         report: str = run.result.get(f"{base_name}.report")
         report = base64.b64decode(report.encode("utf-8"), b"+*").decode("utf-8")
         report = textwrap.indent(report, "  ")
-        bench_log.append(f"{spec}\n")
         bench_log.append(report)
 
     bench_log = []
@@ -314,6 +311,10 @@ def _handle_crash_report(report: RunResultReport, run_result: EvalResult):
     return False
 
 
+def _shortname(spec: str):
+    return spec.replace(": ", "=").replace("; ", "_")
+
+
 def generate_report(result: FullResult) -> RunResultReport:  # noqa: C901
     runs = result.runs
     report = RunResultReport()
@@ -350,7 +351,7 @@ def generate_report(result: FullResult) -> RunResultReport:  # noqa: C901
 
             if prof_run.profile.trace is not None:
                 report.add_log(
-                    "Profiling",
+                    f"Profiling {prof_run.run.result.get(f'benchmark.0.spec')}",
                     make_profile_log(prof_run.run),
                 )
 
@@ -365,8 +366,8 @@ def generate_report(result: FullResult) -> RunResultReport:  # noqa: C901
             if prof_run.profile is not None:
                 if prof_run.profile.trace is not None:
                     report.add_file(
-                        "profile.zip",
-                        make_profile_log(prof_run.run),
+                        f"profile-{_shortname(prof_run.run.result.get(f'benchmark.0.spec'))}.zip",
+                        f"{prof_run.profile.profiler} report - " + prof_run.run.result.get(f"benchmark.0.spec"),
                         base64.b64decode(prof_run.profile.trace),
                     )
 

From eaa54f740b118e9813e28f8296e8e35c6c54c7ee Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Sun, 9 Nov 2025 16:56:24 +0100
Subject: [PATCH 26/35] fix tests

---
 scripts/ci_test_cuda.py           | 4 ++--
 scripts/ci_test_python.py         | 4 ++--
 src/kernelbot/discord_reporter.py | 2 +-
 src/libkernelbot/report.py        | 6 +++---
 src/libkernelbot/run_eval.py      | 6 ++++--
 src/runners/modal_runner.py       | 2 +-
 tests/test_report.py              | 8 ++++++--
 7 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/scripts/ci_test_cuda.py b/scripts/ci_test_cuda.py
index c3fa893c..de1f5fbe 100644
--- a/scripts/ci_test_cuda.py
+++ b/scripts/ci_test_cuda.py
@@ -19,12 +19,12 @@ def run_cuda_helper(sources: dict, headers: dict = None, arch=None, **kwargs):
         headers = header_files
 
     eval_result = run_cuda_script(
-        make_system_info(),
         sources,
         headers,
         arch=arch,
         mode=SubmissionMode.TEST.value,
         tests="size: 256; seed: 42\n",
+        system=make_system_info(),
         **kwargs,
     )
     return eval_result.compilation, eval_result.run
@@ -195,12 +195,12 @@ def test_include_dirs(tmp_path: Path):
 
     # can also use generic flags argument
     result = run_cuda_script(
-        make_system_info(),
         {"eval.cu": eval_cu, "submission.cu": sub},
         header_files,
         flags=["-I.", f"-I{tmp_path}"],
         mode=SubmissionMode.TEST.value,
         tests="size: 256; seed: 42\n",
+        system=make_system_info(),
     )
 
     assert result.compilation.success is True
diff --git a/scripts/ci_test_python.py b/scripts/ci_test_python.py
index 7cc4fedd..1bd8dd9f 100644
--- a/scripts/ci_test_python.py
+++ b/scripts/ci_test_python.py
@@ -12,11 +12,11 @@
 
 def run_pytorch_helper(sources: dict, tests=None, **kwargs):
     result = run_pytorch_script(
-        make_system_info(),
         sources,
         "eval.py",
         mode=SubmissionMode.TEST.value,
         tests=tests or "size: 256; seed: 42\n",
+        system=make_system_info(),
         **kwargs,
     )
     return result.run
@@ -45,7 +45,7 @@ def custom_kernel(input):
     run = run_pytorch_helper({**files, "submission.py": sub})
     assert run.success is True
     assert run.passed is False
-    assert "python eval.py test" in run.command
+    assert "python3 eval.py test" in run.command
     assert run.stdout == ""
     assert run.stderr == ""
 
diff --git a/src/kernelbot/discord_reporter.py b/src/kernelbot/discord_reporter.py
index 54ba4063..d0551b07 100644
--- a/src/kernelbot/discord_reporter.py
+++ b/src/kernelbot/discord_reporter.py
@@ -1,5 +1,5 @@
 import discord
-from discord_utils import _send_split_log, _send_file
+from discord_utils import _send_file, _send_split_log
 
 from libkernelbot.report import (
     File,
diff --git a/src/libkernelbot/report.py b/src/libkernelbot/report.py
index 71f9a7b0..58beaffe 100644
--- a/src/libkernelbot/report.py
+++ b/src/libkernelbot/report.py
@@ -351,7 +351,7 @@ def generate_report(result: FullResult) -> RunResultReport:  # noqa: C901
 
             if prof_run.profile.trace is not None:
                 report.add_log(
-                    f"Profiling {prof_run.run.result.get(f'benchmark.0.spec')}",
+                    f"Profiling {prof_run.run.result.get('benchmark.0.spec')}",
                     make_profile_log(prof_run.run),
                 )
 
@@ -366,8 +366,8 @@ def generate_report(result: FullResult) -> RunResultReport:  # noqa: C901
             if prof_run.profile is not None:
                 if prof_run.profile.trace is not None:
                     report.add_file(
-                        f"profile-{_shortname(prof_run.run.result.get(f'benchmark.0.spec'))}.zip",
-                        f"{prof_run.profile.profiler} report - " + prof_run.run.result.get(f"benchmark.0.spec"),
+                        f"profile-{_shortname(prof_run.run.result.get('benchmark.0.spec'))}.zip",
+                        f"{prof_run.profile.profiler} report - " + prof_run.run.result.get("benchmark.0.spec"),
                         base64.b64decode(prof_run.profile.trace),
                     )
 
diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py
index 156f525e..e3879ee7 100644
--- a/src/libkernelbot/run_eval.py
+++ b/src/libkernelbot/run_eval.py
@@ -449,8 +449,10 @@ def profile_program_ncu(
     profile_result = None
 
     try:
-        report = subprocess.check_output(["ncu", "--import", f"{str(output_dir / 'profile.ncu-rep')}", "--print-details", "body"], text=True)
-        report = _filter_ncu_report(report, ["GPU Throughput", "Pipe Utilization (% of active cycles)", "Warp State (All Cycles)"])
+        get_tables = ["GPU Throughput", "Pipe Utilization (% of active cycles)", "Warp State (All Cycles)"]
+        ncu_cmd = ["ncu", "--import", f"{str(output_dir / 'profile.ncu-rep')}", "--print-details", "body"]
+        report = subprocess.check_output(ncu_cmd, text=True)
+        report = _filter_ncu_report(report, get_tables)
         run_result.result["benchmark.0.report"] = base64.b64encode(report.encode("utf-8")).decode("utf-8")
     except subprocess.CalledProcessError:
         pass
diff --git a/src/runners/modal_runner.py b/src/runners/modal_runner.py
index 6a048e62..d2cb0d64 100644
--- a/src/runners/modal_runner.py
+++ b/src/runners/modal_runner.py
@@ -16,7 +16,7 @@
 
 # Move this to another file later:
 cuda_image = (
-    Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.12")
+    Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.13")
     .apt_install(
         "git",
         "gcc-13",
diff --git a/tests/test_report.py b/tests/test_report.py
index a1964e62..9006a98e 100644
--- a/tests/test_report.py
+++ b/tests/test_report.py
@@ -6,6 +6,7 @@
 
 from libkernelbot import consts
 from libkernelbot.report import (
+    File,
     RunResultReport,
     _generate_compile_report,
     _short_fail_reason,
@@ -402,7 +403,6 @@ def test_make_profile_log():
 
     log = make_profile_log(run)
 
-    assert "Matrix multiplication profile" in log
     assert "  Profile line 1" in log
     assert "  Profile line 2" in log
 
@@ -664,6 +664,7 @@ def test_generate_report_profile(sample_full_result: FullResult):
     }
     sample_full_result.runs["profile"].profile = ProfileResult(
         profiler="NSight",
+        trace="",
         download_url="https://example.com",
     )
     report = generate_report(sample_full_result)
@@ -687,8 +688,11 @@ def test_generate_report_profile(sample_full_result: FullResult):
             "❌ Test division\n"
             "> Division by zero",
         ),
-        Log(header="Profiling", content="Benchmark\n\n  Profile report\n"),
+        Log(header='Profiling Benchmark', content='  Profile report\n'),
         Link("NSight profiling output", "Download from GitHub", "https://example.com"),
+        File(name='profile-Benchmark.zip',
+             message='NSight report - Benchmark',
+             content=b''),
     ]
 
 
From e83b0f48d54793d0cfb21217c1e69c74123c297d Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Mon, 10 Nov 2025 19:04:53 +0100
Subject: [PATCH 27/35] Fix: good error for profile via api

---
 src/kernelbot/api/api_utils.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/kernelbot/api/api_utils.py b/src/kernelbot/api/api_utils.py
index 4082108e..0b0e714c 100644
--- a/src/kernelbot/api/api_utils.py
+++ b/src/kernelbot/api/api_utils.py
@@ -189,6 +189,8 @@ async def display_report(self, title: str, report: RunResultReport):
             elif isinstance(part, Log):
                 self.long_report += f"\n\n## {part.header}:\n"
                 self.long_report += f"```\n{part.content}```"
+
+
 # ruff: noqa: C901
 async def to_submit_info(
     user_info: Any,
@@ -197,14 +199,12 @@ async def to_submit_info(
     leaderboard_name: str,
     gpu_type: str,
     db_context: LeaderboardDB,
-) -> tuple[SubmissionRequest, SubmissionMode]: # noqa: C901
+) -> tuple[SubmissionRequest, SubmissionMode]:  # noqa: C901
     user_name = user_info["user_name"]
     user_id = user_info["user_id"]
 
     try:
-        submission_mode_enum: SubmissionMode = SubmissionMode(
-            submission_mode.lower()
-        )
+        submission_mode_enum: SubmissionMode = SubmissionMode(submission_mode.lower())
     except ValueError:
         raise HTTPException(
             status_code=400,
@@ -222,6 +222,11 @@ async def to_submit_info(
         SubmissionMode.BENCHMARK,
         SubmissionMode.LEADERBOARD,
     ]
+    if submission_mode_enum == SubmissionMode.PROFILE:
+        raise HTTPException(
+            status_code=400,
+            detail="Profile submissions are not currently supported via API, use Discord instead.",
+        )
     if submission_mode_enum not in allowed_modes:
         raise HTTPException(
             status_code=400,
@@ -263,9 +268,7 @@ async def to_submit_info(
     except HTTPException:
         raise
     except Exception as e:
-        raise HTTPException(
-            status_code=400, detail=f"Error reading submission file: {e}"
-        ) from e
+        raise HTTPException(status_code=400, detail=f"Error reading submission file: {e}") from e
 
     try:
         submission_code = submission_content.decode("utf-8")

From 716aca9c6c1dcf9447f5169a05401c2e8f040364 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Mon, 10 Nov 2025 19:07:08 +0100
Subject: [PATCH 28/35] Fix: remove nvidia-smi from workflow

---
 .github/workflows/nvidia_workflow.yml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml
index 6f455fbe..ff156a50 100644
--- a/.github/workflows/nvidia_workflow.yml
+++ b/.github/workflows/nvidia_workflow.yml
@@ -24,11 +24,6 @@ jobs:
     steps:
     - uses: actions/checkout@v3
 
-    - name: nvidia-smi
-      shell: bash
-      run: |
-        nvidia-smi || echo "nvidia-smi failed"
-
     - name: Create input files
       shell: bash
       run: |

From cb880a71166d55977e7cb2fae24e9ff9a7191171 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Mon, 10 Nov 2025 19:08:33 +0100
Subject: [PATCH 29/35] Fix: polling time to 15s

---
 src/libkernelbot/launchers/github.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/libkernelbot/launchers/github.py b/src/libkernelbot/launchers/github.py
index 3f09b94d..d457d244 100644
--- a/src/libkernelbot/launchers/github.py
+++ b/src/libkernelbot/launchers/github.py
@@ -344,7 +344,7 @@ async def wait_for_completion(
                     return
 
                 await callback(self)
-                await asyncio.sleep(20)  # Yield control while waiting
+                await asyncio.sleep(15)  # Yield control while waiting
             except TimeoutError:
                 raise  # Re-raise the specific TimeoutError from the timeout block
             except Exception as e:

From 2621ca145931a5e703fb4b108d7c40a99acfa5ea Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Mon, 10 Nov 2025 19:20:17 +0100
Subject: [PATCH 30/35] limit profiling report length

---
 src/libkernelbot/run_eval.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py
index e3879ee7..9460c650 100644
--- a/src/libkernelbot/run_eval.py
+++ b/src/libkernelbot/run_eval.py
@@ -150,8 +150,20 @@ def _filter_ncu_report(report: str, tables: list):
     in a *single* discord message.
     """
     result = ""
+    n_kernels = 0
     collect = False
     for line in report.splitlines():
+        if len(line) >= 3 and line[2] != ' ':
+            if n_kernels != 0:
+                result += "\n"
+            if n_kernels == 2:
+                result += "\nAdditional kernel launches follow. Please check the .ncu-rep file for more details.\n"
+            n_kernels += 1
+            result += line + "\n"
+
+        if n_kernels > 2:
+            continue
+
         if "Table Name : " in line:
             table = line[line.find("Table Name :") + len("Table Name :"):].strip()
             if table in tables:

From af80b61b80f9ae636c3ef9797ddb8fff797950eb Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Mon, 10 Nov 2025 19:20:58 +0100
Subject: [PATCH 31/35] limit number of kernels to be profiled

---
 src/libkernelbot/run_eval.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py
index 9460c650..62c8880e 100644
--- a/src/libkernelbot/run_eval.py
+++ b/src/libkernelbot/run_eval.py
@@ -451,6 +451,7 @@ def profile_program_ncu(
         "--nvtx",
         "--nvtx-include", "custom_kernel/",
         "--import-source", "1",
+        "-c", "10",
         "-o", f"{str(output_dir / 'profile.ncu-rep')}",
         "--",
     ] + call

From 2931fd46e709b2881e9fbfec0ed98209c374cb58 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Mon, 10 Nov 2025 19:30:56 +0100
Subject: [PATCH 32/35] stricter matching for kernel name lines

---
 src/libkernelbot/run_eval.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py
index 62c8880e..25a9f3ce 100644
--- a/src/libkernelbot/run_eval.py
+++ b/src/libkernelbot/run_eval.py
@@ -153,7 +153,7 @@ def _filter_ncu_report(report: str, tables: list):
     n_kernels = 0
     collect = False
     for line in report.splitlines():
-        if len(line) >= 3 and line[2] != ' ':
+        if len(line) >= 5 and line[3] == ' ' and line[4] != ' ':
             if n_kernels != 0:
                 result += "\n"
             if n_kernels == 2:

From 110386e977e40eb1d82d7e47f45d97e9893d6f02 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Mon, 10 Nov 2025 19:33:17 +0100
Subject: [PATCH 33/35] add an additional safety limit to ncu reports

---
 src/libkernelbot/run_eval.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py
index 25a9f3ce..2bc39caa 100644
--- a/src/libkernelbot/run_eval.py
+++ b/src/libkernelbot/run_eval.py
@@ -152,6 +152,7 @@ def _filter_ncu_report(report: str, tables: list):
     result = ""
     n_kernels = 0
     collect = False
+    length = 0
     for line in report.splitlines():
         if len(line) >= 5 and line[3] == ' ' and line[4] != ' ':
             if n_kernels != 0:
@@ -177,6 +178,11 @@ def _filter_ncu_report(report: str, tables: list):
 
         if collect:
             result += line + "\n"
+            length += 1
+            # just as a precaution, also limit lines directly
+            if length > 100:
+                result += "\n[...]\nReport has been truncated. Please check the .ncu-rep file for more details.\n"
+                break
     return result
 
 
From 8a4c6b2dc729bd7b044bbc1c563b0f6e1bde5350 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Mon, 10 Nov 2025 19:37:53 +0100
Subject: [PATCH 34/35] fix

---
 src/libkernelbot/run_eval.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py
index 2bc39caa..727dfe51 100644
--- a/src/libkernelbot/run_eval.py
+++ b/src/libkernelbot/run_eval.py
@@ -154,12 +154,12 @@ def _filter_ncu_report(report: str, tables: list):
     collect = False
     length = 0
     for line in report.splitlines():
-        if len(line) >= 5 and line[3] == ' ' and line[4] != ' ':
+        if len(line) >= 3 and line[1] == ' ' and line[2] != ' ':
             if n_kernels != 0:
                 result += "\n"
-            if n_kernels == 2:
-                result += "\nAdditional kernel launches follow. Please check the .ncu-rep file for more details.\n"
             n_kernels += 1
+            if n_kernels == 3:
+                result += "\nAdditional kernel launches follow. Please check the .ncu-rep file for more details.\n"
             result += line + "\n"
 
         if n_kernels > 2:

From c9786fb44c69ea8902b56c7a6d47bb7ef55d5831 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Mon, 10 Nov 2025 20:24:44 +0100
Subject: [PATCH 35/35] Fix: style

---
 src/libkernelbot/run_eval.py | 76 +++++++++++++++++++++++-------------
 1 file changed, 48 insertions(+), 28 deletions(-)

diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py
index 727dfe51..2cd6b397 100644
--- a/src/libkernelbot/run_eval.py
+++ b/src/libkernelbot/run_eval.py
@@ -130,16 +130,16 @@ def _create_files(files: Optional[dict[str, str]]):
 def _directory_to_zip_bytes(directory_path) -> str:
     """Create a zip archive and return as base64 encoded bytes."""
     with tempfile.TemporaryDirectory() as temp_dir:
-        archive_path = os.path.join(temp_dir, 'archive')
-        shutil.make_archive(archive_path, 'zip', directory_path)
+        archive_path = os.path.join(temp_dir, "archive")
+        shutil.make_archive(archive_path, "zip", directory_path)
 
-        with open(archive_path + '.zip', 'rb') as f:
+        with open(archive_path + ".zip", "rb") as f:
             data = f.read()
 
-        return base64.b64encode(data).decode('utf-8')
+        return base64.b64encode(data).decode("utf-8")
 
 
-def _filter_ncu_report(report: str, tables: list):
+def _filter_ncu_report(report: str, tables: list):  # noqa: C901
     """
     Extract the Speed-of-light section from the full ncu terminal report.
 
@@ -154,19 +154,19 @@ def _filter_ncu_report(report: str, tables: list):
     collect = False
     length = 0
     for line in report.splitlines():
-        if len(line) >= 3 and line[1] == ' ' and line[2] != ' ':
+        if len(line) >= 3 and line[1] == " " and line[2] != " ":
             if n_kernels != 0:
                 result += "\n"
             n_kernels += 1
             if n_kernels == 3:
-                result += "\nAdditional kernel launches follow. Please check the .ncu-rep file for more details.\n"
+                result += "\nAdditional kernel launches follow. Please check the .ncu-rep file for more details.\n"  # noqa: E501
             result += line + "\n"
 
         if n_kernels > 2:
             continue
 
         if "Table Name : " in line:
-            table = line[line.find("Table Name :") + len("Table Name :"):].strip()
+            table = line[line.find("Table Name :") + len("Table Name :") :].strip()
             if table in tables:
                 result += "\n"
                 collect = True
@@ -181,7 +181,7 @@ def _filter_ncu_report(report: str, tables: list):
             length += 1
             # just as a precaution, also limit lines directly
             if length > 100:
-                result += "\n[...]\nReport has been truncated. Please check the .ncu-rep file for more details.\n"
+                result += "\n[...]\nReport has been truncated. Please check the .ncu-rep file for more details.\n"  # noqa: E501
                 break
     return result
 
@@ -406,10 +406,15 @@ def profile_program_roc(
         "--",
     ] + call
 
-    run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu, extra_env={
-        "GPU_DUMP_CODE_OBJECT": "1",
-    },
-        )
+    run_result = run_program(
+        call,
+        seed=seed,
+        timeout=timeout,
+        multi_gpu=multi_gpu,
+        extra_env={
+            "GPU_DUMP_CODE_OBJECT": "1",
+        },
+    )
 
     profile_result = None
 
@@ -453,32 +458,49 @@ def profile_program_ncu(
     # Wrap program in ncu
     call = [
         "ncu",
-        "--set", "full",
+        "--set",
+        "full",
         "--nvtx",
-        "--nvtx-include", "custom_kernel/",
-        "--import-source", "1",
-        "-c", "10",
-        "-o", f"{str(output_dir / 'profile.ncu-rep')}",
+        "--nvtx-include",
+        "custom_kernel/",
+        "--import-source",
+        "1",
+        "-c",
+        "10",
+        "-o",
+        f"{str(output_dir / 'profile.ncu-rep')}",
         "--",
     ] + call
 
-    run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu, extra_env={
-        "POPCORN_NCU": "1"
-    })
+    run_result = run_program(
+        call, seed=seed, timeout=timeout, multi_gpu=multi_gpu, extra_env={"POPCORN_NCU": "1"}
+    )
     profile_result = None
 
     try:
-        get_tables = ["GPU Throughput", "Pipe Utilization (% of active cycles)", "Warp State (All Cycles)"]
-        ncu_cmd = ["ncu", "--import", f"{str(output_dir / 'profile.ncu-rep')}", "--print-details", "body"]
+        get_tables = [
+            "GPU Throughput",
+            "Pipe Utilization (% of active cycles)",
+            "Warp State (All Cycles)",
+        ]
+        ncu_cmd = [
+            "ncu",
+            "--import",
+            f"{str(output_dir / 'profile.ncu-rep')}",
+            "--print-details",
+            "body",
+        ]
         report = subprocess.check_output(ncu_cmd, text=True)
         report = _filter_ncu_report(report, get_tables)
-        run_result.result["benchmark.0.report"] = base64.b64encode(report.encode("utf-8")).decode("utf-8")
+        run_result.result["benchmark.0.report"] = base64.b64encode(report.encode("utf-8")).decode(
+            "utf-8"
+        )
     except subprocess.CalledProcessError:
         pass
 
     if run_result.success:
         profile_result = ProfileResult(
-            profiler='Nsight-Compute',
+            profiler="Nsight-Compute",
             trace=_directory_to_zip_bytes(output_dir),
             download_url=None,
         )
@@ -822,9 +844,7 @@ def run_config(config: dict):
     }
     if config["lang"] == "py":
         runner = functools.partial(
-            run_pytorch_script,
-            sources=config["sources"],
-            main=config["main"]
+            run_pytorch_script, sources=config["sources"], main=config["main"]
         )
     elif config["lang"] == "cu":
         runner = functools.partial(