From 3bf6f6450709ec640ee4d3fdff3a6d243bf34937 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Mon, 13 Oct 2025 16:32:19 -0700 Subject: [PATCH 01/35] Change runner from gpumode-nvidia-arc to Nvidia-A100 --- .github/workflows/nvidia-arc-health.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml index 13841d27..7e2e4104 100644 --- a/.github/workflows/nvidia-arc-health.yml +++ b/.github/workflows/nvidia-arc-health.yml @@ -10,7 +10,7 @@ on: jobs: health-check: - runs-on: [gpumode-nvidia-arc] + runs-on: [Nvidia-A100-8-x86-64] timeout-minutes: 5 container: image: nvidia/cuda:12.4.0-devel-ubuntu22.04 From 5f40e369284bbc5baf0c3a9cf1dfd048d3104cfe Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Mon, 13 Oct 2025 16:35:37 -0700 Subject: [PATCH 02/35] Update nvidia-arc-health.yml --- .github/workflows/nvidia-arc-health.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml index 7e2e4104..babdf01c 100644 --- a/.github/workflows/nvidia-arc-health.yml +++ b/.github/workflows/nvidia-arc-health.yml @@ -6,7 +6,6 @@ on: - cron: '0 2 * * *' workflow_dispatch: push: - branches: [main] jobs: health-check: From e3ac7307681b9726ef8802af8c079f9b68f25416 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Mon, 13 Oct 2025 16:46:24 -0700 Subject: [PATCH 03/35] Update nvidia-arc-health.yml --- .github/workflows/nvidia-arc-health.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml index babdf01c..0c552666 100644 --- a/.github/workflows/nvidia-arc-health.yml +++ b/.github/workflows/nvidia-arc-health.yml @@ -11,8 +11,6 @@ jobs: health-check: runs-on: [Nvidia-A100-8-x86-64] timeout-minutes: 5 - container: - image: nvidia/cuda:12.4.0-devel-ubuntu22.04 steps: - name: Setup Python From c60090bf4b68e966f845b5a10b1d308ef27c850f Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 1 Nov 2025 11:15:56 -0700 Subject: [PATCH 04/35] Feat: run health on b200 --- .github/workflows/nvidia-arc-health.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml index 0c552666..47d4a54e 100644 --- a/.github/workflows/nvidia-arc-health.yml +++ b/.github/workflows/nvidia-arc-health.yml @@ -9,7 +9,7 @@ on: jobs: health-check: - runs-on: [Nvidia-A100-8-x86-64] + runs-on: [nvidia-docker-b200-8-x86-64, Nvidia-A100-8-x86-64] timeout-minutes: 5 steps: From 2a69a10a83480e78e2e9fed2143b77a2f1bb54cd Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 1 Nov 2025 11:16:37 -0700 Subject: [PATCH 05/35] tmp --- .github/workflows/nvidia-arc-health.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml index 47d4a54e..67926d82 100644 --- a/.github/workflows/nvidia-arc-health.yml +++ b/.github/workflows/nvidia-arc-health.yml @@ -9,7 +9,7 @@ on: jobs: health-check: - runs-on: [nvidia-docker-b200-8-x86-64, Nvidia-A100-8-x86-64] + runs-on: [nvidia-docker-b200-8-x86-64] timeout-minutes: 5 steps: From 9a6c08d82184d3af8143a257856787010b5f614c Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 1 Nov 2025 11:18:02 -0700 Subject: [PATCH 06/35] tmp --- .github/workflows/nvidia-arc-health.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml index 67926d82..a07c8ada 100644 --- a/.github/workflows/nvidia-arc-health.yml +++ b/.github/workflows/nvidia-arc-health.yml @@ -18,10 +18,6 @@ jobs: with: python-version: '3.10' - - name: Install PyTorch - run: | - pip install torch - - name: GPU Health Check run: python -c "import torch; torch.randn(5, device='cuda')" From aa2f8946a68cc6af81ea807623fb4d2bb6e2094b Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 1 Nov 2025 11:18:42 -0700 Subject: [PATCH 07/35] tmp --- .github/workflows/nvidia-arc-health.yml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml index a07c8ada..14e19aae 100644 --- a/.github/workflows/nvidia-arc-health.yml +++ b/.github/workflows/nvidia-arc-health.yml @@ -12,14 +12,8 @@ jobs: runs-on: [nvidia-docker-b200-8-x86-64] timeout-minutes: 5 - steps: - - name: Setup Python - uses: actions/setup-python@v5 - with: - python-version: '3.10' - - name: GPU Health Check - run: python -c "import torch; torch.randn(5, device='cuda')" + run: python3 -c "import torch; torch.randn(5, device='cuda')" env: CUDA_VISIBLE_DEVICES: 0 From fbc28addf29f76b66165218ef063ca3d196b8870 Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 1 Nov 2025 11:20:48 -0700 Subject: [PATCH 08/35] feat From 6437e1975bf7397da85355de011fa6fb93e3b3c8 Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 1 Nov 2025 11:21:20 -0700 Subject: [PATCH 09/35] feat --- .github/workflows/nvidia-arc-health.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml index 14e19aae..87f58473 100644 --- a/.github/workflows/nvidia-arc-health.yml +++ b/.github/workflows/nvidia-arc-health.yml @@ -12,8 +12,9 @@ jobs: runs-on: [nvidia-docker-b200-8-x86-64] timeout-minutes: 5 + steps: - name: GPU Health Check - run: python3 -c "import torch; torch.randn(5, device='cuda')" + run: python -c "import torch; torch.randn(5, device='cuda')" env: CUDA_VISIBLE_DEVICES: 0 From 6c4bde053c23a5155fc57190ae5bdc649550d93e Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 1 Nov 2025 11:21:45 -0700 Subject: [PATCH 10/35] feat --- .github/workflows/nvidia-arc-health.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml index 87f58473..a619f8e1 100644 --- a/.github/workflows/nvidia-arc-health.yml +++ b/.github/workflows/nvidia-arc-health.yml @@ -14,7 +14,7 @@ jobs: steps: - name: GPU Health Check - run: python -c "import torch; torch.randn(5, device='cuda')" + run: python3 -c "import torch; torch.randn(5, device='cuda')" env: CUDA_VISIBLE_DEVICES: 0 From a3e045c77b9a399b5c15e7abaec7e7361fa83aea Mon Sep 17 00:00:00 2001 From: Alex Zhang Date: Sat, 1 Nov 2025 14:26:08 -0400 Subject: [PATCH 11/35] replace nvidia workflow to point to our b200 cluster --- .github/workflows/nvidia_workflow.yml | 32 +-------------------------- 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index b50ec044..59811d30 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -19,23 +19,13 @@ run-name: 'NVIDIA Job - ${{ github.event.inputs.run_id }}' jobs: run: - runs-on: [gpumode-nvidia-arc] + runs-on: [nvidia-docker-b200-8-x86-64] timeout-minutes: 10 container: image: nvidia/cuda:12.4.0-devel-ubuntu22.04 steps: - uses: actions/checkout@v3 - - name: Setup Python - uses: actions/setup-python@v5 - with: - python-version: '3.10' - - - name: Install uv - uses: astral-sh/setup-uv@v3 - with: - version: "latest" - - name: Create input files shell: bash run: | @@ -49,26 +39,6 @@ jobs: # Now write to file (won't be logged since it's masked) echo "$PAYLOAD" > payload.json - - name: Install uv - uses: astral-sh/setup-uv@v3 - with: - version: "latest" - - - name: Setup Python environment - shell: bash - run: | - uv venv .venv - echo "VIRTUAL_ENV=$PWD/.venv" >> $GITHUB_ENV - echo "$PWD/.venv/bin" >> $GITHUB_PATH - - if [[ -n "${{ github.event.inputs.requirements }}" ]]; then - cat > "requirements.txt" <<'EOL' - ${{ github.event.inputs.requirements }} - EOL - uv pip install -r "requirements.txt" - fi - uv pip install -e . - - name: Run script shell: bash run: | From 844d3bf85266f4b2a2f12df2f49da9b1c2b1ddbe Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 1 Nov 2025 11:33:13 -0700 Subject: [PATCH 12/35] Fix: container --- .github/workflows/nvidia_workflow.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index 59811d30..b5ad437e 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -21,8 +21,6 @@ jobs: run: runs-on: [nvidia-docker-b200-8-x86-64] timeout-minutes: 10 - container: - image: nvidia/cuda:12.4.0-devel-ubuntu22.04 steps: - uses: actions/checkout@v3 @@ -58,5 +56,3 @@ jobs: name: profile-data path: profile_data/* retention-days: 1 - env: - CUDA_VISIBLE_DEVICES: 0 From 3275924f6f3a9728d386612dc0ef9844c54f3b5c Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 1 Nov 2025 11:35:10 -0700 Subject: [PATCH 13/35] Fix: python->python3 --- .github/workflows/nvidia_workflow.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index b5ad437e..766072cc 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -40,7 +40,7 @@ jobs: - name: Run script shell: bash run: | - python src/runners/github-runner.py + python3 src/runners/github-runner.py - name: Upload training artifacts uses: actions/upload-artifact@v4 From b19b59bf336c919be5b3c481c7df3b342cb64eac Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 1 Nov 2025 11:37:38 -0700 Subject: [PATCH 14/35] Fix: add back deps --- .github/workflows/nvidia_workflow.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index 766072cc..ff156a50 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -37,6 +37,14 @@ jobs: # Now write to file (won't be logged since it's masked) echo "$PAYLOAD" > payload.json + - name: Setup Virtual Environment and Install Dependencies + shell: bash + run: | + pip install --upgrade pip + pip install -r "requirements.txt" + pip install -e . + + - name: Run script shell: bash run: | From 3e8eb6fc9bd88fd590c6feec93d0a5b7a9d28614 Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 1 Nov 2025 11:39:06 -0700 Subject: [PATCH 15/35] Fix: python->python3 --- src/libkernelbot/run_eval.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py index c0897baf..a7eadb8b 100644 --- a/src/libkernelbot/run_eval.py +++ b/src/libkernelbot/run_eval.py @@ -22,7 +22,7 @@ class ProfileResult: # Public download URL of all files created by the profiler # This may also be configured later download_url: Optional[str] - #fmt: on + # fmt: on @dataclasses.dataclass @@ -351,9 +351,15 @@ def profile_program( "--", ] + call - run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu, extra_env={ - "GPU_DUMP_CODE_OBJECT": "1", - }) + run_result = run_program( + call, + seed=seed, + timeout=timeout, + multi_gpu=multi_gpu, + extra_env={ + "GPU_DUMP_CODE_OBJECT": "1", + }, + ) profile_result = None @@ -377,7 +383,7 @@ def profile_program( code_obj.rename(output_dir / code_obj.name) profile_result = ProfileResult( - profiler='rocPROF', + profiler="rocPROF", download_url=None, ) @@ -386,6 +392,7 @@ def profile_program( # TODO: Implement profiling for other platforms return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu), None + def run_single_evaluation( system: SystemInfo, call: list[str], @@ -427,7 +434,7 @@ def run_single_evaluation( return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu), None -def make_system_info() -> SystemInfo: # noqa: C901 +def make_system_info() -> SystemInfo: # noqa: C901 info = SystemInfo() try: import torch @@ -448,14 +455,16 @@ def make_system_info() -> SystemInfo: # noqa: C901 info.gpu = subprocess.check_output( ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"], encoding="utf-8" ) - info.device_count = info.gpu.count('\n') + info.device_count = info.gpu.count("\n") info.runtime = "CUDA" except subprocess.CalledProcessError: # try again for HIP try: - rocm_info = json.loads(subprocess.check_output( - ["rocm-smi", "--showproductname", "--json"], encoding="utf-8" - )) + rocm_info = json.loads( + subprocess.check_output( + ["rocm-smi", "--showproductname", "--json"], encoding="utf-8" + ) + ) if len(rocm_info) > 0: info.gpu = next(rocm_info.__iter__())["Card Series"] @@ -587,7 +596,7 @@ def run_pytorch_script( # noqa: C901 # "compile" step: execute the script once. Will populate # `load_inline`'s compile cache, so the actual runs will be faster. try: - compile_run = run_program(["python", "submission.py"], seed=1, timeout=Timeout.COMPILE) + compile_run = run_program(["python3", "submission.py"], seed=1, timeout=Timeout.COMPILE) if "-DTORCH_EXTENSION_NAME" in compile_run.stdout: comp = CompileResult( nvcc_found=True, From 998cf42c18b4fbe5d34a2eb747458f96f8b8b00e Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 1 Nov 2025 11:41:25 -0700 Subject: [PATCH 16/35] Fix: python->python3 --- src/libkernelbot/run_eval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py index a7eadb8b..de448784 100644 --- a/src/libkernelbot/run_eval.py +++ b/src/libkernelbot/run_eval.py @@ -622,7 +622,7 @@ def run_pytorch_script( # noqa: C901 exit_code=e.returncode, ) - run, profile = run_single_evaluation(system, ["python", main], **kwargs) + run, profile = run_single_evaluation(system, ["python3", main], **kwargs) return EvalResult( start=start, From 1de31fd608cb8fa9c3a0bc4f62b0bbc849c507cb Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 1 Nov 2025 11:49:22 -0700 Subject: [PATCH 17/35] Add nvidia-smi --- .github/workflows/nvidia_workflow.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index ff156a50..6f455fbe 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -24,6 +24,11 @@ jobs: steps: - uses: actions/checkout@v3 + - name: nvidia-smi + shell: bash + run: | + nvidia-smi || echo "nvidia-smi failed" + - name: Create input files shell: bash run: | From d754094605a43a0ba319874b9001a00163ec04a4 Mon Sep 17 00:00:00 2001 From: ngc92 <7938269+ngc92@users.noreply.github.com> Date: Sat, 1 Nov 2025 19:39:55 +0100 Subject: [PATCH 18/35] split profiling into rocm/ncu; small code improvements --- examples/eval.py | 33 +++++- src/libkernelbot/run_eval.py | 192 +++++++++++++++++++++-------------- 2 files changed, 145 insertions(+), 80 deletions(-) diff --git a/examples/eval.py b/examples/eval.py index 597b5ff4..187e11cd 100644 --- a/examples/eval.py +++ b/examples/eval.py @@ -500,9 +500,9 @@ def run_benchmarking(logger: PopcornOutput, pool: multiprocessing.Pool, tests: l return 112 -def _run_single_profile(test: TestCase) -> str: +def _run_single_profile_torch(test: TestCase) -> str: """ - Runs a single test case. Do not call directly + Profiles a single benchmark using the torch profiler. """ from submission import custom_kernel from torch.profiler import profile, ProfilerActivity @@ -511,14 +511,36 @@ def _run_single_profile(test: TestCase) -> str: data = generate_input(**test.args) torch.cuda.synchronize() + cloned = _clone_data(data, 0) with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof: with nvtx_range("custom_kernel"): - submission_output = custom_kernel(_clone_data(data, 0)) + submission_output = custom_kernel(cloned) torch.cuda.synchronize() return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20) +def _run_single_profile_ncu(test: TestCase) -> str: + """ + Profiles a single benchmark using ncu. Note: this does not + invoke NCU; instead, it is expected that eval is launched + under NCU, and this function will rurnthe kernel excactly + once in the 'custom_kernel' nvtx range. + """ + from submission import custom_kernel + + with nvtx_range("generate input"): + data = generate_input(**test.args) + torch.cuda.synchronize() + + cloned = _clone_data(data, 0) + with nvtx_range("custom_kernel"): + submission_output = custom_kernel(cloned) + torch.cuda.synchronize() + + return "" + + def _run_distributed_profile(test: TestCase, rank: int) -> "EventList": """ Runs a single profiling case. Do not call directly @@ -610,7 +632,10 @@ def run_single_profile(test: TestCase, pool: multiprocessing.Pool) -> str: """ world_size = test.args.get("world_size", None) if world_size is None: - return pool.apply(_run_single_profile, (test,)) + if bool(os.getenv("POPCORN_NCU", "0")): + return pool.apply(_run_single_profile_ncu, (test,)) + else: + return pool.apply(_run_single_profile_torch, (test,)) else: return run_multi_gpu_profile(pool, test, world_size) diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py index de448784..e82f466d 100644 --- a/src/libkernelbot/run_eval.py +++ b/src/libkernelbot/run_eval.py @@ -305,6 +305,112 @@ def run_program( ) +def profile_program_roc( + call: list[str], + seed: Optional[int], + timeout: int, + multi_gpu: bool, + output_dir: Path, +) -> tuple[RunResult, Optional[ProfileResult]]: + # Wrap program in rocprof + call = [ + "rocprofv3", + "--log-level", + "fatal", + "--hip-trace", + "--kernel-trace", + "--rccl-trace", + "--marker-trace", + "--hip-trace", + "--memory-copy-trace", + # New? Doesn't work in the runner + # "--memory-allocation-trace", + "--scratch-memory-trace", + # The HSA trace output is very large, so skip it for now + # "--hsa-trace", + "--output-format", + "pftrace", + "csv", + "-d", + str(output_dir), + # Just store the files as %pid%_tracename.ext instead of putting them in an + # additional directory named after the hostname. + "-o", + # Insert an extra path here so that the resulting zip has all files + # in the profile_data/ directory rather than the root. + "%pid%", + "--", + ] + call + + run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu, extra_env={ + "GPU_DUMP_CODE_OBJECT": "1", + }, + ) + + profile_result = None + + if run_result.success: + # Post-process trace data. + # rocPROF generates one trace for every process, but its more useful to + # have all traces be in the same file. Fortunately we can do that by + # concatenating. + traces = list(output_dir.glob("*.pftrace")) + with (output_dir / "combined.pftrace").open("wb") as combined: + for trace_path in traces: + with trace_path.open("rb") as trace: + shutil.copyfileobj(trace, combined) + + # After we've created the combined trace, there is no point in + # keeping the individual traces around. + trace_path.unlink() + + # Also move the code objects to the profiling output directory. + for code_obj in list(Path.cwd().glob("_code_object*.o")): + code_obj.rename(output_dir / code_obj.name) + + profile_result = ProfileResult( + profiler="rocPROF", + download_url=None, + ) + + return run_result, profile_result + + +def profile_program_ncu( + call: list[str], + seed: Optional[int], + timeout: int, + multi_gpu: bool, + output_dir: Path, +) -> tuple[RunResult, Optional[ProfileResult]]: + assert not multi_gpu, "Multi-GPU profiling not supported for ncu." + + # Wrap program in ncu + call = [ + "ncu", + "--set", "full", + "--nvtx", + "--nvtx-include", "custom_kernel/", + "--import-source", "1", + "-o", f"{str(output_dir / 'profile.ncu-rep')}", + "--", + ] + call + + run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu, extra_env={ + "POPCORN_NCU": "1" + }) + + profile_result = None + + if run_result.success: + profile_result = ProfileResult( + profiler='ncu', + download_url=None, + ) + + return run_result, profile_result + + def profile_program( system: SystemInfo, call: list[str], @@ -315,89 +421,25 @@ def profile_program( # The runner-specific configuration should implement logic # to fetch the data in this directory and return it as # ProfileResult.download_url. - # Insert an extra nested nested path here so that the resulting zip has all files + # Insert an extra nested path here so that the resulting zip has all files # in the profile_data/ directory rather than directly in the root. output_dir = Path(".") / "profile_data" / "profile_data" output_dir.mkdir(parents=True, exist_ok=True) if system.runtime == "ROCm": - # Wrap program in rocprof - call = [ - "rocprofv3", - "--log-level", - "fatal", - "--hip-trace", - "--kernel-trace", - "--rccl-trace", - "--marker-trace", - "--hip-trace", - "--memory-copy-trace", - # New? Doesn't work in the runner - # "--memory-allocation-trace", - "--scratch-memory-trace", - # The HSA trace output is very large, so skip it for now - # "--hsa-trace", - "--output-format", - "pftrace", - "csv", - "-d", - str(output_dir), - # Just store the files as %pid%_tracename.ext instead of putting them in an - # additional directory named after the hostname. - "-o", - # Insert an extra path here so that the resulting zip has all files - # in the profile_data/ directory rather than the root. - "%pid%", - "--", - ] + call - - run_result = run_program( - call, - seed=seed, - timeout=timeout, - multi_gpu=multi_gpu, - extra_env={ - "GPU_DUMP_CODE_OBJECT": "1", - }, - ) - - profile_result = None - - if run_result.success: - # Post-process trace data. - # rocPROF generates one trace for every process, but its more useful to - # have all traces be in the same file. Fortunately we can do that by - # concatenating. - traces = list(output_dir.glob("*.pftrace")) - with (output_dir / "combined.pftrace").open("wb") as combined: - for trace_path in traces: - with trace_path.open("rb") as trace: - shutil.copyfileobj(trace, combined) - - # After we've created the combined trace, there is no point in - # keeping the individual traces around. - trace_path.unlink() - - # Also move the code objects to the profiling output directory. - for code_obj in list(Path.cwd().glob("_code_object*.o")): - code_obj.rename(output_dir / code_obj.name) - - profile_result = ProfileResult( - profiler="rocPROF", - download_url=None, - ) - - return run_result, profile_result + return profile_program_roc(call, seed, timeout, multi_gpu, output_dir) + elif system.runtime == "CUDA": + return profile_program_ncu(call, seed, timeout, multi_gpu, output_dir) else: - # TODO: Implement profiling for other platforms - return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu), None + raise ValueError(f"Unknown runtime {system.runtime}") + def run_single_evaluation( - system: SystemInfo, call: list[str], mode: str, *, + system: SystemInfo, multi_gpu: bool = False, tests: Optional[str] = None, benchmarks: Optional[str] = None, @@ -426,7 +468,7 @@ def run_single_evaluation( cases.flush() - call += [mode, cases.name] + call = call + [mode, cases.name] if mode == "profile": return profile_program(system, call, seed=seed, timeout=timeout, multi_gpu=multi_gpu) @@ -498,7 +540,6 @@ def make_system_info() -> SystemInfo: # noqa: C901 def run_cuda_script( # # noqa: C901 - system: SystemInfo, sources: dict[str, str], headers: Optional[dict[str, str]] = None, arch: Optional[int] = None, @@ -559,7 +600,7 @@ def run_cuda_script( # # noqa: C901 if os.path.exists(f): os.remove(f) - run_result, profile_result = run_single_evaluation(system, ["./eval.out"], **kwargs) + run_result, profile_result = run_single_evaluation(["./eval.out"], **kwargs) return EvalResult( start=start, end=datetime.datetime.now(), @@ -570,7 +611,6 @@ def run_cuda_script( # # noqa: C901 def run_pytorch_script( # noqa: C901 - system: SystemInfo, sources: dict[str, str], main: str, **kwargs, @@ -622,7 +662,7 @@ def run_pytorch_script( # noqa: C901 exit_code=e.returncode, ) - run, profile = run_single_evaluation(system, ["python3", main], **kwargs) + run, profile = run_single_evaluation(["python3", main], **kwargs) return EvalResult( start=start, From 394e2341e0962c0d0362e991a9d561943805b182 Mon Sep 17 00:00:00 2001 From: ngc92 <7938269+ngc92@users.noreply.github.com> Date: Sun, 9 Nov 2025 13:43:59 +0100 Subject: [PATCH 19/35] profile each benchmark individually for cleaner traces --- src/libkernelbot/report.py | 51 +++++++++++++++++++----------------- src/libkernelbot/run_eval.py | 26 +++++++++++------- 2 files changed, 43 insertions(+), 34 deletions(-) diff --git a/src/libkernelbot/report.py b/src/libkernelbot/report.py index 25bb27cb..3579bf43 100644 --- a/src/libkernelbot/report.py +++ b/src/libkernelbot/report.py @@ -174,16 +174,18 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]: # n elif full: result.append("❌ Benchmarks missing") - if "profile" in runs: - bench_run = runs["profile"].run - if not bench_run.success: - result.append("❌ Running profile failed" + _short_fail_reason(bench_run)) - return result - elif not bench_run.passed: - result.append("❌ Profiling failed") - return result - else: - result.append("✅ Profiling successful") + profile_runs = [v for k, v in runs.items() if k.startswith("profile")] + if len(profile_runs) > 0: + for prof_run in profile_runs: + bench_run = prof_run.run + if not bench_run.success: + result.append("❌ Running profile failed" + _short_fail_reason(bench_run)) + return result + elif not bench_run.passed: + result.append("❌ Profiling failed") + return result + else: + result.append("✅ Profiling successful") if "leaderboard" in runs: lb_run = runs["leaderboard"].run @@ -327,23 +329,24 @@ def generate_report(result: FullResult) -> RunResultReport: # noqa: C901 make_benchmark_log(bench_run.run), ) - if "profile" in runs: - prof_run = runs["profile"] - if _handle_crash_report(report, prof_run): - return report + profile_runs = [v for k, v in runs.items() if k.startswith("profile")] + if len(profile_runs) > 0: + for prof_run in profile_runs: + if _handle_crash_report(report, prof_run): + return report - report.add_log( - "Profiling", - make_profile_log(prof_run.run), - ) - - if prof_run.profile is not None and prof_run.profile.download_url is not None: - report.add_link( - f"{prof_run.profile.profiler} profiling output", - "Download from GitHub", - prof_run.profile.download_url, + report.add_log( + "Profiling", + make_profile_log(prof_run.run), ) + if prof_run.profile is not None and prof_run.profile.download_url is not None: + report.add_link( + f"{prof_run.profile.profiler} profiling output", + "Download from GitHub", + prof_run.profile.download_url, + ) + if "leaderboard" in runs: bench_run = runs["leaderboard"] if _handle_crash_report(report, bench_run): diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py index e82f466d..cc8e3489 100644 --- a/src/libkernelbot/run_eval.py +++ b/src/libkernelbot/run_eval.py @@ -1,3 +1,4 @@ +import copy import dataclasses import datetime import functools @@ -678,12 +679,13 @@ def run_pytorch_script( # noqa: C901 class _EvalRunner(Protocol): - def __call__(self, mode: str) -> EvalResult: ... + def __call__(self, mode: str, **kwargs) -> EvalResult: ... def run_evaluation( call: _EvalRunner, mode: str, + common_args: dict, ) -> dict[str, EvalResult]: """ Given a "runner" function `call`, interprets the mode @@ -693,22 +695,28 @@ def run_evaluation( require multiple runner calls. """ results: dict[str, EvalResult] = {} - if mode in ["test", "benchmark", "profile"]: - results[mode] = call(mode=mode) + if mode == "profile": + benchmarks = copy.deepcopy(common_args["benchmarks"]) + for i, benchmark in enumerate(benchmarks.splitlines()): + common_args["benchmarks"] = benchmark + results[f"{mode}.{i}"] = call(mode=mode, **common_args) + + elif mode in ["test", "benchmark"]: + results[mode] = call(mode=mode, **common_args) elif mode in ["private", "leaderboard"]: # first, run the tests - results["test"] = call(mode="test") + results["test"] = call(mode="test", **common_args) if not results["test"].run or not results["test"].run.passed: return results - results["benchmark"] = call(mode="benchmark") + results["benchmark"] = call(mode="benchmark", **common_args) if not results["benchmark"].run or not results["benchmark"].run.passed: return results # if they pass, run the leaderboard validation - results["leaderboard"] = call(mode="leaderboard") + results["leaderboard"] = call(mode="leaderboard", **common_args) else: raise AssertionError("Invalid mode") @@ -742,8 +750,7 @@ def run_config(config: dict): runner = functools.partial( run_pytorch_script, sources=config["sources"], - main=config["main"], - **common_args, + main=config["main"] ) elif config["lang"] == "cu": runner = functools.partial( @@ -755,10 +762,9 @@ def run_config(config: dict): include_dirs=config.get("include_dirs", []), libraries=config.get("libraries", []), flags=CUDA_FLAGS, - **common_args, ) else: raise ValueError(f"Invalid language {config['lang']}") - results = run_evaluation(runner, config["mode"]) + results = run_evaluation(runner, config["mode"], common_args) return FullResult(success=True, error="", runs=results, system=system) From 0e51cf58819cc33defaad5bbc09bff84dea4714e Mon Sep 17 00:00:00 2001 From: ngc92 <7938269+ngc92@users.noreply.github.com> Date: Sun, 9 Nov 2025 14:35:24 +0100 Subject: [PATCH 20/35] profile in tempdir --- src/libkernelbot/run_eval.py | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py index cc8e3489..6a7ee9d1 100644 --- a/src/libkernelbot/run_eval.py +++ b/src/libkernelbot/run_eval.py @@ -1,3 +1,4 @@ +import base64 import copy import dataclasses import datetime @@ -20,6 +21,9 @@ class ProfileResult: # fmt: off profiler: str # The profiler used to gather this data + # Profiler trace. May be empty, in which case `download_url` + # should point to the trace file. + trace: str # Public download URL of all files created by the profiler # This may also be configured later download_url: Optional[str] @@ -123,6 +127,14 @@ def _create_files(files: Optional[dict[str, str]]): Path(name).write_text(content) +def _directory_to_zip_bytes(directory_path) -> str: + """Create a zip archive and return as bas64 encoded bytes.""" + with tempfile.NamedTemporaryFile() as archive_path: + shutil.make_archive(archive_path.name, 'zip', directory_path) + data = archive_path.read() + return base64.b64encode(data).decode('utf-8') + + def compile_cuda_script( # # noqa: C901 files: list[str], arch: Optional[int] = None, @@ -371,6 +383,7 @@ def profile_program_roc( profile_result = ProfileResult( profiler="rocPROF", + trace=_directory_to_zip_bytes(output_dir), download_url=None, ) @@ -405,7 +418,8 @@ def profile_program_ncu( if run_result.success: profile_result = ProfileResult( - profiler='ncu', + profiler='Nsight-Compute', + trace=_directory_to_zip_bytes(output_dir), download_url=None, ) @@ -424,16 +438,16 @@ def profile_program( # ProfileResult.download_url. # Insert an extra nested path here so that the resulting zip has all files # in the profile_data/ directory rather than directly in the root. - output_dir = Path(".") / "profile_data" / "profile_data" - output_dir.mkdir(parents=True, exist_ok=True) - - if system.runtime == "ROCm": - return profile_program_roc(call, seed, timeout, multi_gpu, output_dir) - elif system.runtime == "CUDA": - return profile_program_ncu(call, seed, timeout, multi_gpu, output_dir) - else: - raise ValueError(f"Unknown runtime {system.runtime}") + with tempfile.TemporaryDirectory(dir=".") as tmpdir: + output_dir = Path(tmpdir) / "profile_data" + output_dir.mkdir() + if system.runtime == "ROCm": + return profile_program_roc(call, seed, timeout, multi_gpu, output_dir) + elif system.runtime == "CUDA": + return profile_program_ncu(call, seed, timeout, multi_gpu, output_dir) + else: + raise ValueError(f"Unknown runtime {system.runtime}") def run_single_evaluation( From 3e6a59ce53f0df5b439f08b29a14ed4b4aada521 Mon Sep 17 00:00:00 2001 From: ngc92 <7938269+ngc92@users.noreply.github.com> Date: Sun, 9 Nov 2025 15:12:30 +0100 Subject: [PATCH 21/35] send profile results as attached files --- src/kernelbot/discord_reporter.py | 8 ++++- src/kernelbot/discord_utils.py | 5 ++++ src/libkernelbot/launchers/github.py | 2 +- src/libkernelbot/report.py | 45 +++++++++++++++++++++------- src/libkernelbot/run_eval.py | 12 +++++--- 5 files changed, 55 insertions(+), 17 deletions(-) diff --git a/src/kernelbot/discord_reporter.py b/src/kernelbot/discord_reporter.py index 3b6fd8c3..54ba4063 100644 --- a/src/kernelbot/discord_reporter.py +++ b/src/kernelbot/discord_reporter.py @@ -1,7 +1,8 @@ import discord -from discord_utils import _send_split_log +from discord_utils import _send_split_log, _send_file from libkernelbot.report import ( + File, Link, Log, MultiProgressReporter, @@ -70,6 +71,11 @@ async def display_report(self, title: str, report: RunResultReport): message += part.text elif isinstance(part, Log): message = await _send_split_log(thread, message, part.header, part.content) + elif isinstance(part, File): + if len(message) > 0: + await thread.send(message) + await _send_file(thread, part.message, part.name, part.content) + message = "" elif isinstance(part, Link): if len(message) > 0: await thread.send(message) diff --git a/src/kernelbot/discord_utils.py b/src/kernelbot/discord_utils.py index d014f3ca..6830db1f 100644 --- a/src/kernelbot/discord_utils.py +++ b/src/kernelbot/discord_utils.py @@ -1,5 +1,6 @@ import functools import logging +from io import BytesIO import discord @@ -136,3 +137,7 @@ async def _send_split_log(thread: discord.Thread, partial_message: str, header: await thread.send(partial_message) return "" + + +async def _send_file(thread: discord.Thread, message: str, name: str, file: bytes): + await thread.send(message, file=discord.File(BytesIO(file), filename=name)) diff --git a/src/libkernelbot/launchers/github.py b/src/libkernelbot/launchers/github.py index 4c1b1d5f..3f09b94d 100644 --- a/src/libkernelbot/launchers/github.py +++ b/src/libkernelbot/launchers/github.py @@ -143,7 +143,7 @@ async def run_submission( # noqa: C901 # Update profile artifact to the actual download URL. # For the GitHub launcher the profile_artifact currently just contains # the name of the artifact. - if profile_res is not None: + if profile_res is not None and "profile-data" in index: profile_res.download_url = index["profile-data"].public_download_url res = EvalResult( diff --git a/src/libkernelbot/report.py b/src/libkernelbot/report.py index 3579bf43..5764a2bd 100644 --- a/src/libkernelbot/report.py +++ b/src/libkernelbot/report.py @@ -43,9 +43,19 @@ class Link: url: str +@dataclasses.dataclass +class File: + """ + Link represents a file that gets attached to the report. + """ + name: str + message: str + content: bytes + + class RunResultReport: def __init__(self, data=None): - self.data: List[Text | Log | Link] = data or [] + self.data: List[Text | Log | Link | File] = data or [] def add_text(self, section: str): self.data.append(Text(section)) @@ -56,6 +66,9 @@ def add_log(self, header: str, log: str): def add_link(self, title: str, text: str, url: str): self.data.append(Link(title, text, url)) + def add_file(self, name: str, message: str, content: bytes): + self.data.append(File(name, message, content)) + def __repr__(self): return f"RunResultReport(data={self.data})" @@ -335,18 +348,28 @@ def generate_report(result: FullResult) -> RunResultReport: # noqa: C901 if _handle_crash_report(report, prof_run): return report - report.add_log( - "Profiling", - make_profile_log(prof_run.run), - ) - - if prof_run.profile is not None and prof_run.profile.download_url is not None: - report.add_link( - f"{prof_run.profile.profiler} profiling output", - "Download from GitHub", - prof_run.profile.download_url, + if prof_run.profile.trace is not None: + report.add_log( + "Profiling", + make_profile_log(prof_run.run), ) + if prof_run.profile.download_url is not None: + report.add_link( + f"{prof_run.profile.profiler} profiling output", + "Download from GitHub", + prof_run.profile.download_url, + ) + + for prof_run in profile_runs: + if prof_run.profile is not None: + if prof_run.profile.trace is not None: + report.add_file( + "profile.zip", + make_profile_log(prof_run.run), + base64.b64decode(prof_run.profile.trace), + ) + if "leaderboard" in runs: bench_run = runs["leaderboard"] if _handle_crash_report(report, bench_run): diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py index 6a7ee9d1..0953d8d9 100644 --- a/src/libkernelbot/run_eval.py +++ b/src/libkernelbot/run_eval.py @@ -128,10 +128,14 @@ def _create_files(files: Optional[dict[str, str]]): def _directory_to_zip_bytes(directory_path) -> str: - """Create a zip archive and return as bas64 encoded bytes.""" - with tempfile.NamedTemporaryFile() as archive_path: - shutil.make_archive(archive_path.name, 'zip', directory_path) - data = archive_path.read() + """Create a zip archive and return as base64 encoded bytes.""" + with tempfile.TemporaryDirectory() as temp_dir: + archive_path = os.path.join(temp_dir, 'archive') + shutil.make_archive(archive_path, 'zip', directory_path) + + with open(archive_path + '.zip', 'rb') as f: + data = f.read() + return base64.b64encode(data).decode('utf-8') From f31e4bb2f70ed4fe24235284d532b79e99d78095 Mon Sep 17 00:00:00 2001 From: ngc92 <7938269+ngc92@users.noreply.github.com> Date: Sun, 9 Nov 2025 15:31:46 +0100 Subject: [PATCH 22/35] don't spam alerts --- src/kernelbot/discord_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kernelbot/discord_utils.py b/src/kernelbot/discord_utils.py index 6830db1f..f6be7350 100644 --- a/src/kernelbot/discord_utils.py +++ b/src/kernelbot/discord_utils.py @@ -140,4 +140,4 @@ async def _send_split_log(thread: discord.Thread, partial_message: str, header: async def _send_file(thread: discord.Thread, message: str, name: str, file: bytes): - await thread.send(message, file=discord.File(BytesIO(file), filename=name)) + await thread.send(message, file=discord.File(BytesIO(file), filename=name), silent=True) From 00c215aa9844d69c8deeec34b5db3fc19adc6ca5 Mon Sep 17 00:00:00 2001 From: ngc92 <7938269+ngc92@users.noreply.github.com> Date: Sun, 9 Nov 2025 15:38:43 +0100 Subject: [PATCH 23/35] include default ncu report --- src/libkernelbot/run_eval.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py index 0953d8d9..a9d85308 100644 --- a/src/libkernelbot/run_eval.py +++ b/src/libkernelbot/run_eval.py @@ -417,9 +417,14 @@ def profile_program_ncu( run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu, extra_env={ "POPCORN_NCU": "1" }) - profile_result = None + try: + report = subprocess.check_output(["ncu", "--import", f"{str(output_dir / 'profile.ncu-rep')}",], text=True) + run_result.result["benchmark.0.report"] = base64.b64encode(report.encode("utf-8")).decode("utf-8") + except subprocess.CalledProcessError: + pass + if run_result.success: profile_result = ProfileResult( profiler='Nsight-Compute', From b014b79bf48a3be436faa40da61cb5d372f2a8c7 Mon Sep 17 00:00:00 2001 From: ngc92 <7938269+ngc92@users.noreply.github.com> Date: Sun, 9 Nov 2025 16:15:47 +0100 Subject: [PATCH 24/35] attempt at filtered ncu --- src/libkernelbot/run_eval.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py index a9d85308..156f525e 100644 --- a/src/libkernelbot/run_eval.py +++ b/src/libkernelbot/run_eval.py @@ -139,6 +139,35 @@ def _directory_to_zip_bytes(directory_path) -> str: return base64.b64encode(data).decode('utf-8') +def _filter_ncu_report(report: str, tables: list): + """ + Extract the Speed-of-light section from the full ncu terminal report. + + For expert users, we just attach the full ncu profile to the result, + and they can view whichever metrics they are interested in. But to + encourage novice users to try out profiling, we want to have a + *simple* set of things to display automatically, short enough to fit + in a *single* discord message. + """ + result = "" + collect = False + for line in report.splitlines(): + if "Table Name : " in line: + table = line[line.find("Table Name :") + len("Table Name :"):].strip() + if table in tables: + result += "\n" + collect = True + else: + collect = False + + if len(line.strip()) == 0: + collect = False + + if collect: + result += line + "\n" + return result + + def compile_cuda_script( # # noqa: C901 files: list[str], arch: Optional[int] = None, @@ -420,7 +449,8 @@ def profile_program_ncu( profile_result = None try: - report = subprocess.check_output(["ncu", "--import", f"{str(output_dir / 'profile.ncu-rep')}",], text=True) + report = subprocess.check_output(["ncu", "--import", f"{str(output_dir / 'profile.ncu-rep')}", "--print-details", "body"], text=True) + report = _filter_ncu_report(report, ["GPU Throughput", "Pipe Utilization (% of active cycles)", "Warp State (All Cycles)"]) run_result.result["benchmark.0.report"] = base64.b64encode(report.encode("utf-8")).decode("utf-8") except subprocess.CalledProcessError: pass From f328ebae33aaabadf579640de0931e63bf9d8b38 Mon Sep 17 00:00:00 2001 From: ngc92 <7938269+ngc92@users.noreply.github.com> Date: Sun, 9 Nov 2025 16:24:16 +0100 Subject: [PATCH 25/35] formatting fix --- src/kernelbot/discord_utils.py | 4 ++-- src/libkernelbot/report.py | 13 +++++++------ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/kernelbot/discord_utils.py b/src/kernelbot/discord_utils.py index f6be7350..7924a3d2 100644 --- a/src/kernelbot/discord_utils.py +++ b/src/kernelbot/discord_utils.py @@ -125,7 +125,7 @@ async def _send_split_log(thread: discord.Thread, partial_message: str, header: else: if partial_message != "": chunks.append(partial_message) - partial_message = line + partial_message = line + "\n" if partial_message != "": chunks.append(partial_message) @@ -134,7 +134,7 @@ async def _send_split_log(thread: discord.Thread, partial_message: str, header: for i, chunk in enumerate(chunks): partial_message = f"\n\n## {header} ({i+1}/{len(chunks)}):\n" partial_message += f"```\n{limit_length(chunk, 1900)}```" - await thread.send(partial_message) + await thread.send(partial_message, silent=True) return "" diff --git a/src/libkernelbot/report.py b/src/libkernelbot/report.py index 5764a2bd..71f9a7b0 100644 --- a/src/libkernelbot/report.py +++ b/src/libkernelbot/report.py @@ -272,12 +272,9 @@ def make_profile_log(run: RunResult) -> str: num_bench = int(run.result.get("benchmark-count", 0)) def log_one(base_name): - spec = run.result.get(f"{base_name}.spec") - report: str = run.result.get(f"{base_name}.report") report = base64.b64decode(report.encode("utf-8"), b"+*").decode("utf-8") report = textwrap.indent(report, " ") - bench_log.append(f"{spec}\n") bench_log.append(report) bench_log = [] @@ -314,6 +311,10 @@ def _handle_crash_report(report: RunResultReport, run_result: EvalResult): return False +def _shortname(spec: str): + return spec.replace(": ", "=").replace("; ", "_") + + def generate_report(result: FullResult) -> RunResultReport: # noqa: C901 runs = result.runs report = RunResultReport() @@ -350,7 +351,7 @@ def generate_report(result: FullResult) -> RunResultReport: # noqa: C901 if prof_run.profile.trace is not None: report.add_log( - "Profiling", + f"Profiling {prof_run.run.result.get(f'benchmark.0.spec')}", make_profile_log(prof_run.run), ) @@ -365,8 +366,8 @@ def generate_report(result: FullResult) -> RunResultReport: # noqa: C901 if prof_run.profile is not None: if prof_run.profile.trace is not None: report.add_file( - "profile.zip", - make_profile_log(prof_run.run), + f"profile-{_shortname(prof_run.run.result.get(f'benchmark.0.spec'))}.zip", + f"{prof_run.profile.profiler} report - " + prof_run.run.result.get(f"benchmark.0.spec"), base64.b64decode(prof_run.profile.trace), ) From eaa54f740b118e9813e28f8296e8e35c6c54c7ee Mon Sep 17 00:00:00 2001 From: ngc92 <7938269+ngc92@users.noreply.github.com> Date: Sun, 9 Nov 2025 16:56:24 +0100 Subject: [PATCH 26/35] fix tests --- scripts/ci_test_cuda.py | 4 ++-- scripts/ci_test_python.py | 4 ++-- src/kernelbot/discord_reporter.py | 2 +- src/libkernelbot/report.py | 6 +++--- src/libkernelbot/run_eval.py | 6 ++++-- src/runners/modal_runner.py | 2 +- tests/test_report.py | 8 ++++++-- 7 files changed, 19 insertions(+), 13 deletions(-) diff --git a/scripts/ci_test_cuda.py b/scripts/ci_test_cuda.py index c3fa893c..de1f5fbe 100644 --- a/scripts/ci_test_cuda.py +++ b/scripts/ci_test_cuda.py @@ -19,12 +19,12 @@ def run_cuda_helper(sources: dict, headers: dict = None, arch=None, **kwargs): headers = header_files eval_result = run_cuda_script( - make_system_info(), sources, headers, arch=arch, mode=SubmissionMode.TEST.value, tests="size: 256; seed: 42\n", + system=make_system_info(), **kwargs, ) return eval_result.compilation, eval_result.run @@ -195,12 +195,12 @@ def test_include_dirs(tmp_path: Path): # can also use generic flags argument result = run_cuda_script( - make_system_info(), {"eval.cu": eval_cu, "submission.cu": sub}, header_files, flags=["-I.", f"-I{tmp_path}"], mode=SubmissionMode.TEST.value, tests="size: 256; seed: 42\n", + system=make_system_info(), ) assert result.compilation.success is True diff --git a/scripts/ci_test_python.py b/scripts/ci_test_python.py index 7cc4fedd..1bd8dd9f 100644 --- a/scripts/ci_test_python.py +++ b/scripts/ci_test_python.py @@ -12,11 +12,11 @@ def run_pytorch_helper(sources: dict, tests=None, **kwargs): result = run_pytorch_script( - make_system_info(), sources, "eval.py", mode=SubmissionMode.TEST.value, tests=tests or "size: 256; seed: 42\n", + system=make_system_info(), **kwargs, ) return result.run @@ -45,7 +45,7 @@ def custom_kernel(input): run = run_pytorch_helper({**files, "submission.py": sub}) assert run.success is True assert run.passed is False - assert "python eval.py test" in run.command + assert "python3 eval.py test" in run.command assert run.stdout == "" assert run.stderr == "" diff --git a/src/kernelbot/discord_reporter.py b/src/kernelbot/discord_reporter.py index 54ba4063..d0551b07 100644 --- a/src/kernelbot/discord_reporter.py +++ b/src/kernelbot/discord_reporter.py @@ -1,5 +1,5 @@ import discord -from discord_utils import _send_split_log, _send_file +from discord_utils import _send_file, _send_split_log from libkernelbot.report import ( File, diff --git a/src/libkernelbot/report.py b/src/libkernelbot/report.py index 71f9a7b0..58beaffe 100644 --- a/src/libkernelbot/report.py +++ b/src/libkernelbot/report.py @@ -351,7 +351,7 @@ def generate_report(result: FullResult) -> RunResultReport: # noqa: C901 if prof_run.profile.trace is not None: report.add_log( - f"Profiling {prof_run.run.result.get(f'benchmark.0.spec')}", + f"Profiling {prof_run.run.result.get('benchmark.0.spec')}", make_profile_log(prof_run.run), ) @@ -366,8 +366,8 @@ def generate_report(result: FullResult) -> RunResultReport: # noqa: C901 if prof_run.profile is not None: if prof_run.profile.trace is not None: report.add_file( - f"profile-{_shortname(prof_run.run.result.get(f'benchmark.0.spec'))}.zip", - f"{prof_run.profile.profiler} report - " + prof_run.run.result.get(f"benchmark.0.spec"), + f"profile-{_shortname(prof_run.run.result.get('benchmark.0.spec'))}.zip", + f"{prof_run.profile.profiler} report - " + prof_run.run.result.get("benchmark.0.spec"), base64.b64decode(prof_run.profile.trace), ) diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py index 156f525e..e3879ee7 100644 --- a/src/libkernelbot/run_eval.py +++ b/src/libkernelbot/run_eval.py @@ -449,8 +449,10 @@ def profile_program_ncu( profile_result = None try: - report = subprocess.check_output(["ncu", "--import", f"{str(output_dir / 'profile.ncu-rep')}", "--print-details", "body"], text=True) - report = _filter_ncu_report(report, ["GPU Throughput", "Pipe Utilization (% of active cycles)", "Warp State (All Cycles)"]) + get_tables = ["GPU Throughput", "Pipe Utilization (% of active cycles)", "Warp State (All Cycles)"] + ncu_cmd = ["ncu", "--import", f"{str(output_dir / 'profile.ncu-rep')}", "--print-details", "body"] + report = subprocess.check_output(ncu_cmd, text=True) + report = _filter_ncu_report(report, get_tables) run_result.result["benchmark.0.report"] = base64.b64encode(report.encode("utf-8")).decode("utf-8") except subprocess.CalledProcessError: pass diff --git a/src/runners/modal_runner.py b/src/runners/modal_runner.py index 6a048e62..d2cb0d64 100644 --- a/src/runners/modal_runner.py +++ b/src/runners/modal_runner.py @@ -16,7 +16,7 @@ # Move this to another file later: cuda_image = ( - Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.12") + Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.13") .apt_install( "git", "gcc-13", diff --git a/tests/test_report.py b/tests/test_report.py index a1964e62..9006a98e 100644 --- a/tests/test_report.py +++ b/tests/test_report.py @@ -6,6 +6,7 @@ from libkernelbot import consts from libkernelbot.report import ( + File, RunResultReport, _generate_compile_report, _short_fail_reason, @@ -402,7 +403,6 @@ def test_make_profile_log(): log = make_profile_log(run) - assert "Matrix multiplication profile" in log assert " Profile line 1" in log assert " Profile line 2" in log @@ -664,6 +664,7 @@ def test_generate_report_profile(sample_full_result: FullResult): } sample_full_result.runs["profile"].profile = ProfileResult( profiler="NSight", + trace="", download_url="https://example.com", ) report = generate_report(sample_full_result) @@ -687,8 +688,11 @@ def test_generate_report_profile(sample_full_result: FullResult): "❌ Test division\n" "> Division by zero", ), - Log(header="Profiling", content="Benchmark\n\n Profile report\n"), + Log(header='Profiling Benchmark', content=' Profile report\n'), Link("NSight profiling output", "Download from GitHub", "https://example.com"), + File(name='profile-Benchmark.zip', + message='NSight report - Benchmark', + content=b''), ] From e83b0f48d54793d0cfb21217c1e69c74123c297d Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Mon, 10 Nov 2025 19:04:53 +0100 Subject: [PATCH 27/35] Fix: good error for profile via api --- src/kernelbot/api/api_utils.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/kernelbot/api/api_utils.py b/src/kernelbot/api/api_utils.py index 4082108e..0b0e714c 100644 --- a/src/kernelbot/api/api_utils.py +++ b/src/kernelbot/api/api_utils.py @@ -189,6 +189,8 @@ async def display_report(self, title: str, report: RunResultReport): elif isinstance(part, Log): self.long_report += f"\n\n## {part.header}:\n" self.long_report += f"```\n{part.content}```" + + # ruff: noqa: C901 async def to_submit_info( user_info: Any, @@ -197,14 +199,12 @@ async def to_submit_info( leaderboard_name: str, gpu_type: str, db_context: LeaderboardDB, -) -> tuple[SubmissionRequest, SubmissionMode]: # noqa: C901 +) -> tuple[SubmissionRequest, SubmissionMode]: # noqa: C901 user_name = user_info["user_name"] user_id = user_info["user_id"] try: - submission_mode_enum: SubmissionMode = SubmissionMode( - submission_mode.lower() - ) + submission_mode_enum: SubmissionMode = SubmissionMode(submission_mode.lower()) except ValueError: raise HTTPException( status_code=400, @@ -222,6 +222,11 @@ async def to_submit_info( SubmissionMode.BENCHMARK, SubmissionMode.LEADERBOARD, ] + if submission_mode_enum == SubmissionMode.PROFILE: + raise HTTPException( + status_code=400, + detail="Profile submissions are not currently supported via API, use Discord instead.", + ) if submission_mode_enum not in allowed_modes: raise HTTPException( status_code=400, @@ -263,9 +268,7 @@ async def to_submit_info( except HTTPException: raise except Exception as e: - raise HTTPException( - status_code=400, detail=f"Error reading submission file: {e}" - ) from e + raise HTTPException(status_code=400, detail=f"Error reading submission file: {e}") from e try: submission_code = submission_content.decode("utf-8") From 716aca9c6c1dcf9447f5169a05401c2e8f040364 Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Mon, 10 Nov 2025 19:07:08 +0100 Subject: [PATCH 28/35] Fix: remove nvidia-smi from workflow --- .github/workflows/nvidia_workflow.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index 6f455fbe..ff156a50 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -24,11 +24,6 @@ jobs: steps: - uses: actions/checkout@v3 - - name: nvidia-smi - shell: bash - run: | - nvidia-smi || echo "nvidia-smi failed" - - name: Create input files shell: bash run: | From cb880a71166d55977e7cb2fae24e9ff9a7191171 Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Mon, 10 Nov 2025 19:08:33 +0100 Subject: [PATCH 29/35] Fix: polling time to 15s --- src/libkernelbot/launchers/github.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libkernelbot/launchers/github.py b/src/libkernelbot/launchers/github.py index 3f09b94d..d457d244 100644 --- a/src/libkernelbot/launchers/github.py +++ b/src/libkernelbot/launchers/github.py @@ -344,7 +344,7 @@ async def wait_for_completion( return await callback(self) - await asyncio.sleep(20) # Yield control while waiting + await asyncio.sleep(15) # Yield control while waiting except TimeoutError: raise # Re-raise the specific TimeoutError from the timeout block except Exception as e: From 2621ca145931a5e703fb4b108d7c40a99acfa5ea Mon Sep 17 00:00:00 2001 From: ngc92 <7938269+ngc92@users.noreply.github.com> Date: Mon, 10 Nov 2025 19:20:17 +0100 Subject: [PATCH 30/35] limit profiling report length --- src/libkernelbot/run_eval.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py index e3879ee7..9460c650 100644 --- a/src/libkernelbot/run_eval.py +++ b/src/libkernelbot/run_eval.py @@ -150,8 +150,20 @@ def _filter_ncu_report(report: str, tables: list): in a *single* discord message. """ result = "" + n_kernels = 0 collect = False for line in report.splitlines(): + if len(line) >= 3 and line[2] != ' ': + if n_kernels != 0: + result += "\n" + if n_kernels == 2: + result += "\nAdditional kernel launches follow. Please check the .ncu-rep file for more details.\n" + n_kernels += 1 + result += line + "\n" + + if n_kernels > 2: + continue + if "Table Name : " in line: table = line[line.find("Table Name :") + len("Table Name :"):].strip() if table in tables: From af80b61b80f9ae636c3ef9797ddb8fff797950eb Mon Sep 17 00:00:00 2001 From: ngc92 <7938269+ngc92@users.noreply.github.com> Date: Mon, 10 Nov 2025 19:20:58 +0100 Subject: [PATCH 31/35] limit number of kernels to be profiled --- src/libkernelbot/run_eval.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py index 9460c650..62c8880e 100644 --- a/src/libkernelbot/run_eval.py +++ b/src/libkernelbot/run_eval.py @@ -451,6 +451,7 @@ def profile_program_ncu( "--nvtx", "--nvtx-include", "custom_kernel/", "--import-source", "1", + "-c", "10", "-o", f"{str(output_dir / 'profile.ncu-rep')}", "--", ] + call From 2931fd46e709b2881e9fbfec0ed98209c374cb58 Mon Sep 17 00:00:00 2001 From: ngc92 <7938269+ngc92@users.noreply.github.com> Date: Mon, 10 Nov 2025 19:30:56 +0100 Subject: [PATCH 32/35] stricter matching for kernel name lines --- src/libkernelbot/run_eval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py index 62c8880e..25a9f3ce 100644 --- a/src/libkernelbot/run_eval.py +++ b/src/libkernelbot/run_eval.py @@ -153,7 +153,7 @@ def _filter_ncu_report(report: str, tables: list): n_kernels = 0 collect = False for line in report.splitlines(): - if len(line) >= 3 and line[2] != ' ': + if len(line) >= 5 and line[3] == ' ' and line[4] != ' ': if n_kernels != 0: result += "\n" if n_kernels == 2: From 110386e977e40eb1d82d7e47f45d97e9893d6f02 Mon Sep 17 00:00:00 2001 From: ngc92 <7938269+ngc92@users.noreply.github.com> Date: Mon, 10 Nov 2025 19:33:17 +0100 Subject: [PATCH 33/35] add an additional safety limit to ncu reports --- src/libkernelbot/run_eval.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py index 25a9f3ce..2bc39caa 100644 --- a/src/libkernelbot/run_eval.py +++ b/src/libkernelbot/run_eval.py @@ -152,6 +152,7 @@ def _filter_ncu_report(report: str, tables: list): result = "" n_kernels = 0 collect = False + length = 0 for line in report.splitlines(): if len(line) >= 5 and line[3] == ' ' and line[4] != ' ': if n_kernels != 0: @@ -177,6 +178,11 @@ def _filter_ncu_report(report: str, tables: list): if collect: result += line + "\n" + length += 1 + # just as a precaution, also limit lines directly + if length > 100: + result += "\n[...]\nReport has been truncated. Please check the .ncu-rep file for more details.\n" + break return result From 8a4c6b2dc729bd7b044bbc1c563b0f6e1bde5350 Mon Sep 17 00:00:00 2001 From: ngc92 <7938269+ngc92@users.noreply.github.com> Date: Mon, 10 Nov 2025 19:37:53 +0100 Subject: [PATCH 34/35] fix --- src/libkernelbot/run_eval.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py index 2bc39caa..727dfe51 100644 --- a/src/libkernelbot/run_eval.py +++ b/src/libkernelbot/run_eval.py @@ -154,12 +154,12 @@ def _filter_ncu_report(report: str, tables: list): collect = False length = 0 for line in report.splitlines(): - if len(line) >= 5 and line[3] == ' ' and line[4] != ' ': + if len(line) >= 3 and line[1] == ' ' and line[2] != ' ': if n_kernels != 0: result += "\n" - if n_kernels == 2: - result += "\nAdditional kernel launches follow. Please check the .ncu-rep file for more details.\n" n_kernels += 1 + if n_kernels == 3: + result += "\nAdditional kernel launches follow. Please check the .ncu-rep file for more details.\n" result += line + "\n" if n_kernels > 2: From c9786fb44c69ea8902b56c7a6d47bb7ef55d5831 Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Mon, 10 Nov 2025 20:24:44 +0100 Subject: [PATCH 35/35] Fix: style --- src/libkernelbot/run_eval.py | 76 +++++++++++++++++++++++------------- 1 file changed, 48 insertions(+), 28 deletions(-) diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py index 727dfe51..2cd6b397 100644 --- a/src/libkernelbot/run_eval.py +++ b/src/libkernelbot/run_eval.py @@ -130,16 +130,16 @@ def _create_files(files: Optional[dict[str, str]]): def _directory_to_zip_bytes(directory_path) -> str: """Create a zip archive and return as base64 encoded bytes.""" with tempfile.TemporaryDirectory() as temp_dir: - archive_path = os.path.join(temp_dir, 'archive') - shutil.make_archive(archive_path, 'zip', directory_path) + archive_path = os.path.join(temp_dir, "archive") + shutil.make_archive(archive_path, "zip", directory_path) - with open(archive_path + '.zip', 'rb') as f: + with open(archive_path + ".zip", "rb") as f: data = f.read() - return base64.b64encode(data).decode('utf-8') + return base64.b64encode(data).decode("utf-8") -def _filter_ncu_report(report: str, tables: list): +def _filter_ncu_report(report: str, tables: list): # noqa: C901 """ Extract the Speed-of-light section from the full ncu terminal report. @@ -154,19 +154,19 @@ def _filter_ncu_report(report: str, tables: list): collect = False length = 0 for line in report.splitlines(): - if len(line) >= 3 and line[1] == ' ' and line[2] != ' ': + if len(line) >= 3 and line[1] == " " and line[2] != " ": if n_kernels != 0: result += "\n" n_kernels += 1 if n_kernels == 3: - result += "\nAdditional kernel launches follow. Please check the .ncu-rep file for more details.\n" + result += "\nAdditional kernel launches follow. Please check the .ncu-rep file for more details.\n" # noqa: E501 result += line + "\n" if n_kernels > 2: continue if "Table Name : " in line: - table = line[line.find("Table Name :") + len("Table Name :"):].strip() + table = line[line.find("Table Name :") + len("Table Name :") :].strip() if table in tables: result += "\n" collect = True @@ -181,7 +181,7 @@ def _filter_ncu_report(report: str, tables: list): length += 1 # just as a precaution, also limit lines directly if length > 100: - result += "\n[...]\nReport has been truncated. Please check the .ncu-rep file for more details.\n" + result += "\n[...]\nReport has been truncated. Please check the .ncu-rep file for more details.\n" # noqa: E501 break return result @@ -406,10 +406,15 @@ def profile_program_roc( "--", ] + call - run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu, extra_env={ - "GPU_DUMP_CODE_OBJECT": "1", - }, - ) + run_result = run_program( + call, + seed=seed, + timeout=timeout, + multi_gpu=multi_gpu, + extra_env={ + "GPU_DUMP_CODE_OBJECT": "1", + }, + ) profile_result = None @@ -453,32 +458,49 @@ def profile_program_ncu( # Wrap program in ncu call = [ "ncu", - "--set", "full", + "--set", + "full", "--nvtx", - "--nvtx-include", "custom_kernel/", - "--import-source", "1", - "-c", "10", - "-o", f"{str(output_dir / 'profile.ncu-rep')}", + "--nvtx-include", + "custom_kernel/", + "--import-source", + "1", + "-c", + "10", + "-o", + f"{str(output_dir / 'profile.ncu-rep')}", "--", ] + call - run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu, extra_env={ - "POPCORN_NCU": "1" - }) + run_result = run_program( + call, seed=seed, timeout=timeout, multi_gpu=multi_gpu, extra_env={"POPCORN_NCU": "1"} + ) profile_result = None try: - get_tables = ["GPU Throughput", "Pipe Utilization (% of active cycles)", "Warp State (All Cycles)"] - ncu_cmd = ["ncu", "--import", f"{str(output_dir / 'profile.ncu-rep')}", "--print-details", "body"] + get_tables = [ + "GPU Throughput", + "Pipe Utilization (% of active cycles)", + "Warp State (All Cycles)", + ] + ncu_cmd = [ + "ncu", + "--import", + f"{str(output_dir / 'profile.ncu-rep')}", + "--print-details", + "body", + ] report = subprocess.check_output(ncu_cmd, text=True) report = _filter_ncu_report(report, get_tables) - run_result.result["benchmark.0.report"] = base64.b64encode(report.encode("utf-8")).decode("utf-8") + run_result.result["benchmark.0.report"] = base64.b64encode(report.encode("utf-8")).decode( + "utf-8" + ) except subprocess.CalledProcessError: pass if run_result.success: profile_result = ProfileResult( - profiler='Nsight-Compute', + profiler="Nsight-Compute", trace=_directory_to_zip_bytes(output_dir), download_url=None, ) @@ -822,9 +844,7 @@ def run_config(config: dict): } if config["lang"] == "py": runner = functools.partial( - run_pytorch_script, - sources=config["sources"], - main=config["main"] + run_pytorch_script, sources=config["sources"], main=config["main"] ) elif config["lang"] == "cu": runner = functools.partial(