From 3bf6f6450709ec640ee4d3fdff3a6d243bf34937 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Mon, 13 Oct 2025 16:32:19 -0700 Subject: [PATCH 01/18] Change runner from gpumode-nvidia-arc to Nvidia-A100 --- .github/workflows/nvidia-arc-health.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml index 13841d27..7e2e4104 100644 --- a/.github/workflows/nvidia-arc-health.yml +++ b/.github/workflows/nvidia-arc-health.yml @@ -10,7 +10,7 @@ on: jobs: health-check: - runs-on: [gpumode-nvidia-arc] + runs-on: [Nvidia-A100-8-x86-64] timeout-minutes: 5 container: image: nvidia/cuda:12.4.0-devel-ubuntu22.04 From 5f40e369284bbc5baf0c3a9cf1dfd048d3104cfe Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Mon, 13 Oct 2025 16:35:37 -0700 Subject: [PATCH 02/18] Update nvidia-arc-health.yml --- .github/workflows/nvidia-arc-health.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml index 7e2e4104..babdf01c 100644 --- a/.github/workflows/nvidia-arc-health.yml +++ b/.github/workflows/nvidia-arc-health.yml @@ -6,7 +6,6 @@ on: - cron: '0 2 * * *' workflow_dispatch: push: - branches: [main] jobs: health-check: From e3ac7307681b9726ef8802af8c079f9b68f25416 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Mon, 13 Oct 2025 16:46:24 -0700 Subject: [PATCH 03/18] Update nvidia-arc-health.yml --- .github/workflows/nvidia-arc-health.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml index babdf01c..0c552666 100644 --- a/.github/workflows/nvidia-arc-health.yml +++ b/.github/workflows/nvidia-arc-health.yml @@ -11,8 +11,6 @@ jobs: health-check: runs-on: [Nvidia-A100-8-x86-64] timeout-minutes: 5 - container: - image: nvidia/cuda:12.4.0-devel-ubuntu22.04 steps: - name: Setup Python From c60090bf4b68e966f845b5a10b1d308ef27c850f Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 1 Nov 2025 11:15:56 -0700 Subject: [PATCH 04/18] Feat: run health on b200 --- .github/workflows/nvidia-arc-health.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml index 0c552666..47d4a54e 100644 --- a/.github/workflows/nvidia-arc-health.yml +++ b/.github/workflows/nvidia-arc-health.yml @@ -9,7 +9,7 @@ on: jobs: health-check: - runs-on: [Nvidia-A100-8-x86-64] + runs-on: [nvidia-docker-b200-8-x86-64, Nvidia-A100-8-x86-64] timeout-minutes: 5 steps: From 2a69a10a83480e78e2e9fed2143b77a2f1bb54cd Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 1 Nov 2025 11:16:37 -0700 Subject: [PATCH 05/18] tmp --- .github/workflows/nvidia-arc-health.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml index 47d4a54e..67926d82 100644 --- a/.github/workflows/nvidia-arc-health.yml +++ b/.github/workflows/nvidia-arc-health.yml @@ -9,7 +9,7 @@ on: jobs: health-check: - runs-on: [nvidia-docker-b200-8-x86-64, Nvidia-A100-8-x86-64] + runs-on: [nvidia-docker-b200-8-x86-64] timeout-minutes: 5 steps: From 9a6c08d82184d3af8143a257856787010b5f614c Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 1 Nov 2025 11:18:02 -0700 Subject: [PATCH 06/18] tmp --- .github/workflows/nvidia-arc-health.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml index 67926d82..a07c8ada 100644 --- a/.github/workflows/nvidia-arc-health.yml +++ b/.github/workflows/nvidia-arc-health.yml @@ -18,10 +18,6 @@ jobs: with: python-version: '3.10' - - name: Install PyTorch - run: | - pip install torch - - name: GPU Health Check run: python -c "import torch; torch.randn(5, device='cuda')" From aa2f8946a68cc6af81ea807623fb4d2bb6e2094b Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 1 Nov 2025 11:18:42 -0700 Subject: [PATCH 07/18] tmp --- .github/workflows/nvidia-arc-health.yml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml index a07c8ada..14e19aae 100644 --- a/.github/workflows/nvidia-arc-health.yml +++ b/.github/workflows/nvidia-arc-health.yml @@ -12,14 +12,8 @@ jobs: runs-on: [nvidia-docker-b200-8-x86-64] timeout-minutes: 5 - steps: - - name: Setup Python - uses: actions/setup-python@v5 - with: - python-version: '3.10' - - name: GPU Health Check - run: python -c "import torch; torch.randn(5, device='cuda')" + run: python3 -c "import torch; torch.randn(5, device='cuda')" env: CUDA_VISIBLE_DEVICES: 0 From fbc28addf29f76b66165218ef063ca3d196b8870 Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 1 Nov 2025 11:20:48 -0700 Subject: [PATCH 08/18] feat From 6437e1975bf7397da85355de011fa6fb93e3b3c8 Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 1 Nov 2025 11:21:20 -0700 Subject: [PATCH 09/18] feat --- .github/workflows/nvidia-arc-health.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml index 14e19aae..87f58473 100644 --- a/.github/workflows/nvidia-arc-health.yml +++ b/.github/workflows/nvidia-arc-health.yml @@ -12,8 +12,9 @@ jobs: runs-on: [nvidia-docker-b200-8-x86-64] timeout-minutes: 5 + steps: - name: GPU Health Check - run: python3 -c "import torch; torch.randn(5, device='cuda')" + run: python -c "import torch; torch.randn(5, device='cuda')" env: CUDA_VISIBLE_DEVICES: 0 From 6c4bde053c23a5155fc57190ae5bdc649550d93e Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 1 Nov 2025 11:21:45 -0700 Subject: [PATCH 10/18] feat --- .github/workflows/nvidia-arc-health.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nvidia-arc-health.yml b/.github/workflows/nvidia-arc-health.yml index 87f58473..a619f8e1 100644 --- a/.github/workflows/nvidia-arc-health.yml +++ b/.github/workflows/nvidia-arc-health.yml @@ -14,7 +14,7 @@ jobs: steps: - name: GPU Health Check - run: python -c "import torch; torch.randn(5, device='cuda')" + run: python3 -c "import torch; torch.randn(5, device='cuda')" env: CUDA_VISIBLE_DEVICES: 0 From a3e045c77b9a399b5c15e7abaec7e7361fa83aea Mon Sep 17 00:00:00 2001 From: Alex Zhang Date: Sat, 1 Nov 2025 14:26:08 -0400 Subject: [PATCH 11/18] replace nvidia workflow to point to our b200 cluster --- .github/workflows/nvidia_workflow.yml | 32 +-------------------------- 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index b50ec044..59811d30 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -19,23 +19,13 @@ run-name: 'NVIDIA Job - ${{ github.event.inputs.run_id }}' jobs: run: - runs-on: [gpumode-nvidia-arc] + runs-on: [nvidia-docker-b200-8-x86-64] timeout-minutes: 10 container: image: nvidia/cuda:12.4.0-devel-ubuntu22.04 steps: - uses: actions/checkout@v3 - - name: Setup Python - uses: actions/setup-python@v5 - with: - python-version: '3.10' - - - name: Install uv - uses: astral-sh/setup-uv@v3 - with: - version: "latest" - - name: Create input files shell: bash run: | @@ -49,26 +39,6 @@ jobs: # Now write to file (won't be logged since it's masked) echo "$PAYLOAD" > payload.json - - name: Install uv - uses: astral-sh/setup-uv@v3 - with: - version: "latest" - - - name: Setup Python environment - shell: bash - run: | - uv venv .venv - echo "VIRTUAL_ENV=$PWD/.venv" >> $GITHUB_ENV - echo "$PWD/.venv/bin" >> $GITHUB_PATH - - if [[ -n "${{ github.event.inputs.requirements }}" ]]; then - cat > "requirements.txt" <<'EOL' - ${{ github.event.inputs.requirements }} - EOL - uv pip install -r "requirements.txt" - fi - uv pip install -e . - - name: Run script shell: bash run: | From 844d3bf85266f4b2a2f12df2f49da9b1c2b1ddbe Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 1 Nov 2025 11:33:13 -0700 Subject: [PATCH 12/18] Fix: container --- .github/workflows/nvidia_workflow.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index 59811d30..b5ad437e 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -21,8 +21,6 @@ jobs: run: runs-on: [nvidia-docker-b200-8-x86-64] timeout-minutes: 10 - container: - image: nvidia/cuda:12.4.0-devel-ubuntu22.04 steps: - uses: actions/checkout@v3 @@ -58,5 +56,3 @@ jobs: name: profile-data path: profile_data/* retention-days: 1 - env: - CUDA_VISIBLE_DEVICES: 0 From 3275924f6f3a9728d386612dc0ef9844c54f3b5c Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 1 Nov 2025 11:35:10 -0700 Subject: [PATCH 13/18] Fix: python->python3 --- .github/workflows/nvidia_workflow.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index b5ad437e..766072cc 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -40,7 +40,7 @@ jobs: - name: Run script shell: bash run: | - python src/runners/github-runner.py + python3 src/runners/github-runner.py - name: Upload training artifacts uses: actions/upload-artifact@v4 From b19b59bf336c919be5b3c481c7df3b342cb64eac Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 1 Nov 2025 11:37:38 -0700 Subject: [PATCH 14/18] Fix: add back deps --- .github/workflows/nvidia_workflow.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index 766072cc..ff156a50 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -37,6 +37,14 @@ jobs: # Now write to file (won't be logged since it's masked) echo "$PAYLOAD" > payload.json + - name: Setup Virtual Environment and Install Dependencies + shell: bash + run: | + pip install --upgrade pip + pip install -r "requirements.txt" + pip install -e . + + - name: Run script shell: bash run: | From 3e8eb6fc9bd88fd590c6feec93d0a5b7a9d28614 Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 1 Nov 2025 11:39:06 -0700 Subject: [PATCH 15/18] Fix: python->python3 --- src/libkernelbot/run_eval.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py index c0897baf..a7eadb8b 100644 --- a/src/libkernelbot/run_eval.py +++ b/src/libkernelbot/run_eval.py @@ -22,7 +22,7 @@ class ProfileResult: # Public download URL of all files created by the profiler # This may also be configured later download_url: Optional[str] - #fmt: on + # fmt: on @dataclasses.dataclass @@ -351,9 +351,15 @@ def profile_program( "--", ] + call - run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu, extra_env={ - "GPU_DUMP_CODE_OBJECT": "1", - }) + run_result = run_program( + call, + seed=seed, + timeout=timeout, + multi_gpu=multi_gpu, + extra_env={ + "GPU_DUMP_CODE_OBJECT": "1", + }, + ) profile_result = None @@ -377,7 +383,7 @@ def profile_program( code_obj.rename(output_dir / code_obj.name) profile_result = ProfileResult( - profiler='rocPROF', + profiler="rocPROF", download_url=None, ) @@ -386,6 +392,7 @@ def profile_program( # TODO: Implement profiling for other platforms return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu), None + def run_single_evaluation( system: SystemInfo, call: list[str], @@ -427,7 +434,7 @@ def run_single_evaluation( return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu), None -def make_system_info() -> SystemInfo: # noqa: C901 +def make_system_info() -> SystemInfo: # noqa: C901 info = SystemInfo() try: import torch @@ -448,14 +455,16 @@ def make_system_info() -> SystemInfo: # noqa: C901 info.gpu = subprocess.check_output( ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"], encoding="utf-8" ) - info.device_count = info.gpu.count('\n') + info.device_count = info.gpu.count("\n") info.runtime = "CUDA" except subprocess.CalledProcessError: # try again for HIP try: - rocm_info = json.loads(subprocess.check_output( - ["rocm-smi", "--showproductname", "--json"], encoding="utf-8" - )) + rocm_info = json.loads( + subprocess.check_output( + ["rocm-smi", "--showproductname", "--json"], encoding="utf-8" + ) + ) if len(rocm_info) > 0: info.gpu = next(rocm_info.__iter__())["Card Series"] @@ -587,7 +596,7 @@ def run_pytorch_script( # noqa: C901 # "compile" step: execute the script once. Will populate # `load_inline`'s compile cache, so the actual runs will be faster. try: - compile_run = run_program(["python", "submission.py"], seed=1, timeout=Timeout.COMPILE) + compile_run = run_program(["python3", "submission.py"], seed=1, timeout=Timeout.COMPILE) if "-DTORCH_EXTENSION_NAME" in compile_run.stdout: comp = CompileResult( nvcc_found=True, From 998cf42c18b4fbe5d34a2eb747458f96f8b8b00e Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 1 Nov 2025 11:41:25 -0700 Subject: [PATCH 16/18] Fix: python->python3 --- src/libkernelbot/run_eval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py index a7eadb8b..de448784 100644 --- a/src/libkernelbot/run_eval.py +++ b/src/libkernelbot/run_eval.py @@ -622,7 +622,7 @@ def run_pytorch_script( # noqa: C901 exit_code=e.returncode, ) - run, profile = run_single_evaluation(system, ["python", main], **kwargs) + run, profile = run_single_evaluation(system, ["python3", main], **kwargs) return EvalResult( start=start, From 1de31fd608cb8fa9c3a0bc4f62b0bbc849c507cb Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 1 Nov 2025 11:49:22 -0700 Subject: [PATCH 17/18] Add nvidia-smi --- .github/workflows/nvidia_workflow.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index ff156a50..6f455fbe 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -24,6 +24,11 @@ jobs: steps: - uses: actions/checkout@v3 + - name: nvidia-smi + shell: bash + run: | + nvidia-smi || echo "nvidia-smi failed" + - name: Create input files shell: bash run: | From 5c8af957bdd30b08e5bba23877d19f07c6656090 Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 1 Nov 2025 11:53:49 -0700 Subject: [PATCH 18/18] Add ncu --- .github/workflows/nvidia_workflow.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index 6f455fbe..dee90785 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -28,6 +28,11 @@ jobs: shell: bash run: | nvidia-smi || echo "nvidia-smi failed" + + - name: ncu + shell: bash + run: | + ncu --version || echo "ncu failed" - name: Create input files shell: bash