gpu-mode · msaroufim · Sep 24, 2025 · Sep 23, 2025 · Sep 23, 2025 · Sep 23, 2025
diff --git a/.github/workflows/amd-health.yml b/.github/workflows/amd-health.yml
@@ -10,18 +10,43 @@ on:
 
 jobs:
   health-check:
-    runs-on: [amdgpu-mi300-x86-64]
+    runs-on: [amdgpu-mi300-8-x86-64]
     timeout-minutes: 5
 
     steps:
+    - uses: actions/checkout@v3
+
     - name: Setup Python
       uses: actions/setup-python@v5
       with:
         python-version: '3.10'
 
     - name: Install PyTorch
       run: |
+        pip install numpy
         pip install torch --index-url https://download.pytorch.org/whl/rocm6.3
 
+    - name: System Information
+      run: |
+        echo "=== ROCm Version ==="
+        rocm-smi --version || rocminfo --version || echo "ROCm version check failed"
+        echo ""
+        echo "=== GPU Driver Info ==="
+        rocm-smi -a || rocminfo || echo "ROCm SMI failed"
+        echo ""
+        echo "=== PyTorch Version ==="
+        python -c "import torch; print(f'PyTorch: {torch.__version__}')"
+        python -c "import torch; print(f'CUDA/ROCm: {torch.version.cuda}')"
+        python -c "import torch; print(f'HIP: {torch.version.hip if hasattr(torch.version, \"hip\") else \"N/A\"}')"
+        echo ""
+        echo "=== OS Info ==="
+        uname -a
+        cat /etc/os-release | head -5
+
     - name: GPU Health Check
       run: python -c "import torch; torch.randn(5, device='cuda')"
+
+    - name: Distributed Health Check
+      run: |
+        python -c "import torch; print(f'Available GPUs: {torch.cuda.device_count()}')"
+        python scripts/test_distributed.py
diff --git a/scripts/test_distributed.py b/scripts/test_distributed.py
@@ -0,0 +1,93 @@
+import os
+import signal
+import sys
+from multiprocessing import Pool
+
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+
+def timeout_handler(signum, frame):
+    print("✗ TIMEOUT: Process hung")
+    sys.exit(1)
+
+
+def test_worker(args):
+    rank, world_size, master_port = args
+    try:
+        os.environ["MASTER_ADDR"] = "127.0.0.1"
+        os.environ["MASTER_PORT"] = str(master_port)
+        os.environ["RANK"] = str(rank)
+        os.environ["WORLD_SIZE"] = str(world_size)
+
+        signal.signal(signal.SIGALRM, timeout_handler)
+        signal.alarm(30)
+
+        print(f"Rank {rank}: Init NCCL...")
+        dist.init_process_group(
+            "nccl",
+            init_method="env://",
+            rank=rank,
+            world_size=world_size,
+            device_id=torch.device(f"cuda:{rank}"),
+        )
+        signal.alarm(0)
+
+        device = torch.device(f"cuda:{rank}")
+        tensor = torch.ones(100, device=device) * rank
+
+        signal.alarm(15)
+        dist.all_reduce(tensor)
+        signal.alarm(0)
+
+        print(f"✓ Rank {rank}: sum = {tensor[0].item()}")
+        dist.destroy_process_group()
+        return True
+
+    except Exception as e:
+        signal.alarm(0)
+        print(f"✗ Rank {rank}: {e}")
+        return False
+
+
+def main():
+    num_gpus = torch.cuda.device_count()
+    print(f"Testing {num_gpus} GPUs - 4 rounds")
+
+    for round_num in range(4):
+        print(f"=== ROUND {round_num + 1} ===")
+        master_port = 29500 + round_num
+
+        mp.set_start_method("spawn", force=True)
+
+        # Prepare worker arguments
+        worker_args = [(rank, num_gpus, master_port) for rank in range(num_gpus)]
+
+        with Pool(processes=num_gpus) as pool:
+            try:
+                # Use map_async with timeout
+                result = pool.map_async(test_worker, worker_args)
+                results = result.get(timeout=60)
+
+                # Check if all workers succeeded
+                if not all(results):
+                    print(f"✗ ROUND {round_num + 1} FAILED")
+                    sys.exit(1)
+
+            except mp.TimeoutError:
+                print(f"✗ ROUND {round_num + 1} HUNG")
+                pool.terminate()
+                pool.join()
+                sys.exit(1)
+            except Exception as e:
+                print(f"✗ ROUND {round_num + 1} ERROR: {e}")
+                sys.exit(1)
+
+        print(f"✓ ROUND {round_num + 1} PASSED")
+
+    print("✓ ALL ROUNDS PASSED")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/libkernelbot/launchers/github.py b/src/libkernelbot/launchers/github.py
@@ -105,6 +105,7 @@ async def run_submission(  # noqa: C901
         logger.info("Waiting for workflow to start...")
 
         timeout = get_timeout(config) + TIMEOUT_BUFFER_MINUTES
+
         logger.info(f"Waiting for workflow to complete... (timeout: {timeout} minutes)")
         await run.wait_for_completion(
             lambda x: self.wait_callback(x, status), timeout_minutes=timeout
@@ -350,7 +351,6 @@ async def wait_for_completion(
                 logger.error(f"Error waiting for GitHub run {self.run_id}: {e}", exc_info=e)
                 raise  # Re-raise other exceptions
 
-
     def get_artifact_index(self) -> dict[str, GitHubArtifact]:
         logger.info("Creating artifact index for run %s", self.run_id)
         artifacts = self.run.get_artifacts()
@@ -368,7 +368,6 @@ def get_artifact_index(self) -> dict[str, GitHubArtifact]:
 
         return extracted
 
-
     async def download_artifact(self, artifact: GitHubArtifact) -> dict:
         logger.info("Attempting to download artifact '%s' for run %s", artifact.name, self.run_id)
 
@@ -387,6 +386,5 @@ async def download_artifact(self, artifact: GitHubArtifact) -> dict:
             return artifact_dict
         else:
             raise RuntimeError(
-                f"Failed to download artifact {artifact.name}. "
-                f"Status code: {response.status_code}"
+                f"Failed to download artifact {artifact.name}. Status code: {response.status_code}"
             )
diff --git a/tests/test_github.py b/tests/test_github.py
@@ -83,7 +83,7 @@ def github_config():
 
 @pytest.mark.integration
 @pytest.mark.asyncio
-@pytest.mark.parametrize("gpu_type", [GitHubGPU.NVIDIA, GitHubGPU.MI300])
+@pytest.mark.parametrize("gpu_type", [GitHubGPU.NVIDIA, GitHubGPU.MI300x8])
 async def test_github_launcher_python_script(project_root: Path, github_config: GitHubConfig, gpu_type: GitHubGPU):
     """
     Test GitHubLauncher with a real Python script using real GitHub Actions.