diff --git a/.github/workflows/amd-health.yml b/.github/workflows/amd-health.yml index 0e80f0e2..d469e9d3 100644 --- a/.github/workflows/amd-health.yml +++ b/.github/workflows/amd-health.yml @@ -10,10 +10,12 @@ on: jobs: health-check: - runs-on: [amdgpu-mi300-x86-64] + runs-on: [amdgpu-mi300-8-x86-64] timeout-minutes: 5 steps: + - uses: actions/checkout@v3 + - name: Setup Python uses: actions/setup-python@v5 with: @@ -21,7 +23,30 @@ jobs: - name: Install PyTorch run: | + pip install numpy pip install torch --index-url https://download.pytorch.org/whl/rocm6.3 + - name: System Information + run: | + echo "=== ROCm Version ===" + rocm-smi --version || rocminfo --version || echo "ROCm version check failed" + echo "" + echo "=== GPU Driver Info ===" + rocm-smi -a || rocminfo || echo "ROCm SMI failed" + echo "" + echo "=== PyTorch Version ===" + python -c "import torch; print(f'PyTorch: {torch.__version__}')" + python -c "import torch; print(f'CUDA/ROCm: {torch.version.cuda}')" + python -c "import torch; print(f'HIP: {torch.version.hip if hasattr(torch.version, \"hip\") else \"N/A\"}')" + echo "" + echo "=== OS Info ===" + uname -a + cat /etc/os-release | head -5 + - name: GPU Health Check run: python -c "import torch; torch.randn(5, device='cuda')" + + - name: Distributed Health Check + run: | + python -c "import torch; print(f'Available GPUs: {torch.cuda.device_count()}')" + python scripts/test_distributed.py diff --git a/scripts/test_distributed.py b/scripts/test_distributed.py new file mode 100644 index 00000000..ce521b42 --- /dev/null +++ b/scripts/test_distributed.py @@ -0,0 +1,93 @@ +import os +import signal +import sys +from multiprocessing import Pool + +import torch +import torch.distributed as dist +import torch.multiprocessing as mp + + +def timeout_handler(signum, frame): + print("✗ TIMEOUT: Process hung") + sys.exit(1) + + +def test_worker(args): + rank, world_size, master_port = args + try: + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = str(master_port) + os.environ["RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + + signal.signal(signal.SIGALRM, timeout_handler) + signal.alarm(30) + + print(f"Rank {rank}: Init NCCL...") + dist.init_process_group( + "nccl", + init_method="env://", + rank=rank, + world_size=world_size, + device_id=torch.device(f"cuda:{rank}"), + ) + signal.alarm(0) + + device = torch.device(f"cuda:{rank}") + tensor = torch.ones(100, device=device) * rank + + signal.alarm(15) + dist.all_reduce(tensor) + signal.alarm(0) + + print(f"✓ Rank {rank}: sum = {tensor[0].item()}") + dist.destroy_process_group() + return True + + except Exception as e: + signal.alarm(0) + print(f"✗ Rank {rank}: {e}") + return False + + +def main(): + num_gpus = torch.cuda.device_count() + print(f"Testing {num_gpus} GPUs - 4 rounds") + + for round_num in range(4): + print(f"=== ROUND {round_num + 1} ===") + master_port = 29500 + round_num + + mp.set_start_method("spawn", force=True) + + # Prepare worker arguments + worker_args = [(rank, num_gpus, master_port) for rank in range(num_gpus)] + + with Pool(processes=num_gpus) as pool: + try: + # Use map_async with timeout + result = pool.map_async(test_worker, worker_args) + results = result.get(timeout=60) + + # Check if all workers succeeded + if not all(results): + print(f"✗ ROUND {round_num + 1} FAILED") + sys.exit(1) + + except mp.TimeoutError: + print(f"✗ ROUND {round_num + 1} HUNG") + pool.terminate() + pool.join() + sys.exit(1) + except Exception as e: + print(f"✗ ROUND {round_num + 1} ERROR: {e}") + sys.exit(1) + + print(f"✓ ROUND {round_num + 1} PASSED") + + print("✓ ALL ROUNDS PASSED") + + +if __name__ == "__main__": + main() diff --git a/src/libkernelbot/launchers/github.py b/src/libkernelbot/launchers/github.py index 1c58f0d0..4c1b1d5f 100644 --- a/src/libkernelbot/launchers/github.py +++ b/src/libkernelbot/launchers/github.py @@ -105,6 +105,7 @@ async def run_submission( # noqa: C901 logger.info("Waiting for workflow to start...") timeout = get_timeout(config) + TIMEOUT_BUFFER_MINUTES + logger.info(f"Waiting for workflow to complete... (timeout: {timeout} minutes)") await run.wait_for_completion( lambda x: self.wait_callback(x, status), timeout_minutes=timeout @@ -350,7 +351,6 @@ async def wait_for_completion( logger.error(f"Error waiting for GitHub run {self.run_id}: {e}", exc_info=e) raise # Re-raise other exceptions - def get_artifact_index(self) -> dict[str, GitHubArtifact]: logger.info("Creating artifact index for run %s", self.run_id) artifacts = self.run.get_artifacts() @@ -368,7 +368,6 @@ def get_artifact_index(self) -> dict[str, GitHubArtifact]: return extracted - async def download_artifact(self, artifact: GitHubArtifact) -> dict: logger.info("Attempting to download artifact '%s' for run %s", artifact.name, self.run_id) @@ -387,6 +386,5 @@ async def download_artifact(self, artifact: GitHubArtifact) -> dict: return artifact_dict else: raise RuntimeError( - f"Failed to download artifact {artifact.name}. " - f"Status code: {response.status_code}" + f"Failed to download artifact {artifact.name}. Status code: {response.status_code}" ) diff --git a/tests/test_github.py b/tests/test_github.py index 327f75a3..d1eedfd7 100644 --- a/tests/test_github.py +++ b/tests/test_github.py @@ -83,7 +83,7 @@ def github_config(): @pytest.mark.integration @pytest.mark.asyncio -@pytest.mark.parametrize("gpu_type", [GitHubGPU.NVIDIA, GitHubGPU.MI300]) +@pytest.mark.parametrize("gpu_type", [GitHubGPU.NVIDIA, GitHubGPU.MI300x8]) async def test_github_launcher_python_script(project_root: Path, github_config: GitHubConfig, gpu_type: GitHubGPU): """ Test GitHubLauncher with a real Python script using real GitHub Actions.