Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion .github/workflows/amd-health.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,43 @@ on:

jobs:
health-check:
runs-on: [amdgpu-mi300-x86-64]
runs-on: [amdgpu-mi300-8-x86-64]
timeout-minutes: 5

steps:
- uses: actions/checkout@v3

- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.10'

- name: Install PyTorch
run: |
pip install numpy
pip install torch --index-url https://download.pytorch.org/whl/rocm6.3

- name: System Information
run: |
echo "=== ROCm Version ==="
rocm-smi --version || rocminfo --version || echo "ROCm version check failed"
echo ""
echo "=== GPU Driver Info ==="
rocm-smi -a || rocminfo || echo "ROCm SMI failed"
echo ""
echo "=== PyTorch Version ==="
python -c "import torch; print(f'PyTorch: {torch.__version__}')"
python -c "import torch; print(f'CUDA/ROCm: {torch.version.cuda}')"
python -c "import torch; print(f'HIP: {torch.version.hip if hasattr(torch.version, \"hip\") else \"N/A\"}')"
echo ""
echo "=== OS Info ==="
uname -a
cat /etc/os-release | head -5

- name: GPU Health Check
run: python -c "import torch; torch.randn(5, device='cuda')"

- name: Distributed Health Check
run: |
python -c "import torch; print(f'Available GPUs: {torch.cuda.device_count()}')"
python scripts/test_distributed.py
93 changes: 93 additions & 0 deletions scripts/test_distributed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import os
import signal
import sys
from multiprocessing import Pool

import torch
import torch.distributed as dist
import torch.multiprocessing as mp


def timeout_handler(signum, frame):
print("✗ TIMEOUT: Process hung")
sys.exit(1)


def test_worker(args):
rank, world_size, master_port = args
try:
os.environ["MASTER_ADDR"] = "127.0.0.1"
os.environ["MASTER_PORT"] = str(master_port)
os.environ["RANK"] = str(rank)
os.environ["WORLD_SIZE"] = str(world_size)

signal.signal(signal.SIGALRM, timeout_handler)
signal.alarm(30)

print(f"Rank {rank}: Init NCCL...")
dist.init_process_group(
"nccl",
init_method="env://",
rank=rank,
world_size=world_size,
device_id=torch.device(f"cuda:{rank}"),
)
signal.alarm(0)

device = torch.device(f"cuda:{rank}")
tensor = torch.ones(100, device=device) * rank

signal.alarm(15)
dist.all_reduce(tensor)
signal.alarm(0)

print(f"✓ Rank {rank}: sum = {tensor[0].item()}")
dist.destroy_process_group()
return True

except Exception as e:
signal.alarm(0)
print(f"✗ Rank {rank}: {e}")
return False


def main():
num_gpus = torch.cuda.device_count()
print(f"Testing {num_gpus} GPUs - 4 rounds")

for round_num in range(4):
print(f"=== ROUND {round_num + 1} ===")
master_port = 29500 + round_num

mp.set_start_method("spawn", force=True)

# Prepare worker arguments
worker_args = [(rank, num_gpus, master_port) for rank in range(num_gpus)]

with Pool(processes=num_gpus) as pool:
try:
# Use map_async with timeout
result = pool.map_async(test_worker, worker_args)
results = result.get(timeout=60)

# Check if all workers succeeded
if not all(results):
print(f"✗ ROUND {round_num + 1} FAILED")
sys.exit(1)

except mp.TimeoutError:
print(f"✗ ROUND {round_num + 1} HUNG")
pool.terminate()
pool.join()
sys.exit(1)
except Exception as e:
print(f"✗ ROUND {round_num + 1} ERROR: {e}")
sys.exit(1)

print(f"✓ ROUND {round_num + 1} PASSED")

print("✓ ALL ROUNDS PASSED")


if __name__ == "__main__":
main()
6 changes: 2 additions & 4 deletions src/libkernelbot/launchers/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ async def run_submission( # noqa: C901
logger.info("Waiting for workflow to start...")

timeout = get_timeout(config) + TIMEOUT_BUFFER_MINUTES

logger.info(f"Waiting for workflow to complete... (timeout: {timeout} minutes)")
await run.wait_for_completion(
lambda x: self.wait_callback(x, status), timeout_minutes=timeout
Expand Down Expand Up @@ -350,7 +351,6 @@ async def wait_for_completion(
logger.error(f"Error waiting for GitHub run {self.run_id}: {e}", exc_info=e)
raise # Re-raise other exceptions


def get_artifact_index(self) -> dict[str, GitHubArtifact]:
logger.info("Creating artifact index for run %s", self.run_id)
artifacts = self.run.get_artifacts()
Expand All @@ -368,7 +368,6 @@ def get_artifact_index(self) -> dict[str, GitHubArtifact]:

return extracted


async def download_artifact(self, artifact: GitHubArtifact) -> dict:
logger.info("Attempting to download artifact '%s' for run %s", artifact.name, self.run_id)

Expand All @@ -387,6 +386,5 @@ async def download_artifact(self, artifact: GitHubArtifact) -> dict:
return artifact_dict
else:
raise RuntimeError(
f"Failed to download artifact {artifact.name}. "
f"Status code: {response.status_code}"
f"Failed to download artifact {artifact.name}. Status code: {response.status_code}"
)
2 changes: 1 addition & 1 deletion tests/test_github.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def github_config():

@pytest.mark.integration
@pytest.mark.asyncio
@pytest.mark.parametrize("gpu_type", [GitHubGPU.NVIDIA, GitHubGPU.MI300])
@pytest.mark.parametrize("gpu_type", [GitHubGPU.NVIDIA, GitHubGPU.MI300x8])
async def test_github_launcher_python_script(project_root: Path, github_config: GitHubConfig, gpu_type: GitHubGPU):
"""
Test GitHubLauncher with a real Python script using real GitHub Actions.
Expand Down
Loading