From 9c02d48911487320fce7dd2e7558f846dbfd82ff Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Tue, 23 Sep 2025 13:33:16 -0700 Subject: [PATCH 01/17] Enhance GPU health check with distributed testing Increased timeout for health check job and added distributed health check step to verify GPU availability and initialization. --- .github/workflows/amd-health.yml | 55 +++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/.github/workflows/amd-health.yml b/.github/workflows/amd-health.yml index 0e80f0e2..ebf4fcbb 100644 --- a/.github/workflows/amd-health.yml +++ b/.github/workflows/amd-health.yml @@ -11,7 +11,7 @@ on: jobs: health-check: runs-on: [amdgpu-mi300-x86-64] - timeout-minutes: 5 + timeout-minutes: 10 steps: - name: Setup Python @@ -25,3 +25,56 @@ jobs: - name: GPU Health Check run: python -c "import torch; torch.randn(5, device='cuda')" + + - name: Distributed Health Check + run: | + # Check how many GPUs are available + python -c "import torch; print(f'Available GPUs: {torch.cuda.device_count()}')" + + # Test distributed initialization with 2 GPUs (minimal distributed test) + python -c " + import torch + import torch.distributed as dist + import torch.multiprocessing as mp + import os + import time + + def test_distributed(rank, world_size, master_port): + os.environ['MASTER_ADDR'] = '127.0.0.1' + os.environ['MASTER_PORT'] = str(master_port) + + try: + dist.init_process_group('nccl', rank=rank, world_size=world_size, device_id=torch.device(f'cuda:{rank}')) + print(f'✓ Rank {rank} initialized successfully') + + # Simple distributed operation test + tensor = torch.ones(2, device=f'cuda:{rank}') * rank + dist.all_reduce(tensor) + print(f'✓ Rank {rank} all_reduce result: {tensor}') + + dist.destroy_process_group() + return True + except Exception as e: + print(f'✗ Rank {rank} failed: {e}') + return False + + num_gpus = torch.cuda.device_count() + world_size = min(num_gpus, 8) # Test with available GPUs, up to 8 + master_port = 12345 + int(time.time()) % 1000 # One port for all ranks + + print(f'Testing distributed initialization with {world_size} GPUs on port {master_port}') + + mp.set_start_method('spawn', force=True) + processes = [] + for rank in range(world_size): + p = mp.Process(target=test_distributed, args=(rank, world_size, master_port)) + p.start() + processes.append(p) + + for p in processes: + p.join() + if p.exitcode != 0: + print('✗ Distributed test failed') + exit(1) + print('✓ Distributed health check passed') + " From 01d4aab46add5188a5c8d073ae8194617da0b241 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Tue, 23 Sep 2025 13:33:35 -0700 Subject: [PATCH 02/17] Reduce timeout for health check job from 10 to 5 minutes --- .github/workflows/amd-health.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/amd-health.yml b/.github/workflows/amd-health.yml index ebf4fcbb..7f8b6740 100644 --- a/.github/workflows/amd-health.yml +++ b/.github/workflows/amd-health.yml @@ -11,7 +11,7 @@ on: jobs: health-check: runs-on: [amdgpu-mi300-x86-64] - timeout-minutes: 10 + timeout-minutes: 5 steps: - name: Setup Python From 3de2541aef17998e2403047d8dfbe2744cf5df22 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Tue, 23 Sep 2025 14:39:22 -0700 Subject: [PATCH 03/17] update --- .github/workflows/amd-health.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/amd-health.yml b/.github/workflows/amd-health.yml index 7f8b6740..080aaef3 100644 --- a/.github/workflows/amd-health.yml +++ b/.github/workflows/amd-health.yml @@ -10,7 +10,7 @@ on: jobs: health-check: - runs-on: [amdgpu-mi300-x86-64] + runs-on: [amdgpu-mi250-x86-64] timeout-minutes: 5 steps: From 4d66efb49239a31e7498ac50eaa408b61bda5b36 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Tue, 23 Sep 2025 14:43:54 -0700 Subject: [PATCH 04/17] Update runner for health check workflow --- .github/workflows/amd-health.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/amd-health.yml b/.github/workflows/amd-health.yml index 080aaef3..1173e3f4 100644 --- a/.github/workflows/amd-health.yml +++ b/.github/workflows/amd-health.yml @@ -10,7 +10,7 @@ on: jobs: health-check: - runs-on: [amdgpu-mi250-x86-64] + runs-on: [amdgpu-mi300-8-x86-64-gmj4g-runner-fmp9g] timeout-minutes: 5 steps: From 2f44dbed5b7f124311ffbdc34e02b2048bbb1011 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Tue, 23 Sep 2025 15:47:35 -0700 Subject: [PATCH 05/17] Update runner for health check workflow --- .github/workflows/amd-health.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/amd-health.yml b/.github/workflows/amd-health.yml index 1173e3f4..3adae8c7 100644 --- a/.github/workflows/amd-health.yml +++ b/.github/workflows/amd-health.yml @@ -10,7 +10,7 @@ on: jobs: health-check: - runs-on: [amdgpu-mi300-8-x86-64-gmj4g-runner-fmp9g] + runs-on: [amdgpu-mi300-8-x86-64] timeout-minutes: 5 steps: From 43b360d47d2f5138f1103d1b107fd21fb2850aaa Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Tue, 23 Sep 2025 15:54:55 -0700 Subject: [PATCH 06/17] update --- .github/workflows/amd-health.yml | 83 ++++++++++++++++++-------------- 1 file changed, 47 insertions(+), 36 deletions(-) diff --git a/.github/workflows/amd-health.yml b/.github/workflows/amd-health.yml index 3adae8c7..429a15a8 100644 --- a/.github/workflows/amd-health.yml +++ b/.github/workflows/amd-health.yml @@ -19,9 +19,9 @@ jobs: with: python-version: '3.10' - - name: Install PyTorch - run: | - pip install torch --index-url https://download.pytorch.org/whl/rocm6.3 + # - name: Install PyTorch + # run: | + # pip install torch --index-url https://download.pytorch.org/whl/rocm6.3 - name: GPU Health Check run: python -c "import torch; torch.randn(5, device='cuda')" @@ -31,50 +31,61 @@ jobs: # Check how many GPUs are available python -c "import torch; print(f'Available GPUs: {torch.cuda.device_count()}')" - # Test distributed initialization with 2 GPUs (minimal distributed test) + # Test process group initialization in a loop to debug hanging issues python -c " import torch import torch.distributed as dist - import torch.multiprocessing as mp import os import time + import signal + + def timeout_handler(signum, frame): + print('✗ Process group initialization timed out after 30 seconds') + exit(1) + + # Set timeout for process group initialization + signal.signal(signal.SIGALRM, timeout_handler) + + num_gpus = torch.cuda.device_count() + print(f'Testing process group initialization on {num_gpus} GPUs') - def test_distributed(rank, world_size, master_port): - os.environ['MASTER_ADDR'] = '127.0.0.1' - os.environ['MASTER_PORT'] = str(master_port) - + for attempt in range(3): # Try 3 times try: - dist.init_process_group('nccl', rank=rank, world_size=world_size, device_id=torch.device(f'cuda:{rank}')) - print(f'✓ Rank {rank} initialized successfully') + print(f'Attempt {attempt + 1}: Initializing process group...') + + # Set environment variables + os.environ['MASTER_ADDR'] = '127.0.0.1' + os.environ['MASTER_PORT'] = str(12345 + attempt) + os.environ['WORLD_SIZE'] = '1' + os.environ['RANK'] = '0' + + # Set 30 second timeout + signal.alarm(30) + + # Test single-process initialization first + dist.init_process_group('nccl', rank=0, world_size=1) - # Simple distributed operation test - tensor = torch.ones(2, device=f'cuda:{rank}') * rank - dist.all_reduce(tensor) - print(f'✓ Rank {rank} all_reduce result: {tensor}') + # Cancel timeout + signal.alarm(0) + + print(f'✓ Attempt {attempt + 1}: Process group initialized successfully') + + # Test basic tensor operations + device = torch.device('cuda:0') + tensor = torch.ones(10, device=device) + print(f'✓ Tensor operations work: {tensor.sum().item()}') dist.destroy_process_group() - return True + print(f'✓ Attempt {attempt + 1}: Process group destroyed successfully') + break + except Exception as e: - print(f'✗ Rank {rank} failed: {e}') - return False - - num_gpus = torch.cuda.device_count() - world_size = min(num_gpus, 8) # Test with available GPUs, up to 8 - master_port = 12345 + int(time.time()) % 1000 # One port for all ranks - - print(f'Testing distributed initialization with {world_size} GPUs on port {master_port}') - - mp.set_start_method('spawn', force=True) - processes = [] - for rank in range(world_size): - p = mp.Process(target=test_distributed, args=(rank, world_size, master_port)) - p.start() - processes.append(p) + signal.alarm(0) # Cancel timeout + print(f'✗ Attempt {attempt + 1} failed: {type(e).__name__}: {e}') + if attempt == 2: # Last attempt + print('✗ All initialization attempts failed') + exit(1) + time.sleep(2) # Wait before retry - for p in processes: - p.join() - if p.exitcode != 0: - print('✗ Distributed test failed') - exit(1) print('✓ Distributed health check passed') " From 1649475a0c3967f46e959ecb8a6cb212af51390e Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Tue, 23 Sep 2025 15:57:45 -0700 Subject: [PATCH 07/17] update --- .github/workflows/amd-health.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/amd-health.yml b/.github/workflows/amd-health.yml index 429a15a8..07290cda 100644 --- a/.github/workflows/amd-health.yml +++ b/.github/workflows/amd-health.yml @@ -19,9 +19,9 @@ jobs: with: python-version: '3.10' - # - name: Install PyTorch - # run: | - # pip install torch --index-url https://download.pytorch.org/whl/rocm6.3 + - name: Install PyTorch + run: | + pip install torch --index-url https://download.pytorch.org/whl/rocm6.3 - name: GPU Health Check run: python -c "import torch; torch.randn(5, device='cuda')" From 60ccce5402acf2f011dfd3e51623df0118bc24c7 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Tue, 23 Sep 2025 16:35:54 -0700 Subject: [PATCH 08/17] update --- .github/workflows/amd-health.yml | 62 +------------------------- scripts/test_distributed.py | 75 ++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 61 deletions(-) create mode 100644 scripts/test_distributed.py diff --git a/.github/workflows/amd-health.yml b/.github/workflows/amd-health.yml index 07290cda..f48ceaf0 100644 --- a/.github/workflows/amd-health.yml +++ b/.github/workflows/amd-health.yml @@ -6,7 +6,6 @@ on: - cron: '0 2 * * *' workflow_dispatch: push: - branches: [main] jobs: health-check: @@ -28,64 +27,5 @@ jobs: - name: Distributed Health Check run: | - # Check how many GPUs are available python -c "import torch; print(f'Available GPUs: {torch.cuda.device_count()}')" - - # Test process group initialization in a loop to debug hanging issues - python -c " - import torch - import torch.distributed as dist - import os - import time - import signal - - def timeout_handler(signum, frame): - print('✗ Process group initialization timed out after 30 seconds') - exit(1) - - # Set timeout for process group initialization - signal.signal(signal.SIGALRM, timeout_handler) - - num_gpus = torch.cuda.device_count() - print(f'Testing process group initialization on {num_gpus} GPUs') - - for attempt in range(3): # Try 3 times - try: - print(f'Attempt {attempt + 1}: Initializing process group...') - - # Set environment variables - os.environ['MASTER_ADDR'] = '127.0.0.1' - os.environ['MASTER_PORT'] = str(12345 + attempt) - os.environ['WORLD_SIZE'] = '1' - os.environ['RANK'] = '0' - - # Set 30 second timeout - signal.alarm(30) - - # Test single-process initialization first - dist.init_process_group('nccl', rank=0, world_size=1) - - # Cancel timeout - signal.alarm(0) - - print(f'✓ Attempt {attempt + 1}: Process group initialized successfully') - - # Test basic tensor operations - device = torch.device('cuda:0') - tensor = torch.ones(10, device=device) - print(f'✓ Tensor operations work: {tensor.sum().item()}') - - dist.destroy_process_group() - print(f'✓ Attempt {attempt + 1}: Process group destroyed successfully') - break - - except Exception as e: - signal.alarm(0) # Cancel timeout - print(f'✗ Attempt {attempt + 1} failed: {type(e).__name__}: {e}') - if attempt == 2: # Last attempt - print('✗ All initialization attempts failed') - exit(1) - time.sleep(2) # Wait before retry - - print('✓ Distributed health check passed') - " + python scripts/test_distributed.py diff --git a/scripts/test_distributed.py b/scripts/test_distributed.py new file mode 100644 index 00000000..f4022a8f --- /dev/null +++ b/scripts/test_distributed.py @@ -0,0 +1,75 @@ +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +import os +import signal +import sys + +def timeout_handler(signum, frame): + print('✗ TIMEOUT: Process hung') + sys.exit(1) + +def test_worker(rank, world_size, master_port): + try: + os.environ['MASTER_ADDR'] = '127.0.0.1' + os.environ['MASTER_PORT'] = str(master_port) + os.environ['RANK'] = str(rank) + os.environ['WORLD_SIZE'] = str(world_size) + + signal.signal(signal.SIGALRM, timeout_handler) + signal.alarm(30) + + print(f'Rank {rank}: Init NCCL...') + dist.init_process_group('nccl', rank=rank, world_size=world_size) + signal.alarm(0) + + device = torch.device(f'cuda:{rank}') + tensor = torch.ones(100, device=device) * rank + + signal.alarm(15) + dist.all_reduce(tensor) + signal.alarm(0) + + print(f'✓ Rank {rank}: sum = {tensor[0].item()}') + dist.destroy_process_group() + + except Exception as e: + signal.alarm(0) + print(f'✗ Rank {rank}: {e}') + sys.exit(1) + +def main(): + num_gpus = torch.cuda.device_count() + print(f'Testing {num_gpus} GPUs - 4 rounds') + + for round_num in range(4): + print(f'=== ROUND {round_num + 1} ===') + master_port = 29500 + round_num + + mp.set_start_method('spawn', force=True) + processes = [] + + for rank in range(num_gpus): + p = mp.Process(target=test_worker, args=(rank, num_gpus, master_port)) + p.start() + processes.append(p) + + for i, p in enumerate(processes): + p.join(timeout=60) + if p.exitcode != 0: + print(f'✗ ROUND {round_num + 1} FAILED') + for rp in processes: + if rp.is_alive(): + rp.terminate() + sys.exit(1) + elif p.is_alive(): + print(f'✗ ROUND {round_num + 1} HUNG') + p.terminate() + sys.exit(1) + + print(f'✓ ROUND {round_num + 1} PASSED') + + print('✓ ALL ROUNDS PASSED') + +if __name__ == '__main__': + main() \ No newline at end of file From 4b65304a6a4d5e8e3cfd9328d75c134638077dde Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Tue, 23 Sep 2025 16:39:21 -0700 Subject: [PATCH 09/17] update --- .github/workflows/amd-health.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/amd-health.yml b/.github/workflows/amd-health.yml index f48ceaf0..e715e2d4 100644 --- a/.github/workflows/amd-health.yml +++ b/.github/workflows/amd-health.yml @@ -13,6 +13,8 @@ jobs: timeout-minutes: 5 steps: + - uses: actions/checkout@v3 + - name: Setup Python uses: actions/setup-python@v5 with: From 825decb789a37c324d1a9e30ce3b8c03df03c93f Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Tue, 23 Sep 2025 16:49:24 -0700 Subject: [PATCH 10/17] update ruff --- .github/workflows/amd-health.yml | 1 + scripts/test_distributed.py | 75 +++++++++++++++++--------------- 2 files changed, 41 insertions(+), 35 deletions(-) diff --git a/.github/workflows/amd-health.yml b/.github/workflows/amd-health.yml index e715e2d4..37c1679d 100644 --- a/.github/workflows/amd-health.yml +++ b/.github/workflows/amd-health.yml @@ -22,6 +22,7 @@ jobs: - name: Install PyTorch run: | + pip install numpy pip install torch --index-url https://download.pytorch.org/whl/rocm6.3 - name: GPU Health Check diff --git a/scripts/test_distributed.py b/scripts/test_distributed.py index f4022a8f..44647481 100644 --- a/scripts/test_distributed.py +++ b/scripts/test_distributed.py @@ -1,75 +1,80 @@ -import torch -import torch.distributed as dist -import torch.multiprocessing as mp import os import signal import sys +import torch +import torch.distributed as dist +import torch.multiprocessing as mp + + def timeout_handler(signum, frame): - print('✗ TIMEOUT: Process hung') + print("✗ TIMEOUT: Process hung") sys.exit(1) + def test_worker(rank, world_size, master_port): try: - os.environ['MASTER_ADDR'] = '127.0.0.1' - os.environ['MASTER_PORT'] = str(master_port) - os.environ['RANK'] = str(rank) - os.environ['WORLD_SIZE'] = str(world_size) - + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = str(master_port) + os.environ["RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + signal.signal(signal.SIGALRM, timeout_handler) signal.alarm(30) - - print(f'Rank {rank}: Init NCCL...') - dist.init_process_group('nccl', rank=rank, world_size=world_size) + + print(f"Rank {rank}: Init NCCL...") + dist.init_process_group("nccl", rank=rank, world_size=world_size) signal.alarm(0) - - device = torch.device(f'cuda:{rank}') + + device = torch.device(f"cuda:{rank}") tensor = torch.ones(100, device=device) * rank - + signal.alarm(15) dist.all_reduce(tensor) signal.alarm(0) - - print(f'✓ Rank {rank}: sum = {tensor[0].item()}') + + print(f"✓ Rank {rank}: sum = {tensor[0].item()}") dist.destroy_process_group() - + except Exception as e: signal.alarm(0) - print(f'✗ Rank {rank}: {e}') + print(f"✗ Rank {rank}: {e}") sys.exit(1) + def main(): num_gpus = torch.cuda.device_count() - print(f'Testing {num_gpus} GPUs - 4 rounds') - + print(f"Testing {num_gpus} GPUs - 4 rounds") + for round_num in range(4): - print(f'=== ROUND {round_num + 1} ===') + print(f"=== ROUND {round_num + 1} ===") master_port = 29500 + round_num - - mp.set_start_method('spawn', force=True) + + mp.set_start_method("spawn", force=True) processes = [] - + for rank in range(num_gpus): p = mp.Process(target=test_worker, args=(rank, num_gpus, master_port)) p.start() processes.append(p) - - for i, p in enumerate(processes): + + for _, p in enumerate(processes): p.join(timeout=60) if p.exitcode != 0: - print(f'✗ ROUND {round_num + 1} FAILED') + print(f"✗ ROUND {round_num + 1} FAILED") for rp in processes: if rp.is_alive(): rp.terminate() sys.exit(1) elif p.is_alive(): - print(f'✗ ROUND {round_num + 1} HUNG') + print(f"✗ ROUND {round_num + 1} HUNG") p.terminate() sys.exit(1) - - print(f'✓ ROUND {round_num + 1} PASSED') - - print('✓ ALL ROUNDS PASSED') -if __name__ == '__main__': - main() \ No newline at end of file + print(f"✓ ROUND {round_num + 1} PASSED") + + print("✓ ALL ROUNDS PASSED") + + +if __name__ == "__main__": + main() From 9629cfe3ab2d8ff094471ce20022b8b707dcc2fe Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Tue, 23 Sep 2025 16:55:14 -0700 Subject: [PATCH 11/17] update --- scripts/test_distributed.py | 110 ++++++++++++++++++------------------ 1 file changed, 56 insertions(+), 54 deletions(-) diff --git a/scripts/test_distributed.py b/scripts/test_distributed.py index 44647481..37fe9557 100644 --- a/scripts/test_distributed.py +++ b/scripts/test_distributed.py @@ -1,80 +1,82 @@ -import os -import signal -import sys - import torch import torch.distributed as dist import torch.multiprocessing as mp - +from multiprocessing import Pool +import os +import signal +import sys def timeout_handler(signum, frame): - print("✗ TIMEOUT: Process hung") + print('✗ TIMEOUT: Process hung') sys.exit(1) - -def test_worker(rank, world_size, master_port): +def test_worker(args): + rank, world_size, master_port = args try: - os.environ["MASTER_ADDR"] = "127.0.0.1" - os.environ["MASTER_PORT"] = str(master_port) - os.environ["RANK"] = str(rank) - os.environ["WORLD_SIZE"] = str(world_size) - + os.environ['MASTER_ADDR'] = '127.0.0.1' + os.environ['MASTER_PORT'] = str(master_port) + os.environ['RANK'] = str(rank) + os.environ['WORLD_SIZE'] = str(world_size) + signal.signal(signal.SIGALRM, timeout_handler) signal.alarm(30) - - print(f"Rank {rank}: Init NCCL...") - dist.init_process_group("nccl", rank=rank, world_size=world_size) + + print(f'Rank {rank}: Init NCCL...') + dist.init_process_group("nccl", init_method="env://", rank=rank, world_size=world_size, device_id=torch.device(f'cuda:{rank}')) signal.alarm(0) - - device = torch.device(f"cuda:{rank}") + + device = torch.device(f'cuda:{rank}') tensor = torch.ones(100, device=device) * rank - + signal.alarm(15) dist.all_reduce(tensor) signal.alarm(0) - - print(f"✓ Rank {rank}: sum = {tensor[0].item()}") + + print(f'✓ Rank {rank}: sum = {tensor[0].item()}') dist.destroy_process_group() - + return True + except Exception as e: signal.alarm(0) - print(f"✗ Rank {rank}: {e}") - sys.exit(1) - + print(f'✗ Rank {rank}: {e}') + return False def main(): num_gpus = torch.cuda.device_count() - print(f"Testing {num_gpus} GPUs - 4 rounds") - + print(f'Testing {num_gpus} GPUs - 4 rounds') + for round_num in range(4): - print(f"=== ROUND {round_num + 1} ===") + print(f'=== ROUND {round_num + 1} ===') master_port = 29500 + round_num - - mp.set_start_method("spawn", force=True) - processes = [] - - for rank in range(num_gpus): - p = mp.Process(target=test_worker, args=(rank, num_gpus, master_port)) - p.start() - processes.append(p) - - for _, p in enumerate(processes): - p.join(timeout=60) - if p.exitcode != 0: - print(f"✗ ROUND {round_num + 1} FAILED") - for rp in processes: - if rp.is_alive(): - rp.terminate() + + mp.set_start_method('spawn', force=True) + + # Prepare worker arguments + worker_args = [(rank, num_gpus, master_port) for rank in range(num_gpus)] + + with Pool(processes=num_gpus) as pool: + try: + # Use map_async with timeout + result = pool.map_async(test_worker, worker_args) + results = result.get(timeout=60) + + # Check if all workers succeeded + if not all(results): + print(f'✗ ROUND {round_num + 1} FAILED') + sys.exit(1) + + except mp.TimeoutError: + print(f'✗ ROUND {round_num + 1} HUNG') + pool.terminate() + pool.join() sys.exit(1) - elif p.is_alive(): - print(f"✗ ROUND {round_num + 1} HUNG") - p.terminate() + except Exception as e: + print(f'✗ ROUND {round_num + 1} ERROR: {e}') sys.exit(1) + + print(f'✓ ROUND {round_num + 1} PASSED') + + print('✓ ALL ROUNDS PASSED') - print(f"✓ ROUND {round_num + 1} PASSED") - - print("✓ ALL ROUNDS PASSED") - - -if __name__ == "__main__": - main() +if __name__ == '__main__': + main() \ No newline at end of file From 55a291d9088f3bd8d4d2659314d570bc0e608c48 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Tue, 23 Sep 2025 17:12:00 -0700 Subject: [PATCH 12/17] update --- .github/workflows/amd-health.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/.github/workflows/amd-health.yml b/.github/workflows/amd-health.yml index 37c1679d..16d50ea1 100644 --- a/.github/workflows/amd-health.yml +++ b/.github/workflows/amd-health.yml @@ -25,6 +25,23 @@ jobs: pip install numpy pip install torch --index-url https://download.pytorch.org/whl/rocm6.3 + - name: System Information + run: | + echo "=== ROCm Version ===" + rocm-smi --version || rocminfo --version || echo "ROCm version check failed" + echo "" + echo "=== GPU Driver Info ===" + rocm-smi -a || rocminfo || echo "ROCm SMI failed" + echo "" + echo "=== PyTorch Version ===" + python -c "import torch; print(f'PyTorch: {torch.__version__}')" + python -c "import torch; print(f'CUDA/ROCm: {torch.version.cuda}')" + python -c "import torch; print(f'HIP: {torch.version.hip if hasattr(torch.version, \"hip\") else \"N/A\"}')" + echo "" + echo "=== OS Info ===" + uname -a + cat /etc/os-release | head -5 + - name: GPU Health Check run: python -c "import torch; torch.randn(5, device='cuda')" From e005de4292f4756c6d761fd6d97ea6295919c918 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 24 Sep 2025 08:33:59 -0700 Subject: [PATCH 13/17] update --- scripts/test_distributed.py | 87 +++++++++++++++++++++---------------- 1 file changed, 49 insertions(+), 38 deletions(-) diff --git a/scripts/test_distributed.py b/scripts/test_distributed.py index 37fe9557..ce521b42 100644 --- a/scripts/test_distributed.py +++ b/scripts/test_distributed.py @@ -1,82 +1,93 @@ -import torch -import torch.distributed as dist -import torch.multiprocessing as mp -from multiprocessing import Pool import os import signal import sys +from multiprocessing import Pool + +import torch +import torch.distributed as dist +import torch.multiprocessing as mp + def timeout_handler(signum, frame): - print('✗ TIMEOUT: Process hung') + print("✗ TIMEOUT: Process hung") sys.exit(1) + def test_worker(args): rank, world_size, master_port = args try: - os.environ['MASTER_ADDR'] = '127.0.0.1' - os.environ['MASTER_PORT'] = str(master_port) - os.environ['RANK'] = str(rank) - os.environ['WORLD_SIZE'] = str(world_size) - + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = str(master_port) + os.environ["RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + signal.signal(signal.SIGALRM, timeout_handler) signal.alarm(30) - - print(f'Rank {rank}: Init NCCL...') - dist.init_process_group("nccl", init_method="env://", rank=rank, world_size=world_size, device_id=torch.device(f'cuda:{rank}')) + + print(f"Rank {rank}: Init NCCL...") + dist.init_process_group( + "nccl", + init_method="env://", + rank=rank, + world_size=world_size, + device_id=torch.device(f"cuda:{rank}"), + ) signal.alarm(0) - - device = torch.device(f'cuda:{rank}') + + device = torch.device(f"cuda:{rank}") tensor = torch.ones(100, device=device) * rank - + signal.alarm(15) dist.all_reduce(tensor) signal.alarm(0) - - print(f'✓ Rank {rank}: sum = {tensor[0].item()}') + + print(f"✓ Rank {rank}: sum = {tensor[0].item()}") dist.destroy_process_group() return True - + except Exception as e: signal.alarm(0) - print(f'✗ Rank {rank}: {e}') + print(f"✗ Rank {rank}: {e}") return False + def main(): num_gpus = torch.cuda.device_count() - print(f'Testing {num_gpus} GPUs - 4 rounds') - + print(f"Testing {num_gpus} GPUs - 4 rounds") + for round_num in range(4): - print(f'=== ROUND {round_num + 1} ===') + print(f"=== ROUND {round_num + 1} ===") master_port = 29500 + round_num - - mp.set_start_method('spawn', force=True) - + + mp.set_start_method("spawn", force=True) + # Prepare worker arguments worker_args = [(rank, num_gpus, master_port) for rank in range(num_gpus)] - + with Pool(processes=num_gpus) as pool: try: # Use map_async with timeout result = pool.map_async(test_worker, worker_args) results = result.get(timeout=60) - + # Check if all workers succeeded if not all(results): - print(f'✗ ROUND {round_num + 1} FAILED') + print(f"✗ ROUND {round_num + 1} FAILED") sys.exit(1) - + except mp.TimeoutError: - print(f'✗ ROUND {round_num + 1} HUNG') + print(f"✗ ROUND {round_num + 1} HUNG") pool.terminate() pool.join() sys.exit(1) except Exception as e: - print(f'✗ ROUND {round_num + 1} ERROR: {e}') + print(f"✗ ROUND {round_num + 1} ERROR: {e}") sys.exit(1) - - print(f'✓ ROUND {round_num + 1} PASSED') - - print('✓ ALL ROUNDS PASSED') -if __name__ == '__main__': - main() \ No newline at end of file + print(f"✓ ROUND {round_num + 1} PASSED") + + print("✓ ALL ROUNDS PASSED") + + +if __name__ == "__main__": + main() From 3317036d7e40ef5c1b6ccbbeeade5818d025bbb0 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 24 Sep 2025 08:58:50 -0700 Subject: [PATCH 14/17] update timeout --- src/libkernelbot/launchers/github.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/libkernelbot/launchers/github.py b/src/libkernelbot/launchers/github.py index 1c58f0d0..2c65c7e2 100644 --- a/src/libkernelbot/launchers/github.py +++ b/src/libkernelbot/launchers/github.py @@ -105,6 +105,10 @@ async def run_submission( # noqa: C901 logger.info("Waiting for workflow to start...") timeout = get_timeout(config) + TIMEOUT_BUFFER_MINUTES + # AMD workflows need extra time for PyTorch ROCm installation + # Add 10 more minutes + if gpu_vendor == "AMD": + timeout += 10 logger.info(f"Waiting for workflow to complete... (timeout: {timeout} minutes)") await run.wait_for_completion( lambda x: self.wait_callback(x, status), timeout_minutes=timeout From 8910a4bbbd01cc9992f594e0b4f24ff1190ae8db Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 24 Sep 2025 08:59:26 -0700 Subject: [PATCH 15/17] Update amd-health.yml --- .github/workflows/amd-health.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/amd-health.yml b/.github/workflows/amd-health.yml index 16d50ea1..d469e9d3 100644 --- a/.github/workflows/amd-health.yml +++ b/.github/workflows/amd-health.yml @@ -6,6 +6,7 @@ on: - cron: '0 2 * * *' workflow_dispatch: push: + branches: [main] jobs: health-check: From 441b832a0c33c0175e9986b94f0d37363b64f5b6 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 24 Sep 2025 10:37:31 -0700 Subject: [PATCH 16/17] fix regression test --- src/libkernelbot/launchers/github.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/libkernelbot/launchers/github.py b/src/libkernelbot/launchers/github.py index 2c65c7e2..4c1b1d5f 100644 --- a/src/libkernelbot/launchers/github.py +++ b/src/libkernelbot/launchers/github.py @@ -105,10 +105,7 @@ async def run_submission( # noqa: C901 logger.info("Waiting for workflow to start...") timeout = get_timeout(config) + TIMEOUT_BUFFER_MINUTES - # AMD workflows need extra time for PyTorch ROCm installation - # Add 10 more minutes - if gpu_vendor == "AMD": - timeout += 10 + logger.info(f"Waiting for workflow to complete... (timeout: {timeout} minutes)") await run.wait_for_completion( lambda x: self.wait_callback(x, status), timeout_minutes=timeout @@ -354,7 +351,6 @@ async def wait_for_completion( logger.error(f"Error waiting for GitHub run {self.run_id}: {e}", exc_info=e) raise # Re-raise other exceptions - def get_artifact_index(self) -> dict[str, GitHubArtifact]: logger.info("Creating artifact index for run %s", self.run_id) artifacts = self.run.get_artifacts() @@ -372,7 +368,6 @@ def get_artifact_index(self) -> dict[str, GitHubArtifact]: return extracted - async def download_artifact(self, artifact: GitHubArtifact) -> dict: logger.info("Attempting to download artifact '%s' for run %s", artifact.name, self.run_id) @@ -391,6 +386,5 @@ async def download_artifact(self, artifact: GitHubArtifact) -> dict: return artifact_dict else: raise RuntimeError( - f"Failed to download artifact {artifact.name}. " - f"Status code: {response.status_code}" + f"Failed to download artifact {artifact.name}. Status code: {response.status_code}" ) From d7f934a307bb57937a3d2b6af424abd83b20af2e Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 24 Sep 2025 10:40:57 -0700 Subject: [PATCH 17/17] update --- tests/test_github.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_github.py b/tests/test_github.py index 327f75a3..d1eedfd7 100644 --- a/tests/test_github.py +++ b/tests/test_github.py @@ -83,7 +83,7 @@ def github_config(): @pytest.mark.integration @pytest.mark.asyncio -@pytest.mark.parametrize("gpu_type", [GitHubGPU.NVIDIA, GitHubGPU.MI300]) +@pytest.mark.parametrize("gpu_type", [GitHubGPU.NVIDIA, GitHubGPU.MI300x8]) async def test_github_launcher_python_script(project_root: Path, github_config: GitHubConfig, gpu_type: GitHubGPU): """ Test GitHubLauncher with a real Python script using real GitHub Actions.