From 9c02d48911487320fce7dd2e7558f846dbfd82ff Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Tue, 23 Sep 2025 13:33:16 -0700
Subject: [PATCH 01/17] Enhance GPU health check with distributed testing

Increased timeout for health check job and added distributed health check step to verify GPU availability and initialization.
---
 .github/workflows/amd-health.yml | 55 +++++++++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/amd-health.yml b/.github/workflows/amd-health.yml
index 0e80f0e2..ebf4fcbb 100644
--- a/.github/workflows/amd-health.yml
+++ b/.github/workflows/amd-health.yml
@@ -11,7 +11,7 @@ on:
 jobs:
   health-check:
     runs-on: [amdgpu-mi300-x86-64]
-    timeout-minutes: 5
+    timeout-minutes: 10
     
     steps:
     - name: Setup Python
@@ -25,3 +25,56 @@ jobs:
     
     - name: GPU Health Check
       run: python -c "import torch; torch.randn(5, device='cuda')"
+    
+    - name: Distributed Health Check
+      run: |
+        # Check how many GPUs are available
+        python -c "import torch; print(f'Available GPUs: {torch.cuda.device_count()}')"
+        
+        # Test distributed initialization with 2 GPUs (minimal distributed test)
+        python -c "
+        import torch
+        import torch.distributed as dist
+        import torch.multiprocessing as mp
+        import os
+        import time
+        
+        def test_distributed(rank, world_size, master_port):
+            os.environ['MASTER_ADDR'] = '127.0.0.1'
+            os.environ['MASTER_PORT'] = str(master_port)
+            
+            try:
+                dist.init_process_group('nccl', rank=rank, world_size=world_size, device_id=torch.device(f'cuda:{rank}'))
+                print(f'✓ Rank {rank} initialized successfully')
+                
+                # Simple distributed operation test
+                tensor = torch.ones(2, device=f'cuda:{rank}') * rank
+                dist.all_reduce(tensor)
+                print(f'✓ Rank {rank} all_reduce result: {tensor}')
+                
+                dist.destroy_process_group()
+                return True
+            except Exception as e:
+                print(f'✗ Rank {rank} failed: {e}')
+                return False
+        
+        num_gpus = torch.cuda.device_count()
+        world_size = min(num_gpus, 8)  # Test with available GPUs, up to 8
+        master_port = 12345 + int(time.time()) % 1000  # One port for all ranks
+        
+        print(f'Testing distributed initialization with {world_size} GPUs on port {master_port}')
+        
+        mp.set_start_method('spawn', force=True)
+        processes = []
+        for rank in range(world_size):
+            p = mp.Process(target=test_distributed, args=(rank, world_size, master_port))
+            p.start()
+            processes.append(p)
+        
+        for p in processes:
+            p.join()
+            if p.exitcode != 0:
+                print('✗ Distributed test failed')
+                exit(1)
+        print('✓ Distributed health check passed')
+        "

From 01d4aab46add5188a5c8d073ae8194617da0b241 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Tue, 23 Sep 2025 13:33:35 -0700
Subject: [PATCH 02/17] Reduce timeout for health check job from 10 to 5
 minutes

---
 .github/workflows/amd-health.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/amd-health.yml b/.github/workflows/amd-health.yml
index ebf4fcbb..7f8b6740 100644
--- a/.github/workflows/amd-health.yml
+++ b/.github/workflows/amd-health.yml
@@ -11,7 +11,7 @@ on:
 jobs:
   health-check:
     runs-on: [amdgpu-mi300-x86-64]
-    timeout-minutes: 10
+    timeout-minutes: 5
     
     steps:
     - name: Setup Python

From 3de2541aef17998e2403047d8dfbe2744cf5df22 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Tue, 23 Sep 2025 14:39:22 -0700
Subject: [PATCH 03/17] update

---
 .github/workflows/amd-health.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/amd-health.yml b/.github/workflows/amd-health.yml
index 7f8b6740..080aaef3 100644
--- a/.github/workflows/amd-health.yml
+++ b/.github/workflows/amd-health.yml
@@ -10,7 +10,7 @@ on:
 
 jobs:
   health-check:
-    runs-on: [amdgpu-mi300-x86-64]
+    runs-on: [amdgpu-mi250-x86-64]
     timeout-minutes: 5
     
     steps:

From 4d66efb49239a31e7498ac50eaa408b61bda5b36 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Tue, 23 Sep 2025 14:43:54 -0700
Subject: [PATCH 04/17] Update runner for health check workflow

---
 .github/workflows/amd-health.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/amd-health.yml b/.github/workflows/amd-health.yml
index 080aaef3..1173e3f4 100644
--- a/.github/workflows/amd-health.yml
+++ b/.github/workflows/amd-health.yml
@@ -10,7 +10,7 @@ on:
 
 jobs:
   health-check:
-    runs-on: [amdgpu-mi250-x86-64]
+    runs-on: [amdgpu-mi300-8-x86-64-gmj4g-runner-fmp9g]
     timeout-minutes: 5
     
     steps:

From 2f44dbed5b7f124311ffbdc34e02b2048bbb1011 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Tue, 23 Sep 2025 15:47:35 -0700
Subject: [PATCH 05/17] Update runner for health check workflow

---
 .github/workflows/amd-health.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/amd-health.yml b/.github/workflows/amd-health.yml
index 1173e3f4..3adae8c7 100644
--- a/.github/workflows/amd-health.yml
+++ b/.github/workflows/amd-health.yml
@@ -10,7 +10,7 @@ on:
 
 jobs:
   health-check:
-    runs-on: [amdgpu-mi300-8-x86-64-gmj4g-runner-fmp9g]
+    runs-on: [amdgpu-mi300-8-x86-64]
     timeout-minutes: 5
     
     steps:

From 43b360d47d2f5138f1103d1b107fd21fb2850aaa Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Tue, 23 Sep 2025 15:54:55 -0700
Subject: [PATCH 06/17] update

---
 .github/workflows/amd-health.yml | 83 ++++++++++++++++++--------------
 1 file changed, 47 insertions(+), 36 deletions(-)

diff --git a/.github/workflows/amd-health.yml b/.github/workflows/amd-health.yml
index 3adae8c7..429a15a8 100644
--- a/.github/workflows/amd-health.yml
+++ b/.github/workflows/amd-health.yml
@@ -19,9 +19,9 @@ jobs:
       with:
         python-version: '3.10'
     
-    - name: Install PyTorch
-      run: |
-        pip install torch --index-url https://download.pytorch.org/whl/rocm6.3
+    # - name: Install PyTorch
+    #   run: |
+    #     pip install torch --index-url https://download.pytorch.org/whl/rocm6.3
     
     - name: GPU Health Check
       run: python -c "import torch; torch.randn(5, device='cuda')"
@@ -31,50 +31,61 @@ jobs:
         # Check how many GPUs are available
         python -c "import torch; print(f'Available GPUs: {torch.cuda.device_count()}')"
         
-        # Test distributed initialization with 2 GPUs (minimal distributed test)
+        # Test process group initialization in a loop to debug hanging issues
         python -c "
         import torch
         import torch.distributed as dist
-        import torch.multiprocessing as mp
         import os
         import time
+        import signal
+        
+        def timeout_handler(signum, frame):
+            print('✗ Process group initialization timed out after 30 seconds')
+            exit(1)
+        
+        # Set timeout for process group initialization
+        signal.signal(signal.SIGALRM, timeout_handler)
+        
+        num_gpus = torch.cuda.device_count()
+        print(f'Testing process group initialization on {num_gpus} GPUs')
         
-        def test_distributed(rank, world_size, master_port):
-            os.environ['MASTER_ADDR'] = '127.0.0.1'
-            os.environ['MASTER_PORT'] = str(master_port)
-            
+        for attempt in range(3):  # Try 3 times
             try:
-                dist.init_process_group('nccl', rank=rank, world_size=world_size, device_id=torch.device(f'cuda:{rank}'))
-                print(f'✓ Rank {rank} initialized successfully')
+                print(f'Attempt {attempt + 1}: Initializing process group...')
+                
+                # Set environment variables
+                os.environ['MASTER_ADDR'] = '127.0.0.1'
+                os.environ['MASTER_PORT'] = str(12345 + attempt)
+                os.environ['WORLD_SIZE'] = '1'
+                os.environ['RANK'] = '0'
+                
+                # Set 30 second timeout
+                signal.alarm(30)
+                
+                # Test single-process initialization first
+                dist.init_process_group('nccl', rank=0, world_size=1)
                 
-                # Simple distributed operation test
-                tensor = torch.ones(2, device=f'cuda:{rank}') * rank
-                dist.all_reduce(tensor)
-                print(f'✓ Rank {rank} all_reduce result: {tensor}')
+                # Cancel timeout
+                signal.alarm(0)
+                
+                print(f'✓ Attempt {attempt + 1}: Process group initialized successfully')
+                
+                # Test basic tensor operations
+                device = torch.device('cuda:0')
+                tensor = torch.ones(10, device=device)
+                print(f'✓ Tensor operations work: {tensor.sum().item()}')
                 
                 dist.destroy_process_group()
-                return True
+                print(f'✓ Attempt {attempt + 1}: Process group destroyed successfully')
+                break
+                
             except Exception as e:
-                print(f'✗ Rank {rank} failed: {e}')
-                return False
-        
-        num_gpus = torch.cuda.device_count()
-        world_size = min(num_gpus, 8)  # Test with available GPUs, up to 8
-        master_port = 12345 + int(time.time()) % 1000  # One port for all ranks
-        
-        print(f'Testing distributed initialization with {world_size} GPUs on port {master_port}')
-        
-        mp.set_start_method('spawn', force=True)
-        processes = []
-        for rank in range(world_size):
-            p = mp.Process(target=test_distributed, args=(rank, world_size, master_port))
-            p.start()
-            processes.append(p)
+                signal.alarm(0)  # Cancel timeout
+                print(f'✗ Attempt {attempt + 1} failed: {type(e).__name__}: {e}')
+                if attempt == 2:  # Last attempt
+                    print('✗ All initialization attempts failed')
+                    exit(1)
+                time.sleep(2)  # Wait before retry
         
-        for p in processes:
-            p.join()
-            if p.exitcode != 0:
-                print('✗ Distributed test failed')
-                exit(1)
         print('✓ Distributed health check passed')
         "

From 1649475a0c3967f46e959ecb8a6cb212af51390e Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Tue, 23 Sep 2025 15:57:45 -0700
Subject: [PATCH 07/17] update

---
 .github/workflows/amd-health.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/amd-health.yml b/.github/workflows/amd-health.yml
index 429a15a8..07290cda 100644
--- a/.github/workflows/amd-health.yml
+++ b/.github/workflows/amd-health.yml
@@ -19,9 +19,9 @@ jobs:
       with:
         python-version: '3.10'
     
-    # - name: Install PyTorch
-    #   run: |
-    #     pip install torch --index-url https://download.pytorch.org/whl/rocm6.3
+    - name: Install PyTorch
+      run: |
+        pip install torch --index-url https://download.pytorch.org/whl/rocm6.3
     
     - name: GPU Health Check
       run: python -c "import torch; torch.randn(5, device='cuda')"

From 60ccce5402acf2f011dfd3e51623df0118bc24c7 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Tue, 23 Sep 2025 16:35:54 -0700
Subject: [PATCH 08/17] update

---
 .github/workflows/amd-health.yml | 62 +-------------------------
 scripts/test_distributed.py      | 75 ++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+), 61 deletions(-)
 create mode 100644 scripts/test_distributed.py

diff --git a/.github/workflows/amd-health.yml b/.github/workflows/amd-health.yml
index 07290cda..f48ceaf0 100644
--- a/.github/workflows/amd-health.yml
+++ b/.github/workflows/amd-health.yml
@@ -6,7 +6,6 @@ on:
     - cron: '0 2 * * *'
   workflow_dispatch:
   push:
-    branches: [main]
 
 jobs:
   health-check:
@@ -28,64 +27,5 @@ jobs:
     
     - name: Distributed Health Check
       run: |
-        # Check how many GPUs are available
         python -c "import torch; print(f'Available GPUs: {torch.cuda.device_count()}')"
-        
-        # Test process group initialization in a loop to debug hanging issues
-        python -c "
-        import torch
-        import torch.distributed as dist
-        import os
-        import time
-        import signal
-        
-        def timeout_handler(signum, frame):
-            print('✗ Process group initialization timed out after 30 seconds')
-            exit(1)
-        
-        # Set timeout for process group initialization
-        signal.signal(signal.SIGALRM, timeout_handler)
-        
-        num_gpus = torch.cuda.device_count()
-        print(f'Testing process group initialization on {num_gpus} GPUs')
-        
-        for attempt in range(3):  # Try 3 times
-            try:
-                print(f'Attempt {attempt + 1}: Initializing process group...')
-                
-                # Set environment variables
-                os.environ['MASTER_ADDR'] = '127.0.0.1'
-                os.environ['MASTER_PORT'] = str(12345 + attempt)
-                os.environ['WORLD_SIZE'] = '1'
-                os.environ['RANK'] = '0'
-                
-                # Set 30 second timeout
-                signal.alarm(30)
-                
-                # Test single-process initialization first
-                dist.init_process_group('nccl', rank=0, world_size=1)
-                
-                # Cancel timeout
-                signal.alarm(0)
-                
-                print(f'✓ Attempt {attempt + 1}: Process group initialized successfully')
-                
-                # Test basic tensor operations
-                device = torch.device('cuda:0')
-                tensor = torch.ones(10, device=device)
-                print(f'✓ Tensor operations work: {tensor.sum().item()}')
-                
-                dist.destroy_process_group()
-                print(f'✓ Attempt {attempt + 1}: Process group destroyed successfully')
-                break
-                
-            except Exception as e:
-                signal.alarm(0)  # Cancel timeout
-                print(f'✗ Attempt {attempt + 1} failed: {type(e).__name__}: {e}')
-                if attempt == 2:  # Last attempt
-                    print('✗ All initialization attempts failed')
-                    exit(1)
-                time.sleep(2)  # Wait before retry
-        
-        print('✓ Distributed health check passed')
-        "
+        python scripts/test_distributed.py
diff --git a/scripts/test_distributed.py b/scripts/test_distributed.py
new file mode 100644
index 00000000..f4022a8f
--- /dev/null
+++ b/scripts/test_distributed.py
@@ -0,0 +1,75 @@
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import os
+import signal
+import sys
+
+def timeout_handler(signum, frame):
+    print('✗ TIMEOUT: Process hung')
+    sys.exit(1)
+
+def test_worker(rank, world_size, master_port):
+    try:
+        os.environ['MASTER_ADDR'] = '127.0.0.1'
+        os.environ['MASTER_PORT'] = str(master_port)
+        os.environ['RANK'] = str(rank)
+        os.environ['WORLD_SIZE'] = str(world_size)
+        
+        signal.signal(signal.SIGALRM, timeout_handler)
+        signal.alarm(30)
+        
+        print(f'Rank {rank}: Init NCCL...')
+        dist.init_process_group('nccl', rank=rank, world_size=world_size)
+        signal.alarm(0)
+        
+        device = torch.device(f'cuda:{rank}')
+        tensor = torch.ones(100, device=device) * rank
+        
+        signal.alarm(15)
+        dist.all_reduce(tensor)
+        signal.alarm(0)
+        
+        print(f'✓ Rank {rank}: sum = {tensor[0].item()}')
+        dist.destroy_process_group()
+        
+    except Exception as e:
+        signal.alarm(0)
+        print(f'✗ Rank {rank}: {e}')
+        sys.exit(1)
+
+def main():
+    num_gpus = torch.cuda.device_count()
+    print(f'Testing {num_gpus} GPUs - 4 rounds')
+    
+    for round_num in range(4):
+        print(f'=== ROUND {round_num + 1} ===')
+        master_port = 29500 + round_num
+        
+        mp.set_start_method('spawn', force=True)
+        processes = []
+        
+        for rank in range(num_gpus):
+            p = mp.Process(target=test_worker, args=(rank, num_gpus, master_port))
+            p.start()
+            processes.append(p)
+        
+        for i, p in enumerate(processes):
+            p.join(timeout=60)
+            if p.exitcode != 0:
+                print(f'✗ ROUND {round_num + 1} FAILED')
+                for rp in processes:
+                    if rp.is_alive():
+                        rp.terminate()
+                sys.exit(1)
+            elif p.is_alive():
+                print(f'✗ ROUND {round_num + 1} HUNG')
+                p.terminate()
+                sys.exit(1)
+        
+        print(f'✓ ROUND {round_num + 1} PASSED')
+    
+    print('✓ ALL ROUNDS PASSED')
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file

From 4b65304a6a4d5e8e3cfd9328d75c134638077dde Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Tue, 23 Sep 2025 16:39:21 -0700
Subject: [PATCH 09/17] update

---
 .github/workflows/amd-health.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/amd-health.yml b/.github/workflows/amd-health.yml
index f48ceaf0..e715e2d4 100644
--- a/.github/workflows/amd-health.yml
+++ b/.github/workflows/amd-health.yml
@@ -13,6 +13,8 @@ jobs:
     timeout-minutes: 5
     
     steps:
+    - uses: actions/checkout@v3
+    
     - name: Setup Python
       uses: actions/setup-python@v5
       with:

From 825decb789a37c324d1a9e30ce3b8c03df03c93f Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Tue, 23 Sep 2025 16:49:24 -0700
Subject: [PATCH 10/17] update ruff

---
 .github/workflows/amd-health.yml |  1 +
 scripts/test_distributed.py      | 75 +++++++++++++++++---------------
 2 files changed, 41 insertions(+), 35 deletions(-)

diff --git a/.github/workflows/amd-health.yml b/.github/workflows/amd-health.yml
index e715e2d4..37c1679d 100644
--- a/.github/workflows/amd-health.yml
+++ b/.github/workflows/amd-health.yml
@@ -22,6 +22,7 @@ jobs:
     
     - name: Install PyTorch
       run: |
+        pip install numpy
         pip install torch --index-url https://download.pytorch.org/whl/rocm6.3
     
     - name: GPU Health Check
diff --git a/scripts/test_distributed.py b/scripts/test_distributed.py
index f4022a8f..44647481 100644
--- a/scripts/test_distributed.py
+++ b/scripts/test_distributed.py
@@ -1,75 +1,80 @@
-import torch
-import torch.distributed as dist
-import torch.multiprocessing as mp
 import os
 import signal
 import sys
 
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+
 def timeout_handler(signum, frame):
-    print('✗ TIMEOUT: Process hung')
+    print("✗ TIMEOUT: Process hung")
     sys.exit(1)
 
+
 def test_worker(rank, world_size, master_port):
     try:
-        os.environ['MASTER_ADDR'] = '127.0.0.1'
-        os.environ['MASTER_PORT'] = str(master_port)
-        os.environ['RANK'] = str(rank)
-        os.environ['WORLD_SIZE'] = str(world_size)
-        
+        os.environ["MASTER_ADDR"] = "127.0.0.1"
+        os.environ["MASTER_PORT"] = str(master_port)
+        os.environ["RANK"] = str(rank)
+        os.environ["WORLD_SIZE"] = str(world_size)
+
         signal.signal(signal.SIGALRM, timeout_handler)
         signal.alarm(30)
-        
-        print(f'Rank {rank}: Init NCCL...')
-        dist.init_process_group('nccl', rank=rank, world_size=world_size)
+
+        print(f"Rank {rank}: Init NCCL...")
+        dist.init_process_group("nccl", rank=rank, world_size=world_size)
         signal.alarm(0)
-        
-        device = torch.device(f'cuda:{rank}')
+
+        device = torch.device(f"cuda:{rank}")
         tensor = torch.ones(100, device=device) * rank
-        
+
         signal.alarm(15)
         dist.all_reduce(tensor)
         signal.alarm(0)
-        
-        print(f'✓ Rank {rank}: sum = {tensor[0].item()}')
+
+        print(f"✓ Rank {rank}: sum = {tensor[0].item()}")
         dist.destroy_process_group()
-        
+
     except Exception as e:
         signal.alarm(0)
-        print(f'✗ Rank {rank}: {e}')
+        print(f"✗ Rank {rank}: {e}")
         sys.exit(1)
 
+
 def main():
     num_gpus = torch.cuda.device_count()
-    print(f'Testing {num_gpus} GPUs - 4 rounds')
-    
+    print(f"Testing {num_gpus} GPUs - 4 rounds")
+
     for round_num in range(4):
-        print(f'=== ROUND {round_num + 1} ===')
+        print(f"=== ROUND {round_num + 1} ===")
         master_port = 29500 + round_num
-        
-        mp.set_start_method('spawn', force=True)
+
+        mp.set_start_method("spawn", force=True)
         processes = []
-        
+
         for rank in range(num_gpus):
             p = mp.Process(target=test_worker, args=(rank, num_gpus, master_port))
             p.start()
             processes.append(p)
-        
-        for i, p in enumerate(processes):
+
+        for _, p in enumerate(processes):
             p.join(timeout=60)
             if p.exitcode != 0:
-                print(f'✗ ROUND {round_num + 1} FAILED')
+                print(f"✗ ROUND {round_num + 1} FAILED")
                 for rp in processes:
                     if rp.is_alive():
                         rp.terminate()
                 sys.exit(1)
             elif p.is_alive():
-                print(f'✗ ROUND {round_num + 1} HUNG')
+                print(f"✗ ROUND {round_num + 1} HUNG")
                 p.terminate()
                 sys.exit(1)
-        
-        print(f'✓ ROUND {round_num + 1} PASSED')
-    
-    print('✓ ALL ROUNDS PASSED')
 
-if __name__ == '__main__':
-    main()
\ No newline at end of file
+        print(f"✓ ROUND {round_num + 1} PASSED")
+
+    print("✓ ALL ROUNDS PASSED")
+
+
+if __name__ == "__main__":
+    main()

From 9629cfe3ab2d8ff094471ce20022b8b707dcc2fe Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Tue, 23 Sep 2025 16:55:14 -0700
Subject: [PATCH 11/17] update

---
 scripts/test_distributed.py | 110 ++++++++++++++++++------------------
 1 file changed, 56 insertions(+), 54 deletions(-)

diff --git a/scripts/test_distributed.py b/scripts/test_distributed.py
index 44647481..37fe9557 100644
--- a/scripts/test_distributed.py
+++ b/scripts/test_distributed.py
@@ -1,80 +1,82 @@
-import os
-import signal
-import sys
-
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
-
+from multiprocessing import Pool
+import os
+import signal
+import sys
 
 def timeout_handler(signum, frame):
-    print("✗ TIMEOUT: Process hung")
+    print('✗ TIMEOUT: Process hung')
     sys.exit(1)
 
-
-def test_worker(rank, world_size, master_port):
+def test_worker(args):
+    rank, world_size, master_port = args
     try:
-        os.environ["MASTER_ADDR"] = "127.0.0.1"
-        os.environ["MASTER_PORT"] = str(master_port)
-        os.environ["RANK"] = str(rank)
-        os.environ["WORLD_SIZE"] = str(world_size)
-
+        os.environ['MASTER_ADDR'] = '127.0.0.1'
+        os.environ['MASTER_PORT'] = str(master_port)
+        os.environ['RANK'] = str(rank)
+        os.environ['WORLD_SIZE'] = str(world_size)
+        
         signal.signal(signal.SIGALRM, timeout_handler)
         signal.alarm(30)
-
-        print(f"Rank {rank}: Init NCCL...")
-        dist.init_process_group("nccl", rank=rank, world_size=world_size)
+        
+        print(f'Rank {rank}: Init NCCL...')
+        dist.init_process_group("nccl", init_method="env://", rank=rank, world_size=world_size, device_id=torch.device(f'cuda:{rank}'))
         signal.alarm(0)
-
-        device = torch.device(f"cuda:{rank}")
+        
+        device = torch.device(f'cuda:{rank}')
         tensor = torch.ones(100, device=device) * rank
-
+        
         signal.alarm(15)
         dist.all_reduce(tensor)
         signal.alarm(0)
-
-        print(f"✓ Rank {rank}: sum = {tensor[0].item()}")
+        
+        print(f'✓ Rank {rank}: sum = {tensor[0].item()}')
         dist.destroy_process_group()
-
+        return True
+        
     except Exception as e:
         signal.alarm(0)
-        print(f"✗ Rank {rank}: {e}")
-        sys.exit(1)
-
+        print(f'✗ Rank {rank}: {e}')
+        return False
 
 def main():
     num_gpus = torch.cuda.device_count()
-    print(f"Testing {num_gpus} GPUs - 4 rounds")
-
+    print(f'Testing {num_gpus} GPUs - 4 rounds')
+    
     for round_num in range(4):
-        print(f"=== ROUND {round_num + 1} ===")
+        print(f'=== ROUND {round_num + 1} ===')
         master_port = 29500 + round_num
-
-        mp.set_start_method("spawn", force=True)
-        processes = []
-
-        for rank in range(num_gpus):
-            p = mp.Process(target=test_worker, args=(rank, num_gpus, master_port))
-            p.start()
-            processes.append(p)
-
-        for _, p in enumerate(processes):
-            p.join(timeout=60)
-            if p.exitcode != 0:
-                print(f"✗ ROUND {round_num + 1} FAILED")
-                for rp in processes:
-                    if rp.is_alive():
-                        rp.terminate()
+        
+        mp.set_start_method('spawn', force=True)
+        
+        # Prepare worker arguments
+        worker_args = [(rank, num_gpus, master_port) for rank in range(num_gpus)]
+        
+        with Pool(processes=num_gpus) as pool:
+            try:
+                # Use map_async with timeout
+                result = pool.map_async(test_worker, worker_args)
+                results = result.get(timeout=60)
+                
+                # Check if all workers succeeded
+                if not all(results):
+                    print(f'✗ ROUND {round_num + 1} FAILED')
+                    sys.exit(1)
+                    
+            except mp.TimeoutError:
+                print(f'✗ ROUND {round_num + 1} HUNG')
+                pool.terminate()
+                pool.join()
                 sys.exit(1)
-            elif p.is_alive():
-                print(f"✗ ROUND {round_num + 1} HUNG")
-                p.terminate()
+            except Exception as e:
+                print(f'✗ ROUND {round_num + 1} ERROR: {e}')
                 sys.exit(1)
+        
+        print(f'✓ ROUND {round_num + 1} PASSED')
+    
+    print('✓ ALL ROUNDS PASSED')
 
-        print(f"✓ ROUND {round_num + 1} PASSED")
-
-    print("✓ ALL ROUNDS PASSED")
-
-
-if __name__ == "__main__":
-    main()
+if __name__ == '__main__':
+    main()
\ No newline at end of file

From 55a291d9088f3bd8d4d2659314d570bc0e608c48 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Tue, 23 Sep 2025 17:12:00 -0700
Subject: [PATCH 12/17] update

---
 .github/workflows/amd-health.yml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/.github/workflows/amd-health.yml b/.github/workflows/amd-health.yml
index 37c1679d..16d50ea1 100644
--- a/.github/workflows/amd-health.yml
+++ b/.github/workflows/amd-health.yml
@@ -25,6 +25,23 @@ jobs:
         pip install numpy
         pip install torch --index-url https://download.pytorch.org/whl/rocm6.3
     
+    - name: System Information
+      run: |
+        echo "=== ROCm Version ==="
+        rocm-smi --version || rocminfo --version || echo "ROCm version check failed"
+        echo ""
+        echo "=== GPU Driver Info ==="
+        rocm-smi -a || rocminfo || echo "ROCm SMI failed"
+        echo ""
+        echo "=== PyTorch Version ==="
+        python -c "import torch; print(f'PyTorch: {torch.__version__}')"
+        python -c "import torch; print(f'CUDA/ROCm: {torch.version.cuda}')"
+        python -c "import torch; print(f'HIP: {torch.version.hip if hasattr(torch.version, \"hip\") else \"N/A\"}')"
+        echo ""
+        echo "=== OS Info ==="
+        uname -a
+        cat /etc/os-release | head -5
+    
     - name: GPU Health Check
       run: python -c "import torch; torch.randn(5, device='cuda')"
     

From e005de4292f4756c6d761fd6d97ea6295919c918 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Wed, 24 Sep 2025 08:33:59 -0700
Subject: [PATCH 13/17] update

---
 scripts/test_distributed.py | 87 +++++++++++++++++++++----------------
 1 file changed, 49 insertions(+), 38 deletions(-)

diff --git a/scripts/test_distributed.py b/scripts/test_distributed.py
index 37fe9557..ce521b42 100644
--- a/scripts/test_distributed.py
+++ b/scripts/test_distributed.py
@@ -1,82 +1,93 @@
-import torch
-import torch.distributed as dist
-import torch.multiprocessing as mp
-from multiprocessing import Pool
 import os
 import signal
 import sys
+from multiprocessing import Pool
+
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
 
 def timeout_handler(signum, frame):
-    print('✗ TIMEOUT: Process hung')
+    print("✗ TIMEOUT: Process hung")
     sys.exit(1)
 
+
 def test_worker(args):
     rank, world_size, master_port = args
     try:
-        os.environ['MASTER_ADDR'] = '127.0.0.1'
-        os.environ['MASTER_PORT'] = str(master_port)
-        os.environ['RANK'] = str(rank)
-        os.environ['WORLD_SIZE'] = str(world_size)
-        
+        os.environ["MASTER_ADDR"] = "127.0.0.1"
+        os.environ["MASTER_PORT"] = str(master_port)
+        os.environ["RANK"] = str(rank)
+        os.environ["WORLD_SIZE"] = str(world_size)
+
         signal.signal(signal.SIGALRM, timeout_handler)
         signal.alarm(30)
-        
-        print(f'Rank {rank}: Init NCCL...')
-        dist.init_process_group("nccl", init_method="env://", rank=rank, world_size=world_size, device_id=torch.device(f'cuda:{rank}'))
+
+        print(f"Rank {rank}: Init NCCL...")
+        dist.init_process_group(
+            "nccl",
+            init_method="env://",
+            rank=rank,
+            world_size=world_size,
+            device_id=torch.device(f"cuda:{rank}"),
+        )
         signal.alarm(0)
-        
-        device = torch.device(f'cuda:{rank}')
+
+        device = torch.device(f"cuda:{rank}")
         tensor = torch.ones(100, device=device) * rank
-        
+
         signal.alarm(15)
         dist.all_reduce(tensor)
         signal.alarm(0)
-        
-        print(f'✓ Rank {rank}: sum = {tensor[0].item()}')
+
+        print(f"✓ Rank {rank}: sum = {tensor[0].item()}")
         dist.destroy_process_group()
         return True
-        
+
     except Exception as e:
         signal.alarm(0)
-        print(f'✗ Rank {rank}: {e}')
+        print(f"✗ Rank {rank}: {e}")
         return False
 
+
 def main():
     num_gpus = torch.cuda.device_count()
-    print(f'Testing {num_gpus} GPUs - 4 rounds')
-    
+    print(f"Testing {num_gpus} GPUs - 4 rounds")
+
     for round_num in range(4):
-        print(f'=== ROUND {round_num + 1} ===')
+        print(f"=== ROUND {round_num + 1} ===")
         master_port = 29500 + round_num
-        
-        mp.set_start_method('spawn', force=True)
-        
+
+        mp.set_start_method("spawn", force=True)
+
         # Prepare worker arguments
         worker_args = [(rank, num_gpus, master_port) for rank in range(num_gpus)]
-        
+
         with Pool(processes=num_gpus) as pool:
             try:
                 # Use map_async with timeout
                 result = pool.map_async(test_worker, worker_args)
                 results = result.get(timeout=60)
-                
+
                 # Check if all workers succeeded
                 if not all(results):
-                    print(f'✗ ROUND {round_num + 1} FAILED')
+                    print(f"✗ ROUND {round_num + 1} FAILED")
                     sys.exit(1)
-                    
+
             except mp.TimeoutError:
-                print(f'✗ ROUND {round_num + 1} HUNG')
+                print(f"✗ ROUND {round_num + 1} HUNG")
                 pool.terminate()
                 pool.join()
                 sys.exit(1)
             except Exception as e:
-                print(f'✗ ROUND {round_num + 1} ERROR: {e}')
+                print(f"✗ ROUND {round_num + 1} ERROR: {e}")
                 sys.exit(1)
-        
-        print(f'✓ ROUND {round_num + 1} PASSED')
-    
-    print('✓ ALL ROUNDS PASSED')
 
-if __name__ == '__main__':
-    main()
\ No newline at end of file
+        print(f"✓ ROUND {round_num + 1} PASSED")
+
+    print("✓ ALL ROUNDS PASSED")
+
+
+if __name__ == "__main__":
+    main()

From 3317036d7e40ef5c1b6ccbbeeade5818d025bbb0 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Wed, 24 Sep 2025 08:58:50 -0700
Subject: [PATCH 14/17] update timeout

---
 src/libkernelbot/launchers/github.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/libkernelbot/launchers/github.py b/src/libkernelbot/launchers/github.py
index 1c58f0d0..2c65c7e2 100644
--- a/src/libkernelbot/launchers/github.py
+++ b/src/libkernelbot/launchers/github.py
@@ -105,6 +105,10 @@ async def run_submission(  # noqa: C901
         logger.info("Waiting for workflow to start...")
 
         timeout = get_timeout(config) + TIMEOUT_BUFFER_MINUTES
+        # AMD workflows need extra time for PyTorch ROCm installation
+        # Add 10 more minutes
+        if gpu_vendor == "AMD":
+            timeout += 10  
         logger.info(f"Waiting for workflow to complete... (timeout: {timeout} minutes)")
         await run.wait_for_completion(
             lambda x: self.wait_callback(x, status), timeout_minutes=timeout

From 8910a4bbbd01cc9992f594e0b4f24ff1190ae8db Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Wed, 24 Sep 2025 08:59:26 -0700
Subject: [PATCH 15/17] Update amd-health.yml

---
 .github/workflows/amd-health.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/amd-health.yml b/.github/workflows/amd-health.yml
index 16d50ea1..d469e9d3 100644
--- a/.github/workflows/amd-health.yml
+++ b/.github/workflows/amd-health.yml
@@ -6,6 +6,7 @@ on:
     - cron: '0 2 * * *'
   workflow_dispatch:
   push:
+    branches: [main]
 
 jobs:
   health-check:

From 441b832a0c33c0175e9986b94f0d37363b64f5b6 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Wed, 24 Sep 2025 10:37:31 -0700
Subject: [PATCH 16/17] fix regression test

---
 src/libkernelbot/launchers/github.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/src/libkernelbot/launchers/github.py b/src/libkernelbot/launchers/github.py
index 2c65c7e2..4c1b1d5f 100644
--- a/src/libkernelbot/launchers/github.py
+++ b/src/libkernelbot/launchers/github.py
@@ -105,10 +105,7 @@ async def run_submission(  # noqa: C901
         logger.info("Waiting for workflow to start...")
 
         timeout = get_timeout(config) + TIMEOUT_BUFFER_MINUTES
-        # AMD workflows need extra time for PyTorch ROCm installation
-        # Add 10 more minutes
-        if gpu_vendor == "AMD":
-            timeout += 10  
+
         logger.info(f"Waiting for workflow to complete... (timeout: {timeout} minutes)")
         await run.wait_for_completion(
             lambda x: self.wait_callback(x, status), timeout_minutes=timeout
@@ -354,7 +351,6 @@ async def wait_for_completion(
                 logger.error(f"Error waiting for GitHub run {self.run_id}: {e}", exc_info=e)
                 raise  # Re-raise other exceptions
 
-
     def get_artifact_index(self) -> dict[str, GitHubArtifact]:
         logger.info("Creating artifact index for run %s", self.run_id)
         artifacts = self.run.get_artifacts()
@@ -372,7 +368,6 @@ def get_artifact_index(self) -> dict[str, GitHubArtifact]:
 
         return extracted
 
-
     async def download_artifact(self, artifact: GitHubArtifact) -> dict:
         logger.info("Attempting to download artifact '%s' for run %s", artifact.name, self.run_id)
 
@@ -391,6 +386,5 @@ async def download_artifact(self, artifact: GitHubArtifact) -> dict:
             return artifact_dict
         else:
             raise RuntimeError(
-                f"Failed to download artifact {artifact.name}. "
-                f"Status code: {response.status_code}"
+                f"Failed to download artifact {artifact.name}. Status code: {response.status_code}"
             )

From d7f934a307bb57937a3d2b6af424abd83b20af2e Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Wed, 24 Sep 2025 10:40:57 -0700
Subject: [PATCH 17/17] update

---
 tests/test_github.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_github.py b/tests/test_github.py
index 327f75a3..d1eedfd7 100644
--- a/tests/test_github.py
+++ b/tests/test_github.py
@@ -83,7 +83,7 @@ def github_config():
 
 @pytest.mark.integration
 @pytest.mark.asyncio
-@pytest.mark.parametrize("gpu_type", [GitHubGPU.NVIDIA, GitHubGPU.MI300])
+@pytest.mark.parametrize("gpu_type", [GitHubGPU.NVIDIA, GitHubGPU.MI300x8])
 async def test_github_launcher_python_script(project_root: Path, github_config: GitHubConfig, gpu_type: GitHubGPU):
     """
     Test GitHubLauncher with a real Python script using real GitHub Actions.