|
6 | 6 | - cron: '0 2 * * *' |
7 | 7 | workflow_dispatch: |
8 | 8 | push: |
9 | | - branches: [main] |
10 | 9 |
|
11 | 10 | jobs: |
12 | 11 | health-check: |
|
28 | 27 |
|
29 | 28 | - name: Distributed Health Check |
30 | 29 | run: | |
31 | | - # Check how many GPUs are available |
32 | 30 | python -c "import torch; print(f'Available GPUs: {torch.cuda.device_count()}')" |
33 | | - |
34 | | - # Test process group initialization in a loop to debug hanging issues |
35 | | - python -c " |
36 | | - import torch |
37 | | - import torch.distributed as dist |
38 | | - import os |
39 | | - import time |
40 | | - import signal |
41 | | - |
42 | | - def timeout_handler(signum, frame): |
43 | | - print('✗ Process group initialization timed out after 30 seconds') |
44 | | - exit(1) |
45 | | - |
46 | | - # Set timeout for process group initialization |
47 | | - signal.signal(signal.SIGALRM, timeout_handler) |
48 | | - |
49 | | - num_gpus = torch.cuda.device_count() |
50 | | - print(f'Testing process group initialization on {num_gpus} GPUs') |
51 | | - |
52 | | - for attempt in range(3): # Try 3 times |
53 | | - try: |
54 | | - print(f'Attempt {attempt + 1}: Initializing process group...') |
55 | | - |
56 | | - # Set environment variables |
57 | | - os.environ['MASTER_ADDR'] = '127.0.0.1' |
58 | | - os.environ['MASTER_PORT'] = str(12345 + attempt) |
59 | | - os.environ['WORLD_SIZE'] = '1' |
60 | | - os.environ['RANK'] = '0' |
61 | | - |
62 | | - # Set 30 second timeout |
63 | | - signal.alarm(30) |
64 | | - |
65 | | - # Test single-process initialization first |
66 | | - dist.init_process_group('nccl', rank=0, world_size=1) |
67 | | - |
68 | | - # Cancel timeout |
69 | | - signal.alarm(0) |
70 | | - |
71 | | - print(f'✓ Attempt {attempt + 1}: Process group initialized successfully') |
72 | | - |
73 | | - # Test basic tensor operations |
74 | | - device = torch.device('cuda:0') |
75 | | - tensor = torch.ones(10, device=device) |
76 | | - print(f'✓ Tensor operations work: {tensor.sum().item()}') |
77 | | - |
78 | | - dist.destroy_process_group() |
79 | | - print(f'✓ Attempt {attempt + 1}: Process group destroyed successfully') |
80 | | - break |
81 | | - |
82 | | - except Exception as e: |
83 | | - signal.alarm(0) # Cancel timeout |
84 | | - print(f'✗ Attempt {attempt + 1} failed: {type(e).__name__}: {e}') |
85 | | - if attempt == 2: # Last attempt |
86 | | - print('✗ All initialization attempts failed') |
87 | | - exit(1) |
88 | | - time.sleep(2) # Wait before retry |
89 | | - |
90 | | - print('✓ Distributed health check passed') |
91 | | - " |
| 31 | + python scripts/test_distributed.py |
0 commit comments