gpu-mode · msaroufim · Oct 13, 2025 · Copilot · Oct 13, 2025 · Copilot
diff --git a/.github/workflows/nvidia-new.yml b/.github/workflows/nvidia-new.yml
@@ -0,0 +1,32 @@
+name: nvidia-arc
+
+on:
+  schedule:
+    # Run nightly at 2 AM UTC
+    - cron: '0 2 * * *'
+  workflow_dispatch:
+  push:
+    branches: [main]
+
+jobs:
+  health-check:
+    runs-on: [Nvidia-A100-8-x86-64]
+    timeout-minutes: 5
+    container:
+      image: nvidia/cuda:12.4.0-devel-ubuntu22.04
+
+    steps:
+    - name: Setup Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.10'
+
+    - name: Install PyTorch
+      run: |
+        pip install torch
+
+    - name: GPU Health Check
+      run: python -c "import torch; torch.randn(5, device='cuda')"
-      run: python -c "import torch; torch.randn(5, device='cuda')"
+      run: |
+        python -c "
+import torch
+try:
+    print('PyTorch version:', torch.__version__)
+    print('CUDA available:', torch.cuda.is_available())
+    if torch.cuda.is_available():
+        print('CUDA device count:', torch.cuda.device_count())
+        print('CUDA device name:', torch.cuda.get_device_name(0))
+        t = torch.randn(5, device='cuda')
+        print('Tensor:', t)
+        print('Tensor device:', t.device)
+        assert t.device.type == 'cuda', 'Tensor is not on CUDA device'
+        print('GPU health check PASSED')
+    else:
+        raise RuntimeError('CUDA is not available')
+except Exception as e:
+    print('GPU health check FAILED:', e)
+    exit(1)
-      run: python -c "import torch; torch.randn(5, device='cuda')"
+      run: |
+        python -c "
+import torch
+try:
+    print('PyTorch version:', torch.__version__)
+    print('CUDA available:', torch.cuda.is_available())
+    if torch.cuda.is_available():
+        print('CUDA device count:', torch.cuda.device_count())
+        print('CUDA device name:', torch.cuda.get_device_name(0))
+        t = torch.randn(5, device='cuda')
+        print('Tensor:', t)
+        print('Tensor device:', t.device)
+        assert t.device.type == 'cuda', 'Tensor is not on CUDA device'
+        print('GPU health check PASSED')
+    else:
+        raise RuntimeError('CUDA is not available')
+except Exception as e:
+    print('GPU health check FAILED:', e)
+    exit(1)
+
+    env:
+      CUDA_VISIBLE_DEVICES: 0
-    
-    env:
-      CUDA_VISIBLE_DEVICES: 0
+      env:
+        CUDA_VISIBLE_DEVICES: 0
+    
-    
-    env:
-      CUDA_VISIBLE_DEVICES: 0
+      env:
+        CUDA_VISIBLE_DEVICES: 0
+