Skip to content

Commit 2b97f30

Browse files
committed
Add simplified Buildkite setup script with proper GPU isolation
1 parent 8db4754 commit 2b97f30

File tree

1 file changed

+121
-0
lines changed

1 file changed

+121
-0
lines changed
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
#!/bin/bash
2+
# Buildkite GPU Node Setup - Simplified version
3+
# Usage: sudo BUILDKITE_AGENT_TOKEN=xxx GPU_TYPE=test ./setup-node-simple.sh
4+
5+
set -euo pipefail
6+
7+
# === CONFIGURATION ===
8+
BUILDKITE_TOKEN="${BUILDKITE_AGENT_TOKEN:?Must set BUILDKITE_AGENT_TOKEN}"
9+
GPU_TYPE="${GPU_TYPE:?Must set GPU_TYPE (e.g., b200, mi300, h100, test)}"
10+
NODE_NAME="${NODE_NAME:-$(hostname)}"
11+
12+
# Auto-detect GPU count
13+
if command -v nvidia-smi &> /dev/null; then
14+
GPU_COUNT=$(nvidia-smi --query-gpu=count --format=csv,noheader | head -1)
15+
else
16+
GPU_COUNT="${GPU_COUNT:-1}"
17+
fi
18+
19+
echo "=== Buildkite GPU Node Setup ==="
20+
echo "Node: ${NODE_NAME}"
21+
echo "GPU Type: ${GPU_TYPE}"
22+
echo "GPU Count: ${GPU_COUNT}"
23+
echo ""
24+
25+
# === INSTALL BUILDKITE AGENT ===
26+
if ! command -v buildkite-agent &> /dev/null; then
27+
echo "Installing Buildkite Agent..."
28+
apt-get update
29+
apt-get install -y apt-transport-https gnupg
30+
curl -fsSL https://keys.openpgp.org/vks/v1/by-fingerprint/32A37959C2FA5C3C99EFBC32A79206696452D198 | \
31+
gpg --dearmor -o /usr/share/keyrings/buildkite-agent-archive-keyring.gpg
32+
echo "deb [signed-by=/usr/share/keyrings/buildkite-agent-archive-keyring.gpg] https://apt.buildkite.com/buildkite-agent stable main" | \
33+
tee /etc/apt/sources.list.d/buildkite-agent.list
34+
apt-get update
35+
apt-get install -y buildkite-agent
36+
fi
37+
38+
# === STOP EXISTING AGENTS ===
39+
echo "Stopping existing agents..."
40+
systemctl stop buildkite-agent 2>/dev/null || true
41+
for i in $(seq 0 15); do
42+
systemctl stop "buildkite-agent-gpu${i}" 2>/dev/null || true
43+
systemctl disable "buildkite-agent-gpu${i}" 2>/dev/null || true
44+
done
45+
46+
# === CREATE DIRECTORIES ===
47+
mkdir -p /var/lib/buildkite-agent/builds
48+
chown -R buildkite-agent:buildkite-agent /var/lib/buildkite-agent
49+
50+
# === CONFIGURE GIT TO USE HTTPS ===
51+
sudo -u buildkite-agent git config --global url."https://github.com/".insteadOf "git@github.com:"
52+
53+
# === CREATE AGENT FOR EACH GPU ===
54+
echo "Creating ${GPU_COUNT} agents..."
55+
56+
for gpu_idx in $(seq 0 $((GPU_COUNT - 1))); do
57+
agent_name="${NODE_NAME}-gpu${gpu_idx}"
58+
config_file="/etc/buildkite-agent/buildkite-agent-gpu${gpu_idx}.cfg"
59+
build_dir="/var/lib/buildkite-agent/builds/gpu${gpu_idx}"
60+
61+
mkdir -p "${build_dir}"
62+
chown buildkite-agent:buildkite-agent "${build_dir}"
63+
64+
# Write config
65+
cat > "${config_file}" << EOF
66+
token="${BUILDKITE_TOKEN}"
67+
name="${agent_name}"
68+
tags="queue=${GPU_TYPE},gpu=${GPU_TYPE},gpu-index=${gpu_idx},node=${NODE_NAME}"
69+
build-path="${build_dir}"
70+
hooks-path="/etc/buildkite-agent/hooks"
71+
EOF
72+
73+
# Write systemd service
74+
cat > "/etc/systemd/system/buildkite-agent-gpu${gpu_idx}.service" << EOF
75+
[Unit]
76+
Description=Buildkite Agent (GPU ${gpu_idx})
77+
Documentation=https://buildkite.com/docs/agent/v3
78+
After=network.target
79+
80+
[Service]
81+
Type=simple
82+
User=buildkite-agent
83+
Environment=NVIDIA_VISIBLE_DEVICES=${gpu_idx}
84+
Environment=CUDA_VISIBLE_DEVICES=${gpu_idx}
85+
ExecStart=/usr/bin/buildkite-agent start --config ${config_file}
86+
RestartSec=5
87+
Restart=on-failure
88+
TimeoutStartSec=10
89+
TimeoutStopSec=60
90+
91+
[Install]
92+
WantedBy=multi-user.target
93+
EOF
94+
95+
echo " Created agent ${gpu_idx}: GPU=${gpu_idx}"
96+
done
97+
98+
# === START AGENTS ===
99+
echo "Starting agents..."
100+
systemctl daemon-reload
101+
102+
for gpu_idx in $(seq 0 $((GPU_COUNT - 1))); do
103+
systemctl enable "buildkite-agent-gpu${gpu_idx}"
104+
systemctl start "buildkite-agent-gpu${gpu_idx}"
105+
done
106+
107+
sleep 3
108+
109+
echo ""
110+
echo "=== Agent Status ==="
111+
for gpu_idx in $(seq 0 $((GPU_COUNT - 1))); do
112+
status=$(systemctl is-active "buildkite-agent-gpu${gpu_idx}" 2>/dev/null || echo "unknown")
113+
echo " GPU ${gpu_idx}: ${status}"
114+
done
115+
116+
echo ""
117+
echo "=== Setup Complete ==="
118+
echo "Created ${GPU_COUNT} agents for queue: ${GPU_TYPE}"
119+
echo "Each agent sees only its assigned GPU via NVIDIA_VISIBLE_DEVICES"
120+
echo ""
121+
echo "Check agents at: https://buildkite.com/organizations/YOUR_ORG/agents"

0 commit comments

Comments
 (0)