From 6ddfcd099ee9128a3ff05ab6566bbd2faf0ffe18 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Wed, 11 Dec 2024 04:05:06 +0000 Subject: [PATCH] disavle wandb for non 9 rank --- scripts/simulate_multi_node_diloco.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/simulate_multi_node_diloco.sh b/scripts/simulate_multi_node_diloco.sh index 2c0273ef..0b13fe3b 100755 --- a/scripts/simulate_multi_node_diloco.sh +++ b/scripts/simulate_multi_node_diloco.sh @@ -61,7 +61,7 @@ export GLOO_SOCKET_IFNAME=lo for i in $(seq 0 $(($N - 1 ))) do > logs/log$i.log - WANDB_MODE=$([ $i -eq 0 ] && echo "online" || echo "disable") GLOBAL_UNIQUE_ID=$i GLOBAL_RANK=$i CUDA_VISIBLE_DEVICES=$(get_cuda_devices $NUM_GPU $i) uv run torchrun --nproc_per_node=$NUM_GPU --node-rank 0 --rdzv-endpoint localhost:$((BASE_PORT + $i)) --nnodes=1 $@ --data.data_rank $i --data.data_world_size $N > logs/log$i.log 2>&1 & + WANDB_MODE=$([ $i -eq 0 ] && echo "online" || echo "offline") GLOBAL_UNIQUE_ID=$i GLOBAL_RANK=$i CUDA_VISIBLE_DEVICES=$(get_cuda_devices $NUM_GPU $i) uv run torchrun --nproc_per_node=$NUM_GPU --node-rank 0 --rdzv-endpoint localhost:$((BASE_PORT + $i)) --nnodes=1 $@ --data.data_rank $i --data.data_world_size $N > logs/log$i.log 2>&1 & child_pids+=($!) done