Skip to content

Commit

Permalink
[fine_tuning]: launch training with torchrun (fsdp)
Browse files Browse the repository at this point in the history
  • Loading branch information
albertoperdomo2 committed Dec 31, 2024
1 parent d92005e commit 0c40412
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 14 deletions.
6 changes: 3 additions & 3 deletions projects/fine_tuning/testing/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ ci_presets:
tests.fine_tuning.test_settings.dataset_name: alpaca_data.json
tests.fine_tuning.test_settings.dataset_replication: 0.5

tests.fine_tuning.test_settings.gpu: [2, 4]
tests.fine_tuning.test_settings.gpu: 8

tests.fine_tuning.test_settings.hyper_parameters.gradient_accumulation_steps: 1
tests.fine_tuning.test_settings.hyper_parameters.per_device_train_batch_size: 4
Expand All @@ -107,9 +107,9 @@ ci_presets:
- model_name:
# - meta-llama/Llama-2-13b-hf
# - meta-llama/Meta-Llama-3.1-70B
# - ibm-granite/granite-3b-code-instruct
- ibm-granite/granite-3b-code-instruct
# - instructlab/granite-7b-lab
- ibm-granite/granite-8b-code-base
# - ibm-granite/granite-8b-code-base
# - meta-llama/Meta-Llama-3.1-8B
# - mistralai/Mistral-7B-v0.3
# - mistralai/Mixtral-8x7B-v0.1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,24 @@ if [[ $WORLD_SIZE == 1 ]]; then
else
echo "Running with a $NUM_GPUS GPUs"
fi
time python /app/accelerate_launch.py
time python -m torch.distributed.run \
--node_rank "$RANK" \
--nnodes "$WORLD_SIZE" \
--nproc_per_node "$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader | wc -l) " \
--master_addr "$MASTER_ADDR" \
--master_port "$MASTER_PORT" \
launch_training.py
exit 0
fi
echo "Running on $WORLD_SIZE machines with $NUM_GPUS GPUs each."

time accelerate launch \
--debug \
--machine_rank $RANK \
--num_machines $WORLD_SIZE \
--num_processes $WORLD_SIZE \
--main_process_ip $MASTER_ADDR \
--main_process_port $MASTER_PORT \
--mixed_precision no \
--dynamo_backend no \
--multi_gpu \
time python -m torch.distributed.run \
--node_rank "$RANK" \
--nnodes "$WORLD_SIZE" \
--nproc_per_node "$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader | wc -l) " \
--master_addr "$MASTER_ADDR" \
--master_port "$MASTER_PORT" \
launch_training.py

# --mixed_precision no --> disabled by default in fsdp
# --dynamo_backend no --> torch.compile disabled by default in fsdp

0 comments on commit 0c40412

Please sign in to comment.