forked from pytorch/torchtitan
-
Notifications
You must be signed in to change notification settings - Fork 0
/
submitit_train.py
39 lines (36 loc) · 1.33 KB
/
submitit_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import submitit
import datetime
import yaml
import os
if __name__ == "__main__":
executor = submitit.AutoExecutor(folder="~/slurm_jobs/titan/job_%j")
n_gpus = 8
executor.update_parameters(
name="titan", timeout_min=3 * 24 * 60,
gpus_per_node=n_gpus,
nodes=1, mem_gb=80, cpus_per_task=n_gpus * 4,
slurm_additional_parameters={
"partition": "h100"
}
)
jobs = []
with executor.batch():
for _ in range(1):
# train_config = './train_configs/chemlactica_125m.toml'
# train_config = './train_configs/chemlactica_1.3b.toml'
train_config = './train_configs/llama3.2_1b.toml'
# train_config = './train_configs/debug_model.toml'
function = submitit.helpers.CommandFunction([
'python3', '-m', 'torch.distributed.run',
'--nproc_per_node', f'{n_gpus}',
'--rdzv_backend', 'c10d',
'--rdzv_endpoint', 'localhost:0',
'--local-ranks-filter', '0',
'--role', 'rank', '--tee', '3',
'train.py',
'--job.config_file', train_config,
])
print(' '.join(function.command))
# subprocess.run(function.command)
job = executor.submit(function)
jobs.append(job)