-
Notifications
You must be signed in to change notification settings - Fork 0
/
run.slurm
24 lines (20 loc) · 1.03 KB
/
run.slurm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
#!/bin/bash
#SBATCH --job-name=ddp_imagenet # Job name
#SBATCH --nodes=1 # Number of nodes (can not be more than 1 on triton)
#SBATCH --ntasks=2 # Number of tasks (usually equals number of GPUs)
#SBATCH --gres=gpu:2
#SBATCH --partition=gpu-debug
#SBATCH --cpus-per-task=8 # Number of CPU cores per task
#SBATCH --mem=100G # Memory per node
#SBATCH --time=00:30:00 # Maximum time for the job
#SBATCH --output=ddp_training_%j.out # Standard output and error log (%j inserts the job ID)
#SBATCH --error=ddp_training_%j.err # Error log (%j inserts the job ID)
# Load necessary modules
module load scicomp-python-env
module load parallel
# Extract the ImageNet dataset to the local disk
time ./extract_images.sh
# # Set the data directory
DATA_DIR="/tmp/ILSVRC2012_extracted"
# ## Launch the training using srun
srun python ddp_imagenet.py --data_dir $DATA_DIR --batch_size 512 --epochs 2