-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtf_train.srun
92 lines (79 loc) · 2.71 KB
/
tf_train.srun
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/bin/bash
#SBATCH --job-name=tftrain
#SBATCH --output=slurm_logs/tf_train_%j.out
#SBATCH --error=slurm_logs/tf_train_%j.err
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=2
#SBATCH --nodes=1-1
#SBATCH --gres=gpu:tesla:2
#SBATCH --partition=taylor,kamiak,cahnrs_gpu,free_gpu,vcea,cahnrs,cahnrs_bigmem
#SBATCH --time=3-00:00:00
#SBATCH --mem=20G
. config.py
#
# ---
#
TFArch=$1
if [[ -z $TFArch ]]; then
echo "Specify which model to train."
exit 1
else
echo "TFArch: $TFArch"
fi
# If specified, then use this percentage (e.g. 100) for the learning curve. The
# training record file must be appended with .100 in this case (or .90, etc.).
Amount=$2
suffix="" # Default no learning curve
if [[ -z $Amount ]]; then
echo "Amount: none - not generating learning curve"
echo "Not handled yet. Use learning curve 100."
# Note: ideally we'd handle both learning curve and not, but for now use
# 100 to for normal use and ignore the other files
exit 1
else
echo "Amount: $Amount (learning curve)"
suffix=".$Amount" # Appended to training record filename
fi
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
module load cuda/8.0.44 cudnn/6.0_cuda8.0 python3/3.5.0
data="$remotedir"
function clean_up
{
rmworkspace -a -f --name="$SCRATCHDIR"
exit
}
#Create a scratch workspace
SCRATCHDIR="$(mkworkspace -q -t 7-00:00 -b /local)" # 7 days
trap 'clean_up' EXIT
echo "Scratch space: $SCRATCHDIR"
echo "SLURM_CPUS_PER_TASK: $SLURM_CPUS_PER_TASK"
echo "SLURM_JOB_GPUS: $SLURM_JOB_GPUS"
echo "Getting data: started"
cd "$SCRATCHDIR"
echo " - dataset"
# The files must be datasets/YourDataSet/tftrain.record, etc. since the config
# file specifies that directory
mkdir -p "$datasetFolder"
cp -a "$data/$datasetFolder/$datasetTFtrain$suffix" \
"$data/$datasetFolder/$datasetTFvalid" \
"$data/$datasetFolder/$datasetTFlabels" \
"$data/$datasetFolder/tf_${TFArch}_${Amount}.config" \
"$data/$datasetFolder/${TFArch}_model_${Amount}".ckpt* \
"$datasetFolder"
echo " - TF models"
cp -ra "$data/models" .
echo "Getting data: done"
echo "Making sure TensorFlow installed: starting"
pip install --user tensorflow-gpu pillow lxml jupyter matplotlib
echo "Making sure TensorFlow installed: done"
echo "Training network: started"
mkdir -p "$data/${datasetFolder}/${datasetTFtrainlogs}/$TFArch$suffix" # log dir, rsync this to view with TensorBoard
cd models/research
export PYTHONPATH="$PYTHONPATH:$(pwd):$(pwd)/slim"
python3 object_detection/train.py \
--train_dir="$data/${datasetFolder}/${datasetTFtrainlogs}/$TFArch$suffix" \
--pipeline_config_path="../../$datasetFolder/tf_${TFArch}_${Amount}.config"
echo "Training network: done"
echo "Deleting workspace: started"
clean_up
echo "Deleting workspace: done"