tf_train.srun

#!/bin/bash
#SBATCH --job-name=tftrain
#SBATCH --output=slurm_logs/tf_train_%j.out
#SBATCH --error=slurm_logs/tf_train_%j.err
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=2
#SBATCH --nodes=1-1
#SBATCH --gres=gpu:tesla:2
#SBATCH --partition=taylor,kamiak,cahnrs_gpu,free_gpu,vcea,cahnrs,cahnrs_bigmem
#SBATCH --time=3-00:00:00
#SBATCH --mem=20G

. config.py

#
# ---
#

TFArch=$1
if [[ -z $TFArch ]]; then
    echo "Specify which model to train."
    exit 1
else
    echo "TFArch: $TFArch"
fi

# If specified, then use this percentage (e.g. 100) for the learning curve. The
# training record file must be appended with .100 in this case (or .90, etc.).
Amount=$2
suffix="" # Default no learning curve
if [[ -z $Amount ]]; then
    echo "Amount: none - not generating learning curve"
    echo "Not handled yet. Use learning curve 100."
    # Note: ideally we'd handle both learning curve and not, but for now use
    # 100 to for normal use and ignore the other files
    exit 1
else
    echo "Amount: $Amount (learning curve)"
    suffix=".$Amount" # Appended to training record filename
fi

export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
module load cuda/8.0.44 cudnn/6.0_cuda8.0 python3/3.5.0
data="$remotedir"

function clean_up 
{
    rmworkspace -a -f --name="$SCRATCHDIR"
    exit
}

#Create a scratch workspace
SCRATCHDIR="$(mkworkspace -q -t 7-00:00 -b /local)" # 7 days
trap 'clean_up' EXIT

echo "Scratch space: $SCRATCHDIR"
echo "SLURM_CPUS_PER_TASK: $SLURM_CPUS_PER_TASK"
echo "SLURM_JOB_GPUS: $SLURM_JOB_GPUS"

echo "Getting data: started"
cd "$SCRATCHDIR"

echo " - dataset"
# The files must be datasets/YourDataSet/tftrain.record, etc. since the config
# file specifies that directory
mkdir -p "$datasetFolder"
cp -a "$data/$datasetFolder/$datasetTFtrain$suffix" \
      "$data/$datasetFolder/$datasetTFvalid" \
      "$data/$datasetFolder/$datasetTFlabels" \
      "$data/$datasetFolder/tf_${TFArch}_${Amount}.config" \
      "$data/$datasetFolder/${TFArch}_model_${Amount}".ckpt* \
      "$datasetFolder"
echo " - TF models"
cp -ra "$data/models" .
echo "Getting data: done"

echo "Making sure TensorFlow installed: starting"
pip install --user tensorflow-gpu pillow lxml jupyter matplotlib
echo "Making sure TensorFlow installed: done"

echo "Training network: started"
mkdir -p "$data/${datasetFolder}/${datasetTFtrainlogs}/$TFArch$suffix" # log dir, rsync this to view with TensorBoard
cd models/research
export PYTHONPATH="$PYTHONPATH:$(pwd):$(pwd)/slim"
python3 object_detection/train.py \
    --train_dir="$data/${datasetFolder}/${datasetTFtrainlogs}/$TFArch$suffix" \
    --pipeline_config_path="../../$datasetFolder/tf_${TFArch}_${Amount}.config"
echo "Training network: done"

echo "Deleting workspace: started"
clean_up
echo "Deleting workspace: done"