Skip to content

Commit

Permalink
use correct launcher in scr_run on different platforms
Browse files Browse the repository at this point in the history
  • Loading branch information
adammoody committed Oct 23, 2017
1 parent 6bb59ac commit 544fdca
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 82 deletions.
37 changes: 19 additions & 18 deletions scripts/LSF/scr_run.in
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,33 @@

# requires: mpirun

launcher="mpirun"
prog="scr_${launcher}"

libdir="@X_LIBDIR@"
bindir="@X_BINDIR@"

# Print usage
if [ -z "$1" ]; then
echo USAGE:
echo "scr_srun [srun args] [-rc|--run-cmd=<run_command>] [-rs|--restart-cmd=<restart_command>] [srun args]"
echo "scr_$launcher [$launcher args] [-rc|--run-cmd=<run_command>] [-rs|--restart-cmd=<restart_command>] [$launcher args]"
echo "<run_command>: The command to run when no restart file is present"
echo "<restart_command>: The command to run when a restart file is present"
echo ""
echo "The invoked command will be \`srun [srun_args] [run_command]\` when no restart file is present"
echo "The invoked command will be \`srun [srun_args] [restart_command]\` when a restart file is present"
echo "The invoked command will be \`$launcher [$launcher args] [run_command]\` when no restart file is present"
echo "The invoked command will be \`$launcher [$launcher args] [restart_command]\` when a restart file is present"
echo "If the string \"SCR_CKPT_NAME\" appears in the restart command, it will be replace by the name "
echo "presented to SCR when the most recent checkpoint was written."
echo ""
echo "If no restart command is specified, the run command will always be used"
echo "If no commands are specified, the srun arguments will be passed directly to srun in all circumstances"
echo "If no commands are specified, the $launcher arguments will be passed directly to $launcher in all circumstances"
echo "If no run command is specified, but a restart command is specified,"
echo "then the restart command will be appended to the srun arguments when a restart file is present."
echo "then the restart command will be appended to the $launcher arguments when a restart file is present."
exit 0
fi

# capture restart and run commands if specified
srun_args=""
launcher_args=""
while [ ${1:+x} ]; do
case $1 in
--restart-cmd|-rs)
Expand All @@ -40,14 +46,14 @@ while [ ${1:+x} ]; do
if [ -z "$run_cmd" ]; then shift; run_cmd=$1; fi
shift ;;
*)
srun_args="$srun_args $1"
launcher_args="$launcher_args $1"
shift ;;
esac
done

# if SCR is disabled, just do a normal run and exit
if [ "$SCR_ENABLE" == "0" ] ; then
mpirun $srun_args $run_cmd
$launcher $launcher_args $run_cmd
exit $?
fi

Expand All @@ -58,11 +64,6 @@ if [ -n "$SCR_DEBUG" ]; then
fi
fi

prog=scr_mpirun

libdir="@X_LIBDIR@"
bindir="@X_BINDIR@"

# make a record of start time
timestamp=`date`
echo "$prog: Started: $timestamp"
Expand Down Expand Up @@ -216,12 +217,12 @@ while [ 1 ] ; do
timestamp=`date`
echo "$prog: RUN $attempts: $timestamp"

launch_cmd="$srun_args $run_cmd"
launch_cmd="$launcher_args $run_cmd"
if [ ${restart_cmd:+x} ]; then
restart_name=`srun $srun_args $bindir/scr_have_restart`
restart_name=`$launcher $launcher_args $bindir/scr_have_restart`
if [ ${restart_name:+x} ]; then
my_restart_cmd=`echo $restart_cmd | sed "s#SCR_CKPT_NAME#${restart_name}#g"`
launch_cmd="$srun_args $my_restart_cmd"
launch_cmd="$launcher_args $my_restart_cmd"
fi
fi

Expand All @@ -231,8 +232,8 @@ while [ 1 ] ; do

if [ $use_scr_watchdog -eq 0 ]; then
echo "$target_hosts" > $hostfile
mpirun --hostfile $hostfile $launch_cmd
#mpirun --host $target_hosts $launch_cmd
$launcher --hostfile $hostfile $launch_cmd
#$launcher --host $target_hosts $launch_cmd
# else
# echo "$prog: Attempting to start watchdog process."
# # need to get job step id of the srun command
Expand Down
33 changes: 14 additions & 19 deletions scripts/PMIX/scr_run.in
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,21 @@
# note: this does not support the watchdog process yet since killing a jobstep isn't
# supported in pmix yet

prog=scr_pmix_run_bash

libdir="@X_LIBDIR@"
bindir="@X_BINDIR@"
__have_libcppr="@HAVE_LIBCPPR@"

# Print usage
if [ -z "$1" ]; then
echo USAGE:
echo "scr_pmix_run [spawn args] [-rc|--run-cmd=<run_command>] [-rs|--restart-cmd=<restart_command>] [spawn args]"
echo "<run_command>: The command to run when no restart file is present"
echo "<restart_command>: The command to run when a restart file is present"
echo ""
echo "The invoked command will be \`scr_pmix_spawn [spawn_args] [run_command]\` when no restart file is present"
echo "The invoked command will be \`scr_pmix_spawn [spawn_args] [restart_command]\` when a restart file is present"
echo "The invoked command will be \`scr_pmix_spawn [spawn args] [run_command]\` when no restart file is present"
echo "The invoked command will be \`scr_pmix_spawn [spawn args] [restart_command]\` when a restart file is present"
echo "If the string \"SCR_CKPT_NAME\" appears in the restart command, it will be replace by the name "
echo "presented to SCR when the most recent checkpoint was written."
echo ""
Expand All @@ -24,7 +30,7 @@ if [ -z "$1" ]; then
fi

# capture restart and run commands if specified
spawn_args=""
launcher_args=""
while [ ${1:+x} ]; do
case $1 in
--restart-cmd|-rs)
Expand All @@ -42,18 +48,14 @@ while [ ${1:+x} ]; do
if [ -z "$run_cmd" ]; then shift; run_cmd=$1; fi
shift ;;
*)
spawn_args="$spawn_args $1"
launcher_args="$launcher_args $1"
shift ;;
esac
done

libdir="@X_LIBDIR@"
bindir="@X_BINDIR@"
__have_libcppr="@HAVE_LIBCPPR@"

# if SCR is disabled, just do a normal run and exit
if [ "$SCR_ENABLE" == "0" ] ; then
$bindir/scr_pmix_spawn $spawn_args $run_cmd
$bindir/scr_pmix_spawn $launcher_args $run_cmd
exit $?
fi

Expand All @@ -64,8 +66,6 @@ if [ -n "$SCR_DEBUG" ]; then
fi
fi

prog=scr_pmix_run_bash

# make a record of start time
timestamp=`date`
echo "$prog: Started: $timestamp"
Expand Down Expand Up @@ -111,11 +111,6 @@ if [ $? -ne 0 ] ; then
exit 1
fi

# NOP srun to force every node to run prolog to delete files from cache
# TODO: remove this if admins find a better place to clear cache
#srun /bin/hostname > /dev/null
#don't nop in pmix machine type?

# make a record of time prerun is started
timestamp=`date`
echo "$prog: prerun: $timestamp"
Expand Down Expand Up @@ -214,12 +209,12 @@ while [ 1 ] ; do
timestamp=`date`
echo "$prog: RUN $attempts: $timestamp"

launch_cmd="$spawn_args $run_cmd"
launch_cmd="$launcher_args $run_cmd"
if [ ${restart_cmd:+x} ]; then
restart_name=`srun $spawn_args $bindir/scr_have_restart`
restart_name=`$bindir/scr_pmix_spawn $launcher_args $bindir/scr_have_restart`
if [ ${restart_name:+x} ]; then
my_restart_cmd=`echo $restart_cmd | sed "s#SCR_CKPT_NAME#${restart_name}#g"`
launch_cmd="$spawn_args $my_restart_cmd"
launch_cmd="$launcher_args $my_restart_cmd"
fi
fi

Expand Down
37 changes: 19 additions & 18 deletions scripts/TLCC/scr_run.in
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,33 @@

# requires: srun

launcher="srun"
prog="scr_${launcher}"

libdir="@X_LIBDIR@"
bindir="@X_BINDIR@"

# Print usage
if [ -z "$1" ]; then
echo USAGE:
echo "scr_srun [srun args] [-rc|--run-cmd=<run_command>] [-rs|--restart-cmd=<restart_command>] [srun args]"
echo "scr_$launcher [$launcher args] [-rc|--run-cmd=<run_command>] [-rs|--restart-cmd=<restart_command>] [$launcher args]"
echo "<run_command>: The command to run when no restart file is present"
echo "<restart_command>: The command to run when a restart file is present"
echo ""
echo "The invoked command will be \`srun [srun_args] [run_command]\` when no restart file is present"
echo "The invoked command will be \`srun [srun_args] [restart_command]\` when a restart file is present"
echo "The invoked command will be \`$launcher [${launcher} args] [run_command]\` when no restart file is present"
echo "The invoked command will be \`$launcher [${launcher} args] [restart_command]\` when a restart file is present"
echo "If the string \"SCR_CKPT_NAME\" appears in the restart command, it will be replace by the name "
echo "presented to SCR when the most recent checkpoint was written."
echo ""
echo "If no restart command is specified, the run command will always be used"
echo "If no commands are specified, the srun arguments will be passed directly to srun in all circumstances"
echo "If no commands are specified, the $launcher arguments will be passed directly to $launcher in all circumstances"
echo "If no run command is specified, but a restart command is specified,"
echo "then the restart command will be appended to the srun arguments when a restart file is present."
echo "then the restart command will be appended to the $launcher arguments when a restart file is present."
exit 0
fi

# capture restart and run commands if specified
srun_args=""
launcher_args=""
while [ ${1:+x} ]; do
case $1 in
--restart-cmd|-rs)
Expand All @@ -40,14 +46,14 @@ while [ ${1:+x} ]; do
if [ -z "$run_cmd" ]; then shift; run_cmd=$1; fi
shift ;;
*)
srun_args="$srun_args $1"
launcher_args="$launcher_args $1"
shift ;;
esac
done

# if SCR is disabled, just do a normal run and exit
if [ "$SCR_ENABLE" == "0" ] ; then
srun $srun_args $run_cmd
$launcher $launcher_args $run_cmd
exit $?
fi

Expand All @@ -58,11 +64,6 @@ if [ -n "$SCR_DEBUG" ]; then
fi
fi

prog=scr_srun

libdir="@X_LIBDIR@"
bindir="@X_BINDIR@"

# make a record of start time
timestamp=`date`
echo "$prog: Started: $timestamp"
Expand Down Expand Up @@ -203,12 +204,12 @@ while [ 1 ] ; do
timestamp=`date`
echo "$prog: RUN $attempts: $timestamp"

launch_cmd="$srun_args $run_cmd"
launch_cmd="$launcher_args $run_cmd"
if [ ${restart_cmd:+x} ]; then
restart_name=`srun $srun_args $bindir/scr_have_restart`
restart_name=`$launcher $launcher_args $bindir/scr_have_restart`
if [ ${restart_name:+x} ]; then
my_restart_cmd=`echo $restart_cmd | sed "s#SCR_CKPT_NAME#${restart_name}#g"`
launch_cmd="$srun_args $my_restart_cmd"
launch_cmd="$launcher_args $my_restart_cmd"
fi
fi

Expand All @@ -217,11 +218,11 @@ while [ 1 ] ; do
$bindir/scr_log_event -T "RUN STARTED" -N "Job=$jobid, Run=$attempts" -S $start_secs

if [ $use_scr_watchdog -eq 0 ]; then
srun $exclude $launch_cmd
$launcher $exclude $launch_cmd
else
echo "$prog: Attempting to start watchdog process."
# need to get job step id of the srun command
srun $exclude $launch_cmd &
$launcher $exclude $launch_cmd &
srun_pid=$!;
sleep 10; # sleep a bit to wait for the job to show up in squeue
echo "$bindir/scr_get_jobstep_id $srun_pid";
Expand Down
46 changes: 19 additions & 27 deletions scripts/cray_xt/scr_run.in
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,33 @@

# requires: aprun

launcher="aprun"
prog="scr_$launcher"

libdir="@X_LIBDIR@"
bindir="@X_BINDIR@"

# Print usage
if [ -z "$1" ]; then
echo USAGE:
echo "scr_aprun [aprun args] [-rc|--run-cmd=<run_command>] [-rs|--restart-cmd=<restart_command>] [aprun args]"
echo "scr_$launcher [$launcher args] [-rc|--run-cmd=<run_command>] [-rs|--restart-cmd=<restart_command>] [$launcher args]"
echo "<run_command>: The command to run when no restart file is present"
echo "<restart_command>: The command to run when a restart file is present"
echo ""
echo "The invoked command will be \`aprun [aprun_args] [run_command]\` when no restart file is present"
echo "The invoked command will be \`aprun [aprun_args] [restart_command]\` when a restart file is present"
echo "The invoked command will be \`$launcher [$launcher args] [run_command]\` when no restart file is present"
echo "The invoked command will be \`$launcher [$launcher args] [restart_command]\` when a restart file is present"
echo "If the string \"SCR_CKPT_NAME\" appears in the restart command, it will be replace by the name "
echo "presented to SCR when the most recent checkpoint was written."
echo ""
echo "If no restart command is specified, the run command will always be used"
echo "If no commands are specified, the aprun arguments will be passed directly to aprun in all circumstances"
echo "If no commands are specified, the $launcher arguments will be passed directly to $launcher in all circumstances"
echo "If no run command is specified, but a restart command is specified,"
echo "then the restart command will be appended to the aprun arguments when a restart file is present."
echo "then the restart command will be appended to the $launcher arguments when a restart file is present."
exit 0
fi

# capture restart and run commands if specified
aprun_args=""
launcher_args=""
while [ ${1:+x} ]; do
case $1 in
--restart-cmd|-rs)
Expand All @@ -40,14 +46,14 @@ while [ ${1:+x} ]; do
if [ -z "$run_cmd" ]; then shift; run_cmd=$1; fi
shift ;;
*)
aprun_args="$aprun_args $1"
launcher_args="$launcher_args $1"
shift ;;
esac
done

# if SCR is disabled, just do a normal run and exit
if [ "$SCR_ENABLE" == "0" ] ; then
aprun $aprun_args $run_cmd
$launcher $launcher_args $run_cmd
exit $?
fi

Expand All @@ -58,11 +64,6 @@ if [ -n "$SCR_DEBUG" ]; then
fi
fi

prog=scr_run

libdir="@X_LIBDIR@"
bindir="@X_BINDIR@"

# make a record of start time
timestamp=`date`
echo "$prog: Started: $timestamp"
Expand Down Expand Up @@ -101,15 +102,6 @@ if [ "$SCR_WATCHDOG" == "1" ] ; then
use_scr_watchdog=1
fi

# normally we would check that this script is running on a node in the job's
# allocated nodeset, but on crays, this script runs on MOM node
script_node=`hostname`
#intersection=`$bindir/scr_glob_hosts --intersection $script_node:$SCR_NODELIST`
#if [ -z "$intersection" ] ; then
#echo "$prog: ERROR: scr_run is executing on $script_node, which is not part of the job's nodeset $SCR_NODELIST."
#exit 1
#fi

# get the control directory
cntldir=`$bindir/scr_list_dir control`
if [ $? -ne 0 ] ; then
Expand Down Expand Up @@ -249,21 +241,21 @@ while [ 1 ] ; do
tmpupnodes=${upnodes%]}
tmpupnodes=${tmpupnodes:1}

launch_cmd="$aprun_args $run_cmd"
launch_cmd="$launcher_args $run_cmd"
if [ ${restart_cmd:+x} ]; then
restart_name=`srun $srun_args $bindir/scr_have_restart`
restart_name=`$launcher $launcher_args $bindir/scr_have_restart`
if [ ${restart_name:+x} ]; then
my_restart_cmd=`echo $restart_cmd | sed "s#SCR_CKPT_NAME#${restart_name}#g"`
launch_cmd="$aprun_args $my_restart_cmd"
launch_cmd="$launcher_args $my_restart_cmd"
fi
fi

if [ $use_scr_watchdog -eq 0 ]; then
aprun -L $tmpupnodes $launch_cmd
$launcher -L $tmpupnodes $launch_cmd
else
echo "$prog: Attempting to start watchdog process."
# need to get apid of the aprun command
aprun -L $tmpupnodes $launch_cmd &
$launcher -L $tmpupnodes $launch_cmd &
aprun_pid=$!;
sleep 10; # sleep a bit to wait for the job to show up in apstat
echo "$bindir/scr_get_jobstep_id $aprun_pid";
Expand Down

0 comments on commit 544fdca

Please sign in to comment.