Skip to content

Commit

Permalink
Address Comments
Browse files Browse the repository at this point in the history
  • Loading branch information
henryleberre committed Jan 14, 2024
1 parent 6c2d61a commit aa2eac8
Show file tree
Hide file tree
Showing 10 changed files with 93 additions and 57 deletions.
4 changes: 2 additions & 2 deletions docs/documentation/running.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ several supercomputer clusters, both interactively and through batch submission.
>
> Adding a new template file or modifying an existing one will most likely be required if:
> - You are on a cluster that does not have a template yet.
> - Your cluster is configured with SLURM and but fails when interactive jobs are
> launched with `mpirun`.
> - Your cluster is configured with SLURM but interactive job launches fail when
> using `srun`. You might need to invoke `mpirun` instead.
> - Something in the existing default or computer template file is incompatible with
> your system or does not provide a feature you need.
>
Expand Down
2 changes: 1 addition & 1 deletion misc/run-phoenix-release-cpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ cd "$SLURM_SUBMIT_DIR"
echo "Running in $(pwd):"

. ./mfc.sh load -c p -m gpu
./mfc.sh test -j $(nproc) -a -- -b mpirun
./mfc.sh test -j $(nproc) -a -- -c phoenix
2 changes: 1 addition & 1 deletion misc/run-phoenix-release-gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ set -x
gpu_count=$(nvidia-smi -L | wc -l) # number of GPUs on node
gpu_ids=$(seq -s ' ' 0 $(($gpu_count-1))) # 0,1,2,...,gpu_count-1

./mfc.sh test -a -j 2 --gpu -g $gpu_ids -- -b mpirun
./mfc.sh test -a -j 2 --gpu -g $gpu_ids -- -c phoenix
7 changes: 5 additions & 2 deletions toolchain/mfc/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,6 @@ def add_common_arguments(p, mask = None):
run.add_argument("input", metavar="INPUT", type=str, help="Input file to run.")
run.add_argument("arguments", metavar="ARGUMENTS", nargs="*", type=str, default=[], help="Additional positional arguments to pass to the case file.")
run.add_argument("-e", "--engine", choices=["interactive", "batch"], type=str, default="interactive", help="Job execution/submission engine choice.")
run.add_argument("--output-summary", type=str, default=None, help="(Interactive) Output a YAML summary file.")
run.add_argument("-p", "--partition", metavar="PARTITION", type=str, default="", help="(Batch) Partition for job submission.")
run.add_argument("-q", "--quality_of_service", metavar="QOS", type=str, default="", help="(Batch) Quality of Service for job submission.")
run.add_argument("-N", "--nodes", metavar="NODES", type=int, default=1, help="(Batch) Number of nodes.")
Expand All @@ -111,13 +110,14 @@ def add_common_arguments(p, mask = None):
run.add_argument("-@", "--email", metavar="EMAIL", type=str, default="", help="(Batch) Email for job notification.")
run.add_argument("-#", "--name", metavar="NAME", type=str, default="MFC", help="(Batch) Job name.")
run.add_argument("-s", "--scratch", action="store_true", default=False, help="Build from scratch.")
run.add_argument("-b", "--binary", choices=["mpirun", "jsrun", "srun", "mpiexec"], type=str, default=None, help="(Interactive) Override MPI execution binary")
run.add_argument("--ncu", nargs=argparse.REMAINDER, type=str, help="Profile with NVIDIA Nsight Compute.")
run.add_argument("--nsys", nargs=argparse.REMAINDER, type=str, help="Profile with NVIDIA Nsight Systems.")
run.add_argument( "--dry-run", action="store_true", default=False, help="(Batch) Run without submitting batch file.")
run.add_argument("--case-optimization", action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.")
run.add_argument( "--no-build", action="store_true", default=False, help="(Testing) Do not rebuild MFC.")
run.add_argument("--wait", action="store_true", default=False, help="(Batch) Wait for the job to finish.")
run.add_argument("-f", "--flags", metavar="FLAGS", dest="--", nargs=argparse.REMAINDER, type=str, default=[], help="(Interactive) Arguments to forward to the MPI invocation.")
run.add_argument("-f", "--flags", metavar="FLAGS", dest="--", nargs=argparse.REMAINDER, type=str, default=[], help="Arguments to forward to the MPI invocation.")
run.add_argument("-c", "--computer", metavar="COMPUTER", type=str, default="default", help=f"(Batch) Path to a custom submission file template or one of {format_list_to_string(list(get_baked_templates().keys()))}.")

# === BENCH ===
Expand Down Expand Up @@ -154,6 +154,9 @@ def add_common_arguments(p, mask = None):
if args["command"] == "build":
if (args["input"] is not None) ^ args["case_optimization"] :
raise MFCException("./mfc.sh build's --case-optimization and --input must be used together.")
if args["command"] == "run":
if args["binary"] is not None and args["engine"] != "interactive":
raise MFCException("./mfc.sh run's --binary can only be used with --engine=interactive.")

# Input files to absolute paths
for e in ["input", "input1", "input2"]:
Expand Down
1 change: 1 addition & 0 deletions toolchain/mfc/run/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ def __generate_job_script(targets):
ARG=ARG,
env=env,
rootdir=MFC_ROOTDIR,
qsystem=queues.get_system(),
binpaths=[target.get_install_binpath() for target in targets],
profiler=__profiler_prepend(),
)
Expand Down
15 changes: 9 additions & 6 deletions toolchain/templates/bridges2.mako
Original file line number Diff line number Diff line change
Expand Up @@ -27,27 +27,30 @@

<%include file="prologue.mako"/>

echo -e ":) Loading modules:\n"
ok ":) Loading modules:\n"
cd "${rootdir}"
. ./mfc.sh load -c b -m ${'g' if gpu else 'c'}
cd - > /dev/null
echo

% for binpath in binpaths:
echo -e ":) Running ${binpath.split('/')[-1]}:\n"
ok -e ":) Running ${binpath.split('/')[-1]}:\n"

% if not mpi:
${' '.join([f"'{x}'" for x in profiler ])} "${binpath}"
% else:
mpirun -np ${nodes*tasks_per_node} \
${' '.join([f"'{x}'" for x in profiler ])} \
"${binpath}"
${' '.join([f"'{x}'" for x in profiler ])} \
mpirun -np ${nodes*tasks_per_node} \
${' '.join([f"'{x}'" for x in ARG('--') ])} \
"${binpath}"
% endif

% if engine == 'interactive':
code=$?
if [ $code -ne 0 ]; then
echo -e "\n:( $MAGENTA${binpath}$COLOR_RESET failed with exit code $MAGENTA$code$COLOR_RESET.\n"
echo
error ":( $MAGENTA${binpath}$COLOR_RESET failed with exit code $MAGENTA$code$COLOR_RESET."
echo
exit 1
fi
% endif
Expand Down
59 changes: 38 additions & 21 deletions toolchain/templates/default.mako
Original file line number Diff line number Diff line change
Expand Up @@ -12,38 +12,55 @@
warn "This is the$MAGENTA default$COLOR_RESET template."
warn "It is not intended to support all systems and execution engines."
warn "Please use a different template via the $MAGENTA--computer$COLOR_RESET option."
echo

% if mpi:
# Find a suitable MPI launcher and store it in the variable "binary".
for binary in ${binary or ''} jsrun srun mpirun mpiexec; do
if command -v $binary > /dev/null; then
break
fi
done

if ! command -v $binary > /dev/null; then
error ":( Could not find a suitable MPI launcher.\n"
exit 1
else
ok ":) Selected MPI launcher $MAGENTA$binary$COLOR_RESET. Use$MAGENTA --binary$COLOR_RESET to override."
fi
% endif

% for binpath in binpaths:
echo -e ":) Running $MAGENTA${binpath}$COLOR_RESET:\n"
ok ":) Running $MAGENTA${binpath}$COLOR_RESET:\n"

% if not mpi:
${' '.join([f"'{x}'" for x in profiler ])} "${binpath}"
% else:
if command -v jsrun > /dev/null; then
jsrun --nrs ${tasks_per_node*nodes} \
--cpu_per_rs 1 \
--gpu_per_rs ${1 if gpu else 0} \
--tasks_per_rs 1 \
${' '.join([f"'{x}'" for x in profiler ])} \
"${binpath}"
elif command -v srun > /dev/null; then
srun --ntasks-per-node ${tasks_per_node} \
${' '.join([f"'{x}'" for x in profiler ])} \
"${binpath}"
elif command -v mpirun > /dev/null; then
mpirun -np ${nodes*tasks_per_node} \
${' '.join([f"'{x}'" for x in profiler ])} \
"${binpath}"
else
echo -e "\n:( Could not find a suitable MPI launcher.\n"
exit 1
if [ "$binary" == "jsrun" ]; then
${' '.join([f"'{x}'" for x in profiler ])} \
jsrun --nrs ${tasks_per_node*nodes} \
--cpu_per_rs 1 \
--gpu_per_rs ${1 if gpu else 0} \
--tasks_per_rs 1 \
${' '.join([f"'{x}'" for x in ARG('--') ])} \
"${binpath}"
elif [ "$binary" == "srun" ]; then
${' '.join([f"'{x}'" for x in profiler ])} \
srun --ntasks-per-node ${tasks_per_node} \
${' '.join([f"'{x}'" for x in ARG('--') ])} \
"${binpath}"
elif [ "$binary" == "mpirun" ] || [ "$binary" == "mpiexec" ]; then
${' '.join([f"'{x}'" for x in profiler ])} \
$binary -np ${nodes*tasks_per_node} \
${' '.join([f"'{x}'" for x in ARG('--') ])} \
"${binpath}"
fi
% endif

code=$?
if [ $code -ne 0 ]; then
echo -e "\n:( $MAGENTA${binpath}$COLOR_RESET failed with exit code $MAGENTA$code$COLOR_RESET.\n"
echo
error ":( $MAGENTA${binpath}$COLOR_RESET failed with exit code $MAGENTA$code$COLOR_RESET."
echo
exit 1
fi

Expand Down
11 changes: 6 additions & 5 deletions toolchain/templates/include/prologue.mako
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@ TABLE_HEADER="+-----------------------------------------------------------------
TABLE_FOOTER="+-----------------------------------------------------------------------------------------------------------+ \\n"
TABLE_TITLE_FORMAT="| %-105s |\\n"
TABLE_CONTENT=$(cat <<-END
$(printf "$TABLE_FORMAT_LINE" "Start-time" "$(date +%T)" "Start-date" "$(date +%T)")
$(printf "$TABLE_FORMAT_LINE" "Partition" "${partition}" "Walltime" "${walltime}")
$(printf "$TABLE_FORMAT_LINE" "Account" "${account}" "Nodes" "${nodes}")
$(printf "$TABLE_FORMAT_LINE" "Job Name" "${name}" "Engine" "${engine}")
$(printf "$TABLE_FORMAT_LINE" "Queue System" "{qsystem.name}" "Email" "${email}")
$(printf "$TABLE_FORMAT_LINE" "Start-time" "$(date +%T)" "Start-date" "$(date +%T)")
$(printf "$TABLE_FORMAT_LINE" "Partition" "${partition or 'N/A'}" "Walltime" "${walltime}")
$(printf "$TABLE_FORMAT_LINE" "Account" "${account or 'N/A'}" "Nodes" "${nodes}")
$(printf "$TABLE_FORMAT_LINE" "Job Name" "${name}" "Engine" "${engine}")
$(printf "$TABLE_FORMAT_LINE" "QoS" "${quality_of_service or 'N/A'}" "Binary" "${binary or 'N/A'}")
$(printf "$TABLE_FORMAT_LINE" "Queue System" "${qsystem.name}" "Email" "${email or 'N/A'}")
END
)

Expand Down
20 changes: 13 additions & 7 deletions toolchain/templates/phoenix.mako
Original file line number Diff line number Diff line change
Expand Up @@ -27,25 +27,31 @@

<%include file="prologue.mako"/>

echo -e ":) Loading modules:\n"
cd "${rootdir}" && . ./mfc.sh load -c p -m ${'g' if gpu else 'c'} && cd -
ok ":) Loading modules:\n"
cd "${rootdir}"
. ./mfc.sh load -c p -m ${'g' if gpu else 'c'}
cd - > /dev/null
echo

% for binpath in binpaths:
echo -e ":) Running ${binpath.split('/')[-1]}:\n"
ok ":) Running ${binpath.split('/')[-1]}:\n"

% if not mpi:
${' '.join([f"'{x}'" for x in profiler ])} "${binpath}"
% else:
mpirun -np ${nodes*tasks_per_node} \
${' '.join([f"'{x}'" for x in profiler ])} \
"${binpath}"
${' '.join([f"'{x}'" for x in profiler ])} \
mpirun -np ${nodes*tasks_per_node} \
--bind-to none \
${' '.join([f"'{x}'" for x in ARG('--') ])} \
"${binpath}"
% endif

% if engine == 'interactive':
code=$?
if [ $code -ne 0 ]; then
echo -e "\n:( $MAGENTA${binpath}$COLOR_RESET failed with exit code $MAGENTA$code$COLOR_RESET.\n"
echo
error ":( $MAGENTA${binpath}$COLOR_RESET failed with exit code $MAGENTA$code$COLOR_RESET."
echo
exit 1
fi
% endif
Expand Down
29 changes: 17 additions & 12 deletions toolchain/templates/summit.mako
Original file line number Diff line number Diff line change
Expand Up @@ -12,30 +12,35 @@

<%include file="prologue.mako"/>

echo -e ":) Loading modules:\n"
cd "${rootdir}" && . ./mfc.sh load -c s -m ${'g' if gpu else 'c'} && cd -
ok ":) Loading modules:\n"
cd "${rootdir}"
. ./mfc.sh load -c s -m ${'g' if gpu else 'c'}
cd - > /dev/null
echo

% for binpath in binpaths:
echo -e ":) Running ${binpath}:\n"
ok ":) Running ${binpath}:\n"

% if not mpi:
${' '.join([f"'{x}'" for x in profiler ])} "${binpath}"
% else:
jsrun \
${'--smpiargs="-gpu"' if gpu else ''} \
--nrs ${tasks_per_node*nodes} \
--cpu_per_rs 1 \
--gpu_per_rs ${1 if gpu else 0} \
--tasks_per_rs 1 \
${' '.join([f"'{x}'" for x in profiler ])} \
"${binpath}"
${' '.join([f"'{x}'" for x in profiler ])} \
jsrun \
${'--smpiargs="-gpu"' if gpu else ''} \
--nrs ${tasks_per_node*nodes} \
--cpu_per_rs 1 \
--gpu_per_rs ${1 if gpu else 0} \
--tasks_per_rs 1 \
${' '.join([f"'{x}'" for x in ARG('--') ])} \
"${binpath}"
% endif

% if engine == 'interactive':
code=$?
if [ $code -ne 0 ]; then
echo -e "\n:( $MAGENTA${binpath}$COLOR_RESET failed with exit code $MAGENTA$code$COLOR_RESET.\n"
echo
error ":( $MAGENTA${binpath}$COLOR_RESET failed with exit code $MAGENTA$code$COLOR_RESET."
echo
exit 1
fi
% endif
Expand Down

0 comments on commit aa2eac8

Please sign in to comment.