Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add slurm 22.05.6 #4

Merged
merged 1 commit into from
Oct 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/cscs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ jobs:
- 21.08.8-2
- 22.05.2
- 22.05.5
- 22.05.6

env:
tag: ghcr.io/eth-cscs/slurm-container-${{ matrix.version }}:latest
Expand Down
70 changes: 70 additions & 0 deletions slurm-22.05.6/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
FROM --platform=linux/amd64 opensuse/leap:15.3

ARG SLURM_VERSION=22.05.6
ARG SLURM_ROOT=/opt/slurm-${SLURM_VERSION}
ARG SLURM_CONFDIR=${SLURM_ROOT}/etc

ENV SLURM_VERSION ${SLURM_VERSION}
ENV SLURM_ROOT ${SLURM_ROOT}
ENV SLURM_CONFDIR ${SLURM_CONFDIR}


RUN zypper install -y \
munge \
munge-devel \
libnuma1 \
libnuma-devel \
librrd8 \
readline-devel \
hwloc \
hwloc-devel \
hdf5 \
hdf5-devel \
lz4 \
liblz4-devel \
libz1 \
zlib-devel \
freeipmi \
freeipmi-devel \
dbus-1 \
dbus-1-devel \
make \
gcc \
gcc-c++ \
curl \
tar \
bzip2 \
python3 \
vim \
ca-certificates \
less \
mpich \
mpich-devel \
sudo

RUN zypper install -y \
lua53 \
lua53-devel \
libmount-devel

RUN useradd -M slurm

RUN mkdir -p /var/log/slurm
RUN mkdir -p /var/spool/slurmctld && chown slurm /var/spool/slurmctld && chmod u+rwx /var/spool/slurmctld
RUN mkdir -p /var/spool/slurmd && chown slurm /var/spool/slurmd && chmod u+rwx /var/spool/slurmd


COPY install_slurm.sh .

RUN ./install_slurm.sh ${SLURM_VERSION} ${SLURM_ROOT} ${SLURM_CONFDIR} --enable-multiple-slurmd

RUN mkdir -p ${SLURM_CONFDIR}
COPY cgroup.conf ${SLURM_CONFDIR}
COPY slurm.conf.in ${SLURM_CONFDIR}

COPY entrypoint.sh .
ENTRYPOINT ["./entrypoint.sh"]
CMD ["bash"]

#COPY run_slurm_examples example.job mpi_example.job plugin.cpp mpi_hello.c .

5 changes: 5 additions & 0 deletions slurm-22.05.6/cgroup.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
CgroupAutomount=yes
ConstrainCores=no
ConstrainRAMSpace=no
CgroupMountpoint=/sys/fs/cgroup
CgroupPlugin=cgroup/v1
88 changes: 88 additions & 0 deletions slurm-22.05.6/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#!/bin/bash

dbus-launch
sudo -u munge munged

. /usr/lib64/mpi/gcc/mpich/bin/mpivars.sh

: "${SLURM_CONF_IN=$SLURM_CONFDIR/slurm.conf.in}"
: "${SLURM_CONF=$SLURM_CONFDIR/slurm.conf}"

# Default number of slurm nodes
: "${SLURM_NUMNODES=3}"

# Default slurm controller
: "${SLURMCTLD_HOST=$HOSTNAME}"
: "${SLURMCTLD_ADDR=127.0.0.1}"

# Default node info
: "${NODE_HOST=$HOSTNAME}"
: "${NODE_ADDR=127.0.0.1}"
: "${NODE_BASEPORT=6001}"

# Default hardware profile
: "${NODE_HW=CPUs=4}"

# Generate node names and associated ports
NODE_NAMES=$(printf "nd[%05i-%05i]" 1 $SLURM_NUMNODES)
NODE_PORTS=$(printf "%i-%i" $NODE_BASEPORT $(($NODE_BASEPORT+$SLURM_NUMNODES-1)))


echo "INFO:"
echo "INFO: Creating $SLURM_CONF with"
echo "INFO: "
column -t <<-EOF
INFO: SLURMCTLD_HOST=$SLURMCTLD_HOST SLURMCTLD_ADDR=$SLURMCTLD_ADDR
INFO: NODE_HOST=$NODE_HOST NODE_ADDR=$NODE_ADDR NODE_BASEPORT=$NODE_BASEPORT
INFO: NODE_HW=$NODE_HW
INFO: SLURM_NUMNODES=$SLURM_NUMNODES
EOF
echo "INFO: "
echo "INFO: Derived values:"
echo "INFO:"
column -t <<-EOF
INFO: NODE_NAMES=$NODE_NAMES
INFO: NODE_PORTS=$NODE_PORTS
EOF
echo "INFO:"
echo "INFO: Override any of the non-derived values by setting the respective environment variable"
echo "INFO: when starting Docker."
echo "INFO:"

export PATH=$SLURM_ROOT/bin:$PATH
export LD_LIBRARY_PATH=$SLURM_ROOT/lib:$LD_LIBRARY_PATH
export MANPATH=$SLURM_ROOT/man:$MANPATH

(
echo "NodeName=${NODE_NAMES} NodeHostname=${NODE_HOST} NodeAddr=${NODE_ADDR} Port=${NODE_PORTS} State=UNKNOWN ${NODE_HW}"
echo "PartitionName=dkr Nodes=ALL Default=YES MaxTime=INFINITE State=UP"
) \
| sed -e "s/SLURMCTLDHOST/${SLURMCTLD_HOST}/" \
-e "s/SLURMCTLDADDR/${SLURMCTLD_ADDR}/" \
$SLURM_CONF_IN - \
> $SLURM_CONF

NODE_NAME_LIST=$(scontrol show hostnames $NODE_NAMES)

for n in $NODE_NAME_LIST
do
echo "$NODE_ADDR $n" >> /etc/hosts
done

echo
echo "Starting Slurm services..."
echo

$SLURM_ROOT/sbin/slurmctld

for n in $NODE_NAME_LIST
do
$SLURM_ROOT/sbin/slurmd -N $n
done

echo
sinfo
echo
echo

exec "$@"
66 changes: 66 additions & 0 deletions slurm-22.05.6/install_slurm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/bin/bash -x
#
# Usage: install_slurm.sh <slurm-version> <install-prefix> [configure-args]
#

SLURM_VERSION=$1
SLURM_ROOT=$2
SLURM_CONFDIR=$3
shift; shift; shift
ARGS=$*

slurm_tar_file=slurm-${SLURM_VERSION}.tar.bz2
slurm_url=https://download.schedmd.com/slurm/${slurm_tar_file}


if [ -z "$SLURM_VERSION" -o -z "$SLURM_ROOT" -o -z "$SLURM_CONFDIR" ];
then
echo "Usage: install_slurm.sh <slurm-version> <install-prefix> <sysconf-dir> [configure-args]"
echo "No Slurm version or install-prefix specified on command line. Aborting."
exit 1
fi

#
# Download slurm tarball and unpack it
#
if true; then

mkdir -p /opt/src || exit 1
(
cd /opt/src

if ! stat $slurm_tar_file; then
echo "=== downloading slurm ${SLURM_VERSION} from ${slurm_url}"
curl --fail --output ${slurm_tar_file} ${slurm_url} || exit 1
fi

echo "=== unpacking $slurm_tar_file"
tar -xjf ${slurm_tar_file} || exit 1
)

fi

if [ "$ARGS" = "NO_BUILD" ];
then
exit 0
fi

#
# Remove any old build directory.
# Run configure, make, make install
#

stat /opt/build/slurm-${SLURM_VERSION} && rm -rf /opt/build/slurm-${SLURM_VERSION}
mkdir -p /opt/build/slurm-${SLURM_VERSION} || exit 1
(
cd /opt/build/slurm-${SLURM_VERSION}
/opt/src/slurm-${SLURM_VERSION}/configure --help
/opt/src/slurm-${SLURM_VERSION}/configure \
--prefix=${SLURM_ROOT} \
--sysconfdir=${SLURM_CONFDIR} \
--disable-dependency-tracking \
$ARGS

make -j4 && make install
)

155 changes: 155 additions & 0 deletions slurm-22.05.6/slurm.conf.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
#
# Example slurm.conf file. Please run configurator.html
# (in doc/html) to build a configuration file customized
# for your environment.
#
#
# slurm.conf file generated by configurator.html.
# Put this file on all nodes of your cluster.
# See the slurm.conf man page for more information.
#
ClusterName=cluster
SlurmctldHost=SLURMCTLDHOST(SLURMCTLDADDR)
#SlurmctldHost=
#
#DisableRootJobs=NO
#EnforcePartLimits=NO
#Epilog=
#EpilogSlurmctld=
#FirstJobId=1
#MaxJobId=67043328
#GresTypes=
#GroupUpdateForce=0
#GroupUpdateTime=600
#JobFileAppend=0
#JobRequeue=1
#JobSubmitPlugins=lua
#KillOnBadExit=0
#LaunchType=launch/slurm
#Licenses=foo*4,bar
#MailProg=/bin/mail
#MaxJobCount=10000
#MaxStepCount=40000
#MaxTasksPerNode=512
MpiDefault=pmi2
#MpiParams=ports=#-#
#PluginDir=
#PlugStackConfig=
#PrivateData=jobs
#ProctrackType=proctrack/cgroup
ProctrackType=proctrack/linuxproc
#Prolog=
#PrologFlags=
#PrologSlurmctld=
#PropagatePrioProcess=0
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#RebootProgram=
ReturnToService=1
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/var/run/slurmd.%n.pid
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurmd.%n
SlurmUser=slurm
#SlurmdUser=root
#SrunEpilog=
#SrunProlog=
StateSaveLocation=/var/spool/slurmctld
SwitchType=switch/none
#TaskEpilog=
TaskPlugin=task/affinity
#TaskProlog=
#TopologyPlugin=topology/tree
#TmpFS=/tmp
#TrackWCKey=no
#TreeWidth=
#UnkillableStepProgram=
#UsePAM=0
#
#
# TIMERS
#BatchStartTimeout=10
#CompleteWait=0
#EpilogMsgTime=2000
#GetEnvTimeout=2
#HealthCheckInterval=0
#HealthCheckProgram=
InactiveLimit=0
KillWait=30
#MessageTimeout=10
#ResvOverRun=0
MinJobAge=300
#OverTimeLimit=0
SlurmctldTimeout=120
SlurmdTimeout=300
#UnkillableStepTimeout=60
#VSizeFactor=0
Waittime=0
#
#
# SCHEDULING
#DefMemPerCPU=0
#MaxMemPerCPU=0
#SchedulerTimeSlice=30
SchedulerType=sched/backfill
SelectType=select/cons_tres
SelectTypeParameters=CR_CPU
#
#
# JOB PRIORITY
#PriorityFlags=
#PriorityType=priority/basic
#PriorityDecayHalfLife=
#PriorityCalcPeriod=
#PriorityFavorSmall=
#PriorityMaxAge=
#PriorityUsageResetPeriod=
#PriorityWeightAge=
#PriorityWeightFairshare=
#PriorityWeightJobSize=
#PriorityWeightPartition=
#PriorityWeightQOS=
#
#
# LOGGING AND ACCOUNTING
#AccountingStorageEnforce=0
#AccountingStorageHost=
#AccountingStoragePass=
#AccountingStoragePort=
AccountingStorageType=accounting_storage/none
#AccountingStorageUser=
#AccountingStoreFlags=
#JobCompHost=
#JobCompLoc=
#JobCompPass=
#JobCompPort=
JobCompType=jobcomp/none
#JobCompUser=
#JobContainerType=job_container/none
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/none
SlurmctldDebug=debug2
SlurmctldLogFile=/var/log/slurm/slurmctld.log
SlurmdDebug=debug2
SlurmdLogFile=/var/log/slurm/slurmd.%n.log
#SlurmSchedLogFile=
#SlurmSchedLogLevel=
#DebugFlags=
#
#
# POWER SAVE SUPPORT FOR IDLE NODES (optional)
#SuspendProgram=
#ResumeProgram=
#SuspendTimeout=
#ResumeTimeout=
#ResumeRate=
#SuspendExcNodes=
#SuspendExcParts=
#SuspendRate=
#SuspendTime=
#
#
# COMPUTE NODES
#NodeName=nd[1-3] NodeHostname=DOCKER_HOSTNAME NodeAddr=127.0.0.1 Port=[6001-6003] CPUs=4 State=UNKNOWN
#PartitionName=dkr Nodes=ALL Default=YES MaxTime=INFINITE State=UP