diff --git a/.github/workflows/cscs.yml b/.github/workflows/cscs.yml index ce81990..18ab062 100644 --- a/.github/workflows/cscs.yml +++ b/.github/workflows/cscs.yml @@ -15,6 +15,7 @@ jobs: - 21.08.8-2 - 22.05.2 - 22.05.5 + - 22.05.6 env: tag: ghcr.io/eth-cscs/slurm-container-${{ matrix.version }}:latest diff --git a/slurm-22.05.6/Dockerfile b/slurm-22.05.6/Dockerfile new file mode 100644 index 0000000..635e42f --- /dev/null +++ b/slurm-22.05.6/Dockerfile @@ -0,0 +1,70 @@ +FROM --platform=linux/amd64 opensuse/leap:15.3 + +ARG SLURM_VERSION=22.05.6 +ARG SLURM_ROOT=/opt/slurm-${SLURM_VERSION} +ARG SLURM_CONFDIR=${SLURM_ROOT}/etc + +ENV SLURM_VERSION ${SLURM_VERSION} +ENV SLURM_ROOT ${SLURM_ROOT} +ENV SLURM_CONFDIR ${SLURM_CONFDIR} + + +RUN zypper install -y \ + munge \ + munge-devel \ + libnuma1 \ + libnuma-devel \ + librrd8 \ + readline-devel \ + hwloc \ + hwloc-devel \ + hdf5 \ + hdf5-devel \ + lz4 \ + liblz4-devel \ + libz1 \ + zlib-devel \ + freeipmi \ + freeipmi-devel \ + dbus-1 \ + dbus-1-devel \ + make \ + gcc \ + gcc-c++ \ + curl \ + tar \ + bzip2 \ + python3 \ + vim \ + ca-certificates \ + less \ + mpich \ + mpich-devel \ + sudo + +RUN zypper install -y \ + lua53 \ + lua53-devel \ + libmount-devel + +RUN useradd -M slurm + +RUN mkdir -p /var/log/slurm +RUN mkdir -p /var/spool/slurmctld && chown slurm /var/spool/slurmctld && chmod u+rwx /var/spool/slurmctld +RUN mkdir -p /var/spool/slurmd && chown slurm /var/spool/slurmd && chmod u+rwx /var/spool/slurmd + + +COPY install_slurm.sh . + +RUN ./install_slurm.sh ${SLURM_VERSION} ${SLURM_ROOT} ${SLURM_CONFDIR} --enable-multiple-slurmd + +RUN mkdir -p ${SLURM_CONFDIR} +COPY cgroup.conf ${SLURM_CONFDIR} +COPY slurm.conf.in ${SLURM_CONFDIR} + +COPY entrypoint.sh . +ENTRYPOINT ["./entrypoint.sh"] +CMD ["bash"] + +#COPY run_slurm_examples example.job mpi_example.job plugin.cpp mpi_hello.c . + diff --git a/slurm-22.05.6/cgroup.conf b/slurm-22.05.6/cgroup.conf new file mode 100644 index 0000000..102e318 --- /dev/null +++ b/slurm-22.05.6/cgroup.conf @@ -0,0 +1,5 @@ +CgroupAutomount=yes +ConstrainCores=no +ConstrainRAMSpace=no +CgroupMountpoint=/sys/fs/cgroup +CgroupPlugin=cgroup/v1 diff --git a/slurm-22.05.6/entrypoint.sh b/slurm-22.05.6/entrypoint.sh new file mode 100755 index 0000000..3a9ef4b --- /dev/null +++ b/slurm-22.05.6/entrypoint.sh @@ -0,0 +1,88 @@ +#!/bin/bash + +dbus-launch +sudo -u munge munged + +. /usr/lib64/mpi/gcc/mpich/bin/mpivars.sh + +: "${SLURM_CONF_IN=$SLURM_CONFDIR/slurm.conf.in}" +: "${SLURM_CONF=$SLURM_CONFDIR/slurm.conf}" + +# Default number of slurm nodes +: "${SLURM_NUMNODES=3}" + +# Default slurm controller +: "${SLURMCTLD_HOST=$HOSTNAME}" +: "${SLURMCTLD_ADDR=127.0.0.1}" + +# Default node info +: "${NODE_HOST=$HOSTNAME}" +: "${NODE_ADDR=127.0.0.1}" +: "${NODE_BASEPORT=6001}" + +# Default hardware profile +: "${NODE_HW=CPUs=4}" + +# Generate node names and associated ports +NODE_NAMES=$(printf "nd[%05i-%05i]" 1 $SLURM_NUMNODES) +NODE_PORTS=$(printf "%i-%i" $NODE_BASEPORT $(($NODE_BASEPORT+$SLURM_NUMNODES-1))) + + +echo "INFO:" +echo "INFO: Creating $SLURM_CONF with" +echo "INFO: " +column -t <<-EOF + INFO: SLURMCTLD_HOST=$SLURMCTLD_HOST SLURMCTLD_ADDR=$SLURMCTLD_ADDR + INFO: NODE_HOST=$NODE_HOST NODE_ADDR=$NODE_ADDR NODE_BASEPORT=$NODE_BASEPORT + INFO: NODE_HW=$NODE_HW + INFO: SLURM_NUMNODES=$SLURM_NUMNODES +EOF +echo "INFO: " +echo "INFO: Derived values:" +echo "INFO:" +column -t <<-EOF + INFO: NODE_NAMES=$NODE_NAMES + INFO: NODE_PORTS=$NODE_PORTS +EOF +echo "INFO:" +echo "INFO: Override any of the non-derived values by setting the respective environment variable" +echo "INFO: when starting Docker." +echo "INFO:" + +export PATH=$SLURM_ROOT/bin:$PATH +export LD_LIBRARY_PATH=$SLURM_ROOT/lib:$LD_LIBRARY_PATH +export MANPATH=$SLURM_ROOT/man:$MANPATH + +( + echo "NodeName=${NODE_NAMES} NodeHostname=${NODE_HOST} NodeAddr=${NODE_ADDR} Port=${NODE_PORTS} State=UNKNOWN ${NODE_HW}" + echo "PartitionName=dkr Nodes=ALL Default=YES MaxTime=INFINITE State=UP" +) \ +| sed -e "s/SLURMCTLDHOST/${SLURMCTLD_HOST}/" \ + -e "s/SLURMCTLDADDR/${SLURMCTLD_ADDR}/" \ + $SLURM_CONF_IN - \ +> $SLURM_CONF + +NODE_NAME_LIST=$(scontrol show hostnames $NODE_NAMES) + +for n in $NODE_NAME_LIST +do + echo "$NODE_ADDR $n" >> /etc/hosts +done + +echo +echo "Starting Slurm services..." +echo + +$SLURM_ROOT/sbin/slurmctld + +for n in $NODE_NAME_LIST +do + $SLURM_ROOT/sbin/slurmd -N $n +done + +echo +sinfo +echo +echo + +exec "$@" diff --git a/slurm-22.05.6/install_slurm.sh b/slurm-22.05.6/install_slurm.sh new file mode 100755 index 0000000..425f1f4 --- /dev/null +++ b/slurm-22.05.6/install_slurm.sh @@ -0,0 +1,66 @@ +#!/bin/bash -x +# +# Usage: install_slurm.sh [configure-args] +# + +SLURM_VERSION=$1 +SLURM_ROOT=$2 +SLURM_CONFDIR=$3 +shift; shift; shift +ARGS=$* + +slurm_tar_file=slurm-${SLURM_VERSION}.tar.bz2 +slurm_url=https://download.schedmd.com/slurm/${slurm_tar_file} + + +if [ -z "$SLURM_VERSION" -o -z "$SLURM_ROOT" -o -z "$SLURM_CONFDIR" ]; +then + echo "Usage: install_slurm.sh [configure-args]" + echo "No Slurm version or install-prefix specified on command line. Aborting." + exit 1 +fi + +# +# Download slurm tarball and unpack it +# +if true; then + + mkdir -p /opt/src || exit 1 + ( + cd /opt/src + + if ! stat $slurm_tar_file; then + echo "=== downloading slurm ${SLURM_VERSION} from ${slurm_url}" + curl --fail --output ${slurm_tar_file} ${slurm_url} || exit 1 + fi + + echo "=== unpacking $slurm_tar_file" + tar -xjf ${slurm_tar_file} || exit 1 + ) + +fi + +if [ "$ARGS" = "NO_BUILD" ]; +then + exit 0 +fi + +# +# Remove any old build directory. +# Run configure, make, make install +# + +stat /opt/build/slurm-${SLURM_VERSION} && rm -rf /opt/build/slurm-${SLURM_VERSION} +mkdir -p /opt/build/slurm-${SLURM_VERSION} || exit 1 +( + cd /opt/build/slurm-${SLURM_VERSION} + /opt/src/slurm-${SLURM_VERSION}/configure --help + /opt/src/slurm-${SLURM_VERSION}/configure \ + --prefix=${SLURM_ROOT} \ + --sysconfdir=${SLURM_CONFDIR} \ + --disable-dependency-tracking \ + $ARGS + + make -j4 && make install +) + diff --git a/slurm-22.05.6/slurm.conf.in b/slurm-22.05.6/slurm.conf.in new file mode 100644 index 0000000..471ae3f --- /dev/null +++ b/slurm-22.05.6/slurm.conf.in @@ -0,0 +1,155 @@ +# +# Example slurm.conf file. Please run configurator.html +# (in doc/html) to build a configuration file customized +# for your environment. +# +# +# slurm.conf file generated by configurator.html. +# Put this file on all nodes of your cluster. +# See the slurm.conf man page for more information. +# +ClusterName=cluster +SlurmctldHost=SLURMCTLDHOST(SLURMCTLDADDR) +#SlurmctldHost= +# +#DisableRootJobs=NO +#EnforcePartLimits=NO +#Epilog= +#EpilogSlurmctld= +#FirstJobId=1 +#MaxJobId=67043328 +#GresTypes= +#GroupUpdateForce=0 +#GroupUpdateTime=600 +#JobFileAppend=0 +#JobRequeue=1 +#JobSubmitPlugins=lua +#KillOnBadExit=0 +#LaunchType=launch/slurm +#Licenses=foo*4,bar +#MailProg=/bin/mail +#MaxJobCount=10000 +#MaxStepCount=40000 +#MaxTasksPerNode=512 +MpiDefault=pmi2 +#MpiParams=ports=#-# +#PluginDir= +#PlugStackConfig= +#PrivateData=jobs +#ProctrackType=proctrack/cgroup +ProctrackType=proctrack/linuxproc +#Prolog= +#PrologFlags= +#PrologSlurmctld= +#PropagatePrioProcess=0 +#PropagateResourceLimits= +#PropagateResourceLimitsExcept= +#RebootProgram= +ReturnToService=1 +SlurmctldPidFile=/var/run/slurmctld.pid +SlurmctldPort=6817 +SlurmdPidFile=/var/run/slurmd.%n.pid +SlurmdPort=6818 +SlurmdSpoolDir=/var/spool/slurmd.%n +SlurmUser=slurm +#SlurmdUser=root +#SrunEpilog= +#SrunProlog= +StateSaveLocation=/var/spool/slurmctld +SwitchType=switch/none +#TaskEpilog= +TaskPlugin=task/affinity +#TaskProlog= +#TopologyPlugin=topology/tree +#TmpFS=/tmp +#TrackWCKey=no +#TreeWidth= +#UnkillableStepProgram= +#UsePAM=0 +# +# +# TIMERS +#BatchStartTimeout=10 +#CompleteWait=0 +#EpilogMsgTime=2000 +#GetEnvTimeout=2 +#HealthCheckInterval=0 +#HealthCheckProgram= +InactiveLimit=0 +KillWait=30 +#MessageTimeout=10 +#ResvOverRun=0 +MinJobAge=300 +#OverTimeLimit=0 +SlurmctldTimeout=120 +SlurmdTimeout=300 +#UnkillableStepTimeout=60 +#VSizeFactor=0 +Waittime=0 +# +# +# SCHEDULING +#DefMemPerCPU=0 +#MaxMemPerCPU=0 +#SchedulerTimeSlice=30 +SchedulerType=sched/backfill +SelectType=select/cons_tres +SelectTypeParameters=CR_CPU +# +# +# JOB PRIORITY +#PriorityFlags= +#PriorityType=priority/basic +#PriorityDecayHalfLife= +#PriorityCalcPeriod= +#PriorityFavorSmall= +#PriorityMaxAge= +#PriorityUsageResetPeriod= +#PriorityWeightAge= +#PriorityWeightFairshare= +#PriorityWeightJobSize= +#PriorityWeightPartition= +#PriorityWeightQOS= +# +# +# LOGGING AND ACCOUNTING +#AccountingStorageEnforce=0 +#AccountingStorageHost= +#AccountingStoragePass= +#AccountingStoragePort= +AccountingStorageType=accounting_storage/none +#AccountingStorageUser= +#AccountingStoreFlags= +#JobCompHost= +#JobCompLoc= +#JobCompPass= +#JobCompPort= +JobCompType=jobcomp/none +#JobCompUser= +#JobContainerType=job_container/none +JobAcctGatherFrequency=30 +JobAcctGatherType=jobacct_gather/none +SlurmctldDebug=debug2 +SlurmctldLogFile=/var/log/slurm/slurmctld.log +SlurmdDebug=debug2 +SlurmdLogFile=/var/log/slurm/slurmd.%n.log +#SlurmSchedLogFile= +#SlurmSchedLogLevel= +#DebugFlags= +# +# +# POWER SAVE SUPPORT FOR IDLE NODES (optional) +#SuspendProgram= +#ResumeProgram= +#SuspendTimeout= +#ResumeTimeout= +#ResumeRate= +#SuspendExcNodes= +#SuspendExcParts= +#SuspendRate= +#SuspendTime= +# +# +# COMPUTE NODES +#NodeName=nd[1-3] NodeHostname=DOCKER_HOSTNAME NodeAddr=127.0.0.1 Port=[6001-6003] CPUs=4 State=UNKNOWN +#PartitionName=dkr Nodes=ALL Default=YES MaxTime=INFINITE State=UP