From 09295ade16be114c50b359624d3ee00718fc48fd Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 16 Aug 2023 11:51:32 +0000 Subject: [PATCH 1/7] slurmd nodes add themselves with appropriate config on startup --- image/docker-entrypoint.sh | 5 +++++ slurm-cluster-chart/files/slurm.conf | 1 - 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/image/docker-entrypoint.sh b/image/docker-entrypoint.sh index f2b0bbc..8846b7a 100755 --- a/image/docker-entrypoint.sh +++ b/image/docker-entrypoint.sh @@ -77,6 +77,11 @@ then done echo "-- slurmctld is now active ..." + echo "---> Adding myself to node definitions ..." + SLURMD_CONFIG=$(slurmd -C | head --lines 1) + echo ${SLURMD_CONFIG} + scontrol create ${SLURMD_CONFIG} State=FUTURE + echo "---> Starting the Slurm Node Daemon (slurmd) ..." exec /usr/sbin/slurmd -D "${@:2}" diff --git a/slurm-cluster-chart/files/slurm.conf b/slurm-cluster-chart/files/slurm.conf index a10c12b..711ce71 100644 --- a/slurm-cluster-chart/files/slurm.conf +++ b/slurm-cluster-chart/files/slurm.conf @@ -52,7 +52,6 @@ CommunicationParameters=NoAddrCache # NODES MaxNodeCount=10 -NodeName=slurmd-[0-9] State=FUTURE CPUs=4 # PARTITIONS PartitionName=all Default=yes Nodes=ALL From 4ed0854e731481bb1266a01ac0e151386564402a Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 16 Aug 2023 14:37:03 +0000 Subject: [PATCH 2/7] set TreeWidth --- slurm-cluster-chart/files/slurm.conf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/slurm-cluster-chart/files/slurm.conf b/slurm-cluster-chart/files/slurm.conf index 711ce71..a84dbe8 100644 --- a/slurm-cluster-chart/files/slurm.conf +++ b/slurm-cluster-chart/files/slurm.conf @@ -52,6 +52,8 @@ CommunicationParameters=NoAddrCache # NODES MaxNodeCount=10 +NodeName=slurmd-[0-9] State=FUTURE +TreeWidth=65533 # PARTITIONS PartitionName=all Default=yes Nodes=ALL From 0abc891a80c15d912823d2bea91be32afbd67fe8 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 16 Aug 2023 14:37:27 +0000 Subject: [PATCH 3/7] automatically delete/create node with correct info --- image/docker-entrypoint.sh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/image/docker-entrypoint.sh b/image/docker-entrypoint.sh index 8846b7a..3e0df92 100755 --- a/image/docker-entrypoint.sh +++ b/image/docker-entrypoint.sh @@ -77,10 +77,9 @@ then done echo "-- slurmctld is now active ..." - echo "---> Adding myself to node definitions ..." - SLURMD_CONFIG=$(slurmd -C | head --lines 1) - echo ${SLURMD_CONFIG} - scontrol create ${SLURMD_CONFIG} State=FUTURE + echo "---> Updating node definitions ..." + scontrol delete node=$HOSTNAME + scontrol create $(slurmd -C | head -n1) State=FUTURE echo "---> Starting the Slurm Node Daemon (slurmd) ..." exec /usr/sbin/slurmd -D "${@:2}" From 0048754363beb39874005bc2b565ae16bba00b73 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 16 Aug 2023 15:09:16 +0000 Subject: [PATCH 4/7] bump image --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index b20a2b3..afb70ac 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:7c0e2d9 +slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:0abc891 login: # Deployment resource name From 23aecda6e4d644aac53920f9daa290b431c2a131 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 17 Aug 2023 13:08:16 +0000 Subject: [PATCH 5/7] fix slurmd entrypoint now hostname!=pod name --- image/docker-entrypoint.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/image/docker-entrypoint.sh b/image/docker-entrypoint.sh index d740393..2f7c716 100755 --- a/image/docker-entrypoint.sh +++ b/image/docker-entrypoint.sh @@ -78,8 +78,8 @@ then echo "-- slurmctld is now active ..." echo "---> Updating node definitions ..." - scontrol delete node=$HOSTNAME - scontrol create $(slurmd -C | head -n1) State=FUTURE + scontrol delete node=${POD_NAME} + scontrol create NodeName=${POD_NAME} $(slurmd -C | head -n1 | cut -d ' ' -f 2-) State=FUTURE echo "---> Starting the Slurm Node Daemon (slurmd) ..." exec /usr/sbin/slurmd -D "${@:2}" From e200093ea2a35b82e872a7dfe53de3299580fc94 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 17 Aug 2023 14:24:07 +0000 Subject: [PATCH 6/7] bump image --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 3d41248..272fc4c 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:d3daba4 +slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:23aecda login: # Deployment resource name From 1a3082ae2def612f409c08991cc4323a6c529add Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 18 Aug 2023 10:23:58 +0000 Subject: [PATCH 7/7] update comments --- slurm-cluster-chart/templates/slurmd.yaml | 2 +- slurm-cluster-chart/values.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/slurm-cluster-chart/templates/slurmd.yaml b/slurm-cluster-chart/templates/slurmd.yaml index bec55ce..66638f2 100644 --- a/slurm-cluster-chart/templates/slurmd.yaml +++ b/slurm-cluster-chart/templates/slurmd.yaml @@ -36,7 +36,7 @@ spec: name: slurmd ports: - containerPort: 6818 - hostPort: 6818 + hostPort: 6818 # used to ensure only a single pod per k8s node resources: {} volumeMounts: - mountPath: /etc/slurm/ diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 272fc4c..2acf0dd 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -7,8 +7,8 @@ login: slurmd: # StatefulSet resource name - name: slurmd # NB this must match NodeName= in slurm-cluster-chart/files/slurm.conf - replicas: 2 + name: slurmd # NB must match prefix of NodeName= definition in slurm-cluster-chart/files/slurm.conf + replicas: 2 # NB must be <= number of nodes in NodeName= definition in slurm-cluster-chart/files/slurm.conf slurmctld: # StatefulSet resource name