From 57bdbed46da2ee2f56f9c3b47f6bf13afb586666 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 14 Jul 2023 10:14:45 +0000 Subject: [PATCH 001/103] fix NFS setup for paths --- README.md | 2 +- nfs/deploy-nfs.sh | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 92183f5..34abe12 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,7 @@ This generates a set of secrets. If these need to be regenerated, see "Reconfigu A ReadWriteMany (RWX) volume is required, if a named volume exists, set `nfs.claimName` in the `values.yaml` file to its name. If not, manifests to deploy a Rook NFS volume are provided in the `/nfs` directory. You can deploy this by running ```console -/nfs/deploy-nfs.sh +./nfs/deploy-nfs.sh ``` and leaving `nfs.claimName` as the provided value. diff --git a/nfs/deploy-nfs.sh b/nfs/deploy-nfs.sh index d46b50f..b2d2f75 100755 --- a/nfs/deploy-nfs.sh +++ b/nfs/deploy-nfs.sh @@ -3,9 +3,9 @@ # Based on https://rook.io/docs/nfs/v1.7/quickstart.html # Manifests listed explicitly here to guarantee ordering -kubectl create -f crds.yaml -kubectl create -f operator.yaml -kubectl create -f rbac.yaml -kubectl create -f nfs.yaml -kubectl create -f sc.yaml -kubectl create -f pvc.yaml +kubectl create -f nfs/crds.yaml +kubectl create -f nfs/operator.yaml +kubectl create -f nfs/rbac.yaml +kubectl create -f nfs/nfs.yaml +kubectl create -f nfs/sc.yaml +kubectl create -f nfs/pvc.yaml From 577d97511961991a0d5d7be9bc86476401b239fc Mon Sep 17 00:00:00 2001 From: Will Date: Mon, 17 Jul 2023 09:51:01 +0100 Subject: [PATCH 002/103] Changed state directory in image and extended list of job states preventing upgrades --- docker-entrypoint.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index ee12fab..0bad2f8 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -47,7 +47,7 @@ then echo "-- slurmdbd is now active ..." echo "---> Setting permissions for state directory ..." - chown slurm:slurm /var/lib/slurmd + chown slurm:slurm /var/spool/slurmctld echo "---> Starting the Slurm Controller Daemon (slurmctld) ..." if /usr/sbin/slurmctld -V | grep -q '17.02' ; then @@ -110,7 +110,7 @@ then gosu munge /usr/sbin/munged echo "---> MUNGE Complete" - RUNNING_JOBS=$(squeue -t pd,r,cg -h -r | wc -l) + RUNNING_JOBS=$(squeue --states=RUNNING,COMPLETING,CONFIGURING,RESIZING,SIGNALING,STAGE_OUT,STOPPED,SUSPENDED --no-header --array | wc --lines) if [[ $RUNNING_JOBS -eq 0 ]] then From 70d549777ebcfebc5e1aad062bc9ac82f69cef00 Mon Sep 17 00:00:00 2001 From: Will Date: Mon, 17 Jul 2023 10:01:55 +0100 Subject: [PATCH 003/103] Fixed typo --- docker-entrypoint.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 0bad2f8..affee77 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -110,7 +110,7 @@ then gosu munge /usr/sbin/munged echo "---> MUNGE Complete" - RUNNING_JOBS=$(squeue --states=RUNNING,COMPLETING,CONFIGURING,RESIZING,SIGNALING,STAGE_OUT,STOPPED,SUSPENDED --no-header --array | wc --lines) + RUNNING_JOBS=$(squeue --states=RUNNING,COMPLETING,CONFIGURING,RESIZING,SIGNALING,STAGE_OUT,STOPPED,SUSPENDED --noheader --array | wc --lines) if [[ $RUNNING_JOBS -eq 0 ]] then From 9210d649fff1493f136d9dc12fc669e4800cc74b Mon Sep 17 00:00:00 2001 From: Will Date: Mon, 17 Jul 2023 10:08:26 +0100 Subject: [PATCH 004/103] Changed details for mounting slurmctld state directory --- slurm-cluster-chart/files/slurm.conf | 2 +- slurm-cluster-chart/templates/slurmctld-deployment.yaml | 8 ++++---- ...md-pvcclaim.yaml => var-spool-slurmctld-pvcclaim.yaml} | 4 ++-- slurm-cluster-chart/values.yaml | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) rename slurm-cluster-chart/templates/{var-lib-slurmd-pvcclaim.yaml => var-spool-slurmctld-pvcclaim.yaml} (82%) diff --git a/slurm-cluster-chart/files/slurm.conf b/slurm-cluster-chart/files/slurm.conf index eda002f..fb9e61d 100644 --- a/slurm-cluster-chart/files/slurm.conf +++ b/slurm-cluster-chart/files/slurm.conf @@ -9,7 +9,7 @@ SlurmUser=slurm SlurmctldPort=6817 SlurmdPort=6818 AuthType=auth/munge -StateSaveLocation=/var/lib/slurmd +StateSaveLocation=/var/spool/slurmctld SlurmdSpoolDir=/var/spool/slurmd SwitchType=switch/none MpiDefault=pmix diff --git a/slurm-cluster-chart/templates/slurmctld-deployment.yaml b/slurm-cluster-chart/templates/slurmctld-deployment.yaml index d42e425..cfa0cac 100644 --- a/slurm-cluster-chart/templates/slurmctld-deployment.yaml +++ b/slurm-cluster-chart/templates/slurmctld-deployment.yaml @@ -36,17 +36,17 @@ spec: - mountPath: /tempmounts/munge.key name: munge-key-secret subPath: munge.key - - mountPath: /var/lib/slurmd - name: var-lib-slurmd + - mountPath: /var/spool/slurmctld + name: slurmctld-state hostname: slurmctld restartPolicy: Always volumes: - name: slurm-jobdir persistentVolumeClaim: claimName: {{ .Values.nfs.claimName }} - - name: var-lib-slurmd + - name: slurmctld-state persistentVolumeClaim: - claimName: var-lib-slurmd + claimName: var-spool-slurmctld - name: slurm-config-volume configMap: name: {{ .Values.configmaps.slurmConf }} diff --git a/slurm-cluster-chart/templates/var-lib-slurmd-pvcclaim.yaml b/slurm-cluster-chart/templates/var-spool-slurmctld-pvcclaim.yaml similarity index 82% rename from slurm-cluster-chart/templates/var-lib-slurmd-pvcclaim.yaml rename to slurm-cluster-chart/templates/var-spool-slurmctld-pvcclaim.yaml index 5879b34..de733a0 100644 --- a/slurm-cluster-chart/templates/var-lib-slurmd-pvcclaim.yaml +++ b/slurm-cluster-chart/templates/var-spool-slurmctld-pvcclaim.yaml @@ -5,10 +5,10 @@ metadata: labels: app.kubernetes.io/name: slurm app.kubernetes.io/component: slurmctld - name: var-lib-slurmd + name: var-spool-slurmctld spec: accessModes: - ReadWriteOnce resources: requests: - storage: 1Gi + storage: 100Mi diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 92e5088..ae4b7a1 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:05bbb87 +sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:70d5497 replicas: slurmd: 2 From 876670d414106ea47eade84721d886379a888a40 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Tue, 18 Jul 2023 10:50:39 +0100 Subject: [PATCH 005/103] Update publish-helm-chart workflow for master branch change --- .github/workflows/publish-helm-chart.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/publish-helm-chart.yml b/.github/workflows/publish-helm-chart.yml index 7ad4374..8ce0698 100644 --- a/.github/workflows/publish-helm-chart.yml +++ b/.github/workflows/publish-helm-chart.yml @@ -3,7 +3,7 @@ name: Release Charts on: push: branches: - - master + - main jobs: release: From 3227860aa3be10e09a0aace777c42cb92b76901b Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 18 Jul 2023 10:38:18 +0000 Subject: [PATCH 006/103] remove slurmd service --- .../templates/slurmd-service.yaml | 16 ---------------- 1 file changed, 16 deletions(-) delete mode 100644 slurm-cluster-chart/templates/slurmd-service.yaml diff --git a/slurm-cluster-chart/templates/slurmd-service.yaml b/slurm-cluster-chart/templates/slurmd-service.yaml deleted file mode 100644 index bec3d90..0000000 --- a/slurm-cluster-chart/templates/slurmd-service.yaml +++ /dev/null @@ -1,16 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - creationTimestamp: null - labels: - app.kubernetes.io/name: slurm - app.kubernetes.io/component: slurmd - name: slurmd -spec: - ports: - - name: "6818" - port: 6818 - targetPort: 6818 - selector: - app.kubernetes.io/name: slurm - app.kubernetes.io/component: slurmd From 4aa1d7df9f05b937d0bfb9ce7f4002557f00f7d1 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 18 Jul 2023 10:38:34 +0000 Subject: [PATCH 007/103] remove slurmctld hostname --- slurm-cluster-chart/templates/slurmctld-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/templates/slurmctld-deployment.yaml b/slurm-cluster-chart/templates/slurmctld-deployment.yaml index cfa0cac..6f87b75 100644 --- a/slurm-cluster-chart/templates/slurmctld-deployment.yaml +++ b/slurm-cluster-chart/templates/slurmctld-deployment.yaml @@ -38,7 +38,7 @@ spec: subPath: munge.key - mountPath: /var/spool/slurmctld name: slurmctld-state - hostname: slurmctld + # hostname: slurmctld restartPolicy: Always volumes: - name: slurm-jobdir From a34001b835b9c296cf186bd2162dce6603a01436 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 18 Jul 2023 10:38:51 +0000 Subject: [PATCH 008/103] use statefulset for slurmd --- slurm-cluster-chart/templates/slurmd-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/templates/slurmd-deployment.yaml b/slurm-cluster-chart/templates/slurmd-deployment.yaml index 55f0a5e..1d5d64d 100644 --- a/slurm-cluster-chart/templates/slurmd-deployment.yaml +++ b/slurm-cluster-chart/templates/slurmd-deployment.yaml @@ -1,5 +1,5 @@ apiVersion: apps/v1 -kind: Deployment +kind: StatefulSet metadata: creationTimestamp: null labels: From b09c6c24cf955b549d7ccf010d69ceaa9b01ca37 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 18 Jul 2023 10:39:14 +0000 Subject: [PATCH 009/103] use dynamic future nodes --- docker-entrypoint.sh | 2 +- slurm-cluster-chart/files/slurm.conf | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index affee77..4ec1026 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -78,7 +78,7 @@ then echo "-- slurmctld is now active ..." echo "---> Starting the Slurm Node Daemon (slurmd) ..." - exec /usr/sbin/slurmd -Z -Dvvv + exec /usr/sbin/slurmd -F -Dvvv fi if [ "$1" = "login" ] diff --git a/slurm-cluster-chart/files/slurm.conf b/slurm-cluster-chart/files/slurm.conf index fb9e61d..6cd77ad 100644 --- a/slurm-cluster-chart/files/slurm.conf +++ b/slurm-cluster-chart/files/slurm.conf @@ -47,8 +47,13 @@ AccountingStorageType=accounting_storage/slurmdbd AccountingStorageHost=slurmdbd AccountingStoragePort=6819 # -# +SlurmctldParameters=cloud_reg_addrs +#CommunicationParameters=NoAddrCache + +# NODES MaxNodeCount=10 +NodeName=slurmd-[0-1] State=FUTURE + # PARTITIONS PartitionName=all Default=yes Nodes=ALL TreeWidth=65533 From 009ecee123b7a2139d796fb4ad34ce25b3d578a5 Mon Sep 17 00:00:00 2001 From: Will Date: Tue, 18 Jul 2023 15:45:04 +0100 Subject: [PATCH 010/103] Removed invisible debug messages from job container --- docker-entrypoint.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index affee77..467fb9d 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -114,10 +114,8 @@ then if [[ $RUNNING_JOBS -eq 0 ]] then - echo "No Slurm jobs in queue, can safely upgrade" exit 0 else - echo "Error: cannot upgrade chart - there are still Slurm jobs in the queue" exit 1 fi fi From 1d9b1802d16dd5a191150a61f1eaf0a9b2ff69ed Mon Sep 17 00:00:00 2001 From: Will Date: Tue, 18 Jul 2023 15:47:53 +0100 Subject: [PATCH 011/103] Updated image, removed unnescessary field from slurmctld and renamed --- .../{slurmctld-deployment.yaml => slurmctld-statefulset.yaml} | 1 - slurm-cluster-chart/values.yaml | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) rename slurm-cluster-chart/templates/{slurmctld-deployment.yaml => slurmctld-statefulset.yaml} (98%) diff --git a/slurm-cluster-chart/templates/slurmctld-deployment.yaml b/slurm-cluster-chart/templates/slurmctld-statefulset.yaml similarity index 98% rename from slurm-cluster-chart/templates/slurmctld-deployment.yaml rename to slurm-cluster-chart/templates/slurmctld-statefulset.yaml index cfa0cac..5bce036 100644 --- a/slurm-cluster-chart/templates/slurmctld-deployment.yaml +++ b/slurm-cluster-chart/templates/slurmctld-statefulset.yaml @@ -38,7 +38,6 @@ spec: subPath: munge.key - mountPath: /var/spool/slurmctld name: slurmctld-state - hostname: slurmctld restartPolicy: Always volumes: - name: slurm-jobdir diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index ae4b7a1..481644c 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:70d5497 +sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:009ecee replicas: slurmd: 2 From 3c2ba621610aa3e3a171017a8831bf72941beed5 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 18 Jul 2023 19:12:35 +0000 Subject: [PATCH 012/103] enable dns on slurmd statefulset and use for slurm addressing --- slurm-cluster-chart/files/slurm.conf | 5 +++-- .../templates/login-deployment.yaml | 3 +++ .../templates/slurmctld-deployment.yaml | 4 +++- .../templates/slurmd-deployment.yaml | 6 ++++-- .../templates/slurmd-service.yaml | 17 +++++++++++++++++ slurm-cluster-chart/values.yaml | 2 +- 6 files changed, 31 insertions(+), 6 deletions(-) create mode 100644 slurm-cluster-chart/templates/slurmd-service.yaml diff --git a/slurm-cluster-chart/files/slurm.conf b/slurm-cluster-chart/files/slurm.conf index 6cd77ad..2c3980d 100644 --- a/slurm-cluster-chart/files/slurm.conf +++ b/slurm-cluster-chart/files/slurm.conf @@ -47,8 +47,9 @@ AccountingStorageType=accounting_storage/slurmdbd AccountingStorageHost=slurmdbd AccountingStoragePort=6819 # -SlurmctldParameters=cloud_reg_addrs -#CommunicationParameters=NoAddrCache +SlurmdTimeout=5 +SlurmctldParameters=cloud_dns,cloud_reg_addrs +CommunicationParameters=NoAddrCache # NODES MaxNodeCount=10 diff --git a/slurm-cluster-chart/templates/login-deployment.yaml b/slurm-cluster-chart/templates/login-deployment.yaml index 2b49536..ecf4d3a 100644 --- a/slurm-cluster-chart/templates/login-deployment.yaml +++ b/slurm-cluster-chart/templates/login-deployment.yaml @@ -45,6 +45,9 @@ spec: subPath: authorized_keys resources: {} hostname: login + dnsConfig: + searches: + - slurmd.default.svc.cluster.local restartPolicy: Always volumes: - name: slurm-jobdir diff --git a/slurm-cluster-chart/templates/slurmctld-deployment.yaml b/slurm-cluster-chart/templates/slurmctld-deployment.yaml index 6f87b75..e46dd7b 100644 --- a/slurm-cluster-chart/templates/slurmctld-deployment.yaml +++ b/slurm-cluster-chart/templates/slurmctld-deployment.yaml @@ -38,7 +38,9 @@ spec: subPath: munge.key - mountPath: /var/spool/slurmctld name: slurmctld-state - # hostname: slurmctld + dnsConfig: + searches: + - slurmd.default.svc.cluster.local restartPolicy: Always volumes: - name: slurm-jobdir diff --git a/slurm-cluster-chart/templates/slurmd-deployment.yaml b/slurm-cluster-chart/templates/slurmd-deployment.yaml index 1d5d64d..e973e3b 100644 --- a/slurm-cluster-chart/templates/slurmd-deployment.yaml +++ b/slurm-cluster-chart/templates/slurmd-deployment.yaml @@ -12,8 +12,7 @@ spec: matchLabels: app.kubernetes.io/name: slurm app.kubernetes.io/component: slurmd - strategy: - type: Recreate + serviceName: slurmd template: metadata: creationTimestamp: null @@ -48,6 +47,9 @@ spec: subPath: munge.key securityContext: privileged: true + dnsConfig: + searches: + - slurmd.default.svc.cluster.local restartPolicy: Always volumes: - name: slurm-jobdir diff --git a/slurm-cluster-chart/templates/slurmd-service.yaml b/slurm-cluster-chart/templates/slurmd-service.yaml new file mode 100644 index 0000000..a182ffd --- /dev/null +++ b/slurm-cluster-chart/templates/slurmd-service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + creationTimestamp: null + labels: + app.kubernetes.io/name: slurm + app.kubernetes.io/component: slurmd + name: slurmd +spec: + ports: + - name: "6818" + port: 6818 + targetPort: 6818 + selector: + app.kubernetes.io/name: slurm + app.kubernetes.io/component: slurmd + clusterIP: None diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index ae4b7a1..6e2eb61 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:70d5497 +sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:b09c6c2 replicas: slurmd: 2 From 1345a581f1fb5f8c1cf83315b9f8247d4a0fce06 Mon Sep 17 00:00:00 2001 From: Will Date: Wed, 19 Jul 2023 16:12:43 +0100 Subject: [PATCH 013/103] Added https (fixes job composer) --- slurm-cluster-chart/files/ood_portal.yaml | 3 +++ slurm-cluster-chart/templates/login-deployment.yaml | 1 + slurm-cluster-chart/templates/login-service.yaml | 4 ++++ 3 files changed, 8 insertions(+) diff --git a/slurm-cluster-chart/files/ood_portal.yaml b/slurm-cluster-chart/files/ood_portal.yaml index 4eee040..9be3295 100644 --- a/slurm-cluster-chart/files/ood_portal.yaml +++ b/slurm-cluster-chart/files/ood_portal.yaml @@ -29,6 +29,9 @@ # - 'SSLCertificateKeyFile "/etc/pki/tls/private/www.example.com.key"' # Default: null (no SSL support) #ssl: null +ssl: +- 'SSLCertificateFile "/etc/pki/tls/certs/localhost.crt"' +- 'SSLCertificateKeyFile "/etc/pki/tls/private/localhost.key"' # Root directory of log files (can be relative ServerRoot) # Example: diff --git a/slurm-cluster-chart/templates/login-deployment.yaml b/slurm-cluster-chart/templates/login-deployment.yaml index 5d15550..1f24e8a 100644 --- a/slurm-cluster-chart/templates/login-deployment.yaml +++ b/slurm-cluster-chart/templates/login-deployment.yaml @@ -35,6 +35,7 @@ spec: ports: - containerPort: 22 - containerPort: 80 + - containerPort: 443 volumeMounts: - mountPath: {{ .Values.nfs.mountPath }} name: slurm-jobdir diff --git a/slurm-cluster-chart/templates/login-service.yaml b/slurm-cluster-chart/templates/login-service.yaml index fee3480..fcc3e49 100644 --- a/slurm-cluster-chart/templates/login-service.yaml +++ b/slurm-cluster-chart/templates/login-service.yaml @@ -15,6 +15,10 @@ spec: port: 80 targetPort: 80 protocol: TCP + - name: "https" + port: 443 + targetPort: 443 + protocol: TCP type: LoadBalancer selector: app.kubernetes.io/name: slurm From f8808dba70a847d8879f47e912d56d7ec2e771f9 Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 20 Jul 2023 12:03:10 +0100 Subject: [PATCH 014/103] slurm.conf updates --- slurm-cluster-chart/files/slurm.conf | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/slurm-cluster-chart/files/slurm.conf b/slurm-cluster-chart/files/slurm.conf index 2c3980d..4c072a7 100644 --- a/slurm-cluster-chart/files/slurm.conf +++ b/slurm-cluster-chart/files/slurm.conf @@ -20,7 +20,7 @@ ReturnToService=2 # # TIMERS SlurmctldTimeout=300 -SlurmdTimeout=300 +SlurmdTimeout=30 InactiveLimit=0 MinJobAge=300 KillWait=30 @@ -47,16 +47,14 @@ AccountingStorageType=accounting_storage/slurmdbd AccountingStorageHost=slurmdbd AccountingStoragePort=6819 # -SlurmdTimeout=5 SlurmctldParameters=cloud_dns,cloud_reg_addrs CommunicationParameters=NoAddrCache # NODES MaxNodeCount=10 -NodeName=slurmd-[0-1] State=FUTURE +NodeName=slurmd-[0-9] State=FUTURE # PARTITIONS PartitionName=all Default=yes Nodes=ALL -TreeWidth=65533 PropagateResourceLimitsExcept=MEMLOCK From d5f7f563649dbb0d46dd7072f7ca3cab4d962d14 Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 20 Jul 2023 13:25:46 +0100 Subject: [PATCH 015/103] Updated with rebuilt image --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 66d9dd6..cd9d34d 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:b09c6c2 #OLD CONTAINER NEEDS REBUILD +sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:c12d04e replicas: slurmd: 2 From 0f286ed3c67afd6dc4cce590d803767eb6316e18 Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 20 Jul 2023 15:47:20 +0100 Subject: [PATCH 016/103] Now generates keys for rocky to self-ssh if don't already exist (in image) --- docker-entrypoint.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 75be39a..cba2464 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -84,9 +84,16 @@ fi if [ "$1" = "login" ] then + echo "---> Setting up ssh for user" mkdir -p /home/rocky/.ssh cp tempmounts/authorized_keys /home/rocky/.ssh/authorized_keys + if [ -f /home/rocky/.ssh/id_rsa.pub ]; then + echo "ssh keys already found" + else + ssh-keygen -t rsa -f /home/rocky/.ssh/id_rsa -N "" + fi + echo "---> Setting permissions for user home directories" cd /home for DIR in */; From c0947542a68868591b9ae19f7f58c9385ac81b90 Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 20 Jul 2023 15:52:59 +0100 Subject: [PATCH 017/103] Updated image tag --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 93c606c..93964f0 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:a89e584 +sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:0f286ed replicas: slurmd: 2 From a5b71c24f4c57b939a1c37bea034d6bf0a8a2f80 Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 20 Jul 2023 16:26:21 +0100 Subject: [PATCH 018/103] Updated image after merge --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index a0e5fdc..1bef86e 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:0f286ed #CHANGE AFTER REBUILD +sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:3daa29f replicas: slurmd: 2 From 5de571412f8e59100250bba69ceffc82ff4e9ec9 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 25 Jul 2023 10:28:45 +0000 Subject: [PATCH 019/103] change service port names to name of service --- slurm-cluster-chart/templates/login-service.yaml | 2 +- slurm-cluster-chart/templates/mysql-service.yaml | 2 +- slurm-cluster-chart/templates/slurmctld-service.yaml | 2 +- slurm-cluster-chart/templates/slurmd-service.yaml | 2 +- slurm-cluster-chart/templates/slurmdbd-service.yaml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/slurm-cluster-chart/templates/login-service.yaml b/slurm-cluster-chart/templates/login-service.yaml index 0a38ba4..c6f93c9 100644 --- a/slurm-cluster-chart/templates/login-service.yaml +++ b/slurm-cluster-chart/templates/login-service.yaml @@ -8,7 +8,7 @@ metadata: name: login spec: ports: - - name: "ssh" + - name: ssh port: 22 targetPort: 22 type: LoadBalancer diff --git a/slurm-cluster-chart/templates/mysql-service.yaml b/slurm-cluster-chart/templates/mysql-service.yaml index 349dfee..a7d58cc 100644 --- a/slurm-cluster-chart/templates/mysql-service.yaml +++ b/slurm-cluster-chart/templates/mysql-service.yaml @@ -8,7 +8,7 @@ metadata: name: mysql spec: ports: - - name: "3306" + - name: mysql port: 3306 targetPort: 3306 selector: diff --git a/slurm-cluster-chart/templates/slurmctld-service.yaml b/slurm-cluster-chart/templates/slurmctld-service.yaml index 9bfc40b..001bcab 100644 --- a/slurm-cluster-chart/templates/slurmctld-service.yaml +++ b/slurm-cluster-chart/templates/slurmctld-service.yaml @@ -8,7 +8,7 @@ metadata: name: slurmctld-0 spec: ports: - - name: "6817" + - name: slurmctld port: 6817 targetPort: 6817 selector: diff --git a/slurm-cluster-chart/templates/slurmd-service.yaml b/slurm-cluster-chart/templates/slurmd-service.yaml index a182ffd..b5884fc 100644 --- a/slurm-cluster-chart/templates/slurmd-service.yaml +++ b/slurm-cluster-chart/templates/slurmd-service.yaml @@ -8,7 +8,7 @@ metadata: name: slurmd spec: ports: - - name: "6818" + - name: slurmd port: 6818 targetPort: 6818 selector: diff --git a/slurm-cluster-chart/templates/slurmdbd-service.yaml b/slurm-cluster-chart/templates/slurmdbd-service.yaml index 400dcda..fc0ec8f 100644 --- a/slurm-cluster-chart/templates/slurmdbd-service.yaml +++ b/slurm-cluster-chart/templates/slurmdbd-service.yaml @@ -8,7 +8,7 @@ metadata: name: slurmdbd spec: ports: - - name: "6819" + - name: slurmdbd port: 6819 targetPort: 6819 selector: From ae1f071ac0de5d1087ae7d063e664846ee1eb520 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 25 Jul 2023 10:30:12 +0000 Subject: [PATCH 020/103] make slurm image value name clearer --- slurm-cluster-chart/templates/check-jobs-finished-hook.yaml | 2 +- slurm-cluster-chart/templates/login-deployment.yaml | 2 +- slurm-cluster-chart/templates/slurmctld-statefulset.yaml | 2 +- slurm-cluster-chart/templates/slurmd-deployment.yaml | 2 +- slurm-cluster-chart/templates/slurmdbd-deployment.yaml | 2 +- slurm-cluster-chart/values.yaml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/slurm-cluster-chart/templates/check-jobs-finished-hook.yaml b/slurm-cluster-chart/templates/check-jobs-finished-hook.yaml index be70975..f1908f8 100644 --- a/slurm-cluster-chart/templates/check-jobs-finished-hook.yaml +++ b/slurm-cluster-chart/templates/check-jobs-finished-hook.yaml @@ -15,7 +15,7 @@ spec: restartPolicy: Never containers: - name: check-jobs-finished-hook - image: {{ .Values.sdcImage }} + image: {{ .Values.slurmImage }} args: - check-queue-hook volumeMounts: diff --git a/slurm-cluster-chart/templates/login-deployment.yaml b/slurm-cluster-chart/templates/login-deployment.yaml index ecf4d3a..781588d 100644 --- a/slurm-cluster-chart/templates/login-deployment.yaml +++ b/slurm-cluster-chart/templates/login-deployment.yaml @@ -24,7 +24,7 @@ spec: containers: - args: - login - image: {{ .Values.sdcImage }} + image: {{ .Values.slurmImage }} name: login ports: - containerPort: 22 diff --git a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml b/slurm-cluster-chart/templates/slurmctld-statefulset.yaml index e46dd7b..27128cd 100644 --- a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml +++ b/slurm-cluster-chart/templates/slurmctld-statefulset.yaml @@ -22,7 +22,7 @@ spec: containers: - args: - slurmctld - image: {{ .Values.sdcImage }} + image: {{ .Values.slurmImage }} name: slurmctld ports: - containerPort: 6817 diff --git a/slurm-cluster-chart/templates/slurmd-deployment.yaml b/slurm-cluster-chart/templates/slurmd-deployment.yaml index e973e3b..81ae1c3 100644 --- a/slurm-cluster-chart/templates/slurmd-deployment.yaml +++ b/slurm-cluster-chart/templates/slurmd-deployment.yaml @@ -31,7 +31,7 @@ spec: containers: - args: - slurmd - image: {{ .Values.sdcImage }} + image: {{ .Values.slurmImage }} name: slurmd ports: - containerPort: 6818 diff --git a/slurm-cluster-chart/templates/slurmdbd-deployment.yaml b/slurm-cluster-chart/templates/slurmdbd-deployment.yaml index 2842de0..d9ebf4a 100644 --- a/slurm-cluster-chart/templates/slurmdbd-deployment.yaml +++ b/slurm-cluster-chart/templates/slurmdbd-deployment.yaml @@ -24,7 +24,7 @@ spec: containers: - args: - slurmdbd - image: {{ .Values.sdcImage }} + image: {{ .Values.slurmImage }} name: slurmdbd ports: - containerPort: 6819 diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index cd9d34d..9baeb26 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:c12d04e +slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:c12d04e replicas: slurmd: 2 From bd524f02b85d82ddfc08bb2844b51a28da923d6b Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 25 Jul 2023 12:51:54 +0000 Subject: [PATCH 021/103] make container image runnable via docker/podman for debugging --- docker-entrypoint.sh | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 23ad303..fe97f57 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -1,17 +1,19 @@ #!/bin/bash set -euo pipefail -chown root:root /home -chmod 755 /home +function start_munge(){ -cp /tempmounts/munge.key /etc/munge/munge.key -chown munge:munge /etc/munge/munge.key -chmod 600 /etc/munge/munge.key + echo "--> Copying MUNGE key ..." + cp /tempmounts/munge.key /etc/munge/munge.key + chown munge:munge /etc/munge/munge.key + chmod 600 /etc/munge/munge.key + + echo "---> Starting the MUNGE Authentication service (munged) ..." + gosu munge /usr/sbin/munged "$@" +} if [ "$1" = "slurmdbd" ] then - echo "---> Starting the MUNGE Authentication service (munged) ..." - gosu munge /usr/sbin/munged echo "---> Starting the Slurm Database Daemon (slurmdbd) ..." @@ -34,8 +36,8 @@ fi if [ "$1" = "slurmctld" ] then - echo "---> Starting the MUNGE Authentication service (munged) ..." - gosu munge /usr/sbin/munged + + start_munge echo "---> Waiting for slurmdbd to become active before starting slurmctld ..." @@ -65,8 +67,7 @@ then ulimit -n 131072 ulimit -a - echo "---> Starting the MUNGE Authentication service (munged) ..." - gosu munge /usr/sbin/munged + start_munge echo "---> Waiting for slurmctld to become active before starting slurmd..." @@ -99,16 +100,12 @@ then ssh-keygen -A /usr/sbin/sshd - echo "---> Starting the MUNGE Authentication service (munged) ..." - gosu munge /usr/sbin/munged -F - echo "---> MUNGE Complete" + start_munge --foreground fi if [ "$1" = "check-queue-hook" ] then - echo "---> Starting the MUNGE Authentication service (munged) ..." - gosu munge /usr/sbin/munged - echo "---> MUNGE Complete" + start_munge RUNNING_JOBS=$(squeue --states=RUNNING,COMPLETING,CONFIGURING,RESIZING,SIGNALING,STAGE_OUT,STOPPED,SUSPENDED --noheader --array | wc --lines) From 9297a3e836ffb9ef7d8799e191aa6b835253e76c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 25 Jul 2023 12:57:44 +0000 Subject: [PATCH 022/103] change munge key mount mode --- docker-entrypoint.sh | 1 - slurm-cluster-chart/templates/login-deployment.yaml | 1 + slurm-cluster-chart/templates/slurmctld-statefulset.yaml | 1 + slurm-cluster-chart/templates/slurmd-deployment.yaml | 1 + slurm-cluster-chart/templates/slurmdbd-deployment.yaml | 2 +- 5 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index fe97f57..5ac9621 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -6,7 +6,6 @@ function start_munge(){ echo "--> Copying MUNGE key ..." cp /tempmounts/munge.key /etc/munge/munge.key chown munge:munge /etc/munge/munge.key - chmod 600 /etc/munge/munge.key echo "---> Starting the MUNGE Authentication service (munged) ..." gosu munge /usr/sbin/munged "$@" diff --git a/slurm-cluster-chart/templates/login-deployment.yaml b/slurm-cluster-chart/templates/login-deployment.yaml index 781588d..da20e58 100644 --- a/slurm-cluster-chart/templates/login-deployment.yaml +++ b/slurm-cluster-chart/templates/login-deployment.yaml @@ -62,6 +62,7 @@ spec: - name: munge-key-secret secret: secretName: {{ .Values.secrets.mungeKey }} + defaultMode: 0400 - name: authorized-keys configMap: name: {{ .Values.configmaps.authorizedKeys }} diff --git a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml b/slurm-cluster-chart/templates/slurmctld-statefulset.yaml index 27128cd..2ba90dd 100644 --- a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml +++ b/slurm-cluster-chart/templates/slurmctld-statefulset.yaml @@ -55,3 +55,4 @@ spec: - name: munge-key-secret secret: secretName: {{ .Values.secrets.mungeKey }} + defaultMode: 0400 diff --git a/slurm-cluster-chart/templates/slurmd-deployment.yaml b/slurm-cluster-chart/templates/slurmd-deployment.yaml index 81ae1c3..9a04f18 100644 --- a/slurm-cluster-chart/templates/slurmd-deployment.yaml +++ b/slurm-cluster-chart/templates/slurmd-deployment.yaml @@ -61,3 +61,4 @@ spec: - name: munge-key-secret secret: secretName: {{ .Values.secrets.mungeKey }} + defaultMode: 0400 diff --git a/slurm-cluster-chart/templates/slurmdbd-deployment.yaml b/slurm-cluster-chart/templates/slurmdbd-deployment.yaml index d9ebf4a..67e6bd8 100644 --- a/slurm-cluster-chart/templates/slurmdbd-deployment.yaml +++ b/slurm-cluster-chart/templates/slurmdbd-deployment.yaml @@ -57,4 +57,4 @@ spec: - name: munge-key-secret secret: secretName: {{ .Values.secrets.mungeKey }} - \ No newline at end of file + defaultMode: 0400 From 8e15262474157c7f44a36512bf77310acf26b6b6 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 25 Jul 2023 13:05:46 +0000 Subject: [PATCH 023/103] remove nonstandard tempmounts directory --- docker-entrypoint.sh | 6 +++--- slurm-cluster-chart/templates/check-jobs-finished-hook.yaml | 2 +- slurm-cluster-chart/templates/login-deployment.yaml | 4 ++-- slurm-cluster-chart/templates/slurmctld-statefulset.yaml | 2 +- slurm-cluster-chart/templates/slurmd-deployment.yaml | 2 +- slurm-cluster-chart/templates/slurmdbd-deployment.yaml | 4 ++-- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 5ac9621..680eab2 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -4,7 +4,7 @@ set -euo pipefail function start_munge(){ echo "--> Copying MUNGE key ..." - cp /tempmounts/munge.key /etc/munge/munge.key + cp /tmp/munge.key /etc/munge/munge.key chown munge:munge /etc/munge/munge.key echo "---> Starting the MUNGE Authentication service (munged) ..." @@ -16,7 +16,7 @@ then echo "---> Starting the Slurm Database Daemon (slurmdbd) ..." - cp /tempmounts/slurmdbd.conf /etc/slurm/slurmdbd.conf + cp /tmp/slurmdbd.conf /etc/slurm/slurmdbd.conf echo "StoragePass=${StoragePass}" >> /etc/slurm/slurmdbd.conf chown slurm:slurm /etc/slurm/slurmdbd.conf chmod 600 /etc/slurm/slurmdbd.conf @@ -85,7 +85,7 @@ if [ "$1" = "login" ] then mkdir -p /home/rocky/.ssh - cp tempmounts/authorized_keys /home/rocky/.ssh/authorized_keys + cp /tmp/authorized_keys /home/rocky/.ssh/authorized_keys echo "---> Setting permissions for user home directories" cd /home diff --git a/slurm-cluster-chart/templates/check-jobs-finished-hook.yaml b/slurm-cluster-chart/templates/check-jobs-finished-hook.yaml index f1908f8..58cac40 100644 --- a/slurm-cluster-chart/templates/check-jobs-finished-hook.yaml +++ b/slurm-cluster-chart/templates/check-jobs-finished-hook.yaml @@ -19,7 +19,7 @@ spec: args: - check-queue-hook volumeMounts: - - mountPath: /tempmounts/munge.key + - mountPath: /tmp/munge.key name: munge-key-secret subPath: munge.key - mountPath: /etc/slurm/slurm.conf diff --git a/slurm-cluster-chart/templates/login-deployment.yaml b/slurm-cluster-chart/templates/login-deployment.yaml index da20e58..758d69c 100644 --- a/slurm-cluster-chart/templates/login-deployment.yaml +++ b/slurm-cluster-chart/templates/login-deployment.yaml @@ -34,14 +34,14 @@ spec: - mountPath: /etc/slurm/slurm.conf name: slurm-config-volume subPath: slurm.conf - - mountPath: /tempmounts/munge.key + - mountPath: /tmp/munge.key name: munge-key-secret subPath: munge.key - mountPath: /etc/ssh/sshd_config subPath: sshd_config name: sshd-config-configmap - name: authorized-keys - mountPath: /tempmounts/authorized_keys + mountPath: /tmp/authorized_keys subPath: authorized_keys resources: {} hostname: login diff --git a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml b/slurm-cluster-chart/templates/slurmctld-statefulset.yaml index 2ba90dd..98e68cd 100644 --- a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml +++ b/slurm-cluster-chart/templates/slurmctld-statefulset.yaml @@ -33,7 +33,7 @@ spec: - mountPath: /etc/slurm/slurm.conf name: slurm-config-volume subPath: slurm.conf - - mountPath: /tempmounts/munge.key + - mountPath: /tmp/munge.key name: munge-key-secret subPath: munge.key - mountPath: /var/spool/slurmctld diff --git a/slurm-cluster-chart/templates/slurmd-deployment.yaml b/slurm-cluster-chart/templates/slurmd-deployment.yaml index 9a04f18..71ddc94 100644 --- a/slurm-cluster-chart/templates/slurmd-deployment.yaml +++ b/slurm-cluster-chart/templates/slurmd-deployment.yaml @@ -42,7 +42,7 @@ spec: subPath: slurm.conf - mountPath: {{ .Values.nfs.mountPath }} name: slurm-jobdir - - mountPath: /tempmounts/munge.key + - mountPath: /tmp/munge.key name: munge-key-secret subPath: munge.key securityContext: diff --git a/slurm-cluster-chart/templates/slurmdbd-deployment.yaml b/slurm-cluster-chart/templates/slurmdbd-deployment.yaml index 67e6bd8..b505888 100644 --- a/slurm-cluster-chart/templates/slurmdbd-deployment.yaml +++ b/slurm-cluster-chart/templates/slurmdbd-deployment.yaml @@ -33,10 +33,10 @@ spec: - mountPath: /etc/slurm/slurm.conf name: slurm-config-volume subPath: slurm.conf - - mountPath: /tempmounts/munge.key + - mountPath: /tmp/munge.key name: munge-key-secret subPath: munge.key - - mountPath: /tempmounts/slurmdbd.conf + - mountPath: /tmp/slurmdbd.conf name: dbd-config-volume subPath: slurmdbd.conf env: From 589ee8f6fc7154c77bd38b53fd31c989d274c46e Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 25 Jul 2023 13:17:15 +0000 Subject: [PATCH 024/103] fix slurmdbd munge startup --- docker-entrypoint.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 680eab2..4bf4d32 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -3,7 +3,7 @@ set -euo pipefail function start_munge(){ - echo "--> Copying MUNGE key ..." + echo "---> Copying MUNGE key ..." cp /tmp/munge.key /etc/munge/munge.key chown munge:munge /etc/munge/munge.key @@ -14,6 +14,8 @@ function start_munge(){ if [ "$1" = "slurmdbd" ] then + start_munge + echo "---> Starting the Slurm Database Daemon (slurmdbd) ..." cp /tmp/slurmdbd.conf /etc/slurm/slurmdbd.conf From 5301249a957e5104cf48cf2dafde9b9c7de3aa44 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 25 Jul 2023 13:24:21 +0000 Subject: [PATCH 025/103] bump slurm image --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 9baeb26..d0296be 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:c12d04e +slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:589ee8f replicas: slurmd: 2 From 7b3fc7dff1d347f5b60fe794f18369588fdd01f7 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 25 Jul 2023 13:33:51 +0000 Subject: [PATCH 026/103] tidy home dir ownership/perms --- docker-entrypoint.sh | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 4bf4d32..a4ee0bf 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -90,14 +90,16 @@ then cp /tmp/authorized_keys /home/rocky/.ssh/authorized_keys echo "---> Setting permissions for user home directories" - cd /home - for DIR in */; - do USER_TO_SET=$( echo $DIR | sed "s/.$//" ) && (chown -R $USER_TO_SET:$USER_TO_SET $USER_TO_SET || echo "Failed to take ownership of $USER_TO_SET") \ - && (chmod 700 /home/$USER_TO_SET/.ssh || echo "Couldn't set permissions for .ssh directory for $USER_TO_SET") \ - && (chmod 600 /home/$USER_TO_SET/.ssh/authorized_keys || echo "Couldn't set permissions for .ssh/authorized_keys for $USER_TO_SET"); + pushd /home > /dev/null + for DIR in * + do + chown -R $DIR:$DIR $DIR || echo "Failed to change ownership of $DIR" + chmod 700 $DIR/.ssh || echo "Couldn't set permissions for .ssh/ directory of $DIR" + chmod 600 $DIR/.ssh/authorized_keys || echo "Couldn't set permissions for .ssh/authorized_keys for $USER_TO_SET" done - echo "---> Complete" - echo "Starting sshd" + popd > /dev/null + + echo "---> Starting sshd" ssh-keygen -A /usr/sbin/sshd From ac20b7b0f0ae6659271d1fe054736ee3d2336223 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 25 Jul 2023 13:44:18 +0000 Subject: [PATCH 027/103] bump slurm image --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index d0296be..cdc6186 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:589ee8f +slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:7b3fc7d replicas: slurmd: 2 From 1a450cbd64eedc99ef58d928d72fd041050736f6 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 25 Jul 2023 15:04:00 +0000 Subject: [PATCH 028/103] fix hook mungekey --- slurm-cluster-chart/templates/check-jobs-finished-hook.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/slurm-cluster-chart/templates/check-jobs-finished-hook.yaml b/slurm-cluster-chart/templates/check-jobs-finished-hook.yaml index 58cac40..8687814 100644 --- a/slurm-cluster-chart/templates/check-jobs-finished-hook.yaml +++ b/slurm-cluster-chart/templates/check-jobs-finished-hook.yaml @@ -29,6 +29,7 @@ spec: - name: munge-key-secret secret: secretName: {{ .Values.secrets.mungeKey }} + defaultMode: 0400 - name: slurm-config-volume configMap: name: {{ .Values.configmaps.slurmConf }} From 0e766042d3afd4c3c621082f77f4705809531a30 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 26 Jul 2023 09:41:22 +0000 Subject: [PATCH 029/103] make slurm.conf auto-updating --- slurm-cluster-chart/templates/slurmctld-statefulset.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml b/slurm-cluster-chart/templates/slurmctld-statefulset.yaml index 98e68cd..89ca613 100644 --- a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml +++ b/slurm-cluster-chart/templates/slurmctld-statefulset.yaml @@ -30,9 +30,8 @@ spec: volumeMounts: - mountPath: {{ .Values.nfs.mountPath }} name: slurm-jobdir - - mountPath: /etc/slurm/slurm.conf + - mountPath: /etc/slurm/ name: slurm-config-volume - subPath: slurm.conf - mountPath: /tmp/munge.key name: munge-key-secret subPath: munge.key From 88c2c51a68db7139ccaa9e0f3af2342959b3df88 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 26 Jul 2023 10:34:05 +0000 Subject: [PATCH 030/103] debug: remove slurmdbd startup code --- docker-entrypoint.sh | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index a4ee0bf..be699e2 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -16,23 +16,23 @@ then start_munge - echo "---> Starting the Slurm Database Daemon (slurmdbd) ..." - - cp /tmp/slurmdbd.conf /etc/slurm/slurmdbd.conf - echo "StoragePass=${StoragePass}" >> /etc/slurm/slurmdbd.conf - chown slurm:slurm /etc/slurm/slurmdbd.conf - chmod 600 /etc/slurm/slurmdbd.conf - { - . /etc/slurm/slurmdbd.conf - until echo "SELECT 1" | mysql -h $StorageHost -u$StorageUser -p$StoragePass 2>&1 > /dev/null - do - echo "-- Waiting for database to become active ..." - sleep 2 - done - } - echo "-- Database is now active ..." - - exec gosu slurm /usr/sbin/slurmdbd -Dvvv + # echo "---> Starting the Slurm Database Daemon (slurmdbd) ..." + + # cp /tmp/slurmdbd.conf /etc/slurm/slurmdbd.conf + # echo "StoragePass=${StoragePass}" >> /etc/slurm/slurmdbd.conf + # chown slurm:slurm /etc/slurm/slurmdbd.conf + # chmod 600 /etc/slurm/slurmdbd.conf + # { + # . /etc/slurm/slurmdbd.conf + # until echo "SELECT 1" | mysql -h $StorageHost -u$StorageUser -p$StoragePass 2>&1 > /dev/null + # do + # echo "-- Waiting for database to become active ..." + # sleep 2 + # done + # } + # echo "-- Database is now active ..." + + # exec gosu slurm /usr/sbin/slurmdbd -Dvvv fi if [ "$1" = "slurmctld" ] From cb6ea9fd0f498388b1b3b5b1ef3f4e6d9ef0b69a Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 26 Jul 2023 10:41:52 +0000 Subject: [PATCH 031/103] guard exec-ing container arg --- docker-entrypoint.sh | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index be699e2..f335846 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -33,9 +33,8 @@ then # echo "-- Database is now active ..." # exec gosu slurm /usr/sbin/slurmdbd -Dvvv -fi -if [ "$1" = "slurmctld" ] +elif [ "$1" = "slurmctld" ] then start_munge @@ -58,9 +57,8 @@ then else exec gosu slurm /usr/sbin/slurmctld -i -Dvvv fi -fi -if [ "$1" = "slurmd" ] +elif [ "$1" = "slurmd" ] then echo "---> Set shell resource limits ..." ulimit -l unlimited @@ -81,9 +79,8 @@ then echo "---> Starting the Slurm Node Daemon (slurmd) ..." exec /usr/sbin/slurmd -F -Dvvv -fi -if [ "$1" = "login" ] +elif [ "$1" = "login" ] then mkdir -p /home/rocky/.ssh @@ -104,9 +101,8 @@ then /usr/sbin/sshd start_munge --foreground -fi -if [ "$1" = "check-queue-hook" ] +elif [ "$1" = "check-queue-hook" ] then start_munge @@ -118,6 +114,7 @@ then else exit 1 fi -fi -exec "$@" +else: + exec "$@" +fi From 1f53be944156f13b5e20548349ad505d0966a365 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 26 Jul 2023 10:46:24 +0000 Subject: [PATCH 032/103] provide debug CMD --- docker-entrypoint.sh | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index f335846..a9f2e8f 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -16,23 +16,23 @@ then start_munge - # echo "---> Starting the Slurm Database Daemon (slurmdbd) ..." - - # cp /tmp/slurmdbd.conf /etc/slurm/slurmdbd.conf - # echo "StoragePass=${StoragePass}" >> /etc/slurm/slurmdbd.conf - # chown slurm:slurm /etc/slurm/slurmdbd.conf - # chmod 600 /etc/slurm/slurmdbd.conf - # { - # . /etc/slurm/slurmdbd.conf - # until echo "SELECT 1" | mysql -h $StorageHost -u$StorageUser -p$StoragePass 2>&1 > /dev/null - # do - # echo "-- Waiting for database to become active ..." - # sleep 2 - # done - # } - # echo "-- Database is now active ..." - - # exec gosu slurm /usr/sbin/slurmdbd -Dvvv + echo "---> Starting the Slurm Database Daemon (slurmdbd) ..." + + cp /tmp/slurmdbd.conf /etc/slurm/slurmdbd.conf + echo "StoragePass=${StoragePass}" >> /etc/slurm/slurmdbd.conf + chown slurm:slurm /etc/slurm/slurmdbd.conf + chmod 600 /etc/slurm/slurmdbd.conf + { + . /etc/slurm/slurmdbd.conf + until echo "SELECT 1" | mysql -h $StorageHost -u$StorageUser -p$StoragePass 2>&1 > /dev/null + do + echo "-- Waiting for database to become active ..." + sleep 2 + done + } + echo "-- Database is now active ..." + + exec gosu slurm /usr/sbin/slurmdbd -Dvvv elif [ "$1" = "slurmctld" ] then @@ -115,6 +115,10 @@ then exit 1 fi -else: +elif [ "$1" = "debug" ] +then + start_munge --foreground + +else exec "$@" fi From 2d509af699dba16910d2dd40669d59704a56ce7e Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 26 Jul 2023 11:44:59 +0000 Subject: [PATCH 033/103] make slurm.conf autoupdating for all except slurmdbd --- README.md | 16 +++++++++++++--- .../templates/check-jobs-finished-hook.yaml | 3 +-- .../templates/login-deployment.yaml | 3 +-- .../templates/slurmd-deployment.yaml | 3 +-- 4 files changed, 16 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 34abe12..7870457 100644 --- a/README.md +++ b/README.md @@ -132,11 +132,21 @@ Note: The mpirun script assumes you are running as user 'rocky'. If you are runn ### Changes to config files -To guarantee changes to config files are propagated to the cluster, use +Changes to the Slurm configuration in `slurm-cluster-chart/files/slurm.conf` will be propagated (it may take a few seconds) to `/etc/slurm/slurm.conf` for all pods except the `slurmdbd` pod by running + +```console +helm upgrade slurm-cluster-chart/ +``` + +The new Slurm configuration can then be read by running `scontrol reconfigure` as root inside a Slurm pod. The [slurm.conf documentation](https://slurm.schedmd.com/slurm.conf.html) notes that some changes require a restart of all daemons, which here requires redeploying the Slurm pods as described below. + +Changes to other configuration files (e.g. Munge key etc) require a redeploy of the appropriate pods. + +To redeploy pods use: ```console -kubectl rollout restart deployment +kubectl rollout restart deployment ``` -Generally restarts to `slurmd`, `slurmctld`, `login` and `slurmdbd` will be required +Generally restarts to `slurmd`, `slurmctld`, `login` and `slurmdbd` will be required. ### Changes to secrets diff --git a/slurm-cluster-chart/templates/check-jobs-finished-hook.yaml b/slurm-cluster-chart/templates/check-jobs-finished-hook.yaml index 8687814..79e93eb 100644 --- a/slurm-cluster-chart/templates/check-jobs-finished-hook.yaml +++ b/slurm-cluster-chart/templates/check-jobs-finished-hook.yaml @@ -22,9 +22,8 @@ spec: - mountPath: /tmp/munge.key name: munge-key-secret subPath: munge.key - - mountPath: /etc/slurm/slurm.conf + - mountPath: /etc/slurm/ name: slurm-config-volume - subPath: slurm.conf volumes: - name: munge-key-secret secret: diff --git a/slurm-cluster-chart/templates/login-deployment.yaml b/slurm-cluster-chart/templates/login-deployment.yaml index 758d69c..48f8f17 100644 --- a/slurm-cluster-chart/templates/login-deployment.yaml +++ b/slurm-cluster-chart/templates/login-deployment.yaml @@ -31,9 +31,8 @@ spec: volumeMounts: - mountPath: {{ .Values.nfs.mountPath }} name: slurm-jobdir - - mountPath: /etc/slurm/slurm.conf + - mountPath: /etc/slurm/ name: slurm-config-volume - subPath: slurm.conf - mountPath: /tmp/munge.key name: munge-key-secret subPath: munge.key diff --git a/slurm-cluster-chart/templates/slurmd-deployment.yaml b/slurm-cluster-chart/templates/slurmd-deployment.yaml index 71ddc94..e5858d5 100644 --- a/slurm-cluster-chart/templates/slurmd-deployment.yaml +++ b/slurm-cluster-chart/templates/slurmd-deployment.yaml @@ -37,9 +37,8 @@ spec: - containerPort: 6818 resources: {} volumeMounts: - - mountPath: /etc/slurm/slurm.conf + - mountPath: /etc/slurm/ name: slurm-config-volume - subPath: slurm.conf - mountPath: {{ .Values.nfs.mountPath }} name: slurm-jobdir - mountPath: /tmp/munge.key From 69e848d93bae1ef06cd63a2454f6c20ebedd7249 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 26 Jul 2023 11:47:07 +0000 Subject: [PATCH 034/103] bump image --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index cdc6186..d1dd881 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:7b3fc7d +slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:aa80f98 replicas: slurmd: 2 From 60b7b3a4779b820888babdd2d4e9be8ef3fc5377 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 27 Jul 2023 14:22:13 +0000 Subject: [PATCH 035/103] move docker build into directory --- .github/workflows/build-containers.yml | 6 +++--- Dockerfile => image/Dockerfile | 0 docker-entrypoint.sh => image/docker-entrypoint.sh | 0 3 files changed, 3 insertions(+), 3 deletions(-) rename Dockerfile => image/Dockerfile (100%) rename docker-entrypoint.sh => image/docker-entrypoint.sh (100%) diff --git a/.github/workflows/build-containers.yml b/.github/workflows/build-containers.yml index db15721..7ca4c39 100644 --- a/.github/workflows/build-containers.yml +++ b/.github/workflows/build-containers.yml @@ -3,9 +3,8 @@ on: push: paths: - .github/workflows/build-containers.yml - - Dockerfile - - docker-entrypoint.sh - workflow_dispatch: + - image/** + workflow_dispatch: jobs: build_push_api: @@ -49,6 +48,7 @@ jobs: with: provenance: false push: true + context: image/ tags: ${{ steps.image-meta.outputs.tags }} labels: ${{ steps.image-meta.outputs.labels }} cache-from: type=local,src=/tmp/.buildx-cache diff --git a/Dockerfile b/image/Dockerfile similarity index 100% rename from Dockerfile rename to image/Dockerfile diff --git a/docker-entrypoint.sh b/image/docker-entrypoint.sh similarity index 100% rename from docker-entrypoint.sh rename to image/docker-entrypoint.sh From 7aa77f743d2abb8bd88e09386a67b472444a3efa Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 27 Jul 2023 14:29:31 +0000 Subject: [PATCH 036/103] pass slurm daemon options from container args --- image/docker-entrypoint.sh | 8 ++++---- slurm-cluster-chart/templates/slurmctld-statefulset.yaml | 1 + slurm-cluster-chart/templates/slurmd-deployment.yaml | 2 ++ slurm-cluster-chart/templates/slurmdbd-deployment.yaml | 1 + 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/image/docker-entrypoint.sh b/image/docker-entrypoint.sh index a9f2e8f..9bfa05c 100755 --- a/image/docker-entrypoint.sh +++ b/image/docker-entrypoint.sh @@ -32,7 +32,7 @@ then } echo "-- Database is now active ..." - exec gosu slurm /usr/sbin/slurmdbd -Dvvv + exec gosu slurm /usr/sbin/slurmdbd -D "${@:2}" elif [ "$1" = "slurmctld" ] then @@ -53,9 +53,9 @@ then echo "---> Starting the Slurm Controller Daemon (slurmctld) ..." if /usr/sbin/slurmctld -V | grep -q '17.02' ; then - exec gosu slurm /usr/sbin/slurmctld -Dvvv + exec gosu slurm /usr/sbin/slurmctld -D "${@:2}" else - exec gosu slurm /usr/sbin/slurmctld -i -Dvvv + exec gosu slurm /usr/sbin/slurmctld -i -D "${@:2}" fi elif [ "$1" = "slurmd" ] @@ -78,7 +78,7 @@ then echo "-- slurmctld is now active ..." echo "---> Starting the Slurm Node Daemon (slurmd) ..." - exec /usr/sbin/slurmd -F -Dvvv + exec /usr/sbin/slurmd -D "${@:2}" elif [ "$1" = "login" ] then diff --git a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml b/slurm-cluster-chart/templates/slurmctld-statefulset.yaml index 89ca613..dc0bf90 100644 --- a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml +++ b/slurm-cluster-chart/templates/slurmctld-statefulset.yaml @@ -22,6 +22,7 @@ spec: containers: - args: - slurmctld + - -vvv image: {{ .Values.slurmImage }} name: slurmctld ports: diff --git a/slurm-cluster-chart/templates/slurmd-deployment.yaml b/slurm-cluster-chart/templates/slurmd-deployment.yaml index e5858d5..4c2396e 100644 --- a/slurm-cluster-chart/templates/slurmd-deployment.yaml +++ b/slurm-cluster-chart/templates/slurmd-deployment.yaml @@ -31,6 +31,8 @@ spec: containers: - args: - slurmd + - -F + - -vvv image: {{ .Values.slurmImage }} name: slurmd ports: diff --git a/slurm-cluster-chart/templates/slurmdbd-deployment.yaml b/slurm-cluster-chart/templates/slurmdbd-deployment.yaml index b505888..db6bdb5 100644 --- a/slurm-cluster-chart/templates/slurmdbd-deployment.yaml +++ b/slurm-cluster-chart/templates/slurmdbd-deployment.yaml @@ -24,6 +24,7 @@ spec: containers: - args: - slurmdbd + - -vvv image: {{ .Values.slurmImage }} name: slurmdbd ports: From d66c52b358454e6e0e7bfe3d9bc7def3b3b1825a Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 27 Jul 2023 14:30:30 +0000 Subject: [PATCH 037/103] fix NFS-mounted /home permissions --- image/docker-entrypoint.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/image/docker-entrypoint.sh b/image/docker-entrypoint.sh index 9bfa05c..76b3989 100755 --- a/image/docker-entrypoint.sh +++ b/image/docker-entrypoint.sh @@ -83,6 +83,9 @@ then elif [ "$1" = "login" ] then + chown root:root /home + chmod 755 /home + mkdir -p /home/rocky/.ssh cp /tmp/authorized_keys /home/rocky/.ssh/authorized_keys From 6527cdafeaa3a0ab65c55edc1d19632c6265d5c5 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 27 Jul 2023 14:45:27 +0000 Subject: [PATCH 038/103] bump image --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index d1dd881..772e125 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:aa80f98 +slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:d66c52b replicas: slurmd: 2 From abc57f51b6a6cff1e58f4ea53124c2004a9a6ee9 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 28 Jul 2023 07:56:39 +0000 Subject: [PATCH 039/103] don't default to 1x CPU --- slurm-cluster-chart/files/slurm.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/files/slurm.conf b/slurm-cluster-chart/files/slurm.conf index 4c072a7..a10c12b 100644 --- a/slurm-cluster-chart/files/slurm.conf +++ b/slurm-cluster-chart/files/slurm.conf @@ -52,7 +52,7 @@ CommunicationParameters=NoAddrCache # NODES MaxNodeCount=10 -NodeName=slurmd-[0-9] State=FUTURE +NodeName=slurmd-[0-9] State=FUTURE CPUs=4 # PARTITIONS PartitionName=all Default=yes Nodes=ALL From d3f01e4ff482f9f1d3cface1f18f2cd02d886abe Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Tue, 8 Aug 2023 14:18:16 +0100 Subject: [PATCH 040/103] Update README.md Co-authored-by: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index 7870457..200fa60 100644 --- a/README.md +++ b/README.md @@ -146,6 +146,13 @@ To redeploy pods use: ```console kubectl rollout restart deployment ``` +for the `slurmdbd`, `login` and `mysql` pods and + +``` +kubectl rollout restart statefulset +``` +for the `slurmd` and `slurmctld` pods +``` Generally restarts to `slurmd`, `slurmctld`, `login` and `slurmdbd` will be required. ### Changes to secrets From 2775fae80ca75d31d701965c646798a9bcd09bf2 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Tue, 8 Aug 2023 14:46:50 +0100 Subject: [PATCH 041/103] Removed suggestion typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 200fa60..2edf8a0 100644 --- a/README.md +++ b/README.md @@ -152,7 +152,7 @@ for the `slurmdbd`, `login` and `mysql` pods and kubectl rollout restart statefulset ``` for the `slurmd` and `slurmctld` pods -``` + Generally restarts to `slurmd`, `slurmctld`, `login` and `slurmdbd` will be required. ### Changes to secrets From 7f4d64e4ac9d8fc3e9f53a3fc69ca8ab5182034f Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 8 Aug 2023 13:59:26 +0000 Subject: [PATCH 042/103] fix refactor of USER_TO_SET --- image/docker-entrypoint.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/image/docker-entrypoint.sh b/image/docker-entrypoint.sh index 76b3989..132c554 100755 --- a/image/docker-entrypoint.sh +++ b/image/docker-entrypoint.sh @@ -95,7 +95,7 @@ then do chown -R $DIR:$DIR $DIR || echo "Failed to change ownership of $DIR" chmod 700 $DIR/.ssh || echo "Couldn't set permissions for .ssh/ directory of $DIR" - chmod 600 $DIR/.ssh/authorized_keys || echo "Couldn't set permissions for .ssh/authorized_keys for $USER_TO_SET" + chmod 600 $DIR/.ssh/authorized_keys || echo "Couldn't set permissions for .ssh/authorized_keys for $DIR" done popd > /dev/null From 1e42e753ef119074c6d05f60e2735384db05c388 Mon Sep 17 00:00:00 2001 From: Will Date: Tue, 8 Aug 2023 15:23:47 +0100 Subject: [PATCH 043/103] Updated tag --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 772e125..7873e5c 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:d66c52b +slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:7f4d64e replicas: slurmd: 2 From f52e91848584aa261dda5ed5cb2c0b4211d0dc7a Mon Sep 17 00:00:00 2001 From: Will Date: Tue, 8 Aug 2023 16:47:25 +0100 Subject: [PATCH 044/103] Fixed munge --- image/docker-entrypoint.sh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/image/docker-entrypoint.sh b/image/docker-entrypoint.sh index 9e6b085..14b511c 100755 --- a/image/docker-entrypoint.sh +++ b/image/docker-entrypoint.sh @@ -106,7 +106,7 @@ then chmod 600 $DIR/.ssh/authorized_keys || echo "Couldn't set permissions for .ssh/authorized_keys for $DIR" done popd > /dev/null - + echo "---> Complete" echo "---> Starting sshd" cp /tempmounts/etc/ssh/* /etc/ssh/ @@ -116,9 +116,7 @@ then chmod 600 /etc/ssh/ssh_host_rsa_key /usr/sbin/sshd - echo "---> Starting the MUNGE Authentication service (munged) ..." - gosu munge /usr/sbin/munged - echo "---> MUNGE Complete" + start_munge echo "---> Setting up self ssh capabilities for OOD" ssh-keyscan localhost > /etc/ssh/ssh_known_hosts From 303e6f0de44ba692c2b01d621596e4cb4cb7029b Mon Sep 17 00:00:00 2001 From: Will Date: Tue, 8 Aug 2023 16:51:13 +0100 Subject: [PATCH 045/103] Updated tag --- slurm-cluster-chart/values.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index f40ee09..6a394cb 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,5 +1,4 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:7f4d64e -#OUTDATED, CHANGE AFTER REBUILD +slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:f52e918 replicas: slurmd: 2 From 7ca06682d58e3bd05ff32bbbd5e719036c686f2a Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 10 Aug 2023 11:59:23 +0100 Subject: [PATCH 046/103] Moved database auth to helm templating --- generate-secrets.sh | 6 ------ slurm-cluster-chart/templates/database-auth-secret.yaml | 6 ++++++ 2 files changed, 6 insertions(+), 6 deletions(-) create mode 100644 slurm-cluster-chart/templates/database-auth-secret.yaml diff --git a/generate-secrets.sh b/generate-secrets.sh index e98b97e..b4cc01c 100755 --- a/generate-secrets.sh +++ b/generate-secrets.sh @@ -1,11 +1,5 @@ #!/bin/bash -kubectl create secret generic database-auth-secret \ ---dry-run=client \ ---from-literal=password=$(tr -dc 'A-Za-z0-9' /dev/null | base64 -w 0) \ diff --git a/slurm-cluster-chart/templates/database-auth-secret.yaml b/slurm-cluster-chart/templates/database-auth-secret.yaml new file mode 100644 index 0000000..27c4e3f --- /dev/null +++ b/slurm-cluster-chart/templates/database-auth-secret.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: Secret +metadata: + name: database-auth-secret +data: + password: {{ randAlphaNum 32 | b64enc }} \ No newline at end of file From 656aa6c058ca0cc337ed72ac83879098110f4a34 Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 10 Aug 2023 12:16:08 +0100 Subject: [PATCH 047/103] Moved munge key generation to helm --- generate-secrets.sh | 6 ------ slurm-cluster-chart/templates/munge-key-secret.yaml | 6 ++++++ 2 files changed, 6 insertions(+), 6 deletions(-) create mode 100644 slurm-cluster-chart/templates/munge-key-secret.yaml diff --git a/generate-secrets.sh b/generate-secrets.sh index b4cc01c..f64c116 100755 --- a/generate-secrets.sh +++ b/generate-secrets.sh @@ -1,11 +1,5 @@ #!/bin/bash -kubectl create secret generic munge-key-secret \ ---dry-run=client \ ---from-literal=munge.key=$(dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64 -w 0) \ --o yaml | \ -kubectl apply -f - - mkdir -p ./temphostkeys/etc/ssh ssh-keygen -A -f ./temphostkeys kubectl create secret generic host-keys-secret \ diff --git a/slurm-cluster-chart/templates/munge-key-secret.yaml b/slurm-cluster-chart/templates/munge-key-secret.yaml new file mode 100644 index 0000000..153b5fe --- /dev/null +++ b/slurm-cluster-chart/templates/munge-key-secret.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: Secret +metadata: + name: munge-key-secret +data: + munge.key: {{ randAscii 128 | b64enc }} \ No newline at end of file From a9003f7a17aba22cc1c0e373f8fbfa6e73c6f742 Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 10 Aug 2023 13:37:40 +0100 Subject: [PATCH 048/103] Moved OOD password to values/yaml --- generate-secrets.sh | 13 ------------- slurm-cluster-chart/templates/login-deployment.yaml | 5 +---- slurm-cluster-chart/values.yaml | 5 ++++- 3 files changed, 5 insertions(+), 18 deletions(-) diff --git a/generate-secrets.sh b/generate-secrets.sh index f64c116..b6d4267 100755 --- a/generate-secrets.sh +++ b/generate-secrets.sh @@ -8,16 +8,3 @@ kubectl create secret generic host-keys-secret \ -o yaml | \ kubectl apply -f - rm -rf ./temphostkeys - -OOD_PASS=$(tr -dc 'A-Za-z0-9' Date: Thu, 10 Aug 2023 14:15:09 +0100 Subject: [PATCH 049/103] Random secrets now generated pre-install only --- slurm-cluster-chart/templates/database-auth-secret.yaml | 5 ++++- slurm-cluster-chart/templates/munge-key-secret.yaml | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/slurm-cluster-chart/templates/database-auth-secret.yaml b/slurm-cluster-chart/templates/database-auth-secret.yaml index 27c4e3f..6133576 100644 --- a/slurm-cluster-chart/templates/database-auth-secret.yaml +++ b/slurm-cluster-chart/templates/database-auth-secret.yaml @@ -2,5 +2,8 @@ apiVersion: v1 kind: Secret metadata: name: database-auth-secret + annotations: + helm.sh/hook: pre-install + helm.sh/resource-policy: keep data: - password: {{ randAlphaNum 32 | b64enc }} \ No newline at end of file + password: {{ randAlphaNum 32 | b64enc }} diff --git a/slurm-cluster-chart/templates/munge-key-secret.yaml b/slurm-cluster-chart/templates/munge-key-secret.yaml index 153b5fe..65825d6 100644 --- a/slurm-cluster-chart/templates/munge-key-secret.yaml +++ b/slurm-cluster-chart/templates/munge-key-secret.yaml @@ -2,5 +2,8 @@ apiVersion: v1 kind: Secret metadata: name: munge-key-secret + annotations: + helm.sh/hook: pre-install + helm.sh/resource-policy: keep data: - munge.key: {{ randAscii 128 | b64enc }} \ No newline at end of file + munge.key: {{ randAscii 128 | b64enc }} From e0514f6c47bfc000264708d2be82151a805a16c1 Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 10 Aug 2023 14:31:46 +0100 Subject: [PATCH 050/103] Added kubectl to image --- image/Dockerfile | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/image/Dockerfile b/image/Dockerfile index 855a1cc..dceaeeb 100644 --- a/image/Dockerfile +++ b/image/Dockerfile @@ -18,6 +18,14 @@ RUN set -ex \ && yum -y module enable ruby:2.7 nodejs:14 \ && yum -y install https://yum.osc.edu/ondemand/2.0/ondemand-release-web-2.0-1.noarch.rpm \ && yum -y module install ruby nodejs \ + && cat < Date: Thu, 10 Aug 2023 14:35:29 +0100 Subject: [PATCH 051/103] Fixed Dockerfile --- image/Dockerfile | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/image/Dockerfile b/image/Dockerfile index dceaeeb..bcc3fdb 100644 --- a/image/Dockerfile +++ b/image/Dockerfile @@ -18,13 +18,13 @@ RUN set -ex \ && yum -y module enable ruby:2.7 nodejs:14 \ && yum -y install https://yum.osc.edu/ondemand/2.0/ondemand-release-web-2.0-1.noarch.rpm \ && yum -y module install ruby nodejs \ - && cat < Date: Thu, 10 Aug 2023 14:46:31 +0100 Subject: [PATCH 052/103] Testing with separate command --- image/Dockerfile | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/image/Dockerfile b/image/Dockerfile index bcc3fdb..14ad93b 100644 --- a/image/Dockerfile +++ b/image/Dockerfile @@ -9,6 +9,15 @@ LABEL org.opencontainers.image.source="https://github.com/stackhpc/slurm-docker- ARG SLURM_TAG=slurm-23.02 ARG GOSU_VERSION=1.11 +RUN cat < Date: Thu, 10 Aug 2023 14:52:37 +0100 Subject: [PATCH 053/103] Revert "Testing with separate command" This reverts commit cd0d1afb5cfaae3bd234dccb26c673435da21fd0. --- image/Dockerfile | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/image/Dockerfile b/image/Dockerfile index 14ad93b..bcc3fdb 100644 --- a/image/Dockerfile +++ b/image/Dockerfile @@ -9,15 +9,6 @@ LABEL org.opencontainers.image.source="https://github.com/stackhpc/slurm-docker- ARG SLURM_TAG=slurm-23.02 ARG GOSU_VERSION=1.11 -RUN cat < Date: Thu, 10 Aug 2023 14:53:58 +0100 Subject: [PATCH 054/103] Removed sudo from dockerfile --- image/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/image/Dockerfile b/image/Dockerfile index bcc3fdb..ee14ea3 100644 --- a/image/Dockerfile +++ b/image/Dockerfile @@ -18,7 +18,7 @@ RUN set -ex \ && yum -y module enable ruby:2.7 nodejs:14 \ && yum -y install https://yum.osc.edu/ondemand/2.0/ondemand-release-web-2.0-1.noarch.rpm \ && yum -y module install ruby nodejs \ - && cat < Date: Thu, 10 Aug 2023 15:00:29 +0100 Subject: [PATCH 055/103] Moved kubernetes repo to separate file --- image/Dockerfile | 9 ++------- image/kubernetes.repo | 6 ++++++ 2 files changed, 8 insertions(+), 7 deletions(-) create mode 100644 image/kubernetes.repo diff --git a/image/Dockerfile b/image/Dockerfile index ee14ea3..9874e58 100644 --- a/image/Dockerfile +++ b/image/Dockerfile @@ -9,6 +9,8 @@ LABEL org.opencontainers.image.source="https://github.com/stackhpc/slurm-docker- ARG SLURM_TAG=slurm-23.02 ARG GOSU_VERSION=1.11 +COPY kubernetes.repo /etc/yum.repos.d/kubernetes.repo + RUN set -ex \ && yum makecache \ && yum -y update \ @@ -19,13 +21,6 @@ RUN set -ex \ && yum -y install https://yum.osc.edu/ondemand/2.0/ondemand-release-web-2.0-1.noarch.rpm \ && yum -y module install ruby nodejs \ && cat < Date: Thu, 10 Aug 2023 15:02:45 +0100 Subject: [PATCH 056/103] Fixed leftover commands --- image/Dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/image/Dockerfile b/image/Dockerfile index 9874e58..0d00a6a 100644 --- a/image/Dockerfile +++ b/image/Dockerfile @@ -20,7 +20,6 @@ RUN set -ex \ && yum -y module enable ruby:2.7 nodejs:14 \ && yum -y install https://yum.osc.edu/ondemand/2.0/ondemand-release-web-2.0-1.noarch.rpm \ && yum -y module install ruby nodejs \ - && cat < Date: Thu, 10 Aug 2023 16:18:53 +0100 Subject: [PATCH 057/103] Updated tag and created service account to modify host-keys-secret --- .../templates/secret-generator-role.yaml | 22 +++++++++++++++++++ .../secret-generator-serviceaccount.yaml | 10 +++++++++ slurm-cluster-chart/values.yaml | 2 +- 3 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 slurm-cluster-chart/templates/secret-generator-role.yaml create mode 100644 slurm-cluster-chart/templates/secret-generator-serviceaccount.yaml diff --git a/slurm-cluster-chart/templates/secret-generator-role.yaml b/slurm-cluster-chart/templates/secret-generator-role.yaml new file mode 100644 index 0000000..67de05e --- /dev/null +++ b/slurm-cluster-chart/templates/secret-generator-role.yaml @@ -0,0 +1,22 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: secret-generator-role +rules: +- apiGroups: [""] # "" indicates the core API group + resources: ["secrets"] + verbs: ["get","apply","create", "patch"] + +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: secret-generator-rolebinding +subjects: + - kind: ServiceAccount + name: secret-generator-account +roleRef: + kind: Role + name: secret-generator-role + apiGroup: rbac.authorization.k8s.io \ No newline at end of file diff --git a/slurm-cluster-chart/templates/secret-generator-serviceaccount.yaml b/slurm-cluster-chart/templates/secret-generator-serviceaccount.yaml new file mode 100644 index 0000000..6510cb9 --- /dev/null +++ b/slurm-cluster-chart/templates/secret-generator-serviceaccount.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: secret-generator-account + annotations: + "kubernetes.io/enforce-mountable-secrets": "true" +automountServiceAccountToken: True +secrets: + - name: host-keys-secret + \ No newline at end of file diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index e2aed84..c0b0360 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:f52e918 +slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:763de73 replicas: slurmd: 2 From d58f819e1ed9e46f9cd71e3432c20928f2922887 Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 10 Aug 2023 16:24:38 +0100 Subject: [PATCH 058/103] Added entrypoint for host key generation hook --- image/docker-entrypoint.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/image/docker-entrypoint.sh b/image/docker-entrypoint.sh index 14b511c..01d3519 100755 --- a/image/docker-entrypoint.sh +++ b/image/docker-entrypoint.sh @@ -148,6 +148,18 @@ then exit 1 fi +elif [ "$1" = "generate-keys-hook" ] +then + mkdir -p ./temphostkeys/etc/ssh + ssh-keygen -A -f ./temphostkeys + kubectl create secret generic host-keys-secret \ + --dry-run=client \ + --from-file=./temphostkeys/etc/ssh \ + -o yaml | \ + kubectl apply -f - + + exit 0 + elif [ "$1" = "debug" ] then start_munge --foreground From 16ee05dbb6ef16c234fe1b491a904e4190693a14 Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 10 Aug 2023 16:44:42 +0100 Subject: [PATCH 059/103] Added pre-install hook to generate host keys --- .../templates/generate-keys-hook.yaml | 22 +++++++++++++++++++ .../templates/secret-generator-role.yaml | 6 +++++ .../secret-generator-serviceaccount.yaml | 3 ++- slurm-cluster-chart/values.yaml | 2 +- 4 files changed, 31 insertions(+), 2 deletions(-) create mode 100644 slurm-cluster-chart/templates/generate-keys-hook.yaml diff --git a/slurm-cluster-chart/templates/generate-keys-hook.yaml b/slurm-cluster-chart/templates/generate-keys-hook.yaml new file mode 100644 index 0000000..c05e7f2 --- /dev/null +++ b/slurm-cluster-chart/templates/generate-keys-hook.yaml @@ -0,0 +1,22 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: generate-keys-hook + annotations: + "helm.sh/hook": pre-install + "helm.sh/hook-delete-policy": hook-succeeded + "helm.sh/hook-weight": "3" +spec: + backoffLimit: 0 + ttlSecondsAfterFinished: 0 + template: + metadata: + name: generate-keys-hook + spec: + serviceAccountName: secret-generator-account + restartPolicy: Never + containers: + - name: generate-keys-hook + image: {{ .Values.slurmImage }} + args: + - generate-keys-hook diff --git a/slurm-cluster-chart/templates/secret-generator-role.yaml b/slurm-cluster-chart/templates/secret-generator-role.yaml index 67de05e..da914be 100644 --- a/slurm-cluster-chart/templates/secret-generator-role.yaml +++ b/slurm-cluster-chart/templates/secret-generator-role.yaml @@ -2,6 +2,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: name: secret-generator-role + annotations: + "helm.sh/hook": pre-install + "helm.sh/hook-weight": "1" rules: - apiGroups: [""] # "" indicates the core API group resources: ["secrets"] @@ -13,6 +16,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: name: secret-generator-rolebinding + annotations: + "helm.sh/hook": pre-install + "helm.sh/hook-weight": "2" subjects: - kind: ServiceAccount name: secret-generator-account diff --git a/slurm-cluster-chart/templates/secret-generator-serviceaccount.yaml b/slurm-cluster-chart/templates/secret-generator-serviceaccount.yaml index 6510cb9..ce860b0 100644 --- a/slurm-cluster-chart/templates/secret-generator-serviceaccount.yaml +++ b/slurm-cluster-chart/templates/secret-generator-serviceaccount.yaml @@ -4,7 +4,8 @@ metadata: name: secret-generator-account annotations: "kubernetes.io/enforce-mountable-secrets": "true" + "helm.sh/hook": pre-install + "helm.sh/hook-weight": "0" automountServiceAccountToken: True secrets: - name: host-keys-secret - \ No newline at end of file diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index c0b0360..0421371 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:763de73 +slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:d58f819 replicas: slurmd: 2 From 15b07a671b8e49a4e0d73bd6899a4290510bc065 Mon Sep 17 00:00:00 2001 From: Will Date: Thu, 10 Aug 2023 16:47:52 +0100 Subject: [PATCH 060/103] Removed generate-secrets.sh --- generate-secrets.sh | 10 ---------- 1 file changed, 10 deletions(-) delete mode 100755 generate-secrets.sh diff --git a/generate-secrets.sh b/generate-secrets.sh deleted file mode 100755 index b6d4267..0000000 --- a/generate-secrets.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -mkdir -p ./temphostkeys/etc/ssh -ssh-keygen -A -f ./temphostkeys -kubectl create secret generic host-keys-secret \ ---dry-run=client \ ---from-file=./temphostkeys/etc/ssh \ --o yaml | \ -kubectl apply -f - -rm -rf ./temphostkeys From 4b8e114aed5468a9cc74b68b0118272c194279cc Mon Sep 17 00:00:00 2001 From: Will Date: Fri, 11 Aug 2023 09:52:16 +0100 Subject: [PATCH 061/103] Now option to give public key explicitly through values.yaml --- .../templates/helm-authorized-keys-configmap.yaml | 9 +++++++++ slurm-cluster-chart/templates/login-deployment.yaml | 6 +++++- slurm-cluster-chart/values.yaml | 4 +++- 3 files changed, 17 insertions(+), 2 deletions(-) create mode 100644 slurm-cluster-chart/templates/helm-authorized-keys-configmap.yaml diff --git a/slurm-cluster-chart/templates/helm-authorized-keys-configmap.yaml b/slurm-cluster-chart/templates/helm-authorized-keys-configmap.yaml new file mode 100644 index 0000000..75ad249 --- /dev/null +++ b/slurm-cluster-chart/templates/helm-authorized-keys-configmap.yaml @@ -0,0 +1,9 @@ +#Only applied if sshPublicKey provided in values.yaml, if not assumes you have run publish-keys.sh prior to helm release +{{ if .Values.sshPublicKey }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: helm-authorized-keys-configmap +data: + authorized_keys: {{ .Values.sshPublicKey }} +{{ end }} diff --git a/slurm-cluster-chart/templates/login-deployment.yaml b/slurm-cluster-chart/templates/login-deployment.yaml index 37fb46b..0984560 100644 --- a/slurm-cluster-chart/templates/login-deployment.yaml +++ b/slurm-cluster-chart/templates/login-deployment.yaml @@ -80,7 +80,11 @@ spec: defaultMode: 0400 - name: authorized-keys configMap: - name: {{ .Values.configmaps.authorizedKeys }} + {{ if .Values.sshPublicKey }} + name: helm-authorized-keys-configmap + {{ else }} + name: authorized-keys-configmap + {{ end }} - name: cluster-config configMap: name: cluster-config diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 0421371..d7fc033 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -13,11 +13,13 @@ sqlImage: mariadb:10.10 databaseStorage: 100Mi configmaps: - authorizedKeys: authorized-keys-configmap slurmConf: slurm-conf-configmap slurmdbdConf: slurmdbd-conf-configmap sshdConfig: sshd-config-configmap +# If let undefined, assumes you have run publish-keys.sh to publish your public key prior to deployment +sshPublicKey: + secrets: databaseAuth: database-auth-secret mungeKey: munge-key-secret From c7a724886ba2aaea3b65c4d228d3391717d8c0d7 Mon Sep 17 00:00:00 2001 From: Will Date: Fri, 11 Aug 2023 10:42:00 +0100 Subject: [PATCH 062/103] Added custom packaging to workflow --- .github/workflows/publish-helm-chart.yml | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/.github/workflows/publish-helm-chart.yml b/.github/workflows/publish-helm-chart.yml index 8ce0698..1806817 100644 --- a/.github/workflows/publish-helm-chart.yml +++ b/.github/workflows/publish-helm-chart.yml @@ -1,9 +1,6 @@ name: Release Charts -on: - push: - branches: - - main +on: push jobs: release: @@ -17,6 +14,11 @@ jobs: uses: actions/checkout@v3 with: fetch-depth: 0 + submodules: true + + - name: Get SemVer version for current commit + id: semver + uses: stackhpc/github-actions/semver@master - name: Configure Git run: | @@ -28,10 +30,15 @@ jobs: env: GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}" + - name: "Package Chart" + run: | + helm package slurm-cluster-chart --version ${{ steps.semver.outputs.version }} + - name: Run chart-releaser uses: helm/chart-releaser-action@v1.5.0 with: charts_dir: . + skip_packaging: True env: CR_TOKEN: "${{ secrets.GITHUB_TOKEN }}" From 69122f7b503dfd55c21bad56d077e6eec0b957a9 Mon Sep 17 00:00:00 2001 From: Will Date: Fri, 11 Aug 2023 10:46:19 +0100 Subject: [PATCH 063/103] Trying adding charts to cr packages --- .github/workflows/publish-helm-chart.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/publish-helm-chart.yml b/.github/workflows/publish-helm-chart.yml index 1806817..8a6f4f7 100644 --- a/.github/workflows/publish-helm-chart.yml +++ b/.github/workflows/publish-helm-chart.yml @@ -32,7 +32,8 @@ jobs: - name: "Package Chart" run: | - helm package slurm-cluster-chart --version ${{ steps.semver.outputs.version }} + mkdir -p .cr-release-packages + helm package slurm-cluster-chart --version ${{ steps.semver.outputs.version }} --destination .cr-release-packages - name: Run chart-releaser uses: helm/chart-releaser-action@v1.5.0 From ca27405f537d3eff24f9d6201f0c614961ddaa7b Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Fri, 11 Aug 2023 10:54:02 +0100 Subject: [PATCH 064/103] Added source in slurm-cluster-chart/files/httpd.conf Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- slurm-cluster-chart/files/httpd.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/files/httpd.conf b/slurm-cluster-chart/files/httpd.conf index 6d3783a..248afb2 100644 --- a/slurm-cluster-chart/files/httpd.conf +++ b/slurm-cluster-chart/files/httpd.conf @@ -1,4 +1,4 @@ -# +# Modified from file installed by httpd package # This is the main Apache HTTP server configuration file. It contains the # configuration directives that give the server its instructions. # See for detailed information. From 1a3c3adb269f6a2161942fe3000d5eeadf30b022 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Fri, 11 Aug 2023 10:54:25 +0100 Subject: [PATCH 065/103] Added source in slurm-cluster-chart/files/ood_portal.yaml Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- slurm-cluster-chart/files/ood_portal.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/slurm-cluster-chart/files/ood_portal.yaml b/slurm-cluster-chart/files/ood_portal.yaml index 9be3295..d5227b2 100644 --- a/slurm-cluster-chart/files/ood_portal.yaml +++ b/slurm-cluster-chart/files/ood_portal.yaml @@ -1,3 +1,4 @@ +# Modified from file installed by ondemand package --- # # Portal configuration From 09d25127aa6a4825d772c444e74b08020497943d Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Fri, 11 Aug 2023 11:29:43 +0100 Subject: [PATCH 066/103] Add Known Issues heading to start documenting these --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 2edf8a0..11fe8b8 100644 --- a/README.md +++ b/README.md @@ -171,3 +171,5 @@ and then restart the other dependent deployments to propagate changes: ```console kubectl rollout restart deployment slurmd slurmctld login slurmdbd ``` + +# Known Issues From 9979627bbe7c4a5f993a23ce5ca3ba7aacf17f21 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Fri, 11 Aug 2023 12:24:59 +0100 Subject: [PATCH 067/103] Convert Rook NFS to Helm chart - Adds Rook NFS Helm chart as dependency of Slurm cluster chart - Refactors main values file to allow additional customisation - Adds cleanup job as pre-delete hook to fix uninstall behaviour --- .gitignore | 3 + nfs/deploy-nfs.sh | 11 ---- nfs/pvc.yaml | 11 ---- nfs/sc.yaml | 13 ----- nfs/teardown-nfs.sh | 16 ------ rooknfs/Chart.yaml | 4 ++ rooknfs/README.md | 0 {nfs => rooknfs/crds}/crds.yaml | 0 {nfs => rooknfs/templates}/nfs.yaml | 18 +++--- {nfs => rooknfs/templates}/operator.yaml | 12 ++-- {nfs => rooknfs/templates}/rbac.yaml | 10 ++-- rooknfs/templates/sc.yaml | 17 ++++++ rooknfs/values.yaml | 30 ++++++++++ slurm-cluster-chart/Chart.yaml | 7 ++- .../templates/hooks/pre-delete.yaml | 55 +++++++++++++++++++ .../{login-deployment.yaml => login.yaml} | 8 +-- slurm-cluster-chart/templates/pvc.yaml | 14 +++++ ...rmctld-statefulset.yaml => slurmctld.yaml} | 6 +- .../{slurmd-deployment.yaml => slurmd.yaml} | 9 +-- slurm-cluster-chart/values.yaml | 50 +++++++++++++++-- 20 files changed, 211 insertions(+), 83 deletions(-) create mode 100644 .gitignore delete mode 100755 nfs/deploy-nfs.sh delete mode 100644 nfs/pvc.yaml delete mode 100644 nfs/sc.yaml delete mode 100755 nfs/teardown-nfs.sh create mode 100644 rooknfs/Chart.yaml create mode 100644 rooknfs/README.md rename {nfs => rooknfs/crds}/crds.yaml (100%) rename {nfs => rooknfs/templates}/nfs.yaml (61%) rename {nfs => rooknfs/templates}/operator.yaml (91%) rename {nfs => rooknfs/templates}/rbac.yaml (88%) create mode 100644 rooknfs/templates/sc.yaml create mode 100644 rooknfs/values.yaml create mode 100644 slurm-cluster-chart/templates/hooks/pre-delete.yaml rename slurm-cluster-chart/templates/{login-deployment.yaml => login.yaml} (90%) create mode 100644 slurm-cluster-chart/templates/pvc.yaml rename slurm-cluster-chart/templates/{slurmctld-statefulset.yaml => slurmctld.yaml} (91%) rename slurm-cluster-chart/templates/{slurmd-deployment.yaml => slurmd.yaml} (88%) diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0ba5327 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +# Build artifacts from local helm install +slurm-cluster-chart/Chart.lock +slurm-cluster-chart/charts/ diff --git a/nfs/deploy-nfs.sh b/nfs/deploy-nfs.sh deleted file mode 100755 index b2d2f75..0000000 --- a/nfs/deploy-nfs.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -# Based on https://rook.io/docs/nfs/v1.7/quickstart.html -# Manifests listed explicitly here to guarantee ordering - -kubectl create -f nfs/crds.yaml -kubectl create -f nfs/operator.yaml -kubectl create -f nfs/rbac.yaml -kubectl create -f nfs/nfs.yaml -kubectl create -f nfs/sc.yaml -kubectl create -f nfs/pvc.yaml diff --git a/nfs/pvc.yaml b/nfs/pvc.yaml deleted file mode 100644 index 7f0a3d7..0000000 --- a/nfs/pvc.yaml +++ /dev/null @@ -1,11 +0,0 @@ -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: rook-nfs-pv-claim -spec: - storageClassName: "rook-nfs-share1" - accessModes: - - ReadWriteMany - resources: - requests: - storage: 10Gi diff --git a/nfs/sc.yaml b/nfs/sc.yaml deleted file mode 100644 index 6f9e3ae..0000000 --- a/nfs/sc.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: storage.k8s.io/v1 -kind: StorageClass -metadata: - labels: - app: rook-nfs - name: rook-nfs-share1 -parameters: - exportName: share1 - nfsServerName: rook-nfs - nfsServerNamespace: rook-nfs -provisioner: nfs.rook.io/rook-nfs-provisioner -reclaimPolicy: Delete -volumeBindingMode: Immediate diff --git a/nfs/teardown-nfs.sh b/nfs/teardown-nfs.sh deleted file mode 100755 index 4dde364..0000000 --- a/nfs/teardown-nfs.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -kubectl delete -f web-service.yaml -kubectl delete -f web-rc.yaml -kubectl delete -f busybox-rc.yaml -kubectl delete -f pvc.yaml -kubectl delete -f pv.yaml -kubectl delete -f nfs.yaml -kubectl delete -f nfs-xfs.yaml -kubectl delete -f nfs-ceph.yaml -kubectl delete -f rbac.yaml -kubectl delete -f psp.yaml -kubectl delete -f scc.yaml # if deployed -kubectl delete -f operator.yaml -kubectl delete -f webhook.yaml # if deployed -kubectl delete -f crds.yaml diff --git a/rooknfs/Chart.yaml b/rooknfs/Chart.yaml new file mode 100644 index 0000000..83a2a11 --- /dev/null +++ b/rooknfs/Chart.yaml @@ -0,0 +1,4 @@ +apiVersion: v2 +name: rooknfs +version: 0.0.1 +description: An packaged installation of Rook NFS for Kubernetes. \ No newline at end of file diff --git a/rooknfs/README.md b/rooknfs/README.md new file mode 100644 index 0000000..e69de29 diff --git a/nfs/crds.yaml b/rooknfs/crds/crds.yaml similarity index 100% rename from nfs/crds.yaml rename to rooknfs/crds/crds.yaml diff --git a/nfs/nfs.yaml b/rooknfs/templates/nfs.yaml similarity index 61% rename from nfs/nfs.yaml rename to rooknfs/templates/nfs.yaml index 742fa34..6fde553 100644 --- a/nfs/nfs.yaml +++ b/rooknfs/templates/nfs.yaml @@ -1,32 +1,36 @@ +{{- if .Values.enabled }} --- # A default storageclass must be present apiVersion: v1 kind: PersistentVolumeClaim metadata: - name: nfs-default-claim - namespace: rook-nfs + name: {{ .Values.claimName}} + namespace: {{ .Values.serverNamespace }} spec: accessModes: - ReadWriteMany resources: requests: - storage: 1Gi + storage: {{ .Values.storageCapacity }} --- apiVersion: nfs.rook.io/v1alpha1 kind: NFSServer metadata: - name: rook-nfs - namespace: rook-nfs + name: {{ .Values.serverName }} + namespace: {{ .Values.serverNamespace }} spec: replicas: 1 exports: - - name: share1 + - name: {{ .Values.shareName }} server: accessMode: ReadWrite squash: "none" # A Persistent Volume Claim must be created before creating NFS CRD instance. persistentVolumeClaim: - claimName: nfs-default-claim + claimName: {{ .Values.claimName }} # A key/value list of annotations annotations: rook: nfs +--- +{{- end }} + diff --git a/nfs/operator.yaml b/rooknfs/templates/operator.yaml similarity index 91% rename from nfs/operator.yaml rename to rooknfs/templates/operator.yaml index b289909..4a1d542 100644 --- a/nfs/operator.yaml +++ b/rooknfs/templates/operator.yaml @@ -1,13 +1,15 @@ +{{- if .Values.enabled }} +--- apiVersion: v1 kind: Namespace metadata: - name: rook-nfs-system # namespace:operator + name: {{ .Values.systemNamespace }} --- apiVersion: v1 kind: ServiceAccount metadata: name: rook-nfs-operator - namespace: rook-nfs-system # namespace:operator + namespace: {{ .Values.systemNamespace }} --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 @@ -20,7 +22,7 @@ roleRef: subjects: - kind: ServiceAccount name: rook-nfs-operator - namespace: rook-nfs-system # namespace:operator + namespace: {{ .Values.systemNamespace }} --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole @@ -106,7 +108,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: rook-nfs-operator - namespace: rook-nfs-system # namespace:operator + namespace: {{ .Values.systemNamespace }} labels: app: rook-nfs-operator spec: @@ -134,3 +136,5 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace +--- +{{- end}} diff --git a/nfs/rbac.yaml b/rooknfs/templates/rbac.yaml similarity index 88% rename from nfs/rbac.yaml rename to rooknfs/templates/rbac.yaml index 8e3d9f7..b327740 100644 --- a/nfs/rbac.yaml +++ b/rooknfs/templates/rbac.yaml @@ -1,14 +1,15 @@ +{{- if .Values.enabled }} --- apiVersion: v1 kind: Namespace metadata: - name: rook-nfs + name: {{ .Values.serverNamespace }} --- apiVersion: v1 kind: ServiceAccount metadata: name: rook-nfs-server - namespace: rook-nfs + namespace: {{ .Values.serverNamespace }} --- kind: ClusterRole apiVersion: rbac.authorization.k8s.io/v1 @@ -51,9 +52,10 @@ metadata: subjects: - kind: ServiceAccount name: rook-nfs-server - # replace with namespace where provisioner is deployed - namespace: rook-nfs + namespace: {{ .Values.serverNamespace }} roleRef: kind: ClusterRole name: rook-nfs-provisioner-runner apiGroup: rbac.authorization.k8s.io +--- +{{- end }} \ No newline at end of file diff --git a/rooknfs/templates/sc.yaml b/rooknfs/templates/sc.yaml new file mode 100644 index 0000000..0ad75fe --- /dev/null +++ b/rooknfs/templates/sc.yaml @@ -0,0 +1,17 @@ +{{- if .Values.enabled }} +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + labels: + app: rook-nfs + name: {{ .Values.storageClassName }} +parameters: + exportName: {{ .Values.shareName }} + nfsServerName: {{ .Values.serverName }} + nfsServerNamespace: {{ .Values.serverNamespace }} +provisioner: nfs.rook.io/rook-nfs-provisioner +reclaimPolicy: Delete +volumeBindingMode: Immediate +--- +{{- end }} \ No newline at end of file diff --git a/rooknfs/values.yaml b/rooknfs/values.yaml new file mode 100644 index 0000000..1961fa6 --- /dev/null +++ b/rooknfs/values.yaml @@ -0,0 +1,30 @@ +# Global flag for enabling/disabling all chart resources +# This is useful for allowing charts which use this chart +# as a dependency to toggle usage of this chart based on +# values in the parent chart +enabled: true + +# Name for the NFSServer resource created by rook +serverName: rook-nfs + +# Name for the created storage class +storageClassName: rook-nfs + +# Name for the Read-Write-Once backing PVC created by Rook +claimName: rook-nfs-backing-pv + +# Name for the NFS share within the NFS Resource instance +shareName: share-1 + +# Size of the Read-Write-Once backing storage volume +storageCapacity: 10Gi + +# Image to use for the Rook NFS operator +operatorImage: rook/nfs:master + +# NOTE: For some reason deploying everything in the default +# namespace leads to R-W-M PVCs getting stuck in 'pending' +# state indefinitely, so here we separate out namespaces as +# of various components in the same way as the Rook docs +serverNamespace: rook-nfs +systemNamespace: rook-nfs-system \ No newline at end of file diff --git a/slurm-cluster-chart/Chart.yaml b/slurm-cluster-chart/Chart.yaml index 9e592c0..4dad59b 100644 --- a/slurm-cluster-chart/Chart.yaml +++ b/slurm-cluster-chart/Chart.yaml @@ -21,4 +21,9 @@ version: 0.1.0 # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "1.16.0" \ No newline at end of file +appVersion: "1.16.0" + +dependencies: + - name: rooknfs + version: 0.0.1 + repository: file://../rooknfs \ No newline at end of file diff --git a/slurm-cluster-chart/templates/hooks/pre-delete.yaml b/slurm-cluster-chart/templates/hooks/pre-delete.yaml new file mode 100644 index 0000000..8cdb1f3 --- /dev/null +++ b/slurm-cluster-chart/templates/hooks/pre-delete.yaml @@ -0,0 +1,55 @@ +{{- if .Values.rooknfs.enabled }} +# NOTE: The cleanup jobs defined here are required to ensure that things which +# Rook NFS is responsible for cleaning up are deleted before deleting the Rook +# pods which do the actual clean up of NFS resources. For example, the RWM PVC +# must be deleted before the Rook StorageClass and provisioner pod. However, +# the PVC cannot be deleted until the pods which are using it are deleted, so +# the various Slurm node pods must actually be the first resources deleted. +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: rook-nfs-cleanup +--- +# TODO: Create a job-specific ClusterRole for the ServiceAccount +# instead of using the cluster-admin role here +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: rook-nfs-cleanup +subjects: +- kind: ServiceAccount + name: rook-nfs-cleanup + namespace: {{ .Release.Namespace }} +roleRef: + kind: ClusterRole + name: cluster-admin +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: rook-nfs-pre-delete-cleanup + annotations: + "helm.sh/hook": pre-delete + "helm.sh/hook-delete-policy": hook-succeeded +spec: + template: + metadata: + name: rook-nfs-pre-delete-cleanup + spec: + serviceAccountName: rook-nfs-cleanup + containers: + - name: tester + image: bitnami/kubectl + command: + - "bin/bash" + - "-c" + - | + kubectl delete -n {{ .Release.Namespace }} deployment {{ .Values.login.name }} --wait --cascade=foreground + kubectl delete -n {{ .Release.Namespace }} statefulset {{ .Values.slurmctld.name }} --wait --cascade=foreground + kubectl delete -n {{ .Release.Namespace }} statefulset {{ .Values.slurmd.name }} --wait --cascade=foreground + kubectl delete -n {{ .Release.Namespace }} pvc {{ .Values.storage.claimName }} --wait + kubectl delete -n {{ .Values.rooknfs.serverNamespace }} nfsservers {{ .Values.rooknfs.serverName }} --wait + restartPolicy: Never +--- +{{- end }} diff --git a/slurm-cluster-chart/templates/login-deployment.yaml b/slurm-cluster-chart/templates/login.yaml similarity index 90% rename from slurm-cluster-chart/templates/login-deployment.yaml rename to slurm-cluster-chart/templates/login.yaml index 48f8f17..ca63392 100644 --- a/slurm-cluster-chart/templates/login-deployment.yaml +++ b/slurm-cluster-chart/templates/login.yaml @@ -5,9 +5,9 @@ metadata: labels: app.kubernetes.io/name: slurm app.kubernetes.io/component: login - name: login + name: {{ .Values.login.name }} spec: - replicas: {{ .Values.replicas.login }} + replicas: {{ .Values.login.replicas }} selector: matchLabels: app.kubernetes.io/name: slurm @@ -29,7 +29,7 @@ spec: ports: - containerPort: 22 volumeMounts: - - mountPath: {{ .Values.nfs.mountPath }} + - mountPath: {{ .Values.storage.mountPath }} name: slurm-jobdir - mountPath: /etc/slurm/ name: slurm-config-volume @@ -51,7 +51,7 @@ spec: volumes: - name: slurm-jobdir persistentVolumeClaim: - claimName: {{ .Values.nfs.claimName }} + claimName: {{ .Values.storage.claimName }} - name: slurm-config-volume configMap: name: {{ .Values.configmaps.slurmConf }} diff --git a/slurm-cluster-chart/templates/pvc.yaml b/slurm-cluster-chart/templates/pvc.yaml new file mode 100644 index 0000000..c5d5955 --- /dev/null +++ b/slurm-cluster-chart/templates/pvc.yaml @@ -0,0 +1,14 @@ +{{- if .Values.rooknfs.enabled }} +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ .Values.storage.claimName }} +spec: + storageClassName: {{ .Values.storageClassName }} + accessModes: + - ReadWriteMany + resources: + requests: + storage: {{ .Values.storage.capacity }} +{{- end }} \ No newline at end of file diff --git a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml b/slurm-cluster-chart/templates/slurmctld.yaml similarity index 91% rename from slurm-cluster-chart/templates/slurmctld-statefulset.yaml rename to slurm-cluster-chart/templates/slurmctld.yaml index dc0bf90..f919c5f 100644 --- a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml +++ b/slurm-cluster-chart/templates/slurmctld.yaml @@ -5,7 +5,7 @@ metadata: labels: app.kubernetes.io/name: slurm app.kubernetes.io/component: slurmctld - name: slurmctld + name: {{ .Values.slurmctld.name }} spec: replicas: 1 selector: @@ -29,7 +29,7 @@ spec: - containerPort: 6817 resources: {} volumeMounts: - - mountPath: {{ .Values.nfs.mountPath }} + - mountPath: {{ .Values.storage.mountPath }} name: slurm-jobdir - mountPath: /etc/slurm/ name: slurm-config-volume @@ -45,7 +45,7 @@ spec: volumes: - name: slurm-jobdir persistentVolumeClaim: - claimName: {{ .Values.nfs.claimName }} + claimName: {{ .Values.storage.claimName }} - name: slurmctld-state persistentVolumeClaim: claimName: var-spool-slurmctld diff --git a/slurm-cluster-chart/templates/slurmd-deployment.yaml b/slurm-cluster-chart/templates/slurmd.yaml similarity index 88% rename from slurm-cluster-chart/templates/slurmd-deployment.yaml rename to slurm-cluster-chart/templates/slurmd.yaml index 4c2396e..4775748 100644 --- a/slurm-cluster-chart/templates/slurmd-deployment.yaml +++ b/slurm-cluster-chart/templates/slurmd.yaml @@ -5,9 +5,9 @@ metadata: labels: app.kubernetes.io/name: slurm app.kubernetes.io/component: slurmd - name: slurmd + name: {{ .Values.slurmd.name }} spec: - replicas: {{ .Values.replicas.slurmd }} + replicas: {{ .Values.slurmd.replicas }} selector: matchLabels: app.kubernetes.io/name: slurm @@ -41,7 +41,8 @@ spec: volumeMounts: - mountPath: /etc/slurm/ name: slurm-config-volume - - mountPath: {{ .Values.nfs.mountPath }} + subPath: slurm.conf + - mountPath: {{ .Values.storage.mountPath }} name: slurm-jobdir - mountPath: /tmp/munge.key name: munge-key-secret @@ -55,7 +56,7 @@ spec: volumes: - name: slurm-jobdir persistentVolumeClaim: - claimName: {{ .Values.nfs.claimName }} + claimName: {{ .Values.storage.claimName }} - name: slurm-config-volume configMap: name: {{ .Values.configmaps.slurmConf }} diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 7873e5c..eb9501c 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,12 +1,52 @@ slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:7f4d64e -replicas: - slurmd: 2 - login: 1 +login: + # Deployment resource name + name: login + replicas: 1 -nfs: +slurmd: + # StatefulSet resource name + name: slurmd + replicas: 2 + +slurmctld: + # StatefulSet resource name + name: slurmctld + # NOTE: We don't include a replicas field here because + # replicas > 1 for slurmctld needs extra Slurm config + +storage: mountPath: /home - claimName: rook-nfs-pv-claim + # The name of a Read-Write-Many StorageClass to use for + # the persistent volume which is shared across Slurm nodes + # Note: If using the default value then you must set + # rooknfs.enabled = true below to ensure that Rook NFS is + # installed on the cluster as a dependency of this Slurm + # chart. If you are using a separate RWM StorageClass, then + # set rooknfs.enabled = false + storageClassName: &storageclassname slurm-rook-nfs + # Name for the R-W-M volume to provision + claimName: slurm-shared-storage + # Capacite of the R-W-M volume + capacity: &capacity 10Gi + + +# Values to be passed to the rook-nfs sub-chart +# See rook-nfs sub-chart for full set of available config values +rooknfs: + enabled: true + storageClassName: *storageclassname + # Name for the NFSServer resource created by Rook + serverName: rook-nfs + # Capacity for the backing Read-Write-*Once* volume + # than Rook will create to provide the actual storage to + # the NFS server. Since we're using the Rook NFS in a + # slightly unconventional way here, we just want to anchor + # this value to the requested storage capacity for the RWM + # volume specified in storage.capacity + storageCapacity: *capacity + sqlImage: mariadb:10.10 From a4727da91d175bf1a6a45264104a6b1045a8940f Mon Sep 17 00:00:00 2001 From: Will Date: Fri, 11 Aug 2023 13:51:58 +0100 Subject: [PATCH 068/103] Removed quotes --- slurm-cluster-chart/templates/login-service.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/slurm-cluster-chart/templates/login-service.yaml b/slurm-cluster-chart/templates/login-service.yaml index f5f8aa3..df8892d 100644 --- a/slurm-cluster-chart/templates/login-service.yaml +++ b/slurm-cluster-chart/templates/login-service.yaml @@ -11,11 +11,11 @@ spec: - name: ssh port: 22 targetPort: 22 - - name: "apache" + - name: apache port: 80 targetPort: 80 protocol: TCP - - name: "https" + - name: https port: 443 targetPort: 443 protocol: TCP From 62c6f3431740bb80d744666bc627ab0f5d738c43 Mon Sep 17 00:00:00 2001 From: Will Date: Fri, 11 Aug 2023 13:56:50 +0100 Subject: [PATCH 069/103] Testing without env file for shell --- image/docker-entrypoint.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/image/docker-entrypoint.sh b/image/docker-entrypoint.sh index 14b511c..c0f854d 100755 --- a/image/docker-entrypoint.sh +++ b/image/docker-entrypoint.sh @@ -125,8 +125,8 @@ then echo "---> Starting Apache Server" - mkdir --parents /etc/ood/config/apps/shell - env > /etc/ood/config/apps/shell/env + # mkdir --parents /etc/ood/config/apps/shell + # env > /etc/ood/config/apps/shell/env /usr/libexec/httpd-ssl-gencerts /opt/ood/ood-portal-generator/sbin/update_ood_portal From 4d90e24398aa3c8ab53d1c46e7f3eb83c0c30f8e Mon Sep 17 00:00:00 2001 From: Will Date: Fri, 11 Aug 2023 14:16:31 +0100 Subject: [PATCH 070/103] Moved rocky ssh generation to make purpose clearer --- image/docker-entrypoint.sh | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/image/docker-entrypoint.sh b/image/docker-entrypoint.sh index c0f854d..55bc66d 100755 --- a/image/docker-entrypoint.sh +++ b/image/docker-entrypoint.sh @@ -91,12 +91,6 @@ then mkdir -p /home/rocky/.ssh cp /tmp/authorized_keys /home/rocky/.ssh/authorized_keys - if [ -f /home/rocky/.ssh/id_rsa.pub ]; then - echo "ssh keys already found" - else - ssh-keygen -t rsa -f /home/rocky/.ssh/id_rsa -N "" - fi - echo "---> Setting permissions for user home directories" pushd /home > /dev/null for DIR in * @@ -119,6 +113,13 @@ then start_munge echo "---> Setting up self ssh capabilities for OOD" + + if [ -f /home/rocky/.ssh/id_rsa.pub ]; then + echo "ssh keys already found" + else + ssh-keygen -t rsa -f /home/rocky/.ssh/id_rsa -N "" + fi + ssh-keyscan localhost > /etc/ssh/ssh_known_hosts echo "" >> /home/rocky/.ssh/authorized_keys #Adding newline to avoid breaking authorized_keys file cat /home/rocky/.ssh/id_rsa.pub >> /home/rocky/.ssh/authorized_keys From 1a4a3e44ba23f0c2a1b817edbb2e17fff7476f74 Mon Sep 17 00:00:00 2001 From: Will Date: Fri, 11 Aug 2023 14:19:53 +0100 Subject: [PATCH 071/103] Updated tag --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 6a394cb..0ca35c9 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:f52e918 +slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:4d90e24 replicas: slurmd: 2 From edfdd7c1fe8e14e889f7632249c16b3bb580dcf3 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Fri, 11 Aug 2023 14:26:24 +0100 Subject: [PATCH 072/103] Fix storageClassName templating typo --- slurm-cluster-chart/templates/pvc.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/templates/pvc.yaml b/slurm-cluster-chart/templates/pvc.yaml index c5d5955..5e934ef 100644 --- a/slurm-cluster-chart/templates/pvc.yaml +++ b/slurm-cluster-chart/templates/pvc.yaml @@ -5,7 +5,7 @@ kind: PersistentVolumeClaim metadata: name: {{ .Values.storage.claimName }} spec: - storageClassName: {{ .Values.storageClassName }} + storageClassName: {{ .Values.storage.storageClassName }} accessModes: - ReadWriteMany resources: From 4407fbe486a3b78bda85f93ac39fc9adda94d0f6 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Fri, 11 Aug 2023 14:55:20 +0100 Subject: [PATCH 073/103] Remove broken subPath spec --- slurm-cluster-chart/templates/slurmd.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/slurm-cluster-chart/templates/slurmd.yaml b/slurm-cluster-chart/templates/slurmd.yaml index 4775748..ff13019 100644 --- a/slurm-cluster-chart/templates/slurmd.yaml +++ b/slurm-cluster-chart/templates/slurmd.yaml @@ -41,7 +41,6 @@ spec: volumeMounts: - mountPath: /etc/slurm/ name: slurm-config-volume - subPath: slurm.conf - mountPath: {{ .Values.storage.mountPath }} name: slurm-jobdir - mountPath: /tmp/munge.key From f9d4f9a95ea28c999cc80076f949a712735f8b45 Mon Sep 17 00:00:00 2001 From: Will Date: Mon, 14 Aug 2023 13:53:22 +0100 Subject: [PATCH 074/103] Changed OOD key names --- .gitignore | 3 +++ slurm-cluster-chart/templates/login-deployment.yaml | 2 +- slurm-cluster-chart/values.yaml | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0ba5327 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +# Build artifacts from local helm install +slurm-cluster-chart/Chart.lock +slurm-cluster-chart/charts/ diff --git a/slurm-cluster-chart/templates/login-deployment.yaml b/slurm-cluster-chart/templates/login-deployment.yaml index 0984560..64a6469 100644 --- a/slurm-cluster-chart/templates/login-deployment.yaml +++ b/slurm-cluster-chart/templates/login-deployment.yaml @@ -28,7 +28,7 @@ spec: name: login env: - name: ROCKY_OOD_PASS - value: {{ .Values.openOndemand.password }} + value: {{ .Values.openOnDemand.password }} ports: - containerPort: 22 - containerPort: 80 diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index d7fc033..c555b98 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -25,5 +25,5 @@ secrets: mungeKey: munge-key-secret #OOD username is rocky -openOndemand: +openOnDemand: password: password From 2ac2fd5aae4a3cd7fb824662e87fdf9b6071c384 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Mon, 14 Aug 2023 13:57:03 +0100 Subject: [PATCH 075/103] Working Helm chart publisher workflow (#25) * Added custom packaging to workflow * Trying adding charts to cr packages * Now publishes rook chart * Temporarily removed slurm chart from publisher to publish initial rook chart to repo * Trying with new workflow and temporarily removing dependency * Re-added rook dependency * Added upterm debugging * Changed rooknfs version * Removed debug --- .github/workflows/publish-helm-chart.yml | 47 +++++++------------ rooknfs/values.yaml | 2 +- slurm-cluster-chart/Chart.yaml | 4 +- .../{ => hooks}/check-jobs-finished-hook.yaml | 0 4 files changed, 21 insertions(+), 32 deletions(-) rename slurm-cluster-chart/templates/{ => hooks}/check-jobs-finished-hook.yaml (100%) diff --git a/.github/workflows/publish-helm-chart.yml b/.github/workflows/publish-helm-chart.yml index 8ce0698..516e388 100644 --- a/.github/workflows/publish-helm-chart.yml +++ b/.github/workflows/publish-helm-chart.yml @@ -1,37 +1,26 @@ -name: Release Charts - -on: - push: - branches: - - main - +name: Publish charts +# Run the tasks on every push +on: push jobs: - release: - # depending on default permission settings for your org (contents being read-only or read-write for workloads), you will have to add permissions - # see: https://docs.github.com/en/actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token - permissions: - contents: write + publish_charts: + name: Build and push Helm charts runs-on: ubuntu-latest steps: - - name: Checkout - uses: actions/checkout@v3 + - name: Check out the repository + uses: actions/checkout@v2 with: + # This is important for the semver action to work correctly + # when determining the number of commits since the last tag fetch-depth: 0 + submodules: true - - name: Configure Git - run: | - git config user.name "$GITHUB_ACTOR" - git config user.email "$GITHUB_ACTOR@users.noreply.github.com" - - - name: Install Helm - uses: azure/setup-helm@v3 - env: - GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}" + - name: Get SemVer version for current commit + id: semver + uses: stackhpc/github-actions/semver@master - - name: Run chart-releaser - uses: helm/chart-releaser-action@v1.5.0 + - name: Publish Helm charts + uses: stackhpc/github-actions/helm-publish@master with: - charts_dir: . - env: - CR_TOKEN: "${{ secrets.GITHUB_TOKEN }}" - + token: ${{ secrets.GITHUB_TOKEN }} + version: ${{ steps.semver.outputs.version }} + app-version: ${{ steps.semver.outputs.short-sha }} diff --git a/rooknfs/values.yaml b/rooknfs/values.yaml index 1961fa6..00a3e7f 100644 --- a/rooknfs/values.yaml +++ b/rooknfs/values.yaml @@ -27,4 +27,4 @@ operatorImage: rook/nfs:master # state indefinitely, so here we separate out namespaces as # of various components in the same way as the Rook docs serverNamespace: rook-nfs -systemNamespace: rook-nfs-system \ No newline at end of file +systemNamespace: rook-nfs-system diff --git a/slurm-cluster-chart/Chart.yaml b/slurm-cluster-chart/Chart.yaml index 4dad59b..0177e24 100644 --- a/slurm-cluster-chart/Chart.yaml +++ b/slurm-cluster-chart/Chart.yaml @@ -25,5 +25,5 @@ appVersion: "1.16.0" dependencies: - name: rooknfs - version: 0.0.1 - repository: file://../rooknfs \ No newline at end of file + version: ">=0-0" + repository: file://../rooknfs diff --git a/slurm-cluster-chart/templates/check-jobs-finished-hook.yaml b/slurm-cluster-chart/templates/hooks/check-jobs-finished-hook.yaml similarity index 100% rename from slurm-cluster-chart/templates/check-jobs-finished-hook.yaml rename to slurm-cluster-chart/templates/hooks/check-jobs-finished-hook.yaml From f25fe6ec0df1c4df5ad79c744570a7d7a28fb447 Mon Sep 17 00:00:00 2001 From: Will Date: Mon, 14 Aug 2023 14:29:19 +0100 Subject: [PATCH 076/103] Removed resource policies --- slurm-cluster-chart/templates/database-auth-secret.yaml | 1 - slurm-cluster-chart/templates/munge-key-secret.yaml | 1 - 2 files changed, 2 deletions(-) diff --git a/slurm-cluster-chart/templates/database-auth-secret.yaml b/slurm-cluster-chart/templates/database-auth-secret.yaml index 6133576..1a1d6ea 100644 --- a/slurm-cluster-chart/templates/database-auth-secret.yaml +++ b/slurm-cluster-chart/templates/database-auth-secret.yaml @@ -4,6 +4,5 @@ metadata: name: database-auth-secret annotations: helm.sh/hook: pre-install - helm.sh/resource-policy: keep data: password: {{ randAlphaNum 32 | b64enc }} diff --git a/slurm-cluster-chart/templates/munge-key-secret.yaml b/slurm-cluster-chart/templates/munge-key-secret.yaml index 65825d6..df97e19 100644 --- a/slurm-cluster-chart/templates/munge-key-secret.yaml +++ b/slurm-cluster-chart/templates/munge-key-secret.yaml @@ -4,6 +4,5 @@ metadata: name: munge-key-secret annotations: helm.sh/hook: pre-install - helm.sh/resource-policy: keep data: munge.key: {{ randAscii 128 | b64enc }} From af39470ad767c002050cc6be9dce364f0da7eb2f Mon Sep 17 00:00:00 2001 From: Scott Davidson <49713135+sd109@users.noreply.github.com> Date: Mon, 14 Aug 2023 14:31:04 +0100 Subject: [PATCH 077/103] Fix typo Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- rooknfs/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rooknfs/Chart.yaml b/rooknfs/Chart.yaml index 83a2a11..b8abd25 100644 --- a/rooknfs/Chart.yaml +++ b/rooknfs/Chart.yaml @@ -1,4 +1,4 @@ apiVersion: v2 name: rooknfs version: 0.0.1 -description: An packaged installation of Rook NFS for Kubernetes. \ No newline at end of file +description: A packaged installation of Rook NFS for Kubernetes. \ No newline at end of file From 336f95f01c26924faf2c51c8864f1b656df10dcc Mon Sep 17 00:00:00 2001 From: Scott Davidson <49713135+sd109@users.noreply.github.com> Date: Mon, 14 Aug 2023 14:31:52 +0100 Subject: [PATCH 078/103] Remove yaml anchor Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index eb9501c..e8e6e09 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -25,7 +25,7 @@ storage: # installed on the cluster as a dependency of this Slurm # chart. If you are using a separate RWM StorageClass, then # set rooknfs.enabled = false - storageClassName: &storageclassname slurm-rook-nfs + storageClassName: slurm-rook-nfs # Name for the R-W-M volume to provision claimName: slurm-shared-storage # Capacite of the R-W-M volume From 5f121966277344c7fe0834c4895cf2ac4f50c9d3 Mon Sep 17 00:00:00 2001 From: Scott Davidson <49713135+sd109@users.noreply.github.com> Date: Mon, 14 Aug 2023 14:32:29 +0100 Subject: [PATCH 079/103] Remove anchor ref and add explanatory comment Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index e8e6e09..98fe170 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -36,7 +36,7 @@ storage: # See rook-nfs sub-chart for full set of available config values rooknfs: enabled: true - storageClassName: *storageclassname + storageClassName: slurm-rook-nfs # NB this must match storage.storageClassName when using rook # Name for the NFSServer resource created by Rook serverName: rook-nfs # Capacity for the backing Read-Write-*Once* volume From 350d39b4b9a6fe56a7daa0c217156e026f4a16cd Mon Sep 17 00:00:00 2001 From: Scott Davidson <49713135+sd109@users.noreply.github.com> Date: Mon, 14 Aug 2023 14:33:06 +0100 Subject: [PATCH 080/103] Add yaml anchor explanation Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 98fe170..2a9eaf8 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -29,7 +29,7 @@ storage: # Name for the R-W-M volume to provision claimName: slurm-shared-storage # Capacite of the R-W-M volume - capacity: &capacity 10Gi + capacity: &capacity 10Gi # NB yaml anchor used so this value is also set for `rooknfs.storageCapacity` if necessary. # Values to be passed to the rook-nfs sub-chart From 58a89d4b27e7cabf5d5203ab9b0d3294a08c1b15 Mon Sep 17 00:00:00 2001 From: Scott Davidson <49713135+sd109@users.noreply.github.com> Date: Mon, 14 Aug 2023 14:33:36 +0100 Subject: [PATCH 081/103] Add comment about name constraints Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 2a9eaf8..b89ca85 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -7,7 +7,7 @@ login: slurmd: # StatefulSet resource name - name: slurmd + name: slurmd # NB this must match NodeName= in slurm-cluster-chart/files/slurm.conf replicas: 2 slurmctld: From 474450b7e68b0272a53e65bae9cb75ff8b30bb64 Mon Sep 17 00:00:00 2001 From: Will Date: Mon, 14 Aug 2023 14:53:49 +0100 Subject: [PATCH 082/103] Refactored and documented values.yaml --- .github/workflows/publish-helm-chart.yml | 46 ++++++------------- .../templates/mysql-deployment.yaml | 2 +- .../var-lib-mysql-persistentvolumeclaim.yaml | 2 +- slurm-cluster-chart/values.yaml | 19 +++++--- 4 files changed, 28 insertions(+), 41 deletions(-) diff --git a/.github/workflows/publish-helm-chart.yml b/.github/workflows/publish-helm-chart.yml index 8a6f4f7..99e4c45 100644 --- a/.github/workflows/publish-helm-chart.yml +++ b/.github/workflows/publish-helm-chart.yml @@ -1,18 +1,16 @@ -name: Release Charts - +name: Publish charts +# Run the tasks on every push on: push - jobs: - release: - # depending on default permission settings for your org (contents being read-only or read-write for workloads), you will have to add permissions - # see: https://docs.github.com/en/actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token - permissions: - contents: write + publish_charts: + name: Build and push Helm charts runs-on: ubuntu-latest steps: - - name: Checkout - uses: actions/checkout@v3 + - name: Check out the repository + uses: actions/checkout@v2 with: + # This is important for the semver action to work correctly + # when determining the number of commits since the last tag fetch-depth: 0 submodules: true @@ -20,26 +18,10 @@ jobs: id: semver uses: stackhpc/github-actions/semver@master - - name: Configure Git - run: | - git config user.name "$GITHUB_ACTOR" - git config user.email "$GITHUB_ACTOR@users.noreply.github.com" - - - name: Install Helm - uses: azure/setup-helm@v3 - env: - GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}" - - - name: "Package Chart" - run: | - mkdir -p .cr-release-packages - helm package slurm-cluster-chart --version ${{ steps.semver.outputs.version }} --destination .cr-release-packages - - - name: Run chart-releaser - uses: helm/chart-releaser-action@v1.5.0 + - name: Publish Helm charts + uses: stackhpc/github-actions/helm-publish@master with: - charts_dir: . - skip_packaging: True - env: - CR_TOKEN: "${{ secrets.GITHUB_TOKEN }}" - + token: ${{ secrets.GITHUB_TOKEN }} + version: ${{ steps.semver.outputs.version }} + app-version: ${{ steps.semver.outputs.short-sha }} + \ No newline at end of file diff --git a/slurm-cluster-chart/templates/mysql-deployment.yaml b/slurm-cluster-chart/templates/mysql-deployment.yaml index 8ffd49e..debf962 100644 --- a/slurm-cluster-chart/templates/mysql-deployment.yaml +++ b/slurm-cluster-chart/templates/mysql-deployment.yaml @@ -34,7 +34,7 @@ spec: value: "yes" - name: MYSQL_USER value: "slurm" - image: {{ .Values.sqlImage }} + image: {{ .Values.mySQL.image }} name: mysql ports: - containerPort: 3306 diff --git a/slurm-cluster-chart/templates/var-lib-mysql-persistentvolumeclaim.yaml b/slurm-cluster-chart/templates/var-lib-mysql-persistentvolumeclaim.yaml index 841bb0f..56fc7dd 100644 --- a/slurm-cluster-chart/templates/var-lib-mysql-persistentvolumeclaim.yaml +++ b/slurm-cluster-chart/templates/var-lib-mysql-persistentvolumeclaim.yaml @@ -11,4 +11,4 @@ spec: - ReadWriteOnce resources: requests: - storage: {{ .Values.databaseStorage }} + storage: {{ .Values.mySQL.storage }} diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index c555b98..63e3531 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -8,22 +8,27 @@ nfs: mountPath: /home claimName: rook-nfs-pv-claim -sqlImage: mariadb:10.10 - -databaseStorage: 100Mi - +# Values for Slurm's database container +mySQL: + #Database image to be used + image: mariadb:10.10 + #Storage requested by the var-lib-mysql volume backing the database + storage: 100Mi + +# Configmap resource names configmaps: slurmConf: slurm-conf-configmap slurmdbdConf: slurmdbd-conf-configmap sshdConfig: sshd-config-configmap -# If let undefined, assumes you have run publish-keys.sh to publish your public key prior to deployment +# Public key used for ssh access to the login node +# If let undefined, assumes you have run the provided publish-keys.sh script to publish your public key prior to deployment sshPublicKey: +# Secret resource names secrets: - databaseAuth: database-auth-secret mungeKey: munge-key-secret -#OOD username is rocky openOnDemand: + #Password for default Open OnDemand user 'rocky' password: password From 908f808efd07c1c66653a53b78c8e2d1ca7d9a6a Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Mon, 14 Aug 2023 17:06:32 +0100 Subject: [PATCH 083/103] Add namespace as command line arg --- publish-keys.sh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/publish-keys.sh b/publish-keys.sh index d293e81..bdd4e0f 100755 --- a/publish-keys.sh +++ b/publish-keys.sh @@ -1,3 +1,8 @@ -kubectl create configmap authorized-keys-configmap \ +NAMESPACE="$1" +if [[ -z $1 ]]; then + NAMESPACE=default +fi +echo Installing in namespace $NAMESPACE +kubectl -n $NAMESPACE create configmap authorized-keys-configmap \ "--from-literal=authorized_keys=$(cat ~/.ssh/*.pub)" --dry-run=client -o yaml | \ -kubectl apply -f - \ No newline at end of file +kubectl -n $NAMESPACE apply -f - \ No newline at end of file From 925ad806fe072878206310db0422f34039723b91 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 15 Aug 2023 11:12:44 +0100 Subject: [PATCH 084/103] Add namespace as script arg --- generate-secrets.sh | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/generate-secrets.sh b/generate-secrets.sh index db64a53..10b7f98 100755 --- a/generate-secrets.sh +++ b/generate-secrets.sh @@ -1,13 +1,17 @@ #!/bin/bash +NAMESPACE="$1" +if [[ -z $1 ]]; then + NAMESPACE=default +fi -kubectl create secret generic database-auth-secret \ +kubectl -n $NAMESPACE create secret generic database-auth-secret \ --dry-run=client \ --from-literal=password=$(tr -dc 'A-Za-z0-9' /dev/null | base64 -w 0) \ -o yaml | \ -kubectl apply -f - \ No newline at end of file +kubectl -n $NAMESPACE apply -f - \ No newline at end of file From e6c5275179a62dbf7bb86d3d8bda8e60f3600d0b Mon Sep 17 00:00:00 2001 From: Will Date: Tue, 15 Aug 2023 11:35:10 +0100 Subject: [PATCH 085/103] Now gives ownership to rocky affter keygen --- image/docker-entrypoint.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/image/docker-entrypoint.sh b/image/docker-entrypoint.sh index 55bc66d..2f87d39 100755 --- a/image/docker-entrypoint.sh +++ b/image/docker-entrypoint.sh @@ -118,6 +118,7 @@ then echo "ssh keys already found" else ssh-keygen -t rsa -f /home/rocky/.ssh/id_rsa -N "" + chown rocky:rocky id_rsa id_rsa.pub fi ssh-keyscan localhost > /etc/ssh/ssh_known_hosts From f32b4f1fdfeb830569ba63446de22fec6db3ac98 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 15 Aug 2023 11:36:52 +0100 Subject: [PATCH 086/103] Fix dnsConfig namespace --- slurm-cluster-chart/templates/login.yaml | 2 +- slurm-cluster-chart/templates/slurmctld.yaml | 2 +- slurm-cluster-chart/templates/slurmd.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/slurm-cluster-chart/templates/login.yaml b/slurm-cluster-chart/templates/login.yaml index ca63392..d8a813c 100644 --- a/slurm-cluster-chart/templates/login.yaml +++ b/slurm-cluster-chart/templates/login.yaml @@ -46,7 +46,7 @@ spec: hostname: login dnsConfig: searches: - - slurmd.default.svc.cluster.local + - slurmd.{{ .Release.Namespace }}.svc.cluster.local restartPolicy: Always volumes: - name: slurm-jobdir diff --git a/slurm-cluster-chart/templates/slurmctld.yaml b/slurm-cluster-chart/templates/slurmctld.yaml index f919c5f..1644463 100644 --- a/slurm-cluster-chart/templates/slurmctld.yaml +++ b/slurm-cluster-chart/templates/slurmctld.yaml @@ -40,7 +40,7 @@ spec: name: slurmctld-state dnsConfig: searches: - - slurmd.default.svc.cluster.local + - slurmd.{{ .Release.Namespace }}.svc.cluster.local restartPolicy: Always volumes: - name: slurm-jobdir diff --git a/slurm-cluster-chart/templates/slurmd.yaml b/slurm-cluster-chart/templates/slurmd.yaml index ff13019..62646b7 100644 --- a/slurm-cluster-chart/templates/slurmd.yaml +++ b/slurm-cluster-chart/templates/slurmd.yaml @@ -50,7 +50,7 @@ spec: privileged: true dnsConfig: searches: - - slurmd.default.svc.cluster.local + - slurmd.{{ .Release.Namespace }}.svc.cluster.local restartPolicy: Always volumes: - name: slurm-jobdir From 7c0e2d9a79b62be220da2808c20c887f6db0e3a8 Mon Sep 17 00:00:00 2001 From: Will Date: Tue, 15 Aug 2023 11:45:54 +0100 Subject: [PATCH 087/103] Fixed path --- image/docker-entrypoint.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/image/docker-entrypoint.sh b/image/docker-entrypoint.sh index 2f87d39..f2b0bbc 100755 --- a/image/docker-entrypoint.sh +++ b/image/docker-entrypoint.sh @@ -118,7 +118,7 @@ then echo "ssh keys already found" else ssh-keygen -t rsa -f /home/rocky/.ssh/id_rsa -N "" - chown rocky:rocky id_rsa id_rsa.pub + chown rocky:rocky /home/rocky/.ssh/id_rsa /home/rocky/.ssh/id_rsa.pub fi ssh-keyscan localhost > /etc/ssh/ssh_known_hosts From 171010d0a523a51d3efd6bbfee4672bbfd4a917e Mon Sep 17 00:00:00 2001 From: Will Date: Tue, 15 Aug 2023 11:51:19 +0100 Subject: [PATCH 088/103] Updated values.yaml --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 0ca35c9..56a5e38 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:4d90e24 +slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:7c0e2d9 replicas: slurmd: 2 From a33790b2a35cf4b94aebd4bcaa977e69269d2d89 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 15 Aug 2023 12:06:50 +0100 Subject: [PATCH 089/103] Use builtin Helm optional dependency feature --- rooknfs/templates/nfs.yaml | 3 --- rooknfs/templates/operator.yaml | 2 -- rooknfs/templates/rbac.yaml | 4 +--- rooknfs/templates/sc.yaml | 4 +--- rooknfs/values.yaml | 5 ----- slurm-cluster-chart/Chart.yaml | 1 + slurm-cluster-chart/templates/pvc.yaml | 4 +--- 7 files changed, 4 insertions(+), 19 deletions(-) diff --git a/rooknfs/templates/nfs.yaml b/rooknfs/templates/nfs.yaml index 6fde553..1da86bc 100644 --- a/rooknfs/templates/nfs.yaml +++ b/rooknfs/templates/nfs.yaml @@ -1,4 +1,3 @@ -{{- if .Values.enabled }} --- # A default storageclass must be present apiVersion: v1 @@ -32,5 +31,3 @@ spec: annotations: rook: nfs --- -{{- end }} - diff --git a/rooknfs/templates/operator.yaml b/rooknfs/templates/operator.yaml index 4a1d542..56318f6 100644 --- a/rooknfs/templates/operator.yaml +++ b/rooknfs/templates/operator.yaml @@ -1,4 +1,3 @@ -{{- if .Values.enabled }} --- apiVersion: v1 kind: Namespace @@ -137,4 +136,3 @@ spec: fieldRef: fieldPath: metadata.namespace --- -{{- end}} diff --git a/rooknfs/templates/rbac.yaml b/rooknfs/templates/rbac.yaml index b327740..422a43b 100644 --- a/rooknfs/templates/rbac.yaml +++ b/rooknfs/templates/rbac.yaml @@ -1,4 +1,3 @@ -{{- if .Values.enabled }} --- apiVersion: v1 kind: Namespace @@ -57,5 +56,4 @@ roleRef: kind: ClusterRole name: rook-nfs-provisioner-runner apiGroup: rbac.authorization.k8s.io ---- -{{- end }} \ No newline at end of file +--- \ No newline at end of file diff --git a/rooknfs/templates/sc.yaml b/rooknfs/templates/sc.yaml index 0ad75fe..505bd44 100644 --- a/rooknfs/templates/sc.yaml +++ b/rooknfs/templates/sc.yaml @@ -1,4 +1,3 @@ -{{- if .Values.enabled }} --- apiVersion: storage.k8s.io/v1 kind: StorageClass @@ -13,5 +12,4 @@ parameters: provisioner: nfs.rook.io/rook-nfs-provisioner reclaimPolicy: Delete volumeBindingMode: Immediate ---- -{{- end }} \ No newline at end of file +--- \ No newline at end of file diff --git a/rooknfs/values.yaml b/rooknfs/values.yaml index 00a3e7f..4150967 100644 --- a/rooknfs/values.yaml +++ b/rooknfs/values.yaml @@ -1,8 +1,3 @@ -# Global flag for enabling/disabling all chart resources -# This is useful for allowing charts which use this chart -# as a dependency to toggle usage of this chart based on -# values in the parent chart -enabled: true # Name for the NFSServer resource created by rook serverName: rook-nfs diff --git a/slurm-cluster-chart/Chart.yaml b/slurm-cluster-chart/Chart.yaml index 0177e24..e3d003c 100644 --- a/slurm-cluster-chart/Chart.yaml +++ b/slurm-cluster-chart/Chart.yaml @@ -27,3 +27,4 @@ dependencies: - name: rooknfs version: ">=0-0" repository: file://../rooknfs + condition: rooknfs.enabled diff --git a/slurm-cluster-chart/templates/pvc.yaml b/slurm-cluster-chart/templates/pvc.yaml index 5e934ef..aab0856 100644 --- a/slurm-cluster-chart/templates/pvc.yaml +++ b/slurm-cluster-chart/templates/pvc.yaml @@ -1,4 +1,3 @@ -{{- if .Values.rooknfs.enabled }} --- apiVersion: v1 kind: PersistentVolumeClaim @@ -10,5 +9,4 @@ spec: - ReadWriteMany resources: requests: - storage: {{ .Values.storage.capacity }} -{{- end }} \ No newline at end of file + storage: {{ .Values.storage.capacity }} \ No newline at end of file From f86952f405ee251f212024a58d8ec6dd75e40314 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 15 Aug 2023 12:22:21 +0100 Subject: [PATCH 090/103] Separate Rook cleanup into correct chart --- generate-secrets.sh | 4 +- rooknfs/templates/hooks/pre-delete.yaml | 50 +++++++++++++++++++ .../templates/hooks/pre-delete.yaml | 14 +++--- 3 files changed, 59 insertions(+), 9 deletions(-) create mode 100644 rooknfs/templates/hooks/pre-delete.yaml diff --git a/generate-secrets.sh b/generate-secrets.sh index 10b7f98..5956181 100755 --- a/generate-secrets.sh +++ b/generate-secrets.sh @@ -6,12 +6,12 @@ fi kubectl -n $NAMESPACE create secret generic database-auth-secret \ --dry-run=client \ ---from-literal=password=$(tr -dc 'A-Za-z0-9' /dev/null | base64 -w 0) \ +--from-literal=munge.key=$(dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64) \ -o yaml | \ kubectl -n $NAMESPACE apply -f - \ No newline at end of file diff --git a/rooknfs/templates/hooks/pre-delete.yaml b/rooknfs/templates/hooks/pre-delete.yaml new file mode 100644 index 0000000..2c75c89 --- /dev/null +++ b/rooknfs/templates/hooks/pre-delete.yaml @@ -0,0 +1,50 @@ +# NOTE: The cleanup jobs defined here are required to ensure that things which +# Rook NFS is responsible for cleaning up are deleted before deleting the Rook +# pods which do the actual clean up of NFS resources. For example, the RWM PVC +# must be deleted before the Rook StorageClass and provisioner pod. However, +# the PVC cannot be deleted until the pods which are using it are deleted, so +# the various Slurm node pods must actually be the first resources deleted. +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: rook-nfs-cleanup +--- +# TODO: Create a job-specific ClusterRole for the ServiceAccount +# instead of using the cluster-admin role here +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: rook-nfs-cleanup +subjects: +- kind: ServiceAccount + name: rook-nfs-cleanup + namespace: {{ .Release.Namespace }} +roleRef: + kind: ClusterRole + name: cluster-admin +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: rook-nfs-pre-delete-cleanup + annotations: + "helm.sh/hook": pre-delete + "helm.sh/hook-delete-policy": hook-succeeded + "helm.sh/hook-weight": "10" +spec: + template: + metadata: + name: rook-nfs-pre-delete-cleanup + spec: + serviceAccountName: rook-nfs-cleanup + containers: + - name: tester + image: bitnami/kubectl + command: + - "bin/bash" + - "-c" + - | + kubectl delete -n {{ .Values.serverNamespace }} nfsservers {{ .Values.serverName }} --wait + restartPolicy: Never +--- \ No newline at end of file diff --git a/slurm-cluster-chart/templates/hooks/pre-delete.yaml b/slurm-cluster-chart/templates/hooks/pre-delete.yaml index 8cdb1f3..868cbbd 100644 --- a/slurm-cluster-chart/templates/hooks/pre-delete.yaml +++ b/slurm-cluster-chart/templates/hooks/pre-delete.yaml @@ -9,17 +9,17 @@ apiVersion: v1 kind: ServiceAccount metadata: - name: rook-nfs-cleanup + name: slurm-k8s-cleanup --- # TODO: Create a job-specific ClusterRole for the ServiceAccount # instead of using the cluster-admin role here apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: rook-nfs-cleanup + name: slurm-k8s-cleanup subjects: - kind: ServiceAccount - name: rook-nfs-cleanup + name: slurm-k8s-cleanup namespace: {{ .Release.Namespace }} roleRef: kind: ClusterRole @@ -28,16 +28,17 @@ roleRef: apiVersion: batch/v1 kind: Job metadata: - name: rook-nfs-pre-delete-cleanup + name: slurm-k8s-pre-delete-cleanup annotations: "helm.sh/hook": pre-delete "helm.sh/hook-delete-policy": hook-succeeded + "helm.sh/hook-weight": "1" spec: template: metadata: - name: rook-nfs-pre-delete-cleanup + name: slurm-k8s-pre-delete-cleanup spec: - serviceAccountName: rook-nfs-cleanup + serviceAccountName: slurm-k8s-cleanup containers: - name: tester image: bitnami/kubectl @@ -49,7 +50,6 @@ spec: kubectl delete -n {{ .Release.Namespace }} statefulset {{ .Values.slurmctld.name }} --wait --cascade=foreground kubectl delete -n {{ .Release.Namespace }} statefulset {{ .Values.slurmd.name }} --wait --cascade=foreground kubectl delete -n {{ .Release.Namespace }} pvc {{ .Values.storage.claimName }} --wait - kubectl delete -n {{ .Values.rooknfs.serverNamespace }} nfsservers {{ .Values.rooknfs.serverName }} --wait restartPolicy: Never --- {{- end }} From 1371681210c766da9871ec90ba2e140f645522be Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 15 Aug 2023 13:40:08 +0100 Subject: [PATCH 091/103] Update docs --- README.md | 30 +++++++++++++++++------------- rooknfs/README.md | 3 +++ 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 11fe8b8..7411656 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,7 @@ # Slurm Docker Cluster -This is a multi-container Slurm cluster using Kubernetes. The Helm chart -creates a named volume for persistent storage of MySQL data files as well as -an NFS volume for shared storage. +This is a multi-container Slurm cluster using Kubernetes. The Slurm cluster Helm chart creates a named volume for persistent storage of MySQL data files. By default, it also installs the +RookNFS Helm chart (also in this repo) to provide shared storage across the Slurm cluster nodes. ## Dependencies @@ -27,12 +26,11 @@ The Helm chart will create the following named volumes: * var_lib_mysql ( -> /var/lib/mysql ) -A named ReadWriteMany (RWX) volume mounted to `/home` is also expected, this can be external or can be deployed using the scripts in the `/nfs` directory (See "Deploying the Cluster") +A named ReadWriteMany (RWX) volume mounted to `/home` is also expected, this can be external or can be deployed using the provided `rooknfs` chart directory (See "Deploying the Cluster") ## Configuring the Cluster -All config files in `slurm-cluster-chart/files` will be mounted into the container to configure their respective services on startup. Note that changes to these files will not all be propagated to existing deployments (see "Reconfiguring the Cluster"). -Additional parameters can be found in the `values.yaml` file, which will be applied on a Helm chart deployment. Note that some of these values will also not propagate until the cluster is restarted (see "Reconfiguring the Cluster"). +All config files in `slurm-cluster-chart/files` will be mounted into the container to configure their respective services on startup. Note that changes to these files will not all be propagated to existing deployments (see "Reconfiguring the Cluster"). Additional parameters can be found in the `values.yaml` file for the Helm chart. Note that some of these values will also not propagate until the cluster is restarted (see "Reconfiguring the Cluster"). ## Deploying the Cluster @@ -44,21 +42,20 @@ On initial deployment ONLY, run ``` This generates a set of secrets. If these need to be regenerated, see "Reconfiguring the Cluster" -### Connecting RWX Volume +### Connecting a RWX Volume -A ReadWriteMany (RWX) volume is required, if a named volume exists, set `nfs.claimName` in the `values.yaml` file to its name. If not, manifests to deploy a Rook NFS volume are provided in the `/nfs` directory. You can deploy this by running -```console -./nfs/deploy-nfs.sh -``` -and leaving `nfs.claimName` as the provided value. +A ReadWriteMany (RWX) volume is required for shared storage across cluster nodes. By default, the Rook NFS Helm chart is installed as a dependency of the Slurm cluster chart in order to provide a RWX capable Storage Class for the required shared volume. If the target Kubernetes cluster has an existing storage class which should be used instead, then `storageClass` in `values.yaml` should be set to the name of this existing class and the RookNFS dependency should be disabled by setting `rooknfs.enabled = false`. + +See the separate RookNFS chart [values.yaml](./rooknfs/values.yaml) for further configuration options when using the RookNFS to provide the shared storage volume. ### Supplying Public Keys To access the cluster via `ssh`, you will need to make your public keys available. All your public keys from localhost can be added by running ```console -./publish-keys.sh +./publish-keys.sh ``` +where `` is the namespace in which the Slurm cluster chart will be deployed (i.e. using `helm install -n ...`). This will create a Kubernetes Secret in the appropriate namespace for the Slurm cluster to use. Omitting the namespace arg will install the secrets in the default namespace. ### Deploying with Helm @@ -66,6 +63,12 @@ After configuring `kubectl` with the appropriate `kubeconfig` file, deploy the c ```console helm install slurm-cluster-chart ``` + +NOTE: If using the RookNFS dependency, then the following must be run before installing the Slurm cluster chart +```console +helm dependency update slurm-cluster-chart +``` + Subsequent releases can be deployed using: ```console @@ -128,6 +131,7 @@ srun singularity exec docker://ghcr.io/stackhpc/mpitests-container:${MPI_CONTAIN ``` Note: The mpirun script assumes you are running as user 'rocky'. If you are running as root, you will need to include the --allow-run-as-root argument + ## Reconfiguring the Cluster ### Changes to config files diff --git a/rooknfs/README.md b/rooknfs/README.md index e69de29..5b7ad6d 100644 --- a/rooknfs/README.md +++ b/rooknfs/README.md @@ -0,0 +1,3 @@ +# RookNFS Helm Chart + +See `values.yaml` for available config options. \ No newline at end of file From fe58891e7ccd9de6cb87f92083db59841f98e7e1 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 15 Aug 2023 13:54:30 +0100 Subject: [PATCH 092/103] Make backing RWO storage class configurable --- rooknfs/templates/nfs.yaml | 1 + rooknfs/values.yaml | 3 +++ slurm-cluster-chart/values.yaml | 7 ++++++- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/rooknfs/templates/nfs.yaml b/rooknfs/templates/nfs.yaml index 1da86bc..a88fb6f 100644 --- a/rooknfs/templates/nfs.yaml +++ b/rooknfs/templates/nfs.yaml @@ -6,6 +6,7 @@ metadata: name: {{ .Values.claimName}} namespace: {{ .Values.serverNamespace }} spec: + storageClassName: {{ .Values.backingStorageClass }} accessModes: - ReadWriteMany resources: diff --git a/rooknfs/values.yaml b/rooknfs/values.yaml index 4150967..4ada627 100644 --- a/rooknfs/values.yaml +++ b/rooknfs/values.yaml @@ -8,6 +8,9 @@ storageClassName: rook-nfs # Name for the Read-Write-Once backing PVC created by Rook claimName: rook-nfs-backing-pv +# Storage class to use for the Read-Write-Once backing PVC +backingStorageClass: + # Name for the NFS share within the NFS Resource instance shareName: share-1 diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index b89ca85..1f59a5a 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -36,7 +36,9 @@ storage: # See rook-nfs sub-chart for full set of available config values rooknfs: enabled: true - storageClassName: slurm-rook-nfs # NB this must match storage.storageClassName when using rook + # Name given to the RWM StorageClass created by Rook + # NB this must match storage.storageClassName when using Rook + storageClassName: slurm-rook-nfs # Name for the NFSServer resource created by Rook serverName: rook-nfs # Capacity for the backing Read-Write-*Once* volume @@ -46,6 +48,9 @@ rooknfs: # this value to the requested storage capacity for the RWM # volume specified in storage.capacity storageCapacity: *capacity + # Storage class to use for the Read-Write-Once backing PVC + # backingStorageClass: + sqlImage: mariadb:10.10 From 303d156f78087eefa761aab7746fd6fafbab5399 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 15 Aug 2023 13:57:00 +0100 Subject: [PATCH 093/103] Mention storage capacity config --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7411656..5ac48f2 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ This generates a set of secrets. If these need to be regenerated, see "Reconfigu ### Connecting a RWX Volume -A ReadWriteMany (RWX) volume is required for shared storage across cluster nodes. By default, the Rook NFS Helm chart is installed as a dependency of the Slurm cluster chart in order to provide a RWX capable Storage Class for the required shared volume. If the target Kubernetes cluster has an existing storage class which should be used instead, then `storageClass` in `values.yaml` should be set to the name of this existing class and the RookNFS dependency should be disabled by setting `rooknfs.enabled = false`. +A ReadWriteMany (RWX) volume is required for shared storage across cluster nodes. By default, the Rook NFS Helm chart is installed as a dependency of the Slurm cluster chart in order to provide a RWX capable Storage Class for the required shared volume. If the target Kubernetes cluster has an existing storage class which should be used instead, then `storageClass` in `values.yaml` should be set to the name of this existing class and the RookNFS dependency should be disabled by setting `rooknfs.enabled = false`. In either case, the storage capacity of the provisioned RWX volume can be configured by setting the value of `storage.capacity`. See the separate RookNFS chart [values.yaml](./rooknfs/values.yaml) for further configuration options when using the RookNFS to provide the shared storage volume. From 1debdedcd97a78f17ab5dc6884ce1c26400cf624 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 15 Aug 2023 14:00:44 +0100 Subject: [PATCH 094/103] Add note on target namespace --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 5ac48f2..4e21c3d 100644 --- a/README.md +++ b/README.md @@ -38,9 +38,9 @@ All config files in `slurm-cluster-chart/files` will be mounted into the contain On initial deployment ONLY, run ```console -./generate-secrets.sh +./generate-secrets.sh ``` -This generates a set of secrets. If these need to be regenerated, see "Reconfiguring the Cluster" +This generates a set of secrets in the target namespace to be used by the Slurm cluster. If these need to be regenerated, see "Reconfiguring the Cluster" ### Connecting a RWX Volume From 8818a94a30df63d312b645b0450e090c6f9f1587 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 15 Aug 2023 14:20:56 +0100 Subject: [PATCH 095/103] Revert to randomly generated DB password --- generate-secrets.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generate-secrets.sh b/generate-secrets.sh index 261f3be..dab0688 100755 --- a/generate-secrets.sh +++ b/generate-secrets.sh @@ -6,7 +6,7 @@ fi kubectl -n $NAMESPACE create secret generic database-auth-secret \ --dry-run=client \ ---from-literal=password=abcdefghijklmnopqrstuvwxyz123456 \ +--from-literal=password=$(tr -dc 'A-Za-z0-9' Date: Tue, 15 Aug 2023 14:49:06 +0100 Subject: [PATCH 096/103] Conditionally include backing storage class field --- rooknfs/templates/nfs.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/rooknfs/templates/nfs.yaml b/rooknfs/templates/nfs.yaml index a88fb6f..cf7b1de 100644 --- a/rooknfs/templates/nfs.yaml +++ b/rooknfs/templates/nfs.yaml @@ -3,10 +3,12 @@ apiVersion: v1 kind: PersistentVolumeClaim metadata: - name: {{ .Values.claimName}} + name: {{ .Values.claimName }} namespace: {{ .Values.serverNamespace }} spec: + {{- if .Values.backingStorageClass }} storageClassName: {{ .Values.backingStorageClass }} + {{- end }} accessModes: - ReadWriteMany resources: From 4c7f875813917e9753542d4e600591efef016e23 Mon Sep 17 00:00:00 2001 From: Will Date: Tue, 15 Aug 2023 16:03:08 +0100 Subject: [PATCH 097/103] Changed database template name --- slurm-cluster-chart/templates/mysql-deployment.yaml | 2 +- .../templates/var-lib-mysql-persistentvolumeclaim.yaml | 2 +- slurm-cluster-chart/values.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/slurm-cluster-chart/templates/mysql-deployment.yaml b/slurm-cluster-chart/templates/mysql-deployment.yaml index debf962..96dc88f 100644 --- a/slurm-cluster-chart/templates/mysql-deployment.yaml +++ b/slurm-cluster-chart/templates/mysql-deployment.yaml @@ -34,7 +34,7 @@ spec: value: "yes" - name: MYSQL_USER value: "slurm" - image: {{ .Values.mySQL.image }} + image: {{ .Values.database.image }} name: mysql ports: - containerPort: 3306 diff --git a/slurm-cluster-chart/templates/var-lib-mysql-persistentvolumeclaim.yaml b/slurm-cluster-chart/templates/var-lib-mysql-persistentvolumeclaim.yaml index 56fc7dd..a5f4503 100644 --- a/slurm-cluster-chart/templates/var-lib-mysql-persistentvolumeclaim.yaml +++ b/slurm-cluster-chart/templates/var-lib-mysql-persistentvolumeclaim.yaml @@ -11,4 +11,4 @@ spec: - ReadWriteOnce resources: requests: - storage: {{ .Values.mySQL.storage }} + storage: {{ .Values.database.storage }} diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 63e3531..7c3a481 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -9,7 +9,7 @@ nfs: claimName: rook-nfs-pv-claim # Values for Slurm's database container -mySQL: +database: #Database image to be used image: mariadb:10.10 #Storage requested by the var-lib-mysql volume backing the database From 50e728515c3f2416a508d121ca6ed180278cab43 Mon Sep 17 00:00:00 2001 From: Scott Davidson <49713135+sd109@users.noreply.github.com> Date: Wed, 16 Aug 2023 10:13:36 +0100 Subject: [PATCH 098/103] Punctuation Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2458e39..aad9b4b 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ The Helm chart will create the following named volumes: * var_lib_mysql ( -> /var/lib/mysql ) -A named ReadWriteMany (RWX) volume mounted to `/home` is also expected, this can be external or can be deployed using the provided `rooknfs` chart directory (See "Deploying the Cluster") +A named ReadWriteMany (RWX) volume mounted to `/home` is also expected, this can be external or can be deployed using the provided `rooknfs` chart directory (See "Deploying the Cluster"). ## Configuring the Cluster From 729e43c0f07aad5f114131be8dbc5e0096b0cb76 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Wed, 16 Aug 2023 10:17:10 +0100 Subject: [PATCH 099/103] Clarify namespace arg as optional --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index aad9b4b..c0b7d61 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ All config files in `slurm-cluster-chart/files` will be mounted into the contain On initial deployment ONLY, run ```console -./generate-secrets.sh +./generate-secrets.sh [] ``` This generates a set of secrets in the target namespace to be used by the Slurm cluster. If these need to be regenerated, see "Reconfiguring the Cluster" @@ -55,7 +55,7 @@ See the separate RookNFS chart [values.yaml](./rooknfs/values.yaml) for further To access the cluster via `ssh`, you will need to make your public keys available. All your public keys from localhost can be added by running ```console -./publish-keys.sh +./publish-keys.sh [] ``` where `` is the namespace in which the Slurm cluster chart will be deployed (i.e. using `helm install -n ...`). This will create a Kubernetes Secret in the appropriate namespace for the Slurm cluster to use. Omitting the namespace arg will install the secrets in the default namespace. From 43a5dd7232c5bc149a232bfe9acb71f732b08f40 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Wed, 16 Aug 2023 10:18:07 +0100 Subject: [PATCH 100/103] Re-disable line wrapping --- generate-secrets.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generate-secrets.sh b/generate-secrets.sh index dab0688..a49ede2 100755 --- a/generate-secrets.sh +++ b/generate-secrets.sh @@ -12,7 +12,7 @@ kubectl -n $NAMESPACE apply -f - kubectl -n $NAMESPACE create secret generic munge-key-secret \ --dry-run=client \ ---from-literal=munge.key=$(dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64) \ +--from-literal=munge.key=$(dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64 -w 0) \ -o yaml | \ kubectl -n $NAMESPACE apply -f - From 7c5b6c4cbb2b4f3f7055fe7f0a325f1a2252ab1a Mon Sep 17 00:00:00 2001 From: Will Date: Wed, 16 Aug 2023 12:12:30 +0100 Subject: [PATCH 101/103] Updated image --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index df19aa5..3d41248 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:7c0e2d9 #OUTDATED, DON'T USE! +slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:d3daba4 login: # Deployment resource name From d31306322c8ecd5d9e4326c11f6f2ccf0d930b9c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 17 Aug 2023 08:22:42 +0000 Subject: [PATCH 102/103] only permit one slurmd pod per k8s node --- slurm-cluster-chart/templates/slurmd.yaml | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/slurm-cluster-chart/templates/slurmd.yaml b/slurm-cluster-chart/templates/slurmd.yaml index 62646b7..b017093 100644 --- a/slurm-cluster-chart/templates/slurmd.yaml +++ b/slurm-cluster-chart/templates/slurmd.yaml @@ -20,14 +20,6 @@ spec: app.kubernetes.io/name: slurm app.kubernetes.io/component: slurmd spec: - topologySpreadConstraints: - - maxSkew: 1 - whenUnsatisfiable: ScheduleAnyway - topologyKey: kubernetes.io/hostname - labelSelector: - matchLabels: - app.kubernetes.io/name: slurm - app.kubernetes.io/component: slurmd containers: - args: - slurmd @@ -37,6 +29,7 @@ spec: name: slurmd ports: - containerPort: 6818 + hostPort: 6818 resources: {} volumeMounts: - mountPath: /etc/slurm/ From 6530f783a681a4758b767b23bc5e79997d9f8011 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 17 Aug 2023 08:46:13 +0000 Subject: [PATCH 103/103] use host networking --- slurm-cluster-chart/templates/slurmd.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/slurm-cluster-chart/templates/slurmd.yaml b/slurm-cluster-chart/templates/slurmd.yaml index b017093..bec55ce 100644 --- a/slurm-cluster-chart/templates/slurmd.yaml +++ b/slurm-cluster-chart/templates/slurmd.yaml @@ -25,6 +25,13 @@ spec: - slurmd - -F - -vvv + - -N + - "$(POD_NAME)" + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name image: {{ .Values.slurmImage }} name: slurmd ports: @@ -41,6 +48,8 @@ spec: subPath: munge.key securityContext: privileged: true + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet dnsConfig: searches: - slurmd.{{ .Release.Namespace }}.svc.cluster.local