diff --git a/api/v1/slurmcluster_types.go b/api/v1/slurmcluster_types.go index 3385bdb8..99dcd051 100644 --- a/api/v1/slurmcluster_types.go +++ b/api/v1/slurmcluster_types.go @@ -682,6 +682,11 @@ type SlurmNodeWorker struct { // // +kubebuilder:validation:Optional SlurmNodeExtra string `json:"slurmNodeExtra,omitempty"` + + // PriorityClass defines the priority class for the Slurm worker node + // + // +kubebuilder:validation:Optional + PriorityClass string `json:"priorityClass,omitempty"` } // SlurmNodeWorkerVolumes defines the volumes for the Slurm worker node diff --git a/config/crd/bases/slurm.nebius.ai_slurmclusters.yaml b/config/crd/bases/slurm.nebius.ai_slurmclusters.yaml index dfe168ed..e7d35b74 100644 --- a/config/crd/bases/slurm.nebius.ai_slurmclusters.yaml +++ b/config/crd/bases/slurm.nebius.ai_slurmclusters.yaml @@ -4025,6 +4025,10 @@ spec: required: - image type: object + priorityClass: + description: PriorityClass defines the priority class for + the Slurm worker node + type: string size: description: Size defines the number of node instances format: int32 diff --git a/helm/slurm-cluster/templates/priority-class.yaml b/helm/slurm-cluster/templates/priority-class.yaml new file mode 100644 index 00000000..f4aec8d4 --- /dev/null +++ b/helm/slurm-cluster/templates/priority-class.yaml @@ -0,0 +1,6 @@ +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: {{ include "slurm-cluster.name" . }} +value: 1000000 +globalDefault: false diff --git a/helm/slurm-cluster/templates/slurm-cluster-cr.yaml b/helm/slurm-cluster/templates/slurm-cluster-cr.yaml index 309a0049..733ddf92 100644 --- a/helm/slurm-cluster/templates/slurm-cluster-cr.yaml +++ b/helm/slurm-cluster/templates/slurm-cluster-cr.yaml @@ -159,6 +159,7 @@ spec: worker: cgroupVersion: {{ .Values.slurmNodes.worker.cgroupVersion | quote }} enableGDRCopy: {{ default false .Values.slurmNodes.worker.enableGDRCopy }} + priorityClass: {{ default (include "slurm-cluster.name" .) .Values.slurmNodes.worker.priorityClass | quote }} {{- if .Values.slurmNodes.worker.slurmNodeExtra }} slurmNodeExtra: {{ .Values.slurmNodes.worker.slurmNodeExtra | quote }} {{- end }} diff --git a/helm/slurm-cluster/values.yaml b/helm/slurm-cluster/values.yaml index 90f6ee2e..f2081f6e 100644 --- a/helm/slurm-cluster/values.yaml +++ b/helm/slurm-cluster/values.yaml @@ -284,6 +284,7 @@ slurmNodes: k8sNodeFilterName: "gpu" cgroupVersion: v2 enableGDRCopy: false + priorityClass: "" slurmNodeExtra: "" supervisordConfigMapRefName: "" sshdConfigMapRefName: "" diff --git a/helm/soperator-crds/templates/slurmcluster-crd.yaml b/helm/soperator-crds/templates/slurmcluster-crd.yaml index 270619ab..5e91f384 100644 --- a/helm/soperator-crds/templates/slurmcluster-crd.yaml +++ b/helm/soperator-crds/templates/slurmcluster-crd.yaml @@ -4024,6 +4024,10 @@ spec: required: - image type: object + priorityClass: + description: PriorityClass defines the priority class for + the Slurm worker node + type: string size: description: Size defines the number of node instances format: int32 diff --git a/helm/soperator/crds/slurmcluster-crd.yaml b/helm/soperator/crds/slurmcluster-crd.yaml index 270619ab..5e91f384 100644 --- a/helm/soperator/crds/slurmcluster-crd.yaml +++ b/helm/soperator/crds/slurmcluster-crd.yaml @@ -4024,6 +4024,10 @@ spec: required: - image type: object + priorityClass: + description: PriorityClass defines the priority class for + the Slurm worker node + type: string size: description: Size defines the number of node instances format: int32 diff --git a/internal/render/worker/statefulset.go b/internal/render/worker/statefulset.go index 5bbe5ade..6f63bca3 100644 --- a/internal/render/worker/statefulset.go +++ b/internal/render/worker/statefulset.go @@ -69,6 +69,28 @@ func RenderStatefulSet( replicas = ptr.To(consts.ZeroReplicas) } + spec := corev1.PodSpec{ + PriorityClassName: worker.PriorityClass, + ServiceAccountName: naming.BuildServiceAccountWorkerName(clusterName), + Affinity: nodeFilter.Affinity, + NodeSelector: nodeFilter.NodeSelector, + Tolerations: nodeFilter.Tolerations, + InitContainers: initContainers, + Containers: []corev1.Container{ + slurmdContainer, + }, + Volumes: volumes, + DNSConfig: &corev1.PodDNSConfig{ + Searches: []string{ + naming.BuildServiceFQDN(consts.ComponentTypeWorker, namespace, clusterName), + }, + }, + } + + if worker.PriorityClass != "" { + spec.PriorityClassName = worker.PriorityClass + } + return appsv1.StatefulSet{ ObjectMeta: metav1.ObjectMeta{ Name: worker.StatefulSet.Name, @@ -99,22 +121,7 @@ func RenderStatefulSet( Labels: labels, Annotations: renderAnnotations(worker, clusterName, namespace), }, - Spec: corev1.PodSpec{ - ServiceAccountName: naming.BuildServiceAccountWorkerName(clusterName), - Affinity: nodeFilter.Affinity, - NodeSelector: nodeFilter.NodeSelector, - Tolerations: nodeFilter.Tolerations, - InitContainers: initContainers, - Containers: []corev1.Container{ - slurmdContainer, - }, - Volumes: volumes, - DNSConfig: &corev1.PodDNSConfig{ - Searches: []string{ - naming.BuildServiceFQDN(consts.ComponentTypeWorker, namespace, clusterName), - }, - }, - }, + Spec: spec, }, }, }, nil diff --git a/internal/values/slurm_worker.go b/internal/values/slurm_worker.go index dc3597fd..09400de0 100644 --- a/internal/values/slurm_worker.go +++ b/internal/values/slurm_worker.go @@ -27,6 +27,7 @@ type SlurmWorker struct { CgroupVersion string EnableGDRCopy bool SlurmNodeExtra string + PriorityClass string Service Service StatefulSet StatefulSet @@ -88,6 +89,7 @@ func buildSlurmWorkerFrom( SharedMemorySize: worker.Volumes.SharedMemorySize, CgroupVersion: worker.CgroupVersion, EnableGDRCopy: worker.EnableGDRCopy, + PriorityClass: worker.PriorityClass, UseDefaultAppArmorProfile: useDefaultAppArmorProfile, SlurmNodeExtra: worker.SlurmNodeExtra, SSHDConfigMapName: sshdConfigMapName,