Skip to content

Commit

Permalink
MSP-3944: add priority class
Browse files Browse the repository at this point in the history
  • Loading branch information
Uburro committed Jan 10, 2025
1 parent 419463a commit 8700921
Show file tree
Hide file tree
Showing 9 changed files with 50 additions and 16 deletions.
5 changes: 5 additions & 0 deletions api/v1/slurmcluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,11 @@ type SlurmNodeWorker struct {
//
// +kubebuilder:validation:Optional
SlurmNodeExtra string `json:"slurmNodeExtra,omitempty"`

// PriorityClass defines the priority class for the Slurm worker node
//
// +kubebuilder:validation:Optional
PriorityClass string `json:"priorityClass,omitempty"`
}

// SlurmNodeWorkerVolumes defines the volumes for the Slurm worker node
Expand Down
4 changes: 4 additions & 0 deletions config/crd/bases/slurm.nebius.ai_slurmclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4025,6 +4025,10 @@ spec:
required:
- image
type: object
priorityClass:
description: PriorityClass defines the priority class for
the Slurm worker node
type: string
size:
description: Size defines the number of node instances
format: int32
Expand Down
6 changes: 6 additions & 0 deletions helm/slurm-cluster/templates/priority-class.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
name: {{ include "slurm-cluster.name" . }}
value: 1000000
globalDefault: false
1 change: 1 addition & 0 deletions helm/slurm-cluster/templates/slurm-cluster-cr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ spec:
worker:
cgroupVersion: {{ .Values.slurmNodes.worker.cgroupVersion | quote }}
enableGDRCopy: {{ default false .Values.slurmNodes.worker.enableGDRCopy }}
priorityClass: {{ default (include "slurm-cluster.name" .) .Values.slurmNodes.worker.priorityClass | quote }}
{{- if .Values.slurmNodes.worker.slurmNodeExtra }}
slurmNodeExtra: {{ .Values.slurmNodes.worker.slurmNodeExtra | quote }}
{{- end }}
Expand Down
1 change: 1 addition & 0 deletions helm/slurm-cluster/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,7 @@ slurmNodes:
k8sNodeFilterName: "gpu"
cgroupVersion: v2
enableGDRCopy: false
priorityClass: ""
slurmNodeExtra: ""
supervisordConfigMapRefName: ""
sshdConfigMapRefName: ""
Expand Down
4 changes: 4 additions & 0 deletions helm/soperator-crds/templates/slurmcluster-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4024,6 +4024,10 @@ spec:
required:
- image
type: object
priorityClass:
description: PriorityClass defines the priority class for
the Slurm worker node
type: string
size:
description: Size defines the number of node instances
format: int32
Expand Down
4 changes: 4 additions & 0 deletions helm/soperator/crds/slurmcluster-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4024,6 +4024,10 @@ spec:
required:
- image
type: object
priorityClass:
description: PriorityClass defines the priority class for
the Slurm worker node
type: string
size:
description: Size defines the number of node instances
format: int32
Expand Down
39 changes: 23 additions & 16 deletions internal/render/worker/statefulset.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,28 @@ func RenderStatefulSet(
replicas = ptr.To(consts.ZeroReplicas)
}

spec := corev1.PodSpec{
PriorityClassName: worker.PriorityClass,
ServiceAccountName: naming.BuildServiceAccountWorkerName(clusterName),
Affinity: nodeFilter.Affinity,
NodeSelector: nodeFilter.NodeSelector,
Tolerations: nodeFilter.Tolerations,
InitContainers: initContainers,
Containers: []corev1.Container{
slurmdContainer,
},
Volumes: volumes,
DNSConfig: &corev1.PodDNSConfig{
Searches: []string{
naming.BuildServiceFQDN(consts.ComponentTypeWorker, namespace, clusterName),
},
},
}

if worker.PriorityClass != "" {
spec.PriorityClassName = worker.PriorityClass
}

return appsv1.StatefulSet{
ObjectMeta: metav1.ObjectMeta{
Name: worker.StatefulSet.Name,
Expand Down Expand Up @@ -99,22 +121,7 @@ func RenderStatefulSet(
Labels: labels,
Annotations: renderAnnotations(worker, clusterName, namespace),
},
Spec: corev1.PodSpec{
ServiceAccountName: naming.BuildServiceAccountWorkerName(clusterName),
Affinity: nodeFilter.Affinity,
NodeSelector: nodeFilter.NodeSelector,
Tolerations: nodeFilter.Tolerations,
InitContainers: initContainers,
Containers: []corev1.Container{
slurmdContainer,
},
Volumes: volumes,
DNSConfig: &corev1.PodDNSConfig{
Searches: []string{
naming.BuildServiceFQDN(consts.ComponentTypeWorker, namespace, clusterName),
},
},
},
Spec: spec,
},
},
}, nil
Expand Down
2 changes: 2 additions & 0 deletions internal/values/slurm_worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ type SlurmWorker struct {
CgroupVersion string
EnableGDRCopy bool
SlurmNodeExtra string
PriorityClass string

Service Service
StatefulSet StatefulSet
Expand Down Expand Up @@ -88,6 +89,7 @@ func buildSlurmWorkerFrom(
SharedMemorySize: worker.Volumes.SharedMemorySize,
CgroupVersion: worker.CgroupVersion,
EnableGDRCopy: worker.EnableGDRCopy,
PriorityClass: worker.PriorityClass,
UseDefaultAppArmorProfile: useDefaultAppArmorProfile,
SlurmNodeExtra: worker.SlurmNodeExtra,
SSHDConfigMapName: sshdConfigMapName,
Expand Down

0 comments on commit 8700921

Please sign in to comment.