Skip to content

Commit

Permalink
Merge pull request #311 from nebius/downscale-and-overwrite
Browse files Browse the repository at this point in the history
add downscaleAndOverwritePopulateJail
  • Loading branch information
Uburro authored Jan 10, 2025
2 parents 4df3f43 + 46bae5c commit 318052e
Show file tree
Hide file tree
Showing 10 changed files with 102 additions and 8 deletions.
4 changes: 3 additions & 1 deletion api/v1/slurmcluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,11 @@ type SlurmClusterSpec struct {
// - none: No maintenance is performed. The cluster operates normally.
// - downscale: Scales down all components to 0.
// - downscaleAndDeletePopulateJail: Scales down all components to 0 and deletes the kubernetes Kind Jobs populateJail.
// - downscaleAndOverwritePopulateJail: Scales down all components to 0 and overwrite populateJail (same as overwrite=true).
// - skipPopulateJail: Skips the execution of the populateJail job during maintenance.
//
// +kubebuilder:validation:Optional
// +kubebuilder:validation:Enum=none;downscale;downscaleAndDeletePopulateJail;skipPopulateJail
// +kubebuilder:validation:Enum=none;downscale;downscaleAndDeletePopulateJail;downscaleAndOverwritePopulateJail;skipPopulateJail
// +kubebuilder:default="none"
Maintenance *consts.MaintenanceMode `json:"maintenance,omitempty"`

Expand Down Expand Up @@ -1019,6 +1020,7 @@ const (
ConditionClusterWorkersAvailable = "WorkersAvailable"
ConditionClusterLoginAvailable = "LoginAvailable"
ConditionClusterAccountingAvailable = "AccountingAvailable"
ConditionClusterPopulateJailMode = "PopulateJailMode"

PhaseClusterReconciling = "Reconciling"
PhaseClusterNotAvailable = "Not available"
Expand Down
2 changes: 2 additions & 0 deletions config/crd/bases/slurm.nebius.ai_slurmclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1073,11 +1073,13 @@ spec:
- none: No maintenance is performed. The cluster operates normally.
- downscale: Scales down all components to 0.
- downscaleAndDeletePopulateJail: Scales down all components to 0 and deletes the kubernetes Kind Jobs populateJail.
- downscaleAndOverwritePopulateJail: Scales down all components to 0 and overwrite populateJail (same as overwrite=true).
- skipPopulateJail: Skips the execution of the populateJail job during maintenance.
enum:
- none
- downscale
- downscaleAndDeletePopulateJail
- downscaleAndOverwritePopulateJail
- skipPopulateJail
type: string
ncclSettings:
Expand Down
1 change: 1 addition & 0 deletions helm/slurm-cluster/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ useDefaultAppArmorProfile: false
# - none: No maintenance is performed. The cluster operates normally.
# - downscale: Scales down all components to 0.
# - downscaleAndDeletePopulateJail: Scales down all components to 0 and deletes the kubernetes Kind Jobs populateJail.
# - downscaleAndOverwritePopulateJail: Scales down all components to 0 and overwrite populateJail (same as overwrite=true).
# - skipPopulateJail: Skips the execution of the populateJail job during maintenance.
maintenance: "none"
# Slurm cluster type. Can be now gpu or cpu
Expand Down
2 changes: 2 additions & 0 deletions helm/soperator-crds/templates/slurmcluster-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1072,11 +1072,13 @@ spec:
- none: No maintenance is performed. The cluster operates normally.
- downscale: Scales down all components to 0.
- downscaleAndDeletePopulateJail: Scales down all components to 0 and deletes the kubernetes Kind Jobs populateJail.
- downscaleAndOverwritePopulateJail: Scales down all components to 0 and overwrite populateJail (same as overwrite=true).
- skipPopulateJail: Skips the execution of the populateJail job during maintenance.
enum:
- none
- downscale
- downscaleAndDeletePopulateJail
- downscaleAndOverwritePopulateJail
- skipPopulateJail
type: string
ncclSettings:
Expand Down
2 changes: 2 additions & 0 deletions helm/soperator/crds/slurmcluster-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1072,11 +1072,13 @@ spec:
- none: No maintenance is performed. The cluster operates normally.
- downscale: Scales down all components to 0.
- downscaleAndDeletePopulateJail: Scales down all components to 0 and deletes the kubernetes Kind Jobs populateJail.
- downscaleAndOverwritePopulateJail: Scales down all components to 0 and overwrite populateJail (same as overwrite=true).
- skipPopulateJail: Skips the execution of the populateJail job during maintenance.
enum:
- none
- downscale
- downscaleAndDeletePopulateJail
- downscaleAndOverwritePopulateJail
- skipPopulateJail
type: string
ncclSettings:
Expand Down
6 changes: 5 additions & 1 deletion internal/check/maintanence.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ func IsModeDownscaleAndDeletePopulate(maintenance *consts.MaintenanceMode) bool
return maintenance != nil && *maintenance == consts.ModeDownscaleAndDeletePopulate
}

func IsModeDownscaleAndOverwritePopulate(maintenance *consts.MaintenanceMode) bool {
return maintenance != nil && *maintenance == consts.ModeDownscaleAndOverwritePopulate
}

func IsModeSkipPopulateJail(maintenance *consts.MaintenanceMode) bool {
return maintenance != nil && *maintenance == consts.ModeSkipPopulateJail
return maintenance != nil && *maintenance == consts.ModeSkipPopulate
}
9 changes: 5 additions & 4 deletions internal/consts/maintenance.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@ package consts
type MaintenanceMode string

const (
ModeNone MaintenanceMode = "none"
ModeDownscale MaintenanceMode = "downscale"
ModeDownscaleAndDeletePopulate MaintenanceMode = "downscaleAndDeletePopulateJail"
ModeSkipPopulateJail MaintenanceMode = "skipPopulateJail"
ModeNone MaintenanceMode = "none"
ModeDownscale MaintenanceMode = "downscale"
ModeDownscaleAndDeletePopulate MaintenanceMode = "downscaleAndDeletePopulateJail"
ModeDownscaleAndOverwritePopulate MaintenanceMode = "downscaleAndOverwritePopulateJail"
ModeSkipPopulate MaintenanceMode = "skipPopulateJail"
)

const (
Expand Down
24 changes: 23 additions & 1 deletion internal/controller/clustercontroller/populate_job.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@ import (
"github.com/pkg/errors"
batchv1 "k8s.io/api/batch/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"

slurmv1 "nebius.ai/slurm-operator/api/v1"
"nebius.ai/slurm-operator/internal/check"
"nebius.ai/slurm-operator/internal/consts"
"nebius.ai/slurm-operator/internal/logfield"
"nebius.ai/slurm-operator/internal/render/populate_jail"
"nebius.ai/slurm-operator/internal/utils"
Expand All @@ -37,7 +39,9 @@ func (r SlurmClusterReconciler) ReconcilePopulateJail(
stepLogger := log.FromContext(stepCtx)
stepLogger.Info("Reconciling")

isMaintenanceStopMode := check.IsModeDownscaleAndDeletePopulate(clusterValues.PopulateJail.Maintenance)
isMaintenanceStopMode := check.IsModeDownscaleAndDeletePopulate(
clusterValues.PopulateJail.Maintenance)

desired := batchv1.Job{}
getErr := r.Get(stepCtx,
client.ObjectKey{
Expand All @@ -56,6 +60,15 @@ func (r SlurmClusterReconciler) ReconcilePopulateJail(
}
stepLogger.Info("Deleted")
}
if check.IsModeDownscaleAndOverwritePopulate(clusterValues.PopulateJail.Maintenance) {
if isConditionNonOverwrite(cluster.Status.Conditions) {
if err := r.Delete(stepCtx, &desired); err != nil {
stepLogger.Error(err, "Failed to delete")
return errors.Wrap(err, "deleting Populate jail Job")
}
stepLogger.Info("Successfully deleted Populate Jail Job")
}
}
return nil
}

Expand Down Expand Up @@ -139,3 +152,12 @@ func (r SlurmClusterReconciler) ReconcilePopulateJail(

return nil
}

func isConditionNonOverwrite(conditions []metav1.Condition) bool {
for _, condition := range conditions {
if condition.Type == slurmv1.ConditionClusterPopulateJailMode {
return condition.Reason != string(consts.ModeDownscaleAndOverwritePopulate)
}
}
return false
}
56 changes: 56 additions & 0 deletions internal/controller/clustercontroller/reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package clustercontroller
import (
"context"
errorsStd "errors"
"fmt"
"os"
"sync"
"time"
Expand Down Expand Up @@ -33,6 +34,7 @@ import (

slurmv1 "nebius.ai/slurm-operator/api/v1"
"nebius.ai/slurm-operator/internal/check"
"nebius.ai/slurm-operator/internal/consts"
"nebius.ai/slurm-operator/internal/controller/reconciler"
"nebius.ai/slurm-operator/internal/controller/state"
"nebius.ai/slurm-operator/internal/logfield"
Expand Down Expand Up @@ -298,6 +300,60 @@ func (r *SlurmClusterReconciler) reconcile(ctx context.Context, cluster *slurmv1
}
}

// Popolate Jail
switch {
case check.IsModeSkipPopulateJail(clusterValues.PopulateJail.Maintenance):
if err = r.patchStatus(ctx, cluster, func(status *slurmv1.SlurmClusterStatus) {
status.SetCondition(metav1.Condition{
Type: slurmv1.ConditionClusterPopulateJailMode,
Status: metav1.ConditionTrue, Reason: string(consts.ModeSkipPopulate),
Message: "Populate Jail is skipped",
})
}); err != nil {
return ctrl.Result{}, err
}
case check.IsModeDownscaleAndDeletePopulate(clusterValues.PopulateJail.Maintenance):
if err = r.patchStatus(ctx, cluster, func(status *slurmv1.SlurmClusterStatus) {
status.SetCondition(metav1.Condition{
Type: slurmv1.ConditionClusterPopulateJailMode,
Status: metav1.ConditionTrue, Reason: string(consts.ModeDownscaleAndDeletePopulate),
Message: "Populate Jail is deleted",
})
}); err != nil {
return ctrl.Result{}, err
}
case check.IsModeDownscaleAndOverwritePopulate(clusterValues.PopulateJail.Maintenance):
if err = r.patchStatus(ctx, cluster, func(status *slurmv1.SlurmClusterStatus) {
status.SetCondition(metav1.Condition{
Type: slurmv1.ConditionClusterPopulateJailMode,
Status: metav1.ConditionTrue, Reason: string(consts.ModeDownscaleAndOverwritePopulate),
Message: "Populate Jail is overwritten",
})
}); err != nil {
return ctrl.Result{}, err
}
case !check.IsMaintenanceActive(clusterValues.PopulateJail.Maintenance):
if err = r.patchStatus(ctx, cluster, func(status *slurmv1.SlurmClusterStatus) {
status.SetCondition(metav1.Condition{
Type: slurmv1.ConditionClusterPopulateJailMode,
Status: metav1.ConditionTrue, Reason: string(consts.ModeNone),
Message: fmt.Sprintf("Populate Jail maintenanceMode is %s", consts.ModeNone),
})
}); err != nil {
return ctrl.Result{}, err
}
default:
if err = r.patchStatus(ctx, cluster, func(status *slurmv1.SlurmClusterStatus) {
status.SetCondition(metav1.Condition{
Type: slurmv1.ConditionClusterPopulateJailMode,
Status: metav1.ConditionUnknown, Reason: "Unknown",
Message: "Unknown Populate Jail maintenanceMode",
})
}); err != nil {
return ctrl.Result{}, err
}
}

// Controllers
switch {
case check.IsMaintenanceActive(clusterValues.NodeController.Maintenance):
Expand Down
4 changes: 3 additions & 1 deletion internal/render/populate_jail/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package populate_jail
import (
corev1 "k8s.io/api/core/v1"

"nebius.ai/slurm-operator/internal/check"
"nebius.ai/slurm-operator/internal/consts"
"nebius.ai/slurm-operator/internal/render/common"
"nebius.ai/slurm-operator/internal/values"
Expand All @@ -16,9 +17,10 @@ func renderContainerPopulateJail(clusterType consts.ClusterType, populateJail *v
volumeMounts = append(volumeMounts, common.RenderVolumeMountJailSnapshot())
}
overwriteEnv := "0"
if populateJail.Overwrite {
if populateJail.Overwrite || check.IsModeDownscaleAndOverwritePopulate(populateJail.Maintenance) {
overwriteEnv = "1"
}

return corev1.Container{
Name: populateJail.ContainerPopulateJail.Name,
Image: populateJail.ContainerPopulateJail.Image,
Expand Down

0 comments on commit 318052e

Please sign in to comment.