From 46bae5cceb39aba95a00f49b77f558d0b67023ae Mon Sep 17 00:00:00 2001 From: Uburro Date: Fri, 10 Jan 2025 00:04:57 +0100 Subject: [PATCH] add downscaleAndOverwritePopulateJail --- api/v1/slurmcluster_types.go | 4 +- .../bases/slurm.nebius.ai_slurmclusters.yaml | 2 + helm/slurm-cluster/values.yaml | 1 + .../templates/slurmcluster-crd.yaml | 2 + helm/soperator/crds/slurmcluster-crd.yaml | 2 + internal/check/maintanence.go | 6 +- internal/consts/maintenance.go | 9 +-- .../clustercontroller/populate_job.go | 24 +++++++- .../controller/clustercontroller/reconcile.go | 56 +++++++++++++++++++ internal/render/populate_jail/container.go | 4 +- 10 files changed, 102 insertions(+), 8 deletions(-) diff --git a/api/v1/slurmcluster_types.go b/api/v1/slurmcluster_types.go index 87cfd470a..6c9addec5 100644 --- a/api/v1/slurmcluster_types.go +++ b/api/v1/slurmcluster_types.go @@ -28,10 +28,11 @@ type SlurmClusterSpec struct { // - none: No maintenance is performed. The cluster operates normally. // - downscale: Scales down all components to 0. // - downscaleAndDeletePopulateJail: Scales down all components to 0 and deletes the kubernetes Kind Jobs populateJail. + // - downscaleAndOverwritePopulateJail: Scales down all components to 0 and overwrite populateJail (same as overwrite=true). // - skipPopulateJail: Skips the execution of the populateJail job during maintenance. // // +kubebuilder:validation:Optional - // +kubebuilder:validation:Enum=none;downscale;downscaleAndDeletePopulateJail;skipPopulateJail + // +kubebuilder:validation:Enum=none;downscale;downscaleAndDeletePopulateJail;downscaleAndOverwritePopulateJail;skipPopulateJail // +kubebuilder:default="none" Maintenance *consts.MaintenanceMode `json:"maintenance,omitempty"` @@ -1000,6 +1001,7 @@ const ( ConditionClusterWorkersAvailable = "WorkersAvailable" ConditionClusterLoginAvailable = "LoginAvailable" ConditionClusterAccountingAvailable = "AccountingAvailable" + ConditionClusterPopulateJailMode = "PopulateJailMode" PhaseClusterReconciling = "Reconciling" PhaseClusterNotAvailable = "Not available" diff --git a/config/crd/bases/slurm.nebius.ai_slurmclusters.yaml b/config/crd/bases/slurm.nebius.ai_slurmclusters.yaml index f4da97010..8d40a0eb5 100644 --- a/config/crd/bases/slurm.nebius.ai_slurmclusters.yaml +++ b/config/crd/bases/slurm.nebius.ai_slurmclusters.yaml @@ -1073,11 +1073,13 @@ spec: - none: No maintenance is performed. The cluster operates normally. - downscale: Scales down all components to 0. - downscaleAndDeletePopulateJail: Scales down all components to 0 and deletes the kubernetes Kind Jobs populateJail. + - downscaleAndOverwritePopulateJail: Scales down all components to 0 and overwrite populateJail (same as overwrite=true). - skipPopulateJail: Skips the execution of the populateJail job during maintenance. enum: - none - downscale - downscaleAndDeletePopulateJail + - downscaleAndOverwritePopulateJail - skipPopulateJail type: string ncclSettings: diff --git a/helm/slurm-cluster/values.yaml b/helm/slurm-cluster/values.yaml index 298065fb2..1295ee7a2 100644 --- a/helm/slurm-cluster/values.yaml +++ b/helm/slurm-cluster/values.yaml @@ -8,6 +8,7 @@ useDefaultAppArmorProfile: true # - none: No maintenance is performed. The cluster operates normally. # - downscale: Scales down all components to 0. # - downscaleAndDeletePopulateJail: Scales down all components to 0 and deletes the kubernetes Kind Jobs populateJail. +# - downscaleAndOverwritePopulateJail: Scales down all components to 0 and overwrite populateJail (same as overwrite=true). # - skipPopulateJail: Skips the execution of the populateJail job during maintenance. maintenance: "none" # Slurm cluster type. Can be now gpu or cpu diff --git a/helm/soperator-crds/templates/slurmcluster-crd.yaml b/helm/soperator-crds/templates/slurmcluster-crd.yaml index 4ff50f1df..fd69623ae 100644 --- a/helm/soperator-crds/templates/slurmcluster-crd.yaml +++ b/helm/soperator-crds/templates/slurmcluster-crd.yaml @@ -1072,11 +1072,13 @@ spec: - none: No maintenance is performed. The cluster operates normally. - downscale: Scales down all components to 0. - downscaleAndDeletePopulateJail: Scales down all components to 0 and deletes the kubernetes Kind Jobs populateJail. + - downscaleAndOverwritePopulateJail: Scales down all components to 0 and overwrite populateJail (same as overwrite=true). - skipPopulateJail: Skips the execution of the populateJail job during maintenance. enum: - none - downscale - downscaleAndDeletePopulateJail + - downscaleAndOverwritePopulateJail - skipPopulateJail type: string ncclSettings: diff --git a/helm/soperator/crds/slurmcluster-crd.yaml b/helm/soperator/crds/slurmcluster-crd.yaml index 4ff50f1df..fd69623ae 100644 --- a/helm/soperator/crds/slurmcluster-crd.yaml +++ b/helm/soperator/crds/slurmcluster-crd.yaml @@ -1072,11 +1072,13 @@ spec: - none: No maintenance is performed. The cluster operates normally. - downscale: Scales down all components to 0. - downscaleAndDeletePopulateJail: Scales down all components to 0 and deletes the kubernetes Kind Jobs populateJail. + - downscaleAndOverwritePopulateJail: Scales down all components to 0 and overwrite populateJail (same as overwrite=true). - skipPopulateJail: Skips the execution of the populateJail job during maintenance. enum: - none - downscale - downscaleAndDeletePopulateJail + - downscaleAndOverwritePopulateJail - skipPopulateJail type: string ncclSettings: diff --git a/internal/check/maintanence.go b/internal/check/maintanence.go index dc3d4ace4..0cd419e69 100644 --- a/internal/check/maintanence.go +++ b/internal/check/maintanence.go @@ -10,6 +10,10 @@ func IsModeDownscaleAndDeletePopulate(maintenance *consts.MaintenanceMode) bool return maintenance != nil && *maintenance == consts.ModeDownscaleAndDeletePopulate } +func IsModeDownscaleAndOverwritePopulate(maintenance *consts.MaintenanceMode) bool { + return maintenance != nil && *maintenance == consts.ModeDownscaleAndOverwritePopulate +} + func IsModeSkipPopulateJail(maintenance *consts.MaintenanceMode) bool { - return maintenance != nil && *maintenance == consts.ModeSkipPopulateJail + return maintenance != nil && *maintenance == consts.ModeSkipPopulate } diff --git a/internal/consts/maintenance.go b/internal/consts/maintenance.go index 3d4d084a0..680620eaf 100644 --- a/internal/consts/maintenance.go +++ b/internal/consts/maintenance.go @@ -3,10 +3,11 @@ package consts type MaintenanceMode string const ( - ModeNone MaintenanceMode = "none" - ModeDownscale MaintenanceMode = "downscale" - ModeDownscaleAndDeletePopulate MaintenanceMode = "downscaleAndDeletePopulateJail" - ModeSkipPopulateJail MaintenanceMode = "skipPopulateJail" + ModeNone MaintenanceMode = "none" + ModeDownscale MaintenanceMode = "downscale" + ModeDownscaleAndDeletePopulate MaintenanceMode = "downscaleAndDeletePopulateJail" + ModeDownscaleAndOverwritePopulate MaintenanceMode = "downscaleAndOverwritePopulateJail" + ModeSkipPopulate MaintenanceMode = "skipPopulateJail" ) const ( diff --git a/internal/controller/clustercontroller/populate_job.go b/internal/controller/clustercontroller/populate_job.go index cd6e15f4e..9ddec21fc 100644 --- a/internal/controller/clustercontroller/populate_job.go +++ b/internal/controller/clustercontroller/populate_job.go @@ -7,12 +7,14 @@ import ( "github.com/pkg/errors" batchv1 "k8s.io/api/batch/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/wait" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" slurmv1 "nebius.ai/slurm-operator/api/v1" "nebius.ai/slurm-operator/internal/check" + "nebius.ai/slurm-operator/internal/consts" "nebius.ai/slurm-operator/internal/logfield" "nebius.ai/slurm-operator/internal/render/populate_jail" "nebius.ai/slurm-operator/internal/utils" @@ -37,7 +39,9 @@ func (r SlurmClusterReconciler) ReconcilePopulateJail( stepLogger := log.FromContext(stepCtx) stepLogger.Info("Reconciling") - isMaintenanceStopMode := check.IsModeDownscaleAndDeletePopulate(clusterValues.PopulateJail.Maintenance) + isMaintenanceStopMode := check.IsModeDownscaleAndDeletePopulate( + clusterValues.PopulateJail.Maintenance) + desired := batchv1.Job{} getErr := r.Get(stepCtx, client.ObjectKey{ @@ -56,6 +60,15 @@ func (r SlurmClusterReconciler) ReconcilePopulateJail( } stepLogger.Info("Deleted") } + if check.IsModeDownscaleAndOverwritePopulate(clusterValues.PopulateJail.Maintenance) { + if isConditionNonOverwrite(cluster.Status.Conditions) { + if err := r.Delete(stepCtx, &desired); err != nil { + stepLogger.Error(err, "Failed to delete") + return errors.Wrap(err, "deleting Populate jail Job") + } + stepLogger.Info("Successfully deleted Populate Jail Job") + } + } return nil } @@ -139,3 +152,12 @@ func (r SlurmClusterReconciler) ReconcilePopulateJail( return nil } + +func isConditionNonOverwrite(conditions []metav1.Condition) bool { + for _, condition := range conditions { + if condition.Type == slurmv1.ConditionClusterPopulateJailMode { + return condition.Reason != string(consts.ModeDownscaleAndOverwritePopulate) + } + } + return false +} diff --git a/internal/controller/clustercontroller/reconcile.go b/internal/controller/clustercontroller/reconcile.go index ae019fa57..d1790232c 100644 --- a/internal/controller/clustercontroller/reconcile.go +++ b/internal/controller/clustercontroller/reconcile.go @@ -3,6 +3,7 @@ package clustercontroller import ( "context" errorsStd "errors" + "fmt" "os" "sync" "time" @@ -33,6 +34,7 @@ import ( slurmv1 "nebius.ai/slurm-operator/api/v1" "nebius.ai/slurm-operator/internal/check" + "nebius.ai/slurm-operator/internal/consts" "nebius.ai/slurm-operator/internal/controller/reconciler" "nebius.ai/slurm-operator/internal/controller/state" "nebius.ai/slurm-operator/internal/logfield" @@ -283,6 +285,60 @@ func (r *SlurmClusterReconciler) reconcile(ctx context.Context, cluster *slurmv1 return ctrl.Result{}, err } + // Popolate Jail + switch { + case check.IsModeSkipPopulateJail(clusterValues.PopulateJail.Maintenance): + if err = r.patchStatus(ctx, cluster, func(status *slurmv1.SlurmClusterStatus) { + status.SetCondition(metav1.Condition{ + Type: slurmv1.ConditionClusterPopulateJailMode, + Status: metav1.ConditionTrue, Reason: string(consts.ModeSkipPopulate), + Message: "Populate Jail is skipped", + }) + }); err != nil { + return ctrl.Result{}, err + } + case check.IsModeDownscaleAndDeletePopulate(clusterValues.PopulateJail.Maintenance): + if err = r.patchStatus(ctx, cluster, func(status *slurmv1.SlurmClusterStatus) { + status.SetCondition(metav1.Condition{ + Type: slurmv1.ConditionClusterPopulateJailMode, + Status: metav1.ConditionTrue, Reason: string(consts.ModeDownscaleAndDeletePopulate), + Message: "Populate Jail is deleted", + }) + }); err != nil { + return ctrl.Result{}, err + } + case check.IsModeDownscaleAndOverwritePopulate(clusterValues.PopulateJail.Maintenance): + if err = r.patchStatus(ctx, cluster, func(status *slurmv1.SlurmClusterStatus) { + status.SetCondition(metav1.Condition{ + Type: slurmv1.ConditionClusterPopulateJailMode, + Status: metav1.ConditionTrue, Reason: string(consts.ModeDownscaleAndOverwritePopulate), + Message: "Populate Jail is overwritten", + }) + }); err != nil { + return ctrl.Result{}, err + } + case !check.IsMaintenanceActive(clusterValues.PopulateJail.Maintenance): + if err = r.patchStatus(ctx, cluster, func(status *slurmv1.SlurmClusterStatus) { + status.SetCondition(metav1.Condition{ + Type: slurmv1.ConditionClusterPopulateJailMode, + Status: metav1.ConditionTrue, Reason: string(consts.ModeNone), + Message: fmt.Sprintf("Populate Jail maintenanceMode is %s", consts.ModeNone), + }) + }); err != nil { + return ctrl.Result{}, err + } + default: + if err = r.patchStatus(ctx, cluster, func(status *slurmv1.SlurmClusterStatus) { + status.SetCondition(metav1.Condition{ + Type: slurmv1.ConditionClusterPopulateJailMode, + Status: metav1.ConditionUnknown, Reason: "Unknown", + Message: "Unknown Populate Jail maintenanceMode", + }) + }); err != nil { + return ctrl.Result{}, err + } + } + // Controllers if res, err := r.ValidateControllers(ctx, cluster, clusterValues); err != nil { logger.Error(err, "Failed to validate Slurm controllers") diff --git a/internal/render/populate_jail/container.go b/internal/render/populate_jail/container.go index 468bd2be4..e77a290ad 100644 --- a/internal/render/populate_jail/container.go +++ b/internal/render/populate_jail/container.go @@ -3,6 +3,7 @@ package populate_jail import ( corev1 "k8s.io/api/core/v1" + "nebius.ai/slurm-operator/internal/check" "nebius.ai/slurm-operator/internal/consts" "nebius.ai/slurm-operator/internal/render/common" "nebius.ai/slurm-operator/internal/values" @@ -16,9 +17,10 @@ func renderContainerPopulateJail(clusterType consts.ClusterType, populateJail *v volumeMounts = append(volumeMounts, common.RenderVolumeMountJailSnapshot()) } overwriteEnv := "0" - if populateJail.Overwrite { + if populateJail.Overwrite || check.IsModeDownscaleAndOverwritePopulate(populateJail.Maintenance) { overwriteEnv = "1" } + return corev1.Container{ Name: populateJail.ContainerPopulateJail.Name, Image: populateJail.ContainerPopulateJail.Image,