diff --git a/internal/controller/clustercontroller/accounting.go b/internal/controller/clustercontroller/accounting.go index 20fd7f52..0b0fdd15 100644 --- a/internal/controller/clustercontroller/accounting.go +++ b/internal/controller/clustercontroller/accounting.go @@ -224,50 +224,47 @@ func (r SlurmClusterReconciler) ReconcileAccounting( }, }, utils.MultiStepExecutionStep{ - Name: "Slurm Deployment", + Name: "Slurm MariaDB Grant", Func: func(stepCtx context.Context) error { stepLogger := log.FromContext(stepCtx) stepLogger.Info("Reconciling") + if !check.IsMariaDbOperatorCRDInstalled { + stepLogger.Info("MariaDB Operator CRD is not installed. Skipping MariaDB reconciliation") + return nil + } + var err error - var desired *appsv1.Deployment - var deploymentNamePtr *string = nil + var desired *mariadbv1alpha1.Grant + var mariaDbGrantNamePtr *string = nil - if !isAccountingEnabled { + if !isMariaDBEnabled || !isAccountingEnabled { stepLogger.Info("Removing") - deploymentName := naming.BuildDeploymentName(consts.ComponentTypeAccounting) - deploymentNamePtr = &deploymentName - if err = r.Deployment.Reconcile(stepCtx, cluster, desired, deploymentNamePtr); err != nil { + mariaDbGrantName := naming.BuildMariaDbName(clusterValues.Name) + mariaDbGrantNamePtr = &mariaDbGrantName + if err = r.MariaDbGrant.Reconcile(stepCtx, cluster, desired, mariaDbGrantNamePtr); err != nil { stepLogger.Error(err, "Failed to reconcile") - return errors.Wrap(err, "reconciling accounting Deployment") + return errors.Wrap(err, "reconciling accounting mariadb grant") } stepLogger.Info("Reconciled") return nil } - desired, err = accounting.RenderDeployment( + + desired, err = accounting.RenderMariaDbGrant( clusterValues.Namespace, clusterValues.Name, &clusterValues.NodeAccounting, - clusterValues.NodeFilters, - clusterValues.VolumeSources, ) if err != nil { stepLogger.Error(err, "Failed to render") - return errors.Wrap(err, "rendering accounting Deployment") + return errors.Wrap(err, "rendering accounting mariadb grant") } stepLogger = stepLogger.WithValues(logfield.ResourceKV(desired)...) stepLogger.Info("Rendered") - deps, err := r.getAccountingDeploymentDependencies(ctx, clusterValues) - if err != nil { - stepLogger.Error(err, "Failed to retrieve dependencies") - return errors.Wrap(err, "retrieving dependencies for accounting Deployment") - } - stepLogger.Info("Retrieved dependencies") - - if err = r.Deployment.Reconcile(stepCtx, cluster, desired, deploymentNamePtr, deps...); err != nil { + if err = r.MariaDbGrant.Reconcile(ctx, cluster, desired, mariaDbGrantNamePtr); err != nil { stepLogger.Error(err, "Failed to reconcile") - return errors.Wrap(err, "reconciling accounting Deployment") + return errors.Wrap(err, "reconciling accounting mariadb grant") } stepLogger.Info("Reconciled") return nil @@ -322,47 +319,50 @@ func (r SlurmClusterReconciler) ReconcileAccounting( }, }, utils.MultiStepExecutionStep{ - Name: "Slurm MariaDB Grant", + Name: "Slurm Deployment", Func: func(stepCtx context.Context) error { stepLogger := log.FromContext(stepCtx) stepLogger.Info("Reconciling") - if !check.IsMariaDbOperatorCRDInstalled { - stepLogger.Info("MariaDB Operator CRD is not installed. Skipping MariaDB reconciliation") - return nil - } - var err error - var desired *mariadbv1alpha1.Grant - var mariaDbGrantNamePtr *string = nil + var desired *appsv1.Deployment + var deploymentNamePtr *string = nil - if !isMariaDBEnabled || !isAccountingEnabled { + if !isAccountingEnabled { stepLogger.Info("Removing") - mariaDbGrantName := naming.BuildMariaDbName(clusterValues.Name) - mariaDbGrantNamePtr = &mariaDbGrantName - if err = r.MariaDbGrant.Reconcile(stepCtx, cluster, desired, mariaDbGrantNamePtr); err != nil { + deploymentName := naming.BuildDeploymentName(consts.ComponentTypeAccounting) + deploymentNamePtr = &deploymentName + if err = r.Deployment.Reconcile(stepCtx, cluster, desired, deploymentNamePtr); err != nil { stepLogger.Error(err, "Failed to reconcile") - return errors.Wrap(err, "reconciling accounting mariadb grant") + return errors.Wrap(err, "reconciling accounting Deployment") } stepLogger.Info("Reconciled") return nil } - - desired, err = accounting.RenderMariaDbGrant( + desired, err = accounting.RenderDeployment( clusterValues.Namespace, clusterValues.Name, &clusterValues.NodeAccounting, + clusterValues.NodeFilters, + clusterValues.VolumeSources, ) if err != nil { stepLogger.Error(err, "Failed to render") - return errors.Wrap(err, "rendering accounting mariadb grant") + return errors.Wrap(err, "rendering accounting Deployment") } stepLogger = stepLogger.WithValues(logfield.ResourceKV(desired)...) stepLogger.Info("Rendered") - if err = r.MariaDbGrant.Reconcile(ctx, cluster, desired, mariaDbGrantNamePtr); err != nil { + deps, err := r.getAccountingDeploymentDependencies(ctx, clusterValues) + if err != nil { + stepLogger.Error(err, "Failed to retrieve dependencies") + return errors.Wrap(err, "retrieving dependencies for accounting Deployment") + } + stepLogger.Info("Retrieved dependencies") + + if err = r.Deployment.Reconcile(stepCtx, cluster, desired, deploymentNamePtr, deps...); err != nil { stepLogger.Error(err, "Failed to reconcile") - return errors.Wrap(err, "reconciling accounting mariadb grant") + return errors.Wrap(err, "reconciling accounting Deployment") } stepLogger.Info("Reconciled") return nil @@ -437,55 +437,103 @@ func (r SlurmClusterReconciler) ValidateAccounting( cluster *slurmv1.SlurmCluster, clusterValues *values.SlurmCluster, ) (ctrl.Result, error) { - logger := log.FromContext(ctx) + existingDeployment := &appsv1.Deployment{} + existingMariaDb := &mariadbv1alpha1.MariaDB{} + existingMariaDbGrant := &mariadbv1alpha1.Grant{} - existing := &appsv1.Deployment{} - err := r.Get( - ctx, - types.NamespacedName{ - Namespace: clusterValues.Namespace, - Name: clusterValues.NodeAccounting.Deployment.Name, - }, - existing, - ) + existingDeployment, err := getTypedResource(ctx, r.Client, clusterValues.Namespace, clusterValues.NodeAccounting.Deployment.Name, existingDeployment) if err != nil { - if apierrors.IsNotFound(err) { - return ctrl.Result{Requeue: true, RequeueAfter: 5 * time.Second}, nil - } - logger.Error(err, "Failed to get accounting Deployment") - return ctrl.Result{}, errors.Wrap(err, "getting accounting Deployment") + message := "Failed to get Deployment" + return r.updateStatus(ctx, cluster, slurmv1.ConditionClusterAccountingAvailable, metav1.ConditionUnknown, "Unknown", message, 10*time.Second) } targetReplicas := clusterValues.NodeAccounting.Deployment.Replicas - if existing.Spec.Replicas != nil { - targetReplicas = *existing.Spec.Replicas + if existingDeployment.Spec.Replicas != nil { + targetReplicas = *existingDeployment.Spec.Replicas } - if existing.Status.AvailableReplicas != targetReplicas { - if err = r.patchStatus(ctx, cluster, func(status *slurmv1.SlurmClusterStatus) { - status.SetCondition(metav1.Condition{ - Type: slurmv1.ConditionClusterAccountingAvailable, - Status: metav1.ConditionFalse, Reason: "NotAvailable", - Message: "Slurm accounting is not available yet", - }) - }); err != nil { - return ctrl.Result{}, err + + if clusterValues.NodeAccounting.MariaDb.Enabled { + existingMariaDb, mariadbErr := getTypedResource(ctx, r.Client, clusterValues.Namespace, naming.BuildMariaDbName(clusterValues.Name), existingMariaDb) + if mariadbErr != nil || existingMariaDb == nil { + message := "Failed to get MariaDB" + return r.updateStatus(ctx, cluster, slurmv1.ConditionClusterAccountingAvailable, metav1.ConditionUnknown, "Unknown", message, 10*time.Second) } - return ctrl.Result{Requeue: true, RequeueAfter: time.Second * 10}, nil - } else { - if err = r.patchStatus(ctx, cluster, func(status *slurmv1.SlurmClusterStatus) { - status.SetCondition(metav1.Condition{ - Type: slurmv1.ConditionClusterAccountingAvailable, - Status: metav1.ConditionTrue, Reason: "Available", - Message: "Slurm accounting is available", - }) - }); err != nil { - return ctrl.Result{}, err + existingMariaDbGrant, mariadbGrantErr := getTypedResource(ctx, r.Client, clusterValues.Namespace, naming.BuildMariaDbName(clusterValues.Name), existingMariaDbGrant) + if mariadbGrantErr != nil || existingMariaDbGrant == nil { + message := "Failed to get MariaDB Grant" + return r.updateStatus(ctx, cluster, slurmv1.ConditionClusterAccountingAvailable, metav1.ConditionUnknown, "Unknown", message, 10*time.Second) } } + switch { + case !isConditionReady(existingMariaDb.Status.Conditions, mariadbv1alpha1.ConditionTypeReady): + message := "MariaDB is not ready" + return r.updateStatus(ctx, cluster, slurmv1.ConditionClusterAccountingAvailable, metav1.ConditionFalse, "NotAvailable", message, 10*time.Second) + case !isConditionReady(existingMariaDbGrant.Status.Conditions, mariadbv1alpha1.ConditionTypeReady): + message := "MariaDB Grant is not ready" + return r.updateStatus(ctx, cluster, slurmv1.ConditionClusterAccountingAvailable, metav1.ConditionFalse, "NotAvailable", message, 10*time.Second) + case existingDeployment.Status.AvailableReplicas == 0: + message := "Slurm accounting is not available yet" + return r.updateStatus(ctx, cluster, slurmv1.ConditionClusterAccountingAvailable, metav1.ConditionFalse, "NotAvailable", message, 10*time.Second) + case existingDeployment.Status.AvailableReplicas != targetReplicas: + message := fmt.Sprintf("Slurm accounting available replicas: %d, but target replicas: %d", existingDeployment.Status.AvailableReplicas, targetReplicas) + return r.updateStatus(ctx, cluster, slurmv1.ConditionClusterAccountingAvailable, metav1.ConditionFalse, "NotAvailable", message, 10*time.Second) + } + + return r.updateStatus(ctx, cluster, slurmv1.ConditionClusterAccountingAvailable, metav1.ConditionTrue, "Available", "Slurm accounting is available", 0) +} + +func getTypedResource[T client.Object]( + ctx context.Context, + getter client.Reader, + namespace, name string, + obj T, +) (T, error) { + err := getter.Get(ctx, types.NamespacedName{Namespace: namespace, Name: name}, obj) + if err != nil { + if apierrors.IsNotFound(err) { + var zeroValue T // This creates the zero value for type T + return zeroValue, nil + } + return obj, errors.Wrap(err, "failed to get resource") + } + return obj, nil +} + +func (r SlurmClusterReconciler) updateStatus( + ctx context.Context, + cluster *slurmv1.SlurmCluster, + conditionType string, + conditionStatus metav1.ConditionStatus, + reason, message string, + requeueAfter time.Duration, +) (ctrl.Result, error) { + if err := r.patchStatus(ctx, cluster, func(status *slurmv1.SlurmClusterStatus) { + status.SetCondition(metav1.Condition{ + Type: conditionType, + Status: conditionStatus, + Reason: reason, + Message: message, + }) + }); err != nil { + return ctrl.Result{}, err + } + if conditionStatus != metav1.ConditionTrue { + return ctrl.Result{Requeue: true, RequeueAfter: requeueAfter}, nil + } + return ctrl.Result{}, nil } +func isConditionReady(conditions []metav1.Condition, conditionType string) bool { + for _, condition := range conditions { + if condition.Type == conditionType && condition.Status == metav1.ConditionTrue { + return true + } + } + return false +} + func (r SlurmClusterReconciler) getAccountingDeploymentDependencies( ctx context.Context, clusterValues *values.SlurmCluster, diff --git a/internal/controller/clustercontroller/reconcile.go b/internal/controller/clustercontroller/reconcile.go index ae019fa5..af4ddb5d 100644 --- a/internal/controller/clustercontroller/reconcile.go +++ b/internal/controller/clustercontroller/reconcile.go @@ -273,41 +273,111 @@ func (r *SlurmClusterReconciler) reconcile(ctx context.Context, cluster *slurmv1 ptr.To(slurmv1.PhaseClusterNotAvailable), func() (ctrl.Result, error) { // Common - if err = r.patchStatus(ctx, cluster, func(status *slurmv1.SlurmClusterStatus) { - status.SetCondition(metav1.Condition{ - Type: slurmv1.ConditionClusterCommonAvailable, - Status: metav1.ConditionTrue, Reason: "Available", - Message: "Slurm common components are available", + switch { + case check.IsMaintenanceActive(clusterValues.NodeRest.Maintenance): + err = r.patchStatus(ctx, cluster, func(status *slurmv1.SlurmClusterStatus) { + status.SetCondition(metav1.Condition{ + Type: slurmv1.ConditionClusterCommonAvailable, + Status: metav1.ConditionFalse, + Reason: "Maintenance", + Message: "Slurm common components are in maintenance", + }) }) - }); err != nil { - return ctrl.Result{}, err + if err != nil { + return ctrl.Result{}, err + } + default: + if err = r.patchStatus(ctx, cluster, func(status *slurmv1.SlurmClusterStatus) { + status.SetCondition(metav1.Condition{ + Type: slurmv1.ConditionClusterCommonAvailable, + Status: metav1.ConditionTrue, Reason: "Available", + Message: "Slurm common components are available", + }) + }); err != nil { + return ctrl.Result{}, err + } } // Controllers - if res, err := r.ValidateControllers(ctx, cluster, clusterValues); err != nil { - logger.Error(err, "Failed to validate Slurm controllers") - return ctrl.Result{}, errors.Wrap(err, "validating Slurm controllers") - } else if res.Requeue { - return res, nil + switch { + case check.IsMaintenanceActive(clusterValues.NodeController.Maintenance): + err = r.patchStatus(ctx, cluster, func(status *slurmv1.SlurmClusterStatus) { + status.SetCondition(metav1.Condition{ + Type: slurmv1.ConditionClusterControllersAvailable, + Status: metav1.ConditionFalse, + Reason: "Maintenance", + Message: "Slurm controllers are in maintenance", + }) + }) + if err != nil { + return ctrl.Result{}, err + } + default: + if res, err := r.ValidateControllers(ctx, cluster, clusterValues); err != nil { + logger.Error(err, "Failed to validate Slurm controllers") + return ctrl.Result{}, errors.Wrap(err, "validating Slurm controllers") + } else if res.Requeue { + return res, nil + } } // Workers - if res, err := r.ValidateWorkers(ctx, cluster, clusterValues); err != nil { - logger.Error(err, "Failed to validate Slurm workers") - return ctrl.Result{}, errors.Wrap(err, "validating Slurm workers") - } else if res.Requeue { - return res, nil + switch { + case check.IsMaintenanceActive(clusterValues.NodeWorker.Maintenance): + err = r.patchStatus(ctx, cluster, func(status *slurmv1.SlurmClusterStatus) { + status.SetCondition(metav1.Condition{ + Type: slurmv1.ConditionClusterWorkersAvailable, + Status: metav1.ConditionFalse, + Reason: "Maintenance", + Message: "Slurm workers are in maintenance", + }) + }) + if err != nil { + return ctrl.Result{}, err + } + + case clusterValues.NodeWorker.Size > 0: + if res, err := r.ValidateWorkers(ctx, cluster, clusterValues); err != nil { + logger.Error(err, "Failed to validate Slurm workers") + return ctrl.Result{}, errors.Wrap(err, "validating Slurm workers") + } else if res.Requeue { + return res, nil + } + default: + if err = r.patchStatus(ctx, cluster, func(status *slurmv1.SlurmClusterStatus) { + status.SetCondition(metav1.Condition{ + Type: slurmv1.ConditionClusterWorkersAvailable, + Status: metav1.ConditionFalse, Reason: "NotAvailable", + Message: "Slurm workers are disabled", + }) + }); err != nil { + return ctrl.Result{}, err + } } // Login - if clusterValues.NodeLogin.Size > 0 { + switch { + case check.IsMaintenanceActive(clusterValues.NodeLogin.Maintenance): + err = r.patchStatus(ctx, cluster, func(status *slurmv1.SlurmClusterStatus) { + status.SetCondition(metav1.Condition{ + Type: slurmv1.ConditionClusterLoginAvailable, + Status: metav1.ConditionFalse, + Reason: "Maintenance", + Message: "Slurm login is in maintenance", + }) + }) + if err != nil { + return ctrl.Result{}, err + } + + case clusterValues.NodeLogin.Size > 0: if res, err := r.ValidateLogin(ctx, cluster, clusterValues); err != nil { logger.Error(err, "Failed to validate Slurm login") return ctrl.Result{}, errors.Wrap(err, "validating Slurm login") } else if res.Requeue { return res, nil } - } else { + default: if err = r.patchStatus(ctx, cluster, func(status *slurmv1.SlurmClusterStatus) { status.SetCondition(metav1.Condition{ Type: slurmv1.ConditionClusterLoginAvailable, @@ -320,23 +390,42 @@ func (r *SlurmClusterReconciler) reconcile(ctx context.Context, cluster *slurmv1 } // Accounting - if clusterValues.NodeAccounting.Enabled { - if res, err := r.ValidateAccounting(ctx, cluster, clusterValues); err != nil { - logger.Error(err, "Failed to validate Slurm accounting") - return ctrl.Result{}, errors.Wrap(err, "validating Slurm accounting") - } else if res.Requeue { - return res, nil + switch { + case check.IsMaintenanceActive(clusterValues.NodeAccounting.Maintenance): + err = r.patchStatus(ctx, cluster, func(status *slurmv1.SlurmClusterStatus) { + status.SetCondition(metav1.Condition{ + Type: slurmv1.ConditionClusterAccountingAvailable, + Status: metav1.ConditionFalse, + Reason: "Maintenance", + Message: "Slurm accounting is in maintenance", + }) + }) + if err != nil { + return ctrl.Result{}, err } - } else { - if err = r.patchStatus(ctx, cluster, func(status *slurmv1.SlurmClusterStatus) { + + case !clusterValues.NodeAccounting.Enabled: + err = r.patchStatus(ctx, cluster, func(status *slurmv1.SlurmClusterStatus) { status.SetCondition(metav1.Condition{ - Type: slurmv1.ConditionClusterAccountingAvailable, - Status: metav1.ConditionFalse, Reason: "NotAvailable", + Type: slurmv1.ConditionClusterAccountingAvailable, + Status: metav1.ConditionFalse, + Reason: "NotAvailable", Message: "Slurm accounting is disabled", }) - }); err != nil { + }) + if err != nil { return ctrl.Result{}, err } + + default: + res, err := r.ValidateAccounting(ctx, cluster, clusterValues) + if err != nil { + logger.Error(err, "Failed to validate Slurm accounting") + return ctrl.Result{}, errors.Wrap(err, "validating Slurm accounting") + } + if res.Requeue { + return res, nil + } } return ctrl.Result{}, nil diff --git a/internal/controller/reconciler/deployment.go b/internal/controller/reconciler/deployment.go index 95becc62..b3f0ebef 100644 --- a/internal/controller/reconciler/deployment.go +++ b/internal/controller/reconciler/deployment.go @@ -47,7 +47,7 @@ func (r *DeploymentReconciler) Reconcile( log.FromContext(ctx).Info(fmt.Sprintf( "Deleting Deployment %s-collector, because of Deployment is not needed", cluster.Name, )) - return r.deleteDeploymentIfOwnedByController(ctx, cluster, cluster.Namespace, *name) + return r.deleteIfOwnedByController(ctx, cluster, cluster.Namespace, *name) } if err := r.reconcile(ctx, cluster, desired, r.patch, deps...); err != nil { log.FromContext(ctx). @@ -58,13 +58,18 @@ func (r *DeploymentReconciler) Reconcile( return nil } -func (r *DeploymentReconciler) deleteDeploymentIfOwnedByController( +func (r *DeploymentReconciler) deleteIfOwnedByController( ctx context.Context, cluster *slurmv1.SlurmCluster, namespace, name string, ) error { deployment, err := r.getDeployment(ctx, namespace, name) + if apierrors.IsNotFound(err) { + log.FromContext(ctx).Info("Deployment not found, skipping deletion") + return nil + } + if err != nil { return errors.Wrap(err, "getting Deployment") } diff --git a/internal/controller/reconciler/grant.go b/internal/controller/reconciler/grant.go index eddb4a2c..09bb77d5 100644 --- a/internal/controller/reconciler/grant.go +++ b/internal/controller/reconciler/grant.go @@ -7,6 +7,7 @@ import ( mariadbv1alpha1 "github.com/mariadb-operator/mariadb-operator/api/v1alpha1" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" slurmv1 "nebius.ai/slurm-operator/api/v1" @@ -60,6 +61,11 @@ func (r *MariaDbGrantReconciler) deleteIfOwnedByController( name string, ) error { grant, err := r.getMariaDbGrant(ctx, namespace, name) + if apierrors.IsNotFound(err) { + log.FromContext(ctx).Info("MariaDbGrant is not found, skipping deletion") + return nil + } + if err != nil { return errors.Wrap(err, "getting MariaDbGrant") } diff --git a/internal/controller/reconciler/k8s_role.go b/internal/controller/reconciler/k8s_role.go index 15e00ae1..49b5034d 100644 --- a/internal/controller/reconciler/k8s_role.go +++ b/internal/controller/reconciler/k8s_role.go @@ -39,7 +39,7 @@ func (r *RoleReconciler) Reconcile( if desired == nil { // If desired is nil, delete the Role log.FromContext(ctx).Info(fmt.Sprintf("Deleting Role %s, because of Role is not needed", naming.BuildRoleWorkerName(cluster.Name))) - return r.deleteRoleIfOwnedByController(ctx, cluster) + return r.deleteIfOwnedByController(ctx, cluster) } if err := r.reconcile(ctx, cluster, desired, r.patch, deps...); err != nil { log.FromContext(ctx). @@ -50,11 +50,16 @@ func (r *RoleReconciler) Reconcile( return nil } -func (r *RoleReconciler) deleteRoleIfOwnedByController( +func (r *RoleReconciler) deleteIfOwnedByController( ctx context.Context, cluster *slurmv1.SlurmCluster, ) error { role, err := r.getRole(ctx, cluster) + if apierrors.IsNotFound(err) { + log.FromContext(ctx).Info("Service not found, skipping deletion") + return nil + } + if err != nil { return errors.Wrap(err, "getting Worker Role") } diff --git a/internal/controller/reconciler/k8s_rolebinging.go b/internal/controller/reconciler/k8s_rolebinging.go index e94ae4ca..477ff42e 100644 --- a/internal/controller/reconciler/k8s_rolebinging.go +++ b/internal/controller/reconciler/k8s_rolebinging.go @@ -39,7 +39,7 @@ func (r *RoleBindingReconciler) Reconcile( if desired == nil { // If desired is nil, delete the Role Binding log.FromContext(ctx).Info(fmt.Sprintf("Deleting RoleBinding %s, because of RoleBinding is not needed", naming.BuildRoleBindingWorkerName(cluster.Name))) - return r.deleteRoleBindingIfOwnedByController(ctx, cluster) + return r.deleteIfOwnedByController(ctx, cluster) } if err := r.reconcile(ctx, cluster, desired, r.patch, deps...); err != nil { log.FromContext(ctx). @@ -50,11 +50,15 @@ func (r *RoleBindingReconciler) Reconcile( return nil } -func (r *RoleBindingReconciler) deleteRoleBindingIfOwnedByController( +func (r *RoleBindingReconciler) deleteIfOwnedByController( ctx context.Context, cluster *slurmv1.SlurmCluster, ) error { roleBinding, err := r.getRoleBinding(ctx, cluster) + if apierrors.IsNotFound(err) { + log.FromContext(ctx).Info("RoleBinding is not found, skipping deletion") + return nil + } if err != nil { return errors.Wrap(err, "getting Worker RoleBinding") } diff --git a/internal/controller/reconciler/k8s_service.go b/internal/controller/reconciler/k8s_service.go index c388f512..ec938286 100644 --- a/internal/controller/reconciler/k8s_service.go +++ b/internal/controller/reconciler/k8s_service.go @@ -6,6 +6,7 @@ import ( "github.com/pkg/errors" corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" @@ -60,6 +61,10 @@ func (r *ServiceReconciler) deleteIfOwnedByController( name string, ) error { service, err := r.getService(ctx, namespace, name) + if apierrors.IsNotFound(err) { + log.FromContext(ctx).Info("Service not found, skipping deletion") + return nil + } if err != nil { return errors.Wrap(err, "getting Service") } diff --git a/internal/controller/reconciler/mariadb.go b/internal/controller/reconciler/mariadb.go index c518dc05..f9ec35f8 100644 --- a/internal/controller/reconciler/mariadb.go +++ b/internal/controller/reconciler/mariadb.go @@ -7,6 +7,7 @@ import ( mariadbv1alpha1 "github.com/mariadb-operator/mariadb-operator/api/v1alpha1" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" slurmv1 "nebius.ai/slurm-operator/api/v1" @@ -61,6 +62,10 @@ func (r *MariaDbReconciler) deleteIfOwnedByController( name string, ) error { mariaDb, err := r.getMariaDb(ctx, namespace, name) + if apierrors.IsNotFound(err) { + log.FromContext(ctx).Info("MariaDb is not found, skipping deletion") + return nil + } if err != nil { return errors.Wrap(err, "getting MariaDb") } diff --git a/internal/controller/reconciler/otel.go b/internal/controller/reconciler/otel.go index 8eddb4f4..9d0dbccb 100644 --- a/internal/controller/reconciler/otel.go +++ b/internal/controller/reconciler/otel.go @@ -45,7 +45,7 @@ func (r *OtelReconciler) Reconcile( // We should use conditions instead. if condition is met and resource exists, delete it // MSP-2715 - task to improve resource deletion log.FromContext(ctx).Info(fmt.Sprintf("Deleting OpenTelemetryCollector %s-collector, because of OpenTelemetryCollector is not needed", cluster.Name)) - return r.deleteOtelIfOwnedByController(ctx, cluster) + return r.deleteIfOwnedByController(ctx, cluster) } if err := r.reconcile(ctx, cluster, desired, r.patch, deps...); err != nil { log.FromContext(ctx). @@ -56,18 +56,23 @@ func (r *OtelReconciler) Reconcile( return nil } -func (r *OtelReconciler) deleteOtelIfOwnedByController( +func (r *OtelReconciler) deleteIfOwnedByController( ctx context.Context, cluster *slurmv1.SlurmCluster, ) error { otel, err := r.getOtel(ctx, cluster) + if apierrors.IsNotFound(err) { + log.FromContext(ctx).Info("Service not found, skipping deletion") + return nil + } + if err != nil { return errors.Wrap(err, "getting OpenTelemetryCollector") } - // Check if the controller is the owner of the OpenTelemetryCollector + isOwner := isControllerOwnerOtel(otel, cluster) if !isOwner { - // The controller is not the owner of the OpenTelemetryCollector, nothing to do + log.FromContext(ctx).Info("OpenTelemetryCollector is not owned by the controller, skipping deletion") return nil } // The controller is the owner of the OpenTelemetryCollector, delete it diff --git a/internal/controller/reconciler/pod_monitor.go b/internal/controller/reconciler/pod_monitor.go index 18cf4ec8..1deec9e4 100644 --- a/internal/controller/reconciler/pod_monitor.go +++ b/internal/controller/reconciler/pod_monitor.go @@ -46,7 +46,7 @@ func (r *PodMonitorReconciler) Reconcile( log.FromContext(ctx).Info(fmt.Sprintf( "Deleting PodMonitor %s-collector, because of PodMonitor is not needed", cluster.Name, )) - return r.deletePodMonitorIfOwnedByController(ctx, cluster) + return r.deleteIfOwnedByController(ctx, cluster) } if err := r.reconcile(ctx, cluster, desired, r.patch, deps...); err != nil { log.FromContext(ctx). @@ -57,22 +57,28 @@ func (r *PodMonitorReconciler) Reconcile( return nil } -func (r *PodMonitorReconciler) deletePodMonitorIfOwnedByController( +func (r *PodMonitorReconciler) deleteIfOwnedByController( ctx context.Context, cluster *slurmv1.SlurmCluster, ) error { podMonitor, err := r.getPodMonitor(ctx, cluster) + if apierrors.IsNotFound(err) { + log.FromContext(ctx).Info("PodMonitor is not needed, skipping deletion") + return nil + } if err != nil { return errors.Wrap(err, "getting PodMonitor") } - // Check if the controller is the owner of the PodMonitor - isOwner := isControllerOwnerPodMonitor(podMonitor, cluster) - if !isOwner { - // The controller is not the owner of the PodMonitor, nothing to do + + if !metav1.IsControlledBy(podMonitor, cluster) { + log.FromContext(ctx).Info("PodMonitor is not owned by controller, skipping deletion") return nil } - // The controller is the owner of the PodMonitor, delete it - return r.deletePodMonitorOwnedByController(ctx, cluster, podMonitor) + + if err := r.Delete(ctx, podMonitor); err != nil { + return errors.Wrap(err, "deleting PodMonitor") + } + return nil } func (r *PodMonitorReconciler) getPodMonitor(ctx context.Context, cluster *slurmv1.SlurmCluster) (*prometheusv1.PodMonitor, error) { @@ -96,36 +102,6 @@ func (r *PodMonitorReconciler) getPodMonitor(ctx context.Context, cluster *slurm return podMonitor, nil } -// Function to check if the controller is the owner -func isControllerOwnerPodMonitor(podMonitor *prometheusv1.PodMonitor, cluster *slurmv1.SlurmCluster) bool { - // Check if the controller is the owner of the Role - isOwner := false - for _, ownerRef := range podMonitor.GetOwnerReferences() { - if ownerRef.Kind == slurmv1.SlurmClusterKind && ownerRef.Name == cluster.Name { - isOwner = true - break - } - } - - return isOwner -} - -func (r *PodMonitorReconciler) deletePodMonitorOwnedByController( - ctx context.Context, - cluster *slurmv1.SlurmCluster, - podMonitor *prometheusv1.PodMonitor, -) error { - // Delete the Role - err := r.Client.Delete(ctx, podMonitor) - if err != nil { - log.FromContext(ctx). - WithValues("cluster", cluster.Name). - Error(err, "Failed to delete PodMonitor") - return errors.Wrap(err, "deleting PodMonitor") - } - return nil -} - func (r *PodMonitorReconciler) patch(existing, desired client.Object) (client.Patch, error) { patchImpl := func(dst, src *prometheusv1.PodMonitor) client.Patch { res := client.MergeFrom(dst.DeepCopy()) diff --git a/internal/controller/reconciler/pod_monitor_test.go b/internal/controller/reconciler/pod_monitor_test.go index 2df5e2e5..980e091f 100644 --- a/internal/controller/reconciler/pod_monitor_test.go +++ b/internal/controller/reconciler/pod_monitor_test.go @@ -5,10 +5,8 @@ import ( "testing" "github.com/stretchr/testify/assert" - apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/types" slurmv1 "nebius.ai/slurm-operator/api/v1" "sigs.k8s.io/controller-runtime/pkg/client/fake" @@ -16,50 +14,6 @@ import ( prometheusv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" ) -func Test_IsControllerOwnerPodMonitor(t *testing.T) { - defaultNameCluster := "test-cluster" - - cluster := &slurmv1.SlurmCluster{ - ObjectMeta: metav1.ObjectMeta{ - Name: defaultNameCluster, - }, - } - - t.Run("controller is owner", func(t *testing.T) { - podMonitor := &prometheusv1.PodMonitor{ - ObjectMeta: metav1.ObjectMeta{ - OwnerReferences: []metav1.OwnerReference{ - { - Kind: slurmv1.SlurmClusterKind, - Name: defaultNameCluster, - }, - }, - }, - } - - isOwner := isControllerOwnerPodMonitor(podMonitor, cluster) - - assert.True(t, isOwner) - }) - - t.Run("controller is not owner", func(t *testing.T) { - podMonitor := &prometheusv1.PodMonitor{ - ObjectMeta: metav1.ObjectMeta{ - OwnerReferences: []metav1.OwnerReference{ - { - Kind: "OtherKind", - Name: "other-name", - }, - }, - }, - } - - isOwner := isControllerOwnerPodMonitor(podMonitor, cluster) - - assert.False(t, isOwner) - }) -} - func Test_GetPodMonitor(t *testing.T) { defaultNamespace := "test-namespace" defaultNameCluster := "test-cluster" @@ -167,89 +121,3 @@ func Test_GetPodMonitor(t *testing.T) { }) } } - -func Test_DeletePodMonitorOwnedByController(t *testing.T) { - defaultNamespace := "test-namespace" - defaultNameCluster := "test-cluster" - - scheme := runtime.NewScheme() - _ = slurmv1.AddToScheme(scheme) - _ = prometheusv1.AddToScheme(scheme) - - tests := []struct { - name string - cluster *slurmv1.SlurmCluster - podMonitor *prometheusv1.PodMonitor - expectErr bool - }{ - { - name: "PodMonitor deleted successfully", - cluster: &slurmv1.SlurmCluster{ - ObjectMeta: metav1.ObjectMeta{ - Name: defaultNameCluster, - Namespace: defaultNamespace, - }, - }, - podMonitor: &prometheusv1.PodMonitor{ - ObjectMeta: metav1.ObjectMeta{ - Name: defaultNameCluster, - Namespace: defaultNamespace, - }, - }, - expectErr: false, - }, - { - name: "Error deleting PodMonitor", - cluster: &slurmv1.SlurmCluster{ - ObjectMeta: metav1.ObjectMeta{ - Name: defaultNameCluster, - Namespace: defaultNamespace, - }, - }, - podMonitor: &prometheusv1.PodMonitor{ - ObjectMeta: metav1.ObjectMeta{ - Name: defaultNameCluster, - Namespace: defaultNamespace, - }, - }, - expectErr: true, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - // Set up the fake client - objs := []runtime.Object{tt.podMonitor} - fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithRuntimeObjects(objs...).Build() - - r := &PodMonitorReconciler{ - Reconciler: &Reconciler{ - Client: fakeClient, - Scheme: scheme, - }, - } - - if tt.expectErr { - // Override the client with our fake client to simulate the error on delete - r.Client = &fakeErrorClient{Client: fakeClient} - } - - // Run the test - ctx := context.TODO() - err := r.deletePodMonitorOwnedByController(ctx, tt.cluster, tt.podMonitor) - - if tt.expectErr { - assert.Error(t, err) - } else { - assert.NoError(t, err) - - // Verify the pod monitor was deleted - err = fakeClient.Get(ctx, types.NamespacedName{ - Namespace: tt.podMonitor.Namespace, - Name: tt.podMonitor.Name, - }, &prometheusv1.PodMonitor{}) - assert.True(t, apierrors.IsNotFound(err)) - } - }) - } -}