Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MSP-3918: Fix bug reconciliation logic for scenarios with maintenance=true and accounting=false #309

Merged
merged 1 commit into from
Jan 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
198 changes: 123 additions & 75 deletions internal/controller/clustercontroller/accounting.go
Original file line number Diff line number Diff line change
Expand Up @@ -224,50 +224,47 @@ func (r SlurmClusterReconciler) ReconcileAccounting(
},
},
utils.MultiStepExecutionStep{
Name: "Slurm Deployment",
Name: "Slurm MariaDB Grant",
Func: func(stepCtx context.Context) error {
stepLogger := log.FromContext(stepCtx)
stepLogger.Info("Reconciling")

if !check.IsMariaDbOperatorCRDInstalled {
stepLogger.Info("MariaDB Operator CRD is not installed. Skipping MariaDB reconciliation")
return nil
}

var err error
var desired *appsv1.Deployment
var deploymentNamePtr *string = nil
var desired *mariadbv1alpha1.Grant
var mariaDbGrantNamePtr *string = nil

if !isAccountingEnabled {
if !isMariaDBEnabled || !isAccountingEnabled {
stepLogger.Info("Removing")
deploymentName := naming.BuildDeploymentName(consts.ComponentTypeAccounting)
deploymentNamePtr = &deploymentName
if err = r.Deployment.Reconcile(stepCtx, cluster, desired, deploymentNamePtr); err != nil {
mariaDbGrantName := naming.BuildMariaDbName(clusterValues.Name)
mariaDbGrantNamePtr = &mariaDbGrantName
if err = r.MariaDbGrant.Reconcile(stepCtx, cluster, desired, mariaDbGrantNamePtr); err != nil {
stepLogger.Error(err, "Failed to reconcile")
return errors.Wrap(err, "reconciling accounting Deployment")
return errors.Wrap(err, "reconciling accounting mariadb grant")
}
stepLogger.Info("Reconciled")
return nil
}
desired, err = accounting.RenderDeployment(

desired, err = accounting.RenderMariaDbGrant(
clusterValues.Namespace,
clusterValues.Name,
&clusterValues.NodeAccounting,
clusterValues.NodeFilters,
clusterValues.VolumeSources,
)
if err != nil {
stepLogger.Error(err, "Failed to render")
return errors.Wrap(err, "rendering accounting Deployment")
return errors.Wrap(err, "rendering accounting mariadb grant")
}
stepLogger = stepLogger.WithValues(logfield.ResourceKV(desired)...)
stepLogger.Info("Rendered")

deps, err := r.getAccountingDeploymentDependencies(ctx, clusterValues)
if err != nil {
stepLogger.Error(err, "Failed to retrieve dependencies")
return errors.Wrap(err, "retrieving dependencies for accounting Deployment")
}
stepLogger.Info("Retrieved dependencies")

if err = r.Deployment.Reconcile(stepCtx, cluster, desired, deploymentNamePtr, deps...); err != nil {
if err = r.MariaDbGrant.Reconcile(ctx, cluster, desired, mariaDbGrantNamePtr); err != nil {
stepLogger.Error(err, "Failed to reconcile")
return errors.Wrap(err, "reconciling accounting Deployment")
return errors.Wrap(err, "reconciling accounting mariadb grant")
}
stepLogger.Info("Reconciled")
return nil
Expand Down Expand Up @@ -322,47 +319,50 @@ func (r SlurmClusterReconciler) ReconcileAccounting(
},
},
utils.MultiStepExecutionStep{
Name: "Slurm MariaDB Grant",
Name: "Slurm Deployment",
Func: func(stepCtx context.Context) error {
stepLogger := log.FromContext(stepCtx)
stepLogger.Info("Reconciling")

if !check.IsMariaDbOperatorCRDInstalled {
stepLogger.Info("MariaDB Operator CRD is not installed. Skipping MariaDB reconciliation")
return nil
}

var err error
var desired *mariadbv1alpha1.Grant
var mariaDbGrantNamePtr *string = nil
var desired *appsv1.Deployment
var deploymentNamePtr *string = nil

if !isMariaDBEnabled || !isAccountingEnabled {
if !isAccountingEnabled {
stepLogger.Info("Removing")
mariaDbGrantName := naming.BuildMariaDbName(clusterValues.Name)
mariaDbGrantNamePtr = &mariaDbGrantName
if err = r.MariaDbGrant.Reconcile(stepCtx, cluster, desired, mariaDbGrantNamePtr); err != nil {
deploymentName := naming.BuildDeploymentName(consts.ComponentTypeAccounting)
deploymentNamePtr = &deploymentName
if err = r.Deployment.Reconcile(stepCtx, cluster, desired, deploymentNamePtr); err != nil {
stepLogger.Error(err, "Failed to reconcile")
return errors.Wrap(err, "reconciling accounting mariadb grant")
return errors.Wrap(err, "reconciling accounting Deployment")
}
stepLogger.Info("Reconciled")
return nil
}

desired, err = accounting.RenderMariaDbGrant(
desired, err = accounting.RenderDeployment(
clusterValues.Namespace,
clusterValues.Name,
&clusterValues.NodeAccounting,
clusterValues.NodeFilters,
clusterValues.VolumeSources,
)
if err != nil {
stepLogger.Error(err, "Failed to render")
return errors.Wrap(err, "rendering accounting mariadb grant")
return errors.Wrap(err, "rendering accounting Deployment")
}
stepLogger = stepLogger.WithValues(logfield.ResourceKV(desired)...)
stepLogger.Info("Rendered")

if err = r.MariaDbGrant.Reconcile(ctx, cluster, desired, mariaDbGrantNamePtr); err != nil {
deps, err := r.getAccountingDeploymentDependencies(ctx, clusterValues)
if err != nil {
stepLogger.Error(err, "Failed to retrieve dependencies")
return errors.Wrap(err, "retrieving dependencies for accounting Deployment")
}
stepLogger.Info("Retrieved dependencies")

if err = r.Deployment.Reconcile(stepCtx, cluster, desired, deploymentNamePtr, deps...); err != nil {
stepLogger.Error(err, "Failed to reconcile")
return errors.Wrap(err, "reconciling accounting mariadb grant")
return errors.Wrap(err, "reconciling accounting Deployment")
}
stepLogger.Info("Reconciled")
return nil
Expand Down Expand Up @@ -437,55 +437,103 @@ func (r SlurmClusterReconciler) ValidateAccounting(
cluster *slurmv1.SlurmCluster,
clusterValues *values.SlurmCluster,
) (ctrl.Result, error) {
logger := log.FromContext(ctx)
existingDeployment := &appsv1.Deployment{}
existingMariaDb := &mariadbv1alpha1.MariaDB{}
existingMariaDbGrant := &mariadbv1alpha1.Grant{}

existing := &appsv1.Deployment{}
err := r.Get(
ctx,
types.NamespacedName{
Namespace: clusterValues.Namespace,
Name: clusterValues.NodeAccounting.Deployment.Name,
},
existing,
)
existingDeployment, err := getTypedResource(ctx, r.Client, clusterValues.Namespace, clusterValues.NodeAccounting.Deployment.Name, existingDeployment)
if err != nil {
if apierrors.IsNotFound(err) {
return ctrl.Result{Requeue: true, RequeueAfter: 5 * time.Second}, nil
}
logger.Error(err, "Failed to get accounting Deployment")
return ctrl.Result{}, errors.Wrap(err, "getting accounting Deployment")
message := "Failed to get Deployment"
return r.updateStatus(ctx, cluster, slurmv1.ConditionClusterAccountingAvailable, metav1.ConditionUnknown, "Unknown", message, 10*time.Second)
}

targetReplicas := clusterValues.NodeAccounting.Deployment.Replicas
if existing.Spec.Replicas != nil {
targetReplicas = *existing.Spec.Replicas
if existingDeployment.Spec.Replicas != nil {
targetReplicas = *existingDeployment.Spec.Replicas
}
if existing.Status.AvailableReplicas != targetReplicas {
if err = r.patchStatus(ctx, cluster, func(status *slurmv1.SlurmClusterStatus) {
status.SetCondition(metav1.Condition{
Type: slurmv1.ConditionClusterAccountingAvailable,
Status: metav1.ConditionFalse, Reason: "NotAvailable",
Message: "Slurm accounting is not available yet",
})
}); err != nil {
return ctrl.Result{}, err

if clusterValues.NodeAccounting.MariaDb.Enabled {
existingMariaDb, mariadbErr := getTypedResource(ctx, r.Client, clusterValues.Namespace, naming.BuildMariaDbName(clusterValues.Name), existingMariaDb)
if mariadbErr != nil || existingMariaDb == nil {
message := "Failed to get MariaDB"
return r.updateStatus(ctx, cluster, slurmv1.ConditionClusterAccountingAvailable, metav1.ConditionUnknown, "Unknown", message, 10*time.Second)
}
return ctrl.Result{Requeue: true, RequeueAfter: time.Second * 10}, nil
} else {
if err = r.patchStatus(ctx, cluster, func(status *slurmv1.SlurmClusterStatus) {
status.SetCondition(metav1.Condition{
Type: slurmv1.ConditionClusterAccountingAvailable,
Status: metav1.ConditionTrue, Reason: "Available",
Message: "Slurm accounting is available",
})
}); err != nil {
return ctrl.Result{}, err
existingMariaDbGrant, mariadbGrantErr := getTypedResource(ctx, r.Client, clusterValues.Namespace, naming.BuildMariaDbName(clusterValues.Name), existingMariaDbGrant)
if mariadbGrantErr != nil || existingMariaDbGrant == nil {
message := "Failed to get MariaDB Grant"
return r.updateStatus(ctx, cluster, slurmv1.ConditionClusterAccountingAvailable, metav1.ConditionUnknown, "Unknown", message, 10*time.Second)
}
}

switch {
case !isConditionReady(existingMariaDb.Status.Conditions, mariadbv1alpha1.ConditionTypeReady):
message := "MariaDB is not ready"
return r.updateStatus(ctx, cluster, slurmv1.ConditionClusterAccountingAvailable, metav1.ConditionFalse, "NotAvailable", message, 10*time.Second)
case !isConditionReady(existingMariaDbGrant.Status.Conditions, mariadbv1alpha1.ConditionTypeReady):
message := "MariaDB Grant is not ready"
return r.updateStatus(ctx, cluster, slurmv1.ConditionClusterAccountingAvailable, metav1.ConditionFalse, "NotAvailable", message, 10*time.Second)
case existingDeployment.Status.AvailableReplicas == 0:
message := "Slurm accounting is not available yet"
return r.updateStatus(ctx, cluster, slurmv1.ConditionClusterAccountingAvailable, metav1.ConditionFalse, "NotAvailable", message, 10*time.Second)
case existingDeployment.Status.AvailableReplicas != targetReplicas:
message := fmt.Sprintf("Slurm accounting available replicas: %d, but target replicas: %d", existingDeployment.Status.AvailableReplicas, targetReplicas)
return r.updateStatus(ctx, cluster, slurmv1.ConditionClusterAccountingAvailable, metav1.ConditionFalse, "NotAvailable", message, 10*time.Second)
}

return r.updateStatus(ctx, cluster, slurmv1.ConditionClusterAccountingAvailable, metav1.ConditionTrue, "Available", "Slurm accounting is available", 0)
}

func getTypedResource[T client.Object](
ctx context.Context,
getter client.Reader,
namespace, name string,
obj T,
) (T, error) {
err := getter.Get(ctx, types.NamespacedName{Namespace: namespace, Name: name}, obj)
if err != nil {
if apierrors.IsNotFound(err) {
var zeroValue T // This creates the zero value for type T
return zeroValue, nil
}
return obj, errors.Wrap(err, "failed to get resource")
}
return obj, nil
}

func (r SlurmClusterReconciler) updateStatus(
ctx context.Context,
cluster *slurmv1.SlurmCluster,
conditionType string,
conditionStatus metav1.ConditionStatus,
reason, message string,
requeueAfter time.Duration,
) (ctrl.Result, error) {
if err := r.patchStatus(ctx, cluster, func(status *slurmv1.SlurmClusterStatus) {
status.SetCondition(metav1.Condition{
Type: conditionType,
Status: conditionStatus,
Reason: reason,
Message: message,
})
}); err != nil {
return ctrl.Result{}, err
}
if conditionStatus != metav1.ConditionTrue {
return ctrl.Result{Requeue: true, RequeueAfter: requeueAfter}, nil
}

return ctrl.Result{}, nil
}

func isConditionReady(conditions []metav1.Condition, conditionType string) bool {
for _, condition := range conditions {
if condition.Type == conditionType && condition.Status == metav1.ConditionTrue {
return true
}
}
return false
}

func (r SlurmClusterReconciler) getAccountingDeploymentDependencies(
ctx context.Context,
clusterValues *values.SlurmCluster,
Expand Down
Loading
Loading