From 69a82d19999debed155e892b01fb68594f372e5c Mon Sep 17 00:00:00 2001 From: Ihor Hrytskiv Date: Mon, 24 Nov 2025 13:56:01 +0200 Subject: [PATCH 1/2] fix: rolling update process Signed-off-by: Ihor Hrytskiv --- internal/controller/dragonfly_controller.go | 9 +++++++-- internal/controller/dragonfly_instance.go | 17 +++++++++++++---- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/internal/controller/dragonfly_controller.go b/internal/controller/dragonfly_controller.go index b6e66f32..626454b1 100644 --- a/internal/controller/dragonfly_controller.go +++ b/internal/controller/dragonfly_controller.go @@ -86,7 +86,7 @@ func (r *DragonflyReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{}, fmt.Errorf("failed to get statefulset: %w", err) } - if result, err := dfi.allPodsHealthy(ctx, statefulSet.Status.UpdateRevision); !result.IsZero() || err != nil { + if result, err := dfi.allPodsHealthyAndHaveRole(ctx, statefulSet.Status.UpdateRevision); !result.IsZero() || err != nil { return result, err } @@ -95,6 +95,11 @@ func (r *DragonflyReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{}, fmt.Errorf("failed to get replicas: %w", err) } + if len(replicas.Items) != int(dfi.df.Spec.Replicas)-1 { + dfi.log.Info("waiting for all replicas to be configured", "expected", *statefulSet.Spec.Replicas-1, "current", len(replicas.Items)) + return ctrl.Result{RequeueAfter: 5 * time.Second}, nil + } + // We want to update the replicas first then the master // We want to have at most one updated replica in full sync phase at a time // if not, requeue @@ -104,7 +109,7 @@ func (r *DragonflyReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( // if we are here it means that all latest replicas are in stable sync // delete older version replicas - if result, err := dfi.updatedReplicas(ctx, replicas, statefulSet.Status.UpdateRevision); !result.IsZero() || err != nil { + if result, err := dfi.updateReplicas(ctx, replicas, statefulSet.Status.UpdateRevision); !result.IsZero() || err != nil { return result, err } diff --git a/internal/controller/dragonfly_instance.go b/internal/controller/dragonfly_instance.go index 21d0547c..de90d2ee 100644 --- a/internal/controller/dragonfly_instance.go +++ b/internal/controller/dragonfly_instance.go @@ -710,8 +710,8 @@ func (dfi *DragonflyInstance) deleteRoleLabel(ctx context.Context, pod *corev1.P return nil } -// allPodsHealthy checks whether all pods are healthy, and deletes pods that are outdated and failed to start -func (dfi *DragonflyInstance) allPodsHealthy(ctx context.Context, updateRevision string) (ctrl.Result, error) { +// allPodsHealthyAndHaveRole checks whether all pods are healthy, and deletes pods that are outdated and failed to start +func (dfi *DragonflyInstance) allPodsHealthyAndHaveRole(ctx context.Context, updateRevision string) (ctrl.Result, error) { pods, err := dfi.getPods(ctx) if err != nil { return ctrl.Result{}, fmt.Errorf("failed to get dragonfly pods: %w", err) @@ -735,6 +735,11 @@ func (dfi *DragonflyInstance) allPodsHealthy(ctx context.Context, updateRevision dfi.log.Info("waiting for pod to finish startup", "pod", pod.Name) return ctrl.Result{RequeueAfter: 5 * time.Second}, nil } + + if !roleExists(&pod) { + dfi.log.Info("waiting for pod to be assigned a role", "pod", pod.Name) + return ctrl.Result{RequeueAfter: 5 * time.Second}, nil + } } return ctrl.Result{}, nil @@ -763,10 +768,14 @@ func (dfi *DragonflyInstance) verifyUpdatedReplicas(ctx context.Context, replica return ctrl.Result{}, nil } -// updatedReplicas updates the replicas to the latest version -func (dfi *DragonflyInstance) updatedReplicas(ctx context.Context, replicas *corev1.PodList, updateRevision string) (ctrl.Result, error) { +// updateReplicas updates the replicas to the latest version +func (dfi *DragonflyInstance) updateReplicas(ctx context.Context, replicas *corev1.PodList, updateRevision string) (ctrl.Result, error) { for _, replica := range replicas.Items { if !isPodOnLatestVersion(&replica, updateRevision) { + _, err := dfi.getMaster(ctx) + if err != nil { + return ctrl.Result{}, fmt.Errorf("skipping deleting replica: failed to get master: %w", err) + } dfi.log.Info("deleting replica", "pod", replica.Name) dfi.eventRecorder.Event(dfi.df, corev1.EventTypeNormal, "Rollout", "Deleting replica") if err := dfi.client.Delete(ctx, &replica); err != nil { From 962ec373c9221f1d429906ce57d8944eebc35955 Mon Sep 17 00:00:00 2001 From: Ihor Hrytskiv Date: Wed, 17 Dec 2025 16:23:34 +0200 Subject: [PATCH 2/2] fix: issues --- internal/controller/dragonfly_controller.go | 2 +- internal/controller/dragonfly_instance.go | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/internal/controller/dragonfly_controller.go b/internal/controller/dragonfly_controller.go index 626454b1..e6cc26fe 100644 --- a/internal/controller/dragonfly_controller.go +++ b/internal/controller/dragonfly_controller.go @@ -96,7 +96,7 @@ func (r *DragonflyReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( } if len(replicas.Items) != int(dfi.df.Spec.Replicas)-1 { - dfi.log.Info("waiting for all replicas to be configured", "expected", *statefulSet.Spec.Replicas-1, "current", len(replicas.Items)) + dfi.log.Info("waiting for all replicas to be configured", "expected", int(dfi.df.Spec.Replicas)-1, "current", len(replicas.Items)) return ctrl.Result{RequeueAfter: 5 * time.Second}, nil } diff --git a/internal/controller/dragonfly_instance.go b/internal/controller/dragonfly_instance.go index de90d2ee..fb6a256f 100644 --- a/internal/controller/dragonfly_instance.go +++ b/internal/controller/dragonfly_instance.go @@ -770,12 +770,12 @@ func (dfi *DragonflyInstance) verifyUpdatedReplicas(ctx context.Context, replica // updateReplicas updates the replicas to the latest version func (dfi *DragonflyInstance) updateReplicas(ctx context.Context, replicas *corev1.PodList, updateRevision string) (ctrl.Result, error) { + _, err := dfi.getMaster(ctx) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to get master before deleting replica: %w", err) + } for _, replica := range replicas.Items { if !isPodOnLatestVersion(&replica, updateRevision) { - _, err := dfi.getMaster(ctx) - if err != nil { - return ctrl.Result{}, fmt.Errorf("skipping deleting replica: failed to get master: %w", err) - } dfi.log.Info("deleting replica", "pod", replica.Name) dfi.eventRecorder.Event(dfi.df, corev1.EventTypeNormal, "Rollout", "Deleting replica") if err := dfi.client.Delete(ctx, &replica); err != nil {