Skip to content

Commit

Permalink
fix: failover if master pod is not ready
Browse files Browse the repository at this point in the history
Signed-off-by: Abhradeep Chakraborty <abhradeep@dragonflydb.io>
  • Loading branch information
Abhra303 committed Aug 22, 2024
1 parent 00c8ba3 commit e202bfe
Showing 1 changed file with 7 additions and 25 deletions.
32 changes: 7 additions & 25 deletions internal/controller/dragonfly_pod_lifecycle_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,23 +75,16 @@ func (r *DfPodLifeCycleReconciler) Reconcile(ctx context.Context, req ctrl.Reque
// Get the role of the pod
role, roleExists := pod.Labels[resources.Role]
if !isPodReady {
restartCount := getRestartCount(pod)
if roleExists && role == "master" {
// If the master Pod is not ready and has restarted atleast once, initiate failover
if restartCount > 0 {
log.Info("Master pod is not starting after multiple attempts, initiating failover", "pod", req.NamespacedName, "restarts", restartCount)
err := dfi.configureReplication(ctx)
if err != nil {
log.Error(err, "Failed to initiate failover")
return ctrl.Result{RequeueAfter: 5 * time.Second}, err
}
return ctrl.Result{}, nil
} else {
log.Info("Master pod is not ready yet, will requeue", "pod", req.NamespacedName, "restarts", restartCount)
return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
log.Info("Master pod is not ready, initiating failover", "pod", req.NamespacedName)
err := dfi.configureReplication(ctx)
if err != nil {
log.Error(err, "Failed to initiate failover")
return ctrl.Result{RequeueAfter: 5 * time.Second}, err
}
return ctrl.Result{}, nil
} else {
log.Info("Pod is not ready yet", "pod", req.NamespacedName, "restarts", restartCount)
log.Info("Pod is not ready yet", "pod", req.NamespacedName)
return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
}
}
Expand Down Expand Up @@ -196,17 +189,6 @@ func (r *DfPodLifeCycleReconciler) Reconcile(ctx context.Context, req ctrl.Reque
return ctrl.Result{}, nil
}

// getRestartCount fetches the restart count for the given dragonfly pod.
func getRestartCount(pod corev1.Pod) int32 {
var restartCount int32 = 0
for _, cs := range pod.Status.ContainerStatuses {
if cs.Name == "dragonfly" {
restartCount += cs.RestartCount
}
}
return restartCount
}

// SetupWithManager sets up the controller with the Manager.
func (r *DfPodLifeCycleReconciler) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewControllerManagedBy(mgr).
Expand Down

0 comments on commit e202bfe

Please sign in to comment.