From a684fe8f2dfb47cc2d2cfc11b49cc714ca73083f Mon Sep 17 00:00:00 2001 From: Nir Soffer Date: Sun, 1 Dec 2024 20:07:25 +0200 Subject: [PATCH] Delete secondary VRG before primary Since we added a VRG on the secondary cluster we have a random failure when deleting the DRPC after relocate. When this happens, we find the PVC in terminating state on the secondary cluster, and the VR and VRG are never deleted. This change attempt to avoid this issue by deleting the secondary VRG first, and deleting the primary VRG only after the secondary VRG was deleted. During DRPC delete we expect to see these errors one or more times: Secondary VRG manifestwork deletion in progress ... Primary VRG manifestwork deletion in progress ... Fixes: #1659 Signed-off-by: Nir Soffer --- .../drplacementcontrol_controller.go | 46 ++++++++++++++++--- 1 file changed, 39 insertions(+), 7 deletions(-) diff --git a/internal/controller/drplacementcontrol_controller.go b/internal/controller/drplacementcontrol_controller.go index 099a52b83e..753bfd8a87 100644 --- a/internal/controller/drplacementcontrol_controller.go +++ b/internal/controller/drplacementcontrol_controller.go @@ -724,15 +724,15 @@ func (r *DRPlacementControlReconciler) cleanupVRGs( return fmt.Errorf("failed to retrieve VRGs. We'll retry later. Error (%w)", err) } - if !ensureVRGsManagedByDRPC(r.Log, mwu, vrgs, drpc, vrgNamespace) { - return fmt.Errorf("VRG adoption in progress") + // We have to ensure the seconrary VRG is deleted befroe deleting the primary VRG. This will fail until there + // is no secondary VRG in the vrgs list. + if err := r.ensureVRGsDeleted(mwu, vrgs, drpc, vrgNamespace, rmn.Secondary); err != nil { + return err } - // delete VRG manifestwork - for _, drClusterName := range rmnutil.DRPolicyClusterNames(drPolicy) { - if err := mwu.DeleteManifestWork(mwu.BuildManifestWorkName(rmnutil.MWTypeVRG), drClusterName); err != nil { - return fmt.Errorf("%w", err) - } + // This will fail until there is no primary VRG in the vrgs list. + if err := r.ensureVRGsDeleted(mwu, vrgs, drpc, vrgNamespace, rmn.Primary); err != nil { + return err } if len(vrgs) != 0 { @@ -747,6 +747,38 @@ func (r *DRPlacementControlReconciler) cleanupVRGs( return nil } +// ensureVRGsDeleted ensure that seconrary or primary VRGs are deleted. Return an error if a vrg could not be deleted, +// or deletion is in progress. Retrun nil if vrg of specified type was not found. +func (r *DRPlacementControlReconciler) ensureVRGsDeleted( + mwu rmnutil.MWUtil, + vrgs map[string]*rmn.VolumeReplicationGroup, + drpc *rmn.DRPlacementControl, + vrgNamespace string, + replicationState rmn.ReplicationState, +) error { + var deleteInProgress bool + + for cluster, vrg := range vrgs { + if vrg.Spec.ReplicationState == replicationState { + if !ensureVRGsManagedByDRPC(r.Log, mwu, vrgs, drpc, vrgNamespace) { + return fmt.Errorf("%s VRG adoption in progress", replicationState) + } + + if err := mwu.DeleteManifestWork(mwu.BuildManifestWorkName(rmnutil.MWTypeVRG), cluster); err != nil { + return fmt.Errorf("failed to delete %s VRG manifestwork for cluster %q: %w", replicationState, cluster, err) + } + + deleteInProgress = true + } + } + + if deleteInProgress { + return fmt.Errorf("%s VRG manifestwork deletion in progress", replicationState) + } + + return nil +} + func (r *DRPlacementControlReconciler) deleteAllManagedClusterViews( drpc *rmn.DRPlacementControl, clusterNames []string, ) error {