Skip to content

Commit

Permalink
Delete secondary VRG before primary
Browse files Browse the repository at this point in the history
Since we added a VRG on the secondary cluster we have a random failure
when deleting the DRPC after relocate. When this happens, we find the
PVC in terminating state on the secondary cluster, and the VR and VRG
are never deleted.

This change attempt to avoid this issue by deleting the secondary VRG
first, and deleting the primary VRG only after the secondary VRG was
deleted.

During DRPC delete we expect to see these errors one or more times:

    Secondary VRG manifestwork deletion in progress
    ...
    Primary VRG manifestwork deletion in progress
    ...

Signed-off-by: Nir Soffer <nsoffer@redhat.com>
  • Loading branch information
nirs committed Dec 1, 2024
1 parent 6c96132 commit efa29e0
Showing 1 changed file with 39 additions and 7 deletions.
46 changes: 39 additions & 7 deletions internal/controller/drplacementcontrol_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -724,15 +724,15 @@ func (r *DRPlacementControlReconciler) cleanupVRGs(
return fmt.Errorf("failed to retrieve VRGs. We'll retry later. Error (%w)", err)
}

if !ensureVRGsManagedByDRPC(r.Log, mwu, vrgs, drpc, vrgNamespace) {
return fmt.Errorf("VRG adoption in progress")
// We have to ensure the seconrary VRG is deleted befroe deleting the primary VRG. This will fail until there
// is no secondary VRG in the vrgs list.
if err := r.ensureVRGsDeleted(mwu, vrgs, drpc, vrgNamespace, rmn.Secondary); err != nil {
return err
}

// delete VRG manifestwork
for _, drClusterName := range rmnutil.DRPolicyClusterNames(drPolicy) {
if err := mwu.DeleteManifestWork(mwu.BuildManifestWorkName(rmnutil.MWTypeVRG), drClusterName); err != nil {
return fmt.Errorf("%w", err)
}
// This will fail until there is no primary VRG in the vrgs list.
if err := r.ensureVRGsDeleted(mwu, vrgs, drpc, vrgNamespace, rmn.Primary); err != nil {
return err
}

if len(vrgs) != 0 {
Expand All @@ -747,6 +747,38 @@ func (r *DRPlacementControlReconciler) cleanupVRGs(
return nil
}

// ensureVRGsDeleted ensure that seconrary or primary VRGs are deleted. Return an error if a vrg could not be deleted,
// or deletion is in progress. Retrun nil if vrg of specified type was not found.
func (r *DRPlacementControlReconciler) ensureVRGsDeleted(
mwu rmnutil.MWUtil,
vrgs map[string]*rmn.VolumeReplicationGroup,
drpc *rmn.DRPlacementControl,
vrgNamespace string,
replicationState rmn.ReplicationState,
) error {
var deleteInProgress bool

for cluster, vrg := range vrgs {
if vrg.Spec.ReplicationState == replicationState {
if !ensureVRGsManagedByDRPC(r.Log, mwu, vrgs, drpc, vrgNamespace) {
return fmt.Errorf("%s VRG adoption in progress", replicationState)
}

if err := mwu.DeleteManifestWork(mwu.BuildManifestWorkName(rmnutil.MWTypeVRG), cluster); err != nil {
return fmt.Errorf("failed to delete %s VRG manifestwork for cluster %q: %w", replicationState, cluster, err)
}

deleteInProgress = true
}
}

if deleteInProgress {
return fmt.Errorf("%s VRG manifestwork deletion in progress", replicationState)
}

return nil
}

func (r *DRPlacementControlReconciler) deleteAllManagedClusterViews(
drpc *rmn.DRPlacementControl, clusterNames []string,
) error {
Expand Down

0 comments on commit efa29e0

Please sign in to comment.