mon: remove extra mon from quorum before taking down pod

When removing a mon from quorum, there is a race condition that can result in mon quorum going being lost at least temporarily. The mon pod was being deleted first, and then the mon removed from quorum. If any other mon went down between the time the pod of the bad mon was deleted and when the mon was removed from quorum, there may not be sufficient quorum to complete the action of removing the mon from quorum and the operator would be stuck. For example, there could be 4 mons temporarily due to timing of upgrading K8s nodes where mons may be taken down for some number of minutes. Say a new mon is started while the down mon also comes back up. Now the operator sees it can remove the 4th mon from quorum, so it starts to remove it. Now say another mon goes down on another node that is being updated or otherwise drained. Since the 4th mon pod was deleted and another mon is down, there are only two mons remaining in quorum, but 3 mons are required in quorum when there are 4 mons. Therefore, the quorum is stuck until the third mon comes back up. The solution is to first remove the extra mon from quorum before taking down the mon pod. Signed-off-by: Travis Nielsen <tnielsen@redhat.com> (cherry picked from commit 8987d26) (cherry picked from commit 09c9065)
red-hat-storage · Sep 5, 2024 · 6df7135 · 6df7135
1 parent acb3979
commit 6df7135
Show file tree

Hide file tree

Showing 3 changed files with 108 additions and 26 deletions.
diff --git a/pkg/operator/ceph/cluster/mon/health.go b/pkg/operator/ceph/cluster/mon/health.go
@@ -730,31 +730,40 @@ func (c *Cluster) removeMonWithOptionalQuorum(daemonName string, shouldRemoveFro
 	}
 	logger.Infof("ensuring removal of unhealthy monitor %s", daemonName)
 
-	resourceName := resourceName(daemonName)
-
-	// Remove the mon pod if it is still there
-	var gracePeriod int64
-	propagation := metav1.DeletePropagationForeground
-	options := &metav1.DeleteOptions{GracePeriodSeconds: &gracePeriod, PropagationPolicy: &propagation}
-	if err := c.context.Clientset.AppsV1().Deployments(c.Namespace).Delete(c.ClusterInfo.Context, resourceName, *options); err != nil {
-		if kerrors.IsNotFound(err) {
-			logger.Infof("dead mon %s was already gone", resourceName)
-		} else {
-			logger.Errorf("failed to remove dead mon deployment %q. %v", resourceName, err)
-		}
-	}
-
 	// Remove the bad monitor from quorum
 	if shouldRemoveFromQuorum {
 		if err := c.removeMonitorFromQuorum(daemonName); err != nil {
 			logger.Errorf("failed to remove mon %q from quorum. %v", daemonName, err)
 		}
 	}
 	delete(c.ClusterInfo.Monitors, daemonName)
-
 	delete(c.mapping.Schedule, daemonName)
 
+	if err := c.saveMonConfig(); err != nil {
+		return errors.Wrapf(err, "failed to save mon config after failing over mon %s", daemonName)
+	}
+
+	// Update cluster-wide RBD bootstrap peer token since Monitors have changed
+	_, err := controller.CreateBootstrapPeerSecret(c.context, c.ClusterInfo, &cephv1.CephCluster{ObjectMeta: metav1.ObjectMeta{Name: c.ClusterInfo.NamespacedName().Name, Namespace: c.Namespace}}, c.ownerInfo)
+	if err != nil {
+		return errors.Wrap(err, "failed to update cluster rbd bootstrap peer token")
+	}
+
+	// When the mon is removed from quorum, it is possible that the operator will be restarted
+	// before the mon pod is deleted. In this case, the operator will need to delete the mon
+	// during the next reconcile. If the reconcile finds an extra mon pod, it will be removed
+	// at that later reconcile. Thus, we delete the mon pod last during the failover
+	// and in case the failover is interrupted, the operator can detect the resources to finish the cleanup.
+	c.removeMonResources(daemonName)
+	return nil
+}
+
+func (c *Cluster) removeMonResources(daemonName string) {
 	// Remove the service endpoint
+	resourceName := resourceName(daemonName)
+	var gracePeriod int64
+	propagation := metav1.DeletePropagationForeground
+	options := &metav1.DeleteOptions{GracePeriodSeconds: &gracePeriod, PropagationPolicy: &propagation}
 	if err := c.context.Clientset.CoreV1().Services(c.Namespace).Delete(c.ClusterInfo.Context, resourceName, *options); err != nil {
 		if kerrors.IsNotFound(err) {
 			logger.Infof("dead mon service %s was already gone", resourceName)
@@ -772,17 +781,14 @@ func (c *Cluster) removeMonWithOptionalQuorum(daemonName string, shouldRemoveFro
 		}
 	}
 
-	if err := c.saveMonConfig(); err != nil {
-		return errors.Wrapf(err, "failed to save mon config after failing over mon %s", daemonName)
-	}
-
-	// Update cluster-wide RBD bootstrap peer token since Monitors have changed
-	_, err := controller.CreateBootstrapPeerSecret(c.context, c.ClusterInfo, &cephv1.CephCluster{ObjectMeta: metav1.ObjectMeta{Name: c.ClusterInfo.NamespacedName().Name, Namespace: c.Namespace}}, c.ownerInfo)
-	if err != nil {
-		return errors.Wrap(err, "failed to update cluster rbd bootstrap peer token")
+	// Remove the mon pod if it is still there
+	if err := c.context.Clientset.AppsV1().Deployments(c.Namespace).Delete(c.ClusterInfo.Context, resourceName, *options); err != nil {
+		if kerrors.IsNotFound(err) {
+			logger.Infof("dead mon %s was already gone", resourceName)
+		} else {
+			logger.Errorf("failed to remove dead mon deployment %q. %v", resourceName, err)
+		}
 	}
-
-	return nil
 }
 
 func (c *Cluster) removeMonitorFromQuorum(name string) error {

diff --git a/pkg/operator/ceph/cluster/mon/mon.go b/pkg/operator/ceph/cluster/mon/mon.go
@@ -1060,7 +1060,39 @@ func (c *Cluster) startDeployments(mons []*monConfig, requireAllInQuorum bool) e
 			requireAllInQuorum = true
 		}
 	}
-	return c.waitForMonsToJoin(mons, requireAllInQuorum)
+	err = c.waitForMonsToJoin(mons, requireAllInQuorum)
+
+	// Check for the rare case of an extra mon deployment that needs to be cleaned up
+	c.checkForExtraMonResources(mons, deployments.Items)
+	return err
+}
+
+func (c *Cluster) checkForExtraMonResources(mons []*monConfig, deployments []apps.Deployment) string {
+	if len(deployments) <= c.spec.Mon.Count || len(deployments) <= len(mons) {
+		return ""
+	}
+
+	// If there are more deployments than expected mons from the ceph quorum,
+	// find the extra mon deployment and clean it up.
+	logger.Infof("there is an extra mon deployment that is not needed and not in quorum")
+	for _, deploy := range deployments {
+		monName := deploy.Labels[controller.DaemonIDLabel]
+		found := false
+		// Search for the mon in the list of mons expected in quorum
+		for _, monDaemon := range mons {
+			if monName == monDaemon.DaemonName {
+				found = true
+				break
+			}
+		}
+		if !found {
+			logger.Infof("deleting extra mon deployment %q", deploy.Name)
+			c.removeMonResources(monName)
+			return monName
+		}
+	}
+
+	return ""
 }
 
 func (c *Cluster) waitForMonsToJoin(mons []*monConfig, requireAllInQuorum bool) error {

diff --git a/pkg/operator/ceph/cluster/mon/mon_test.go b/pkg/operator/ceph/cluster/mon/mon_test.go
@@ -676,6 +676,50 @@ func TestMonVolumeClaimTemplate(t *testing.T) {
 		})
 	}
 }
+
+func TestRemoveExtraMonDeployments(t *testing.T) {
+	namespace := "ns"
+	context, err := newTestStartCluster(t, namespace)
+	assert.NoError(t, err)
+	c := newCluster(context, namespace, true, v1.ResourceRequirements{})
+	c.ClusterInfo = clienttest.CreateTestClusterInfo(1)
+
+	// Nothing to remove when the mons match the deployment
+	mons := []*monConfig{
+		{ResourceName: "rook-ceph-mon-a", DaemonName: "a"},
+	}
+	deployments := []apps.Deployment{
+		{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:   "rook-ceph-mon-a",
+				Labels: map[string]string{"ceph_daemon_id": "a"},
+			},
+		},
+	}
+	c.spec.Mon.Count = 1
+	removed := c.checkForExtraMonResources(mons, deployments)
+	assert.Equal(t, "", removed)
+
+	// Remove an extra mon deployment
+	deployments = append(deployments, apps.Deployment{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:   "rook-ceph-mon-b",
+			Labels: map[string]string{"ceph_daemon_id": "b"},
+		}})
+	removed = c.checkForExtraMonResources(mons, deployments)
+	assert.Equal(t, "b", removed)
+
+	// Nothing to remove when there are not enough deployments for the expected mons
+	mons = []*monConfig{
+		{ResourceName: "rook-ceph-mon-a", DaemonName: "a"},
+		{ResourceName: "rook-ceph-mon-b", DaemonName: "b"},
+		{ResourceName: "rook-ceph-mon-c", DaemonName: "c"},
+	}
+	c.spec.Mon.Count = 3
+	removed = c.checkForExtraMonResources(mons, deployments)
+	assert.Equal(t, "", removed)
+}
+
 func TestStretchMonVolumeClaimTemplate(t *testing.T) {
 	generalSC := "generalSC"
 	zoneSC := "zoneSC"