Skip to content

Commit

Permalink
mon: remove extra mon from quorum before taking down pod
Browse files Browse the repository at this point in the history
When removing a mon from quorum, there is a race condition that
can result in mon quorum going being lost at least temporarily.
The mon pod was being deleted first, and then the mon removed
from quorum. If any other mon went down between the time the
pod of the bad mon was deleted and when the mon was removed from
quorum, there may not be sufficient quorum to complete the
action of removing the mon from quorum and the operator would
be stuck.

For example, there could be 4 mons temporarily due to timing
of upgrading K8s nodes where mons may be taken down for some
number of minutes. Say a new mon is started while the down
mon also comes back up. Now the operator sees it can remove
the 4th mon from quorum, so it starts to remove it. Now say
another mon goes down on another node that is being updated
or otherwise drained. Since the 4th mon pod was deleted
and another mon is down, there are only two mons remaining
in quorum, but 3 mons are required in quorum when there
are 4 mons. Therefore, the quorum is stuck until the
third mon comes back up.

The solution is to first remove the extra mon from quorum
before taking down the mon pod.

Signed-off-by: Travis Nielsen <tnielsen@redhat.com>
(cherry picked from commit 8987d26)
(cherry picked from commit 09c9065)
  • Loading branch information
travisn committed Sep 5, 2024
1 parent acb3979 commit 6df7135
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 26 deletions.
56 changes: 31 additions & 25 deletions pkg/operator/ceph/cluster/mon/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -730,31 +730,40 @@ func (c *Cluster) removeMonWithOptionalQuorum(daemonName string, shouldRemoveFro
}
logger.Infof("ensuring removal of unhealthy monitor %s", daemonName)

resourceName := resourceName(daemonName)

// Remove the mon pod if it is still there
var gracePeriod int64
propagation := metav1.DeletePropagationForeground
options := &metav1.DeleteOptions{GracePeriodSeconds: &gracePeriod, PropagationPolicy: &propagation}
if err := c.context.Clientset.AppsV1().Deployments(c.Namespace).Delete(c.ClusterInfo.Context, resourceName, *options); err != nil {
if kerrors.IsNotFound(err) {
logger.Infof("dead mon %s was already gone", resourceName)
} else {
logger.Errorf("failed to remove dead mon deployment %q. %v", resourceName, err)
}
}

// Remove the bad monitor from quorum
if shouldRemoveFromQuorum {
if err := c.removeMonitorFromQuorum(daemonName); err != nil {
logger.Errorf("failed to remove mon %q from quorum. %v", daemonName, err)
}
}
delete(c.ClusterInfo.Monitors, daemonName)

delete(c.mapping.Schedule, daemonName)

if err := c.saveMonConfig(); err != nil {
return errors.Wrapf(err, "failed to save mon config after failing over mon %s", daemonName)
}

// Update cluster-wide RBD bootstrap peer token since Monitors have changed
_, err := controller.CreateBootstrapPeerSecret(c.context, c.ClusterInfo, &cephv1.CephCluster{ObjectMeta: metav1.ObjectMeta{Name: c.ClusterInfo.NamespacedName().Name, Namespace: c.Namespace}}, c.ownerInfo)
if err != nil {
return errors.Wrap(err, "failed to update cluster rbd bootstrap peer token")
}

// When the mon is removed from quorum, it is possible that the operator will be restarted
// before the mon pod is deleted. In this case, the operator will need to delete the mon
// during the next reconcile. If the reconcile finds an extra mon pod, it will be removed
// at that later reconcile. Thus, we delete the mon pod last during the failover
// and in case the failover is interrupted, the operator can detect the resources to finish the cleanup.
c.removeMonResources(daemonName)
return nil
}

func (c *Cluster) removeMonResources(daemonName string) {
// Remove the service endpoint
resourceName := resourceName(daemonName)
var gracePeriod int64
propagation := metav1.DeletePropagationForeground
options := &metav1.DeleteOptions{GracePeriodSeconds: &gracePeriod, PropagationPolicy: &propagation}
if err := c.context.Clientset.CoreV1().Services(c.Namespace).Delete(c.ClusterInfo.Context, resourceName, *options); err != nil {
if kerrors.IsNotFound(err) {
logger.Infof("dead mon service %s was already gone", resourceName)
Expand All @@ -772,17 +781,14 @@ func (c *Cluster) removeMonWithOptionalQuorum(daemonName string, shouldRemoveFro
}
}

if err := c.saveMonConfig(); err != nil {
return errors.Wrapf(err, "failed to save mon config after failing over mon %s", daemonName)
}

// Update cluster-wide RBD bootstrap peer token since Monitors have changed
_, err := controller.CreateBootstrapPeerSecret(c.context, c.ClusterInfo, &cephv1.CephCluster{ObjectMeta: metav1.ObjectMeta{Name: c.ClusterInfo.NamespacedName().Name, Namespace: c.Namespace}}, c.ownerInfo)
if err != nil {
return errors.Wrap(err, "failed to update cluster rbd bootstrap peer token")
// Remove the mon pod if it is still there
if err := c.context.Clientset.AppsV1().Deployments(c.Namespace).Delete(c.ClusterInfo.Context, resourceName, *options); err != nil {
if kerrors.IsNotFound(err) {
logger.Infof("dead mon %s was already gone", resourceName)
} else {
logger.Errorf("failed to remove dead mon deployment %q. %v", resourceName, err)
}
}

return nil
}

func (c *Cluster) removeMonitorFromQuorum(name string) error {
Expand Down
34 changes: 33 additions & 1 deletion pkg/operator/ceph/cluster/mon/mon.go
Original file line number Diff line number Diff line change
Expand Up @@ -1060,7 +1060,39 @@ func (c *Cluster) startDeployments(mons []*monConfig, requireAllInQuorum bool) e
requireAllInQuorum = true
}
}
return c.waitForMonsToJoin(mons, requireAllInQuorum)
err = c.waitForMonsToJoin(mons, requireAllInQuorum)

// Check for the rare case of an extra mon deployment that needs to be cleaned up
c.checkForExtraMonResources(mons, deployments.Items)
return err
}

func (c *Cluster) checkForExtraMonResources(mons []*monConfig, deployments []apps.Deployment) string {
if len(deployments) <= c.spec.Mon.Count || len(deployments) <= len(mons) {
return ""
}

// If there are more deployments than expected mons from the ceph quorum,
// find the extra mon deployment and clean it up.
logger.Infof("there is an extra mon deployment that is not needed and not in quorum")
for _, deploy := range deployments {
monName := deploy.Labels[controller.DaemonIDLabel]
found := false
// Search for the mon in the list of mons expected in quorum
for _, monDaemon := range mons {
if monName == monDaemon.DaemonName {
found = true
break
}
}
if !found {
logger.Infof("deleting extra mon deployment %q", deploy.Name)
c.removeMonResources(monName)
return monName
}
}

return ""
}

func (c *Cluster) waitForMonsToJoin(mons []*monConfig, requireAllInQuorum bool) error {
Expand Down
44 changes: 44 additions & 0 deletions pkg/operator/ceph/cluster/mon/mon_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -676,6 +676,50 @@ func TestMonVolumeClaimTemplate(t *testing.T) {
})
}
}

func TestRemoveExtraMonDeployments(t *testing.T) {
namespace := "ns"
context, err := newTestStartCluster(t, namespace)
assert.NoError(t, err)
c := newCluster(context, namespace, true, v1.ResourceRequirements{})
c.ClusterInfo = clienttest.CreateTestClusterInfo(1)

// Nothing to remove when the mons match the deployment
mons := []*monConfig{
{ResourceName: "rook-ceph-mon-a", DaemonName: "a"},
}
deployments := []apps.Deployment{
{
ObjectMeta: metav1.ObjectMeta{
Name: "rook-ceph-mon-a",
Labels: map[string]string{"ceph_daemon_id": "a"},
},
},
}
c.spec.Mon.Count = 1
removed := c.checkForExtraMonResources(mons, deployments)
assert.Equal(t, "", removed)

// Remove an extra mon deployment
deployments = append(deployments, apps.Deployment{
ObjectMeta: metav1.ObjectMeta{
Name: "rook-ceph-mon-b",
Labels: map[string]string{"ceph_daemon_id": "b"},
}})
removed = c.checkForExtraMonResources(mons, deployments)
assert.Equal(t, "b", removed)

// Nothing to remove when there are not enough deployments for the expected mons
mons = []*monConfig{
{ResourceName: "rook-ceph-mon-a", DaemonName: "a"},
{ResourceName: "rook-ceph-mon-b", DaemonName: "b"},
{ResourceName: "rook-ceph-mon-c", DaemonName: "c"},
}
c.spec.Mon.Count = 3
removed = c.checkForExtraMonResources(mons, deployments)
assert.Equal(t, "", removed)
}

func TestStretchMonVolumeClaimTemplate(t *testing.T) {
generalSC := "generalSC"
zoneSC := "zoneSC"
Expand Down

0 comments on commit 6df7135

Please sign in to comment.