Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check if the database is available before doing any exclusion checks #1758

Merged
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions controllers/cluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,16 @@ func (r *FoundationDBClusterReconciler) newFdbPodClient(cluster *fdbv1beta2.Foun
return internal.NewFdbPodClient(cluster, pod, globalControllerLogger.WithValues("namespace", cluster.Namespace, "cluster", cluster.Name, "pod", pod.Name), r.GetTimeout, r.PostTimeout)
}

func (r *FoundationDBClusterReconciler) getCoordinatorSet(cluster *fdbv1beta2.FoundationDBCluster) (map[string]fdbv1beta2.None, error) {
johscheuer marked this conversation as resolved.
Show resolved Hide resolved
adminClient, err := r.DatabaseClientProvider.GetAdminClient(cluster, r)
if err != nil {
return map[string]fdbv1beta2.None{}, err
}
defer adminClient.Close()

return adminClient.GetCoordinatorSet()
}

// updateOrApply updates the status either with server-side apply or if disabled with the normal update call.
func (r *FoundationDBClusterReconciler) updateOrApply(ctx context.Context, cluster *fdbv1beta2.FoundationDBCluster) error {
if r.ServerSideApply {
Expand Down
20 changes: 15 additions & 5 deletions controllers/remove_process_groups.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,7 @@ func (u removeProcessGroups) reconcile(ctx context.Context, r *FoundationDBClust
return &requeue{curError: err}
}

coordinators := fdbstatus.GetCoordinatorsFromStatus(status)
allExcluded, newExclusions, processGroupsToRemove := r.getProcessGroupsToRemove(logger, cluster, remainingMap, coordinators)
allExcluded, newExclusions, processGroupsToRemove := r.getProcessGroupsToRemove(logger, cluster, remainingMap)
// If no process groups are marked to remove we have to check if all process groups are excluded.
if len(processGroupsToRemove) == 0 {
if !allExcluded {
Expand Down Expand Up @@ -298,7 +297,8 @@ func getProcessesToInclude(cluster *fdbv1beta2.FoundationDBCluster, removedProce
return fdbProcessesToInclude
}

func (r *FoundationDBClusterReconciler) getProcessGroupsToRemove(logger logr.Logger, cluster *fdbv1beta2.FoundationDBCluster, remainingMap map[string]bool, cordSet map[string]fdbv1beta2.None) (bool, bool, []*fdbv1beta2.ProcessGroupStatus) {
func (r *FoundationDBClusterReconciler) getProcessGroupsToRemove(logger logr.Logger, cluster *fdbv1beta2.FoundationDBCluster, remainingMap map[string]bool) (bool, bool, []*fdbv1beta2.ProcessGroupStatus) {
var cordSet map[string]fdbv1beta2.None
allExcluded := true
newExclusions := false
processGroupsToRemove := make([]*fdbv1beta2.ProcessGroupStatus, 0, len(cluster.Status.ProcessGroups))
Expand All @@ -308,14 +308,24 @@ func (r *FoundationDBClusterReconciler) getProcessGroupsToRemove(logger logr.Log
continue
}

// Only query FDB if we have a pending removal otherwise don't query FDB
if len(cordSet) == 0 {
var err error
cordSet, err = r.getCoordinatorSet(cluster)

if err != nil {
logger.Error(err, "Fetching coordinator set for removal")
return false, false, nil
}
}

if _, ok := cordSet[string(processGroup.ProcessGroupID)]; ok {
logger.Info("Block removal of Coordinator", "processGroupID", processGroup.ProcessGroupID)
allExcluded = false
continue
}

// ProcessGroup is already marked as excluded we can add it to the processGroupsToRemove and skip further
// checks.
// ProcessGroup is already marked as excluded we can add it to the processGroupsToRemove and skip further checks.
if processGroup.IsExcluded() {
processGroupsToRemove = append(processGroupsToRemove, processGroup)
continue
Expand Down
10 changes: 3 additions & 7 deletions controllers/remove_process_groups_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,7 @@ var _ = Describe("remove_process_groups", func() {
coordinatorIP: false,
}

coordSet := map[string]fdbv1beta2.None{
coordinatorIP: {},
}

allExcluded, newExclusions, processes := clusterReconciler.getProcessGroupsToRemove(globalControllerLogger, cluster, remaining, coordSet)
allExcluded, newExclusions, processes := clusterReconciler.getProcessGroupsToRemove(globalControllerLogger, cluster, remaining)
Expect(allExcluded).To(BeFalse())
Expect(processes).To(BeEmpty())
Expect(newExclusions).To(BeFalse())
Expand Down Expand Up @@ -167,9 +163,9 @@ var _ = Describe("remove_process_groups", func() {
}
})

It("should not remove that process group", func() {
It("should not remove the process group and should not exclude processes", func() {
Expect(result).NotTo(BeNil())
Expect(result.message).To(Equal("Removals cannot proceed because cluster has degraded fault tolerance"))
Expect(result.message).To(Equal("Reconciliation needs to exclude more processes"))
// Ensure resources are not deleted
removed, include, err := confirmRemoval(context.Background(), globalControllerLogger, clusterReconciler, cluster, removedProcessGroup.ProcessGroupID)
Expect(err).To(BeNil())
Expand Down
4 changes: 3 additions & 1 deletion controllers/update_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,7 @@ func tryConnectionOptions(logger logr.Logger, cluster *fdbv1beta2.FoundationDBCl
originalConnectionString := cluster.Status.ConnectionString
defer func() { cluster.Status.ConnectionString = originalConnectionString }()

var err error
for _, connectionString := range connectionStrings {
logger.Info("Attempting to get connection string from cluster", "connectionString", connectionString)
cluster.Status.ConnectionString = connectionString
Expand All @@ -309,7 +310,8 @@ func tryConnectionOptions(logger logr.Logger, cluster *fdbv1beta2.FoundationDBCl
return originalConnectionString, clientErr
}

activeConnectionString, err := adminClient.GetConnectionString()
var activeConnectionString string
activeConnectionString, err = adminClient.GetConnectionString()

closeErr := adminClient.Close()
if closeErr != nil {
Expand Down
5 changes: 5 additions & 0 deletions fdbclient/admin_client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -846,6 +846,11 @@ protocol fdb00b071010000`,
Expect(err).NotTo(HaveOccurred())

status := &fdbv1beta2.FoundationDBStatus{
Client: fdbv1beta2.FoundationDBStatusLocalClientInfo{
DatabaseStatus: fdbv1beta2.FoundationDBStatusClientDBStatus{
Available: true,
},
},
Cluster: fdbv1beta2.FoundationDBStatusClusterInfo{
Processes: map[fdbv1beta2.ProcessGroupID]fdbv1beta2.FoundationDBStatusProcessInfo{
"1": { // This process is fully excluded
Expand Down
56 changes: 56 additions & 0 deletions pkg/fdbstatus/status_checks.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,23 @@ import (
"math"
)

// forbiddenStatusMessages represents messages that could be part of the machine-readable status. Those messages can represent
// different error cases that have occurred when fetching the machine-readable status. A list of possible messages can be
// found here: https://github.com/apple/foundationdb/blob/main/documentation/sphinx/source/mr-status.rst?plain=1#L68-L97
// and here: https://apple.github.io/foundationdb/mr-status.html#message-components.
// We don't want to block the exclusion check for all messages, as some messages also indicate client issues or issues
// with a specific transaction priority.
var forbiddenStatusMessages = map[string]fdbv1beta2.None{
"storage_servers_error": {},
"status_incomplete": {},
"unreachable_master_worker": {},
"unreachable_dataDistributor_worker": {},
"unreachable_ratekeeper_worker": {},
"unreadable_configuration": {},
"unreachable_processes": {},
"log_servers_error": {},
}
johscheuer marked this conversation as resolved.
Show resolved Hide resolved

// StatusContextKey will be used as a key in a context to pass down the cached status.
type StatusContextKey struct{}

Expand Down Expand Up @@ -67,6 +84,30 @@ func getRemainingAndExcludedFromStatus(logger logr.Logger, status *fdbv1beta2.Fo
}
}

// If the database is unavailable the status might contain the processes but with missing role information. In this
// case we should not perform any validation on the machine-readable status as this information might not be correct.
// We don't want to run the exclude command to check for the status of the exclusions as this might result in a recovery
// of the cluster or brings the cluster into a worse state.
if !status.Client.DatabaseStatus.Available {
johscheuer marked this conversation as resolved.
Show resolved Hide resolved
logger.Info("Skipping exclusion check as the database is unavailable")
return exclusionStatus{
inProgress: nil,
fullyExcluded: nil,
notExcluded: addresses,
missingInStatus: nil,
}
}

// ...
if !clusterStatusHasValidRoleInformation(logger, status) {
return exclusionStatus{
inProgress: nil,
fullyExcluded: nil,
notExcluded: addresses,
missingInStatus: nil,
}
}

addressesToVerify := map[string]fdbv1beta2.None{}
for _, addr := range addresses {
addressesToVerify[addr.MachineAddress()] = fdbv1beta2.None{}
Expand Down Expand Up @@ -133,6 +174,21 @@ func getRemainingAndExcludedFromStatus(logger logr.Logger, status *fdbv1beta2.Fo
return exclusions
}

// clusterStatusHasValidRoleInformation will check if the cluster part of the machine-readable status contains messages
// that indicate that not all role information could be fetched.
func clusterStatusHasValidRoleInformation(logger logr.Logger, status *fdbv1beta2.FoundationDBStatus) bool {
for _, message := range status.Cluster.Messages {
if _, ok := forbiddenStatusMessages[message.Name]; ok {
logger.Info("Skipping exclusion check as the machine-readable status includes a message that indicates an potential incomplete status",
"messages", status.Cluster.Messages,
"forbiddenStatusMessages", forbiddenStatusMessages)
return false
}
}

return true
}

// CanSafelyRemoveFromStatus checks whether it is safe to remove processes from the cluster, based on the provided status.
//
// The list returned by this method will be the addresses that are *not* safe to remove.
Expand Down
Loading
Loading