diff --git a/controllers/remove_process_groups.go b/controllers/remove_process_groups.go index a45d77980..853dc5d7a 100644 --- a/controllers/remove_process_groups.go +++ b/controllers/remove_process_groups.go @@ -314,8 +314,7 @@ func (r *FoundationDBClusterReconciler) getProcessGroupsToRemove(logger logr.Log continue } - // ProcessGroup is already marked as excluded we can add it to the processGroupsToRemove and skip further - // checks. + // ProcessGroup is already marked as excluded we can add it to the processGroupsToRemove and skip further checks. if processGroup.IsExcluded() { processGroupsToRemove = append(processGroupsToRemove, processGroup) continue diff --git a/controllers/remove_process_groups_test.go b/controllers/remove_process_groups_test.go index d2fe78dc4..3c9501819 100644 --- a/controllers/remove_process_groups_test.go +++ b/controllers/remove_process_groups_test.go @@ -167,9 +167,9 @@ var _ = Describe("remove_process_groups", func() { } }) - It("should not remove that process group", func() { + It("should not remove the process group and should not exclude processes", func() { Expect(result).NotTo(BeNil()) - Expect(result.message).To(Equal("Removals cannot proceed because cluster has degraded fault tolerance")) + Expect(result.message).To(Equal("Reconciliation needs to exclude more processes")) // Ensure resources are not deleted removed, include, err := confirmRemoval(context.Background(), globalControllerLogger, clusterReconciler, cluster, removedProcessGroup.ProcessGroupID) Expect(err).To(BeNil()) diff --git a/controllers/update_status.go b/controllers/update_status.go index 1a0f1938d..6a2846a47 100644 --- a/controllers/update_status.go +++ b/controllers/update_status.go @@ -301,6 +301,7 @@ func tryConnectionOptions(logger logr.Logger, cluster *fdbv1beta2.FoundationDBCl originalConnectionString := cluster.Status.ConnectionString defer func() { cluster.Status.ConnectionString = originalConnectionString }() + var err error for _, connectionString := range connectionStrings { logger.Info("Attempting to get connection string from cluster", "connectionString", connectionString) cluster.Status.ConnectionString = connectionString @@ -309,7 +310,8 @@ func tryConnectionOptions(logger logr.Logger, cluster *fdbv1beta2.FoundationDBCl return originalConnectionString, clientErr } - activeConnectionString, err := adminClient.GetConnectionString() + var activeConnectionString string + activeConnectionString, err = adminClient.GetConnectionString() closeErr := adminClient.Close() if closeErr != nil { diff --git a/fdbclient/admin_client_test.go b/fdbclient/admin_client_test.go index ded591686..5aa4ceb7a 100644 --- a/fdbclient/admin_client_test.go +++ b/fdbclient/admin_client_test.go @@ -846,6 +846,11 @@ protocol fdb00b071010000`, Expect(err).NotTo(HaveOccurred()) status := &fdbv1beta2.FoundationDBStatus{ + Client: fdbv1beta2.FoundationDBStatusLocalClientInfo{ + DatabaseStatus: fdbv1beta2.FoundationDBStatusClientDBStatus{ + Available: true, + }, + }, Cluster: fdbv1beta2.FoundationDBStatusClusterInfo{ Processes: map[fdbv1beta2.ProcessGroupID]fdbv1beta2.FoundationDBStatusProcessInfo{ "1": { // This process is fully excluded diff --git a/pkg/fdbstatus/status_checks.go b/pkg/fdbstatus/status_checks.go index 4891424fc..156ca177f 100644 --- a/pkg/fdbstatus/status_checks.go +++ b/pkg/fdbstatus/status_checks.go @@ -29,6 +29,19 @@ import ( "math" ) +// forbiddenStatusMessages represents messages that could be part of the machine-readable status. Those messages can represent +// different error cases that have occurred when fetching the machine-readable status. A list of possible messages can be +// found here: https://github.com/apple/foundationdb/blob/main/documentation/sphinx/source/mr-status.rst?plain=1#L68-L97 +// and here: https://apple.github.io/foundationdb/mr-status.html#message-components. +// We don't want to block the exclusion check for all messages, as some messages also indicate client issues or issues +// with a specific transaction priority. +var forbiddenStatusMessages = map[string]fdbv1beta2.None{ + "unreadable_configuration": {}, + "full_replication_timeout": {}, + "storage_servers_error": {}, + "log_servers_error": {}, +} + // StatusContextKey will be used as a key in a context to pass down the cached status. type StatusContextKey struct{} @@ -67,6 +80,32 @@ func getRemainingAndExcludedFromStatus(logger logr.Logger, status *fdbv1beta2.Fo } } + // If the database is unavailable the status might contain the processes but with missing role information. In this + // case we should not perform any validation on the machine-readable status as this information might not be correct. + // We don't want to run the exclude command to check for the status of the exclusions as this might result in a recovery + // of the cluster or brings the cluster into a worse state. + if !status.Client.DatabaseStatus.Available { + logger.Info("Skipping exclusion check as the database is unavailable") + return exclusionStatus{ + inProgress: nil, + fullyExcluded: nil, + notExcluded: addresses, + missingInStatus: nil, + } + } + + // We have to make sure that the provided machine-readable status contains the required information, if any of the + // forbiddenStatusMessages is present, the operator is not able to make a decision if a set of processes is fully excluded + // or not. + if !clusterStatusHasValidRoleInformation(logger, status) { + return exclusionStatus{ + inProgress: nil, + fullyExcluded: nil, + notExcluded: addresses, + missingInStatus: nil, + } + } + addressesToVerify := map[string]fdbv1beta2.None{} for _, addr := range addresses { addressesToVerify[addr.MachineAddress()] = fdbv1beta2.None{} @@ -133,6 +172,21 @@ func getRemainingAndExcludedFromStatus(logger logr.Logger, status *fdbv1beta2.Fo return exclusions } +// clusterStatusHasValidRoleInformation will check if the cluster part of the machine-readable status contains messages +// that indicate that not all role information could be fetched. +func clusterStatusHasValidRoleInformation(logger logr.Logger, status *fdbv1beta2.FoundationDBStatus) bool { + for _, message := range status.Cluster.Messages { + if _, ok := forbiddenStatusMessages[message.Name]; ok { + logger.Info("Skipping exclusion check as the machine-readable status includes a message that indicates an potential incomplete status", + "messages", status.Cluster.Messages, + "forbiddenStatusMessages", forbiddenStatusMessages) + return false + } + } + + return true +} + // CanSafelyRemoveFromStatus checks whether it is safe to remove processes from the cluster, based on the provided status. // // The list returned by this method will be the addresses that are *not* safe to remove. diff --git a/pkg/fdbstatus/status_checks_test.go b/pkg/fdbstatus/status_checks_test.go index 27dc93e84..053cab18f 100644 --- a/pkg/fdbstatus/status_checks_test.go +++ b/pkg/fdbstatus/status_checks_test.go @@ -38,6 +38,11 @@ var _ = Describe("status_checks", func() { addr4 := fdbv1beta2.NewProcessAddress(net.ParseIP("127.0.0.4"), "", 0, nil) addr5 := fdbv1beta2.NewProcessAddress(net.ParseIP("127.0.0.5"), "", 0, nil) status := &fdbv1beta2.FoundationDBStatus{ + Client: fdbv1beta2.FoundationDBStatusLocalClientInfo{ + DatabaseStatus: fdbv1beta2.FoundationDBStatusClientDBStatus{ + Available: true, + }, + }, Cluster: fdbv1beta2.FoundationDBStatusClusterInfo{ Processes: map[fdbv1beta2.ProcessGroupID]fdbv1beta2.FoundationDBStatusProcessInfo{ "1": { @@ -119,6 +124,11 @@ var _ = Describe("status_checks", func() { ), Entry("when the process is excluded but the cluster status has multiple generations", &fdbv1beta2.FoundationDBStatus{ + Client: fdbv1beta2.FoundationDBStatusLocalClientInfo{ + DatabaseStatus: fdbv1beta2.FoundationDBStatusClientDBStatus{ + Available: true, + }, + }, Cluster: fdbv1beta2.FoundationDBStatusClusterInfo{ RecoveryState: fdbv1beta2.RecoveryState{ ActiveGenerations: 2, @@ -154,6 +164,11 @@ var _ = Describe("status_checks", func() { ), Entry("when the process group has multiple processes and only one is fully excluded", &fdbv1beta2.FoundationDBStatus{ + Client: fdbv1beta2.FoundationDBStatusLocalClientInfo{ + DatabaseStatus: fdbv1beta2.FoundationDBStatusClientDBStatus{ + Available: true, + }, + }, Cluster: fdbv1beta2.FoundationDBStatusClusterInfo{ Processes: map[fdbv1beta2.ProcessGroupID]fdbv1beta2.FoundationDBStatusProcessInfo{ "1": { @@ -196,6 +211,11 @@ var _ = Describe("status_checks", func() { ), Entry("when the process group has multiple processes and both are fully excluded", &fdbv1beta2.FoundationDBStatus{ + Client: fdbv1beta2.FoundationDBStatusLocalClientInfo{ + DatabaseStatus: fdbv1beta2.FoundationDBStatusClientDBStatus{ + Available: true, + }, + }, Cluster: fdbv1beta2.FoundationDBStatusClusterInfo{ Processes: map[fdbv1beta2.ProcessGroupID]fdbv1beta2.FoundationDBStatusProcessInfo{ "1": { @@ -227,6 +247,11 @@ var _ = Describe("status_checks", func() { ), Entry("when the process group has multiple processes and only one is excluded", &fdbv1beta2.FoundationDBStatus{ + Client: fdbv1beta2.FoundationDBStatusLocalClientInfo{ + DatabaseStatus: fdbv1beta2.FoundationDBStatusClientDBStatus{ + Available: true, + }, + }, Cluster: fdbv1beta2.FoundationDBStatusClusterInfo{ Processes: map[fdbv1beta2.ProcessGroupID]fdbv1beta2.FoundationDBStatusProcessInfo{ "1": { @@ -261,6 +286,276 @@ var _ = Describe("status_checks", func() { nil, nil, ), + Entry("when the process is excluded but the cluster is unavailable", + &fdbv1beta2.FoundationDBStatus{ + Client: fdbv1beta2.FoundationDBStatusLocalClientInfo{ + DatabaseStatus: fdbv1beta2.FoundationDBStatusClientDBStatus{ + Available: false, + }, + }, + Cluster: fdbv1beta2.FoundationDBStatusClusterInfo{ + RecoveryState: fdbv1beta2.RecoveryState{ + ActiveGenerations: 1, + }, + Processes: map[fdbv1beta2.ProcessGroupID]fdbv1beta2.FoundationDBStatusProcessInfo{ + "1": { + Address: addr1, + Excluded: true, + }, + "2": { + Address: addr2, + }, + "3": { + Address: addr3, + }, + "4": { + Address: addr4, + Excluded: true, + Roles: []fdbv1beta2.FoundationDBStatusProcessRoleInfo{ + { + Role: "tester", + }, + }, + }, + }, + }, + }, + []fdbv1beta2.ProcessAddress{addr4}, + nil, + []fdbv1beta2.ProcessAddress{addr4}, + nil, + nil, + ), + Entry("when the machine-readable status contains the \"storage_servers_error\" message", + &fdbv1beta2.FoundationDBStatus{ + Client: fdbv1beta2.FoundationDBStatusLocalClientInfo{ + DatabaseStatus: fdbv1beta2.FoundationDBStatusClientDBStatus{ + Available: true, + }, + }, + Cluster: fdbv1beta2.FoundationDBStatusClusterInfo{ + Messages: []fdbv1beta2.FoundationDBStatusMessage{ + { + Name: "storage_servers_error", + Description: "...", + }, + }, + RecoveryState: fdbv1beta2.RecoveryState{ + ActiveGenerations: 1, + }, + Processes: map[fdbv1beta2.ProcessGroupID]fdbv1beta2.FoundationDBStatusProcessInfo{ + "1": { + Address: addr1, + Excluded: true, + }, + "2": { + Address: addr2, + }, + "3": { + Address: addr3, + }, + "4": { + Address: addr4, + Excluded: true, + Roles: []fdbv1beta2.FoundationDBStatusProcessRoleInfo{ + { + Role: "tester", + }, + }, + }, + }, + }, + }, + []fdbv1beta2.ProcessAddress{addr4}, + nil, + []fdbv1beta2.ProcessAddress{addr4}, + nil, + nil, + ), + Entry("when the machine-readable status contains the \"log_servers_error\" message", + &fdbv1beta2.FoundationDBStatus{ + Client: fdbv1beta2.FoundationDBStatusLocalClientInfo{ + DatabaseStatus: fdbv1beta2.FoundationDBStatusClientDBStatus{ + Available: true, + }, + }, + Cluster: fdbv1beta2.FoundationDBStatusClusterInfo{ + Messages: []fdbv1beta2.FoundationDBStatusMessage{ + { + Name: "log_servers_error", + Description: "...", + }, + }, + RecoveryState: fdbv1beta2.RecoveryState{ + ActiveGenerations: 1, + }, + Processes: map[fdbv1beta2.ProcessGroupID]fdbv1beta2.FoundationDBStatusProcessInfo{ + "1": { + Address: addr1, + Excluded: true, + }, + "2": { + Address: addr2, + }, + "3": { + Address: addr3, + }, + "4": { + Address: addr4, + Excluded: true, + Roles: []fdbv1beta2.FoundationDBStatusProcessRoleInfo{ + { + Role: "tester", + }, + }, + }, + }, + }, + }, + []fdbv1beta2.ProcessAddress{addr4}, + nil, + []fdbv1beta2.ProcessAddress{addr4}, + nil, + nil, + ), + Entry("when the machine-readable status contains the \"unreadable_configuration\" message", + &fdbv1beta2.FoundationDBStatus{ + Client: fdbv1beta2.FoundationDBStatusLocalClientInfo{ + DatabaseStatus: fdbv1beta2.FoundationDBStatusClientDBStatus{ + Available: true, + }, + }, + Cluster: fdbv1beta2.FoundationDBStatusClusterInfo{ + Messages: []fdbv1beta2.FoundationDBStatusMessage{ + { + Name: "unreadable_configuration", + Description: "...", + }, + }, + RecoveryState: fdbv1beta2.RecoveryState{ + ActiveGenerations: 1, + }, + Processes: map[fdbv1beta2.ProcessGroupID]fdbv1beta2.FoundationDBStatusProcessInfo{ + "1": { + Address: addr1, + Excluded: true, + }, + "2": { + Address: addr2, + }, + "3": { + Address: addr3, + }, + "4": { + Address: addr4, + Excluded: true, + Roles: []fdbv1beta2.FoundationDBStatusProcessRoleInfo{ + { + Role: "tester", + }, + }, + }, + }, + }, + }, + []fdbv1beta2.ProcessAddress{addr4}, + nil, + []fdbv1beta2.ProcessAddress{addr4}, + nil, + nil, + ), + Entry("when the machine-readable status contains the \"full_replication_timeout\" message", + &fdbv1beta2.FoundationDBStatus{ + Client: fdbv1beta2.FoundationDBStatusLocalClientInfo{ + DatabaseStatus: fdbv1beta2.FoundationDBStatusClientDBStatus{ + Available: true, + }, + }, + Cluster: fdbv1beta2.FoundationDBStatusClusterInfo{ + Messages: []fdbv1beta2.FoundationDBStatusMessage{ + { + Name: "full_replication_timeout", + Description: "...", + }, + }, + RecoveryState: fdbv1beta2.RecoveryState{ + ActiveGenerations: 1, + }, + Processes: map[fdbv1beta2.ProcessGroupID]fdbv1beta2.FoundationDBStatusProcessInfo{ + "1": { + Address: addr1, + Excluded: true, + }, + "2": { + Address: addr2, + }, + "3": { + Address: addr3, + }, + "4": { + Address: addr4, + Excluded: true, + Roles: []fdbv1beta2.FoundationDBStatusProcessRoleInfo{ + { + Role: "tester", + }, + }, + }, + }, + }, + }, + []fdbv1beta2.ProcessAddress{addr4}, + nil, + []fdbv1beta2.ProcessAddress{addr4}, + nil, + nil, + ), + Entry("when the machine-readable status contains the \"client_issues\" message", + &fdbv1beta2.FoundationDBStatus{ + Client: fdbv1beta2.FoundationDBStatusLocalClientInfo{ + DatabaseStatus: fdbv1beta2.FoundationDBStatusClientDBStatus{ + Available: true, + }, + }, + Cluster: fdbv1beta2.FoundationDBStatusClusterInfo{ + Messages: []fdbv1beta2.FoundationDBStatusMessage{ + { + Name: "client_issues", + Description: "...", + }, + }, + RecoveryState: fdbv1beta2.RecoveryState{ + ActiveGenerations: 1, + }, + Processes: map[fdbv1beta2.ProcessGroupID]fdbv1beta2.FoundationDBStatusProcessInfo{ + "1": { + Address: addr1, + Excluded: true, + }, + "2": { + Address: addr2, + }, + "3": { + Address: addr3, + }, + "4": { + Address: addr4, + Excluded: true, + Roles: []fdbv1beta2.FoundationDBStatusProcessRoleInfo{ + { + Role: "tester", + }, + }, + }, + }, + }, + }, + []fdbv1beta2.ProcessAddress{addr4}, + []fdbv1beta2.ProcessAddress{addr4}, + nil, + nil, + nil, + ), ) })