From 7f6cec95eb0fbd386ed79b55fa465bccfc72bc75 Mon Sep 17 00:00:00 2001 From: "Johannes M. Scheuermann" Date: Thu, 17 Oct 2024 16:55:30 +0200 Subject: [PATCH 1/4] Fix the case where the process group gets removed without the addresses being included --- controllers/remove_process_groups.go | 47 ++++++++++++-------- controllers/remove_process_groups_test.go | 52 +++++++++++++---------- setup/setup.go | 4 -- 3 files changed, 59 insertions(+), 44 deletions(-) diff --git a/controllers/remove_process_groups.go b/controllers/remove_process_groups.go index 5b80a914..a9479458 100644 --- a/controllers/remove_process_groups.go +++ b/controllers/remove_process_groups.go @@ -132,7 +132,9 @@ func (u removeProcessGroups) reconcile(ctx context.Context, r *FoundationDBClust removedProcessGroups := r.removeProcessGroups(ctx, logger, cluster, zoneRemovals, zonedRemovals[removals.TerminatingZone]) err = includeProcessGroup(ctx, logger, r, cluster, removedProcessGroups, status, adminClient) if err != nil { - return &requeue{curError: err, delayedRequeue: true} + delay := time.Duration(int(r.MinimumRecoveryTimeForInclusion-status.Cluster.RecoveryState.SecondsSinceLastRecovered)) * time.Second + _ = delay + return &requeue{curError: err, delayedRequeue: true, delay: 0} } return nil @@ -214,7 +216,7 @@ func confirmRemoval(ctx context.Context, logger logr.Logger, r *FoundationDBClus return false, false, nil } - // Pod is in terminating state so we don't want to block but we also don't want to include it + // Pod is in terminating state so we don't want to block, but we also don't want to include it canBeIncluded = false } @@ -231,7 +233,7 @@ func confirmRemoval(ctx context.Context, logger logr.Logger, r *FoundationDBClus return false, false, nil } - // PVC is in terminating state so we don't want to block but we also don't want to include it + // PVC is in terminating state so we don't want to block, but we also don't want to include it canBeIncluded = false } else if len(pvcs.Items) > 1 { return false, false, fmt.Errorf("multiple PVCs found for cluster %s, processGroupID %s", cluster.Name, processGroup.ProcessGroupID) @@ -251,7 +253,7 @@ func confirmRemoval(ctx context.Context, logger logr.Logger, r *FoundationDBClus return false, false, nil } - // Service is in terminating state so we don't want to block but we also don't want to include it + // Service is in terminating state so we don't want to block, but we also don't want to include it canBeIncluded = false } @@ -259,12 +261,19 @@ func confirmRemoval(ctx context.Context, logger logr.Logger, r *FoundationDBClus } func includeProcessGroup(ctx context.Context, logger logr.Logger, r *FoundationDBClusterReconciler, cluster *fdbv1beta2.FoundationDBCluster, removedProcessGroups map[fdbv1beta2.ProcessGroupID]bool, status *fdbv1beta2.FoundationDBStatus, adminClient fdbadminclient.AdminClient) error { - fdbProcessesToInclude, err := getProcessesToInclude(logger, cluster, removedProcessGroups, status) + fdbProcessesToInclude, newProcessGroups, err := getProcessesToInclude(logger, cluster, removedProcessGroups, status) if err != nil { return err } if len(fdbProcessesToInclude) == 0 { + // In case that the operator was removing a process group without exclusion. + // We can update the process groups at this stage, as no other processes must be included. + if len(cluster.Status.ProcessGroups) != len(newProcessGroups) { + cluster.Status.ProcessGroups = newProcessGroups + return r.updateOrApply(ctx, cluster) + } + return nil } @@ -293,30 +302,33 @@ func includeProcessGroup(ctx context.Context, logger logr.Logger, r *FoundationD return err } - // Reset the SecondsSinceLastRecovered sine the operator just included some processes, which will cause a recovery. + // Reset the SecondsSinceLastRecovered since the operator just included some processes, which will cause a recovery. status.Cluster.RecoveryState.SecondsSinceLastRecovered = 0.0 + // Update the process group list and remove all removed and included process groups. + cluster.Status.ProcessGroups = newProcessGroups return r.updateOrApply(ctx, cluster) } -func getProcessesToInclude(logger logr.Logger, cluster *fdbv1beta2.FoundationDBCluster, removedProcessGroups map[fdbv1beta2.ProcessGroupID]bool, status *fdbv1beta2.FoundationDBStatus) ([]fdbv1beta2.ProcessAddress, error) { +func getProcessesToInclude(logger logr.Logger, cluster *fdbv1beta2.FoundationDBCluster, removedProcessGroups map[fdbv1beta2.ProcessGroupID]bool, status *fdbv1beta2.FoundationDBStatus) ([]fdbv1beta2.ProcessAddress, []*fdbv1beta2.ProcessGroupStatus, error) { fdbProcessesToInclude := make([]fdbv1beta2.ProcessAddress, 0) if len(removedProcessGroups) == 0 { - return fdbProcessesToInclude, nil + return fdbProcessesToInclude, nil, nil } excludedServers, err := fdbstatus.GetExclusions(status) if err != nil { - return fdbProcessesToInclude, fmt.Errorf("unable to get excluded servers from status, %w", err) + return fdbProcessesToInclude, nil, fmt.Errorf("unable to get excluded servers from status, %w", err) } excludedServersMap := make(map[string]fdbv1beta2.None, len(excludedServers)) for _, excludedServer := range excludedServers { excludedServersMap[excludedServer.String()] = fdbv1beta2.None{} } + processGroups := cluster.Status.DeepCopy().ProcessGroups idx := 0 - for _, processGroup := range cluster.Status.ProcessGroups { + for _, processGroup := range processGroups { if processGroup.IsMarkedForRemoval() && removedProcessGroups[processGroup.ProcessGroupID] { foundInExcludedServerList := false exclusionString := processGroup.GetExclusionString() @@ -324,28 +336,29 @@ func getProcessesToInclude(logger logr.Logger, cluster *fdbv1beta2.FoundationDBC fdbProcessesToInclude = append(fdbProcessesToInclude, fdbv1beta2.ProcessAddress{StringAddress: exclusionString}) foundInExcludedServerList = true } + for _, pAddr := range processGroup.Addresses { if _, ok := excludedServersMap[pAddr]; ok { fdbProcessesToInclude = append(fdbProcessesToInclude, fdbv1beta2.ProcessAddress{IPAddress: net.ParseIP(pAddr)}) foundInExcludedServerList = true } } - if !foundInExcludedServerList { + + if !foundInExcludedServerList && !processGroup.ExclusionSkipped { // This means that the process is marked for exclusion and is also removed in the previous step but is missing // its entry in the excluded servers in the status. This should not throw an error as this will block the // inclusion for other processes, but we should have a record of this event happening in the logs. - logger.Info("processGroup is included but is missing from excluded server list", "processGroup", processGroup) + logger.Info("processGroup should be included but is missing from excluded server list", "processGroup", processGroup) } + continue } - cluster.Status.ProcessGroups[idx] = processGroup + + processGroups[idx] = processGroup idx++ } - // Remove the trailing duplicates. - cluster.Status.ProcessGroups = cluster.Status.ProcessGroups[:idx] - - return fdbProcessesToInclude, nil + return fdbProcessesToInclude, processGroups[:idx], nil } func (r *FoundationDBClusterReconciler) getProcessGroupsToRemove(logger logr.Logger, cluster *fdbv1beta2.FoundationDBCluster, remainingMap map[string]bool, cordSet map[string]fdbv1beta2.None) (bool, bool, []*fdbv1beta2.ProcessGroupStatus) { diff --git a/controllers/remove_process_groups_test.go b/controllers/remove_process_groups_test.go index 5c85bc15..8d8b7cb5 100644 --- a/controllers/remove_process_groups_test.go +++ b/controllers/remove_process_groups_test.go @@ -560,11 +560,12 @@ var _ = Describe("remove_process_groups", func() { }) When("including no process", func() { - It("should not include any process", func() { - processesToInclude, err := getProcessesToInclude(logr.Logger{}, cluster, removedProcessGroups, status) + FIt("should not include any process", func() { + processesToInclude, newProcessGroups, err := getProcessesToInclude(logr.Logger{}, cluster, removedProcessGroups, status) Expect(err).NotTo(HaveOccurred()) - Expect(len(processesToInclude)).To(Equal(0)) - Expect(len(cluster.Status.ProcessGroups)).To(Equal(16)) + Expect(processesToInclude).To(BeEmpty()) + Expect(newProcessGroups).To(BeEmpty()) + Expect(cluster.Status.ProcessGroups).To(HaveLen(16)) }) }) @@ -580,11 +581,12 @@ var _ = Describe("remove_process_groups", func() { }) It("should include one process", func() { - fdbProcessesToInclude, err := getProcessesToInclude(logr.Logger{}, cluster, removedProcessGroups, status) + processesToInclude, newProcessGroups, err := getProcessesToInclude(logr.Logger{}, cluster, removedProcessGroups, status) Expect(err).NotTo(HaveOccurred()) - Expect(len(fdbProcessesToInclude)).To(Equal(1)) - Expect(fdbv1beta2.ProcessAddressesString(fdbProcessesToInclude, " ")).To(Equal("1.1.1.1")) - Expect(len(cluster.Status.ProcessGroups)).To(Equal(15)) + Expect(processesToInclude).To(HaveLen(1)) + Expect(fdbv1beta2.ProcessAddressesString(processesToInclude, " ")).To(Equal("1.1.1.1")) + Expect(newProcessGroups).To(HaveLen(15)) + Expect(cluster.Status.ProcessGroups).To(HaveLen(16)) }) }) }) @@ -596,10 +598,11 @@ var _ = Describe("remove_process_groups", func() { When("including no process", func() { It("should not include any process", func() { - fdbProcessesToInclude, err := getProcessesToInclude(logr.Logger{}, cluster, removedProcessGroups, status) + processesToInclude, newProcessGroups, err := getProcessesToInclude(logr.Logger{}, cluster, removedProcessGroups, status) Expect(err).NotTo(HaveOccurred()) - Expect(len(fdbProcessesToInclude)).To(Equal(0)) - Expect(len(cluster.Status.ProcessGroups)).To(Equal(16)) + Expect(processesToInclude).To(BeEmpty()) + Expect(newProcessGroups).To(BeEmpty()) + Expect(cluster.Status.ProcessGroups).To(HaveLen(16)) }) }) @@ -615,11 +618,12 @@ var _ = Describe("remove_process_groups", func() { }) It("should include one process", func() { - fdbProcessesToInclude, err := getProcessesToInclude(logr.Logger{}, cluster, removedProcessGroups, status) + processesToInclude, newProcessGroups, err := getProcessesToInclude(logr.Logger{}, cluster, removedProcessGroups, status) Expect(err).NotTo(HaveOccurred()) - Expect(len(fdbProcessesToInclude)).To(Equal(1)) - Expect(fdbv1beta2.ProcessAddressesString(fdbProcessesToInclude, " ")).To(Equal(removedProcessGroup.GetExclusionString())) - Expect(len(cluster.Status.ProcessGroups)).To(Equal(15)) + Expect(processesToInclude).To(HaveLen(1)) + Expect(fdbv1beta2.ProcessAddressesString(processesToInclude, " ")).To(Equal(removedProcessGroup.GetExclusionString())) + Expect(newProcessGroups).To(HaveLen(15)) + Expect(cluster.Status.ProcessGroups).To(HaveLen(16)) }) }) @@ -637,11 +641,12 @@ var _ = Describe("remove_process_groups", func() { }) It("should include one process", func() { - fdbProcessesToInclude, err := getProcessesToInclude(logr.Logger{}, cluster, removedProcessGroups, status) + processesToInclude, newProcessGroups, err := getProcessesToInclude(logr.Logger{}, cluster, removedProcessGroups, status) Expect(err).NotTo(HaveOccurred()) - Expect(len(fdbProcessesToInclude)).To(Equal(2)) - Expect(fdbv1beta2.ProcessAddressesString(fdbProcessesToInclude, " ")).To(Equal(fmt.Sprintf("%s %s", removedProcessGroup.GetExclusionString(), removedProcessGroup.Addresses[0]))) - Expect(len(cluster.Status.ProcessGroups)).To(Equal(15)) + Expect(processesToInclude).To(HaveLen(2)) + Expect(fdbv1beta2.ProcessAddressesString(processesToInclude, " ")).To(Equal(fmt.Sprintf("%s %s", removedProcessGroup.GetExclusionString(), removedProcessGroup.Addresses[0]))) + Expect(newProcessGroups).To(HaveLen(15)) + Expect(cluster.Status.ProcessGroups).To(HaveLen(16)) }) }) @@ -663,11 +668,12 @@ var _ = Describe("remove_process_groups", func() { }) It("should include one process", func() { - fdbProcessesToInclude, err := getProcessesToInclude(logr.Logger{}, cluster, removedProcessGroups, status) + processesToInclude, newProcessGroups, err := getProcessesToInclude(logr.Logger{}, cluster, removedProcessGroups, status) Expect(err).NotTo(HaveOccurred()) - Expect(len(fdbProcessesToInclude)).To(Equal(1)) - Expect(fdbv1beta2.ProcessAddressesString(fdbProcessesToInclude, " ")).To(Equal(removedProcessGroup2.GetExclusionString())) - Expect(len(cluster.Status.ProcessGroups)).To(Equal(14)) + Expect(processesToInclude).To(HaveLen(1)) + Expect(fdbv1beta2.ProcessAddressesString(processesToInclude, " ")).To(Equal(removedProcessGroup2.GetExclusionString())) + Expect(newProcessGroups).To(HaveLen(14)) + Expect(cluster.Status.ProcessGroups).To(HaveLen(16)) }) }) }) diff --git a/setup/setup.go b/setup/setup.go index d3c9aac9..06e1e8f4 100644 --- a/setup/setup.go +++ b/setup/setup.go @@ -268,10 +268,6 @@ func StartManager( clusterReconciler.MaintenanceListWaitDuration = operatorOpts.MaintenanceListWaitDuration clusterReconciler.MinimumRecoveryTimeForInclusion = operatorOpts.MinimumRecoveryTimeForInclusion clusterReconciler.MinimumRecoveryTimeForExclusion = operatorOpts.MinimumRecoveryTimeForExclusion - clusterReconciler.MaintenanceListStaleDuration = operatorOpts.MaintenanceListStaleDuration - clusterReconciler.MaintenanceListWaitDuration = operatorOpts.MaintenanceListWaitDuration - clusterReconciler.MinimumRecoveryTimeForInclusion = operatorOpts.MinimumRecoveryTimeForInclusion - clusterReconciler.MinimumRecoveryTimeForExclusion = operatorOpts.MinimumRecoveryTimeForExclusion clusterReconciler.ClusterLabelKeyForNodeTrigger = strings.Trim(operatorOpts.ClusterLabelKeyForNodeTrigger, "\"") clusterReconciler.Namespace = operatorOpts.WatchNamespace From a2fc1a79bcdb66d150f4411b7c492abe75721f82 Mon Sep 17 00:00:00 2001 From: "Johannes M. Scheuermann" Date: Thu, 17 Oct 2024 16:58:46 +0200 Subject: [PATCH 2/4] Fix the case where the process group gets removed without the addresses being included --- controllers/remove_process_groups.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/controllers/remove_process_groups.go b/controllers/remove_process_groups.go index a9479458..c3ae8160 100644 --- a/controllers/remove_process_groups.go +++ b/controllers/remove_process_groups.go @@ -132,9 +132,8 @@ func (u removeProcessGroups) reconcile(ctx context.Context, r *FoundationDBClust removedProcessGroups := r.removeProcessGroups(ctx, logger, cluster, zoneRemovals, zonedRemovals[removals.TerminatingZone]) err = includeProcessGroup(ctx, logger, r, cluster, removedProcessGroups, status, adminClient) if err != nil { - delay := time.Duration(int(r.MinimumRecoveryTimeForInclusion-status.Cluster.RecoveryState.SecondsSinceLastRecovered)) * time.Second - _ = delay - return &requeue{curError: err, delayedRequeue: true, delay: 0} + // If the inclusion is blocked or another issues happened we will retry in 60 seconds. + return &requeue{curError: err, delayedRequeue: true, delay: 60 * time.Second} } return nil From 2e6a9e6adbff363e840697eab4740247cc31cedc Mon Sep 17 00:00:00 2001 From: Johannes Scheuermann Date: Fri, 18 Oct 2024 14:32:38 +0200 Subject: [PATCH 3/4] Update controllers/remove_process_groups_test.go --- controllers/remove_process_groups_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/controllers/remove_process_groups_test.go b/controllers/remove_process_groups_test.go index 8d8b7cb5..a08648e5 100644 --- a/controllers/remove_process_groups_test.go +++ b/controllers/remove_process_groups_test.go @@ -560,7 +560,7 @@ var _ = Describe("remove_process_groups", func() { }) When("including no process", func() { - FIt("should not include any process", func() { + It("should not include any process", func() { processesToInclude, newProcessGroups, err := getProcessesToInclude(logr.Logger{}, cluster, removedProcessGroups, status) Expect(err).NotTo(HaveOccurred()) Expect(processesToInclude).To(BeEmpty()) From 5e0c8fd08af687ddf919dd37ee739d9a8407ebe2 Mon Sep 17 00:00:00 2001 From: "Johannes M. Scheuermann" Date: Fri, 18 Oct 2024 15:02:33 +0200 Subject: [PATCH 4/4] Correct test name --- controllers/remove_process_groups_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/controllers/remove_process_groups_test.go b/controllers/remove_process_groups_test.go index a08648e5..3f230299 100644 --- a/controllers/remove_process_groups_test.go +++ b/controllers/remove_process_groups_test.go @@ -640,7 +640,7 @@ var _ = Describe("remove_process_groups", func() { removedProcessGroups[removedProcessGroup.ProcessGroupID] = true }) - It("should include one process", func() { + It("should include two process addresses", func() { processesToInclude, newProcessGroups, err := getProcessesToInclude(logr.Logger{}, cluster, removedProcessGroups, status) Expect(err).NotTo(HaveOccurred()) Expect(processesToInclude).To(HaveLen(2))