From 025f9745d6e81167fce73ff126ff4718d124bd73 Mon Sep 17 00:00:00 2001 From: Johannes Scheuermann Date: Thu, 24 Oct 2024 16:03:56 +0100 Subject: [PATCH] Fix the unified image setup during bootstrap when a custom environment variable is used for the zone (#2156) --- controllers/generate_initial_cluster_file.go | 5 +- e2e/fixtures/cluster_config.go | 3 + e2e/fixtures/fdb_cluster_specs.go | 12 +- .../operator_upgrades_variations_test.go | 26 ++++- internal/coordinator/coordinator.go | 3 +- internal/locality/locality.go | 26 ++++- internal/locality/locality_test.go | 107 +++++++++++++++++- internal/pod_client.go | 3 +- 8 files changed, 164 insertions(+), 21 deletions(-) diff --git a/controllers/generate_initial_cluster_file.go b/controllers/generate_initial_cluster_file.go index 1196ecaf..7aa0929b 100644 --- a/controllers/generate_initial_cluster_file.go +++ b/controllers/generate_initial_cluster_file.go @@ -47,7 +47,7 @@ func (g generateInitialClusterFile) reconcile(ctx context.Context, r *Foundation } logger.Info("Generating initial cluster file") - r.Recorder.Event(cluster, corev1.EventTypeNormal, "ChangingCoordinators", "Choosing initial coordinators") + r.Recorder.Event(cluster, corev1.EventTypeNormal, "GenerateInitialCoordinators", "Choosing initial coordinators") processCounts, err := cluster.GetProcessCountsWithDefaults() if err != nil { @@ -130,8 +130,7 @@ func (g generateInitialClusterFile) reconcile(ctx context.Context, r *Foundation } coordinators, err := locality.ChooseDistributedProcesses(cluster, processLocality, count, locality.ProcessSelectionConstraint{ - HardLimits: locality.GetHardLimits(cluster), - SelectingCoordinators: true, + HardLimits: locality.GetHardLimits(cluster), }) if err != nil { return &requeue{curError: err} diff --git a/e2e/fixtures/cluster_config.go b/e2e/fixtures/cluster_config.go index ae318b39..326bad02 100644 --- a/e2e/fixtures/cluster_config.go +++ b/e2e/fixtures/cluster_config.go @@ -78,6 +78,9 @@ type ClusterConfig struct { UseDNS bool // If enabled the cluster will be setup with the unified image. UseUnifiedImage *bool + // SimulateCustomFaultDomainEnv will simulate the use case that a user has set a custom environment variable to + // be used as zone ID. + SimulateCustomFaultDomainEnv bool // CreationTracker if specified will be used to log the time between the creations steps. CreationTracker CreationTrackerLogger // Number of machines, this is used for calculating the number of Pods and is not correlated to the actual number diff --git a/e2e/fixtures/fdb_cluster_specs.go b/e2e/fixtures/fdb_cluster_specs.go index c657283f..b3089941 100644 --- a/e2e/fixtures/fdb_cluster_specs.go +++ b/e2e/fixtures/fdb_cluster_specs.go @@ -48,6 +48,14 @@ func (factory *Factory) createFDBClusterSpec( imageType = fdbv1beta2.ImageTypeUnified } + faultDomain := fdbv1beta2.FoundationDBClusterFaultDomain{ + Key: "foundationdb.org/none", + } + if config.SimulateCustomFaultDomainEnv { + faultDomain.ValueFrom = "$" + fdbv1beta2.EnvNameInstanceID + faultDomain.Key = corev1.LabelHostname + } + return &fdbv1beta2.FoundationDBCluster{ ObjectMeta: metav1.ObjectMeta{ Name: config.Name, @@ -64,9 +72,7 @@ func (factory *Factory) createFDBClusterSpec( MainContainer: factory.GetMainContainerOverrides(config.DebugSymbols, useUnifiedImage), ImageType: &imageType, SidecarContainer: factory.GetSidecarContainerOverrides(config.DebugSymbols), - FaultDomain: fdbv1beta2.FoundationDBClusterFaultDomain{ - Key: "foundationdb.org/none", - }, + FaultDomain: faultDomain, AutomationOptions: fdbv1beta2.FoundationDBClusterAutomationOptions{ // We have to wait long enough to ensure the operator is not recreating too many Pods at the same time. WaitBetweenRemovalsSeconds: pointer.Int(0), diff --git a/e2e/test_operator_upgrades_variations/operator_upgrades_variations_test.go b/e2e/test_operator_upgrades_variations/operator_upgrades_variations_test.go index 59e8a4a0..3e06e956 100644 --- a/e2e/test_operator_upgrades_variations/operator_upgrades_variations_test.go +++ b/e2e/test_operator_upgrades_variations/operator_upgrades_variations_test.go @@ -32,6 +32,8 @@ import ( "log" "time" + corev1 "k8s.io/api/core/v1" + fdbv1beta2 "github.com/FoundationDB/fdb-kubernetes-operator/api/v1beta2" "github.com/FoundationDB/fdb-kubernetes-operator/e2e/fixtures" . "github.com/onsi/ginkgo/v2" @@ -119,7 +121,7 @@ func performUpgrade(config testConfig, preUpgradeFunction func(cluster *fixtures for _, processGroup := range cluster.Status.ProcessGroups { missingTime := processGroup.GetConditionTime(fdbv1beta2.MissingProcesses) // If the Pod is missing check if the fdbserver processes are running and check the logs of the fdb-kubernetes-monitor. - if missingTime != nil && time.Since(time.Unix(*missingTime, 0)) > 60*time.Second { + if missingTime != nil && time.Since(time.Unix(*missingTime, 0)) > 120*time.Second && !processGroup.IsMarkedForRemoval() && !processGroup.IsExcluded() { log.Println("Missing process for:", processGroup.ProcessGroupID) stdout, stderr, err := factory.ExecuteCmd(context.Background(), cluster.Namespace, processGroup.GetPodName(cluster), fdbv1beta2.MainContainerName, "ps aufx", true) log.Println("stdout:", stdout, "stderr", stderr, "err", err) @@ -336,4 +338,26 @@ var _ = Describe("Operator Upgrades", Label("e2e", "pr"), func() { EntryDescription("Upgrade from %[1]s to %[2]s"), fixtures.GenerateUpgradeTableEntries(testOptions), ) + + DescribeTable( + "upgrading a cluster with zone ID from a custom environment variable", + func(beforeVersion string, targetVersion string) { + performUpgrade(testConfig{ + beforeVersion: beforeVersion, + targetVersion: targetVersion, + clusterConfig: &fixtures.ClusterConfig{ + DebugSymbols: false, + SimulateCustomFaultDomainEnv: true, + }, + loadData: false, + }, func(cluster *fixtures.FdbCluster) { + spec := cluster.GetCluster().Spec.DeepCopy() + + Expect(spec.FaultDomain.Key).To(Equal(corev1.LabelHostname)) + Expect(spec.FaultDomain.ValueFrom).To(HaveSuffix(fdbv1beta2.EnvNameInstanceID)) + }) + }, + EntryDescription("Upgrade from %[1]s to %[2]s"), + fixtures.GenerateUpgradeTableEntries(testOptions), + ) }) diff --git a/internal/coordinator/coordinator.go b/internal/coordinator/coordinator.go index 597ab9f8..82184582 100644 --- a/internal/coordinator/coordinator.go +++ b/internal/coordinator/coordinator.go @@ -115,8 +115,7 @@ func selectCoordinatorsLocalities(logger logr.Logger, cluster *fdbv1beta2.Founda } coordinators, err := locality.ChooseDistributedProcesses(cluster, candidates, coordinatorCount, locality.ProcessSelectionConstraint{ - HardLimits: locality.GetHardLimits(cluster), - SelectingCoordinators: true, + HardLimits: locality.GetHardLimits(cluster), }) logger.Info("Current coordinators", "coordinators", coordinators, "error", err) diff --git a/internal/locality/locality.go b/internal/locality/locality.go index b0260e26..5725da56 100644 --- a/internal/locality/locality.go +++ b/internal/locality/locality.go @@ -29,6 +29,7 @@ import ( "github.com/go-logr/logr" "math" "slices" + "strings" ) // Info captures information about a process for the purposes of @@ -114,6 +115,7 @@ func InfoForProcess(process fdbv1beta2.FoundationDBStatusProcessInfo, mainContai // InfoFromSidecar converts the process information from the sidecar's // context into locality info for selecting processes. +// This method is only used during the initial bootstrapping of the cluster when no fdbserver processes are running. func InfoFromSidecar(cluster *fdbv1beta2.FoundationDBCluster, client podclient.FdbPodClient) (Info, error) { substitutions, err := client.GetVariableSubstitutions() if err != nil { @@ -124,6 +126,25 @@ func InfoFromSidecar(cluster *fdbv1beta2.FoundationDBCluster, client podclient.F return Info{}, nil } + // Take the zone ID from the FDB_ZONE_ID if present. + zoneID, present := substitutions[fdbv1beta2.EnvNameZoneID] + if !present { + // If the FDB_ZONE_ID is not present, the user specified another environment variable that represents the + // zone ID. + var zoneVariable string + if strings.HasPrefix(cluster.Spec.FaultDomain.ValueFrom, "$") { + zoneVariable = cluster.Spec.FaultDomain.ValueFrom[1:] + } else { + zoneVariable = fdbv1beta2.EnvNameZoneID + } + + zoneID = substitutions[zoneVariable] + } + + if zoneID == "" { + return Info{}, errors.New("no zone ID found in Sidecar information") + } + // This locality information is only used during the initial cluster file generation. // So it should be good to only use the first process address here. // This has the implication that in the initial cluster file only the first processes will be used. @@ -131,7 +152,7 @@ func InfoFromSidecar(cluster *fdbv1beta2.FoundationDBCluster, client podclient.F ID: substitutions[fdbv1beta2.EnvNameInstanceID], Address: cluster.GetFullAddress(substitutions[fdbv1beta2.EnvNamePublicIP], 1), LocalityData: map[string]string{ - fdbv1beta2.FDBLocalityZoneIDKey: substitutions[fdbv1beta2.EnvNameZoneID], + fdbv1beta2.FDBLocalityZoneIDKey: zoneID, fdbv1beta2.FDBLocalityDNSNameKey: substitutions[fdbv1beta2.EnvNameDNSName], }, }, nil @@ -164,9 +185,6 @@ type ProcessSelectionConstraint struct { // HardLimits defines a maximum number of processes to recruit on any single // value for a given locality field. HardLimits map[string]int - - // SelectingCoordinators must be true when the ChooseDistributedProcesses is used to select coordinators. - SelectingCoordinators bool } // ChooseDistributedProcesses recruits a maximally well-distributed set of processes from a set of potential candidates. diff --git a/internal/locality/locality_test.go b/internal/locality/locality_test.go index 9d51bcb1..d96c9982 100644 --- a/internal/locality/locality_test.go +++ b/internal/locality/locality_test.go @@ -603,8 +603,7 @@ var _ = Describe("Localities", func() { BeforeEach(func() { candidates = generateCandidates(dcIDs, 5, 5) result, err = ChooseDistributedProcesses(cluster, candidates, cluster.DesiredCoordinatorCount(), ProcessSelectionConstraint{ - HardLimits: GetHardLimits(cluster), - SelectingCoordinators: true, + HardLimits: GetHardLimits(cluster), }) Expect(err).NotTo(HaveOccurred()) }) @@ -645,8 +644,7 @@ var _ = Describe("Localities", func() { // Only measure the actual execution of ChooseDistributedProcesses. experiment.MeasureDuration("ChooseDistributedProcesses", func() { _, _ = ChooseDistributedProcesses(cluster, candidates, cluster.DesiredCoordinatorCount(), ProcessSelectionConstraint{ - HardLimits: GetHardLimits(cluster), - SelectingCoordinators: true, + HardLimits: GetHardLimits(cluster), }) }) // We'll sample the function up to 50 times or up to a minute, whichever comes first. @@ -754,8 +752,7 @@ var _ = Describe("Localities", func() { } result, err = ChooseDistributedProcesses(cluster, candidates, cluster.DesiredCoordinatorCount(), ProcessSelectionConstraint{ - HardLimits: GetHardLimits(cluster), - SelectingCoordinators: true, + HardLimits: GetHardLimits(cluster), }) Expect(err).NotTo(HaveOccurred()) }) @@ -1055,6 +1052,104 @@ var _ = Describe("Localities", func() { Info{}, true, ), + Entry("locality information is read from a different environment variables", + &fdbv1beta2.FoundationDBCluster{ + Spec: fdbv1beta2.FoundationDBClusterSpec{ + FaultDomain: fdbv1beta2.FoundationDBClusterFaultDomain{ + Key: corev1.LabelHostname, + ValueFrom: "$CUSTOM_ENV", + }, + }, + Status: fdbv1beta2.FoundationDBClusterStatus{ + RequiredAddresses: fdbv1beta2.RequiredAddressSet{ + NonTLS: true, + }, + }, + }, + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + Labels: map[string]string{ + fdbv1beta2.FDBProcessGroupIDLabel: "test", + }, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "foundationdb-kubernetes-sidecar", + Args: []string{ + "--public-ip-family", + "4", + }, + Env: []corev1.EnvVar{ + { + Name: "CUSTOM_ENV", + Value: "custom-zone-id", + }, + }, + }, + }, + }, + Status: corev1.PodStatus{ + PodIPs: []corev1.PodIP{ + {IP: "1.1.1.1"}, + }, + }, + }, + Info{ + ID: "test", + Address: fdbv1beta2.ProcessAddress{ + IPAddress: net.ParseIP("1.1.1.1"), + Port: 4501, + }, + LocalityData: map[string]string{ + fdbv1beta2.FDBLocalityZoneIDKey: "custom-zone-id", + fdbv1beta2.FDBLocalityDNSNameKey: "", + }, + }, + false, + ), + Entry("locality information is read from a different environment variable which is missing", + &fdbv1beta2.FoundationDBCluster{ + Spec: fdbv1beta2.FoundationDBClusterSpec{ + FaultDomain: fdbv1beta2.FoundationDBClusterFaultDomain{ + Key: corev1.LabelHostname, + ValueFrom: "$CUSTOM_ENV", + }, + }, + Status: fdbv1beta2.FoundationDBClusterStatus{ + RequiredAddresses: fdbv1beta2.RequiredAddressSet{ + NonTLS: true, + }, + }, + }, + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + Labels: map[string]string{ + fdbv1beta2.FDBProcessGroupIDLabel: "test", + }, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "foundationdb-kubernetes-sidecar", + Args: []string{ + "--public-ip-family", + "4", + }, + }, + }, + }, + Status: corev1.PodStatus{ + PodIPs: []corev1.PodIP{ + {IP: "1.1.1.1"}, + }, + }, + }, + Info{}, + true, + ), ) Describe("checkCoordinatorValidity", func() { diff --git a/internal/pod_client.go b/internal/pod_client.go index c43ab5df..39d9fec3 100644 --- a/internal/pod_client.go +++ b/internal/pod_client.go @@ -434,8 +434,6 @@ func GetSubstitutionsFromClusterAndPod(logger logr.Logger, cluster *fdbv1beta2.F if faultDomainSource == "spec.nodeName" { substitutions[fdbv1beta2.EnvNameZoneID] = pod.Spec.NodeName - } else { - return nil, fmt.Errorf("unsupported fault domain source %s", faultDomainSource) } } @@ -450,6 +448,7 @@ func GetSubstitutionsFromClusterAndPod(logger logr.Logger, cluster *fdbv1beta2.F copyableSubstitutions := map[string]fdbv1beta2.None{ fdbv1beta2.EnvNameDNSName: {}, fdbv1beta2.EnvNameInstanceID: {}, + "CUSTOM_ENV": {}, } for _, container := range pod.Spec.Containers { for _, envVar := range container.Env {