Skip to content

Commit

Permalink
Prevent OOMKill while listing CRDs (#4573)
Browse files Browse the repository at this point in the history
- Also set GOMEMLIMIT (which helps avoid OOMKill)
  • Loading branch information
matthchr authored Feb 12, 2025
1 parent b097e79 commit 5b5d690
Show file tree
Hide file tree
Showing 7 changed files with 32 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ spec:
- --webhook-cert-dir={{ .Values.webhook.certDir }}
{{- end }}
env:
- name: GOMEMLIMIT
value: {{ .Values.go.memLimit }}
- name: AZURE_CLIENT_ID
valueFrom:
secretKeyRef:
Expand Down
5 changes: 4 additions & 1 deletion v2/charts/azure-service-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ priorityClassName: ""
# Number of pod replicas to create for the deployment
replicaCount: 2

# Recomended initial values for resources
# Recommended initial values for resources
# adjust them as necessary
resources:
limits:
Expand All @@ -196,6 +196,9 @@ resources:
cpu: 200m
memory: 256Mi

go:
memLimit: 400MiB # This should be set to ~80-90% of the hard memory limit set above in resources

# Number of old history to retain to allow rollback
# Default Kubernetes value is set to 10
revisionHistoryLimit: 10
Expand Down
7 changes: 4 additions & 3 deletions v2/cmd/controller/app/setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,8 @@ func SetupControllerManager(ctx context.Context, setupLog logr.Logger, flgs *Fla
setupLog.Error(err, "failed to initialize CRD client")
os.Exit(1)
}
existingCRDs, err := crdManager.ListCRDs(ctx)
existingCRDs := apiextensions.CustomResourceDefinitionList{}
err = crdManager.ListCRDs(ctx, &existingCRDs)
if err != nil {
setupLog.Error(err, "failed to list current CRDs")
os.Exit(1)
Expand All @@ -155,7 +156,7 @@ func SetupControllerManager(ctx context.Context, setupLog logr.Logger, flgs *Fla
// Note that this step will restart the pod when it succeeds
err = crdManager.Install(ctx, crdmanagement.Options{
CRDPatterns: flgs.CRDPatterns,
ExistingCRDs: existingCRDs,
ExistingCRDs: &existingCRDs,
Path: crdmanagement.CRDLocation,
Namespace: cfg.PodNamespace,
})
Expand Down Expand Up @@ -184,7 +185,7 @@ func SetupControllerManager(ctx context.Context, setupLog logr.Logger, flgs *Fla
// TODO: the nontrivial startup cost of reading the local copy of CRDs into memory. Since "none" is
// TODO: us approximating the standard operator experience we don't perform this assertion currently as most
// TODO: operators don't.
readyResources := crdmanagement.MakeCRDMap(existingCRDs)
readyResources := crdmanagement.MakeCRDMap(existingCRDs.Items)

if cfg.OperatorMode.IncludesWatchers() {
//nolint:contextcheck
Expand Down
2 changes: 1 addition & 1 deletion v2/config/manager/manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ spec:
resources:
limits:
cpu: 500m
memory: 512Mi
memory: 512Mi # Make sure to change the GOMEMLIMIT env variable if you change this too
requests:
cpu: 200m
memory: 256Mi
Expand Down
2 changes: 2 additions & 0 deletions v2/config/manager/manager_image_patch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ spec:
- image: localhost:5000/azureserviceoperator:latest
name: manager
env:
- name: GOMEMLIMIT
value: 400MiB # This should be set to ~80-90% of the hard memory limit on the pod
- name: AZURE_CLIENT_ID
valueFrom:
secretKeyRef:
Expand Down
26 changes: 16 additions & 10 deletions v2/internal/crdmanagement/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,30 +143,36 @@ func NewManager(logger logr.Logger, kubeClient kubeclient.Client, leaderElection
}
}

func (m *Manager) ListCRDs(ctx context.Context) ([]apiextensions.CustomResourceDefinition, error) {
list := apiextensions.CustomResourceDefinitionList{}
// ListCRDs lists ASO CRDs.
// This accepts a list rather than returning one to allow re-using the same list object (they're large and having multiple)
// copies of the collection results in huge memory usage.
func (m *Manager) ListCRDs(ctx context.Context, list *apiextensions.CustomResourceDefinitionList) error {
// Clear the existing list, if there is one.
list.Items = nil
list.Continue = ""
list.ResourceVersion = ""

selector := labels.NewSelector()
requirement, err := labels.NewRequirement(ServiceOperatorAppLabel, selection.Equals, []string{ServiceOperatorAppValue})
if err != nil {
return nil, err
return err
}
selector = selector.Add(*requirement)

match := client.MatchingLabelsSelector{
Selector: selector,
}

err = m.kubeClient.List(ctx, &list, match)
err = m.kubeClient.List(ctx, list, match)
if err != nil {
return nil, eris.Wrapf(err, "failed to list CRDs")
return eris.Wrapf(err, "failed to list CRDs")
}

for _, crd := range list.Items {
m.logger.V(Verbose).Info("Found an existing CRD", "CRD", crd.Name)
}

return list.Items, nil
return nil
}

func (m *Manager) LoadOperatorCRDs(path string, podNamespace string) ([]apiextensions.CustomResourceDefinition, error) {
Expand Down Expand Up @@ -345,11 +351,11 @@ func (m *Manager) applyCRDs(
// Double-checked locking, we need to make sure once we have the lock there's still work to do, as it may
// already have been done while we were waiting for the lock.
m.logger.V(Status).Info("Double-checked locking - ensure there's still CRDs to apply...")
existingCRDs, err := m.ListCRDs(ctx)
err := m.ListCRDs(ctx, options.ExistingCRDs)
if err != nil {
return eris.Wrap(err, "failed to list current CRDs")
}
instructions, err = m.DetermineCRDsToInstallOrUpgrade(goalCRDs, existingCRDs, options.CRDPatterns)
instructions, err = m.DetermineCRDsToInstallOrUpgrade(goalCRDs, options.ExistingCRDs.Items, options.CRDPatterns)
if err != nil {
return eris.Wrap(err, "failed to determine CRDs to apply")
}
Expand Down Expand Up @@ -411,7 +417,7 @@ type Options struct {
Path string
Namespace string
CRDPatterns string
ExistingCRDs []apiextensions.CustomResourceDefinition
ExistingCRDs *apiextensions.CustomResourceDefinitionList
}

func (m *Manager) Install(ctx context.Context, options Options) error {
Expand All @@ -420,7 +426,7 @@ func (m *Manager) Install(ctx context.Context, options Options) error {
return eris.Wrap(err, "failed to load CRDs from disk")
}

installationInstructions, err := m.DetermineCRDsToInstallOrUpgrade(goalCRDs, options.ExistingCRDs, options.CRDPatterns)
installationInstructions, err := m.DetermineCRDsToInstallOrUpgrade(goalCRDs, options.ExistingCRDs.Items, options.CRDPatterns)
if err != nil {
return eris.Wrap(err, "failed to determine CRDs to apply")
}
Expand Down
5 changes: 3 additions & 2 deletions v2/internal/crdmanagement/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -415,9 +415,10 @@ func Test_ListCRDs_ListsOnlyCRDsMatchingLabel(t *testing.T) {
logger := testcommon.NewTestLogger(t)
crdManager := crdmanagement.NewManager(logger, kubeClient, nil)

crds, err := crdManager.ListCRDs(ctx)
crds := &apiextensions.CustomResourceDefinitionList{}
err := crdManager.ListCRDs(ctx, crds)
g.Expect(err).ToNot(HaveOccurred())
g.Expect(crds).To(HaveLen(1))
g.Expect(crds.Items).To(HaveLen(1))
}

// This test requires that the task target `bundle-crds` has been run
Expand Down

0 comments on commit 5b5d690

Please sign in to comment.