Skip to content

Commit

Permalink
fix: dedupe expiration reconciliations (#1794)
Browse files Browse the repository at this point in the history
  • Loading branch information
jmdeal authored Oct 31, 2024
1 parent f7abd62 commit e438062
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 3 deletions.
13 changes: 10 additions & 3 deletions pkg/controllers/nodeclaim/expiration/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package expiration

import (
"context"
"time"

"github.com/prometheus/client_golang/prometheus"
"k8s.io/utils/clock"
Expand Down Expand Up @@ -46,6 +47,9 @@ func NewController(clk clock.Clock, kubeClient client.Client) *Controller {
}

func (c *Controller) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (reconcile.Result, error) {
if !nodeClaim.DeletionTimestamp.IsZero() {
return reconcile.Result{}, nil
}
// From here there are three scenarios to handle:
// 1. If ExpireAfter is not configured, exit expiration loop
if nodeClaim.Spec.ExpireAfter.Duration == nil {
Expand All @@ -59,7 +63,7 @@ func (c *Controller) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (re
}
// 3. Otherwise, if the NodeClaim is expired we can forcefully expire the nodeclaim (by deleting it)
if err := c.kubeClient.Delete(ctx, nodeClaim); err != nil {
return reconcile.Result{}, err
return reconcile.Result{}, client.IgnoreNotFound(err)
}
// 4. The deletion timestamp has successfully been set for the NodeClaim, update relevant metrics.
log.FromContext(ctx).V(1).Info("deleting expired nodeclaim")
Expand All @@ -68,12 +72,15 @@ func (c *Controller) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (re
metrics.NodePoolLabel: nodeClaim.Labels[v1.NodePoolLabelKey],
metrics.CapacityTypeLabel: nodeClaim.Labels[v1.CapacityTypeLabelKey],
}).Inc()
// We sleep here after the delete operation since we want to ensure that we are able to read our own writes so that
// we avoid duplicating metrics and log lines due to quick re-queues.
// USE CAUTION when determining whether to increase this timeout or remove this line
time.Sleep(time.Second)
return reconcile.Result{}, nil
}

func (c *Controller) Register(_ context.Context, m manager.Manager) error {
builder := controllerruntime.NewControllerManagedBy(m)
return builder.
return controllerruntime.NewControllerManagedBy(m).
Named("nodeclaim.expiration").
For(&v1.NodeClaim{}).
Complete(reconcile.AsReconciler(m.GetClient(), c))
Expand Down
18 changes: 18 additions & 0 deletions pkg/controllers/nodeclaim/expiration/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,4 +164,22 @@ var _ = Describe("Expiration", func() {
result := ExpectObjectReconciled(ctx, env.Client, expirationController, nodeClaim)
Expect(result.RequeueAfter).To(BeNumerically("~", time.Second*100, time.Second))
})
It("shouldn't expire the same NodeClaim multiple times", func() {
nodeClaim.ObjectMeta.Finalizers = append(nodeClaim.ObjectMeta.Finalizers, "test-finalizer")
ExpectApplied(ctx, env.Client, nodePool, nodeClaim)

// step forward to make the node expired
fakeClock.Step(60 * time.Second)
ExpectObjectReconciled(ctx, env.Client, expirationController, nodeClaim)
ExpectExists(ctx, env.Client, nodeClaim)
ExpectMetricCounterValue(metrics.NodeClaimsDisruptedTotal, 1, map[string]string{
metrics.ReasonLabel: metrics.ExpiredReason,
"nodepool": nodePool.Name,
})
ExpectObjectReconciled(ctx, env.Client, expirationController, nodeClaim)
ExpectMetricCounterValue(metrics.NodeClaimsDisruptedTotal, 1, map[string]string{
metrics.ReasonLabel: metrics.ExpiredReason,
"nodepool": nodePool.Name,
})
})
})

0 comments on commit e438062

Please sign in to comment.