diff --git a/pkg/controllers/nodeclaim/expiration/controller.go b/pkg/controllers/nodeclaim/expiration/controller.go index fb39d39705..5681475253 100644 --- a/pkg/controllers/nodeclaim/expiration/controller.go +++ b/pkg/controllers/nodeclaim/expiration/controller.go @@ -18,6 +18,7 @@ package expiration import ( "context" + "time" "github.com/prometheus/client_golang/prometheus" "k8s.io/utils/clock" @@ -46,6 +47,9 @@ func NewController(clk clock.Clock, kubeClient client.Client) *Controller { } func (c *Controller) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (reconcile.Result, error) { + if !nodeClaim.DeletionTimestamp.IsZero() { + return reconcile.Result{}, nil + } // From here there are three scenarios to handle: // 1. If ExpireAfter is not configured, exit expiration loop if nodeClaim.Spec.ExpireAfter.Duration == nil { @@ -59,7 +63,7 @@ func (c *Controller) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (re } // 3. Otherwise, if the NodeClaim is expired we can forcefully expire the nodeclaim (by deleting it) if err := c.kubeClient.Delete(ctx, nodeClaim); err != nil { - return reconcile.Result{}, err + return reconcile.Result{}, client.IgnoreNotFound(err) } // 4. The deletion timestamp has successfully been set for the NodeClaim, update relevant metrics. log.FromContext(ctx).V(1).Info("deleting expired nodeclaim") @@ -68,12 +72,15 @@ func (c *Controller) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (re metrics.NodePoolLabel: nodeClaim.Labels[v1.NodePoolLabelKey], metrics.CapacityTypeLabel: nodeClaim.Labels[v1.CapacityTypeLabelKey], }).Inc() + // We sleep here after the delete operation since we want to ensure that we are able to read our own writes so that + // we avoid duplicating metrics and log lines due to quick re-queues. + // USE CAUTION when determining whether to increase this timeout or remove this line + time.Sleep(time.Second) return reconcile.Result{}, nil } func (c *Controller) Register(_ context.Context, m manager.Manager) error { - builder := controllerruntime.NewControllerManagedBy(m) - return builder. + return controllerruntime.NewControllerManagedBy(m). Named("nodeclaim.expiration"). For(&v1.NodeClaim{}). Complete(reconcile.AsReconciler(m.GetClient(), c)) diff --git a/pkg/controllers/nodeclaim/expiration/suite_test.go b/pkg/controllers/nodeclaim/expiration/suite_test.go index 1f7851c916..fc152533ca 100644 --- a/pkg/controllers/nodeclaim/expiration/suite_test.go +++ b/pkg/controllers/nodeclaim/expiration/suite_test.go @@ -164,4 +164,22 @@ var _ = Describe("Expiration", func() { result := ExpectObjectReconciled(ctx, env.Client, expirationController, nodeClaim) Expect(result.RequeueAfter).To(BeNumerically("~", time.Second*100, time.Second)) }) + It("shouldn't expire the same NodeClaim multiple times", func() { + nodeClaim.ObjectMeta.Finalizers = append(nodeClaim.ObjectMeta.Finalizers, "test-finalizer") + ExpectApplied(ctx, env.Client, nodePool, nodeClaim) + + // step forward to make the node expired + fakeClock.Step(60 * time.Second) + ExpectObjectReconciled(ctx, env.Client, expirationController, nodeClaim) + ExpectExists(ctx, env.Client, nodeClaim) + ExpectMetricCounterValue(metrics.NodeClaimsDisruptedTotal, 1, map[string]string{ + metrics.ReasonLabel: metrics.ExpiredReason, + "nodepool": nodePool.Name, + }) + ExpectObjectReconciled(ctx, env.Client, expirationController, nodeClaim) + ExpectMetricCounterValue(metrics.NodeClaimsDisruptedTotal, 1, map[string]string{ + metrics.ReasonLabel: metrics.ExpiredReason, + "nodepool": nodePool.Name, + }) + }) })