From 2924aadce328edb1ce9565f7a1bdf984fd900ebf Mon Sep 17 00:00:00 2001 From: Dan Rammer Date: Tue, 12 Sep 2023 09:59:57 -0500 Subject: [PATCH] added check for invalid argument code on node event recording failure (#613) Signed-off-by: Daniel Rammer --- pkg/controller/nodes/executor.go | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pkg/controller/nodes/executor.go b/pkg/controller/nodes/executor.go index 011acd0bb..fa864f300 100644 --- a/pkg/controller/nodes/executor.go +++ b/pkg/controller/nodes/executor.go @@ -1230,10 +1230,13 @@ func (c *nodeExecutor) handleQueuedOrRunningNode(ctx context.Context, nCtx inter err = nCtx.EventsRecorder().RecordNodeEvent(ctx, nev, c.eventConfig) if err != nil { - if eventsErr.IsTooLarge(err) { - // With large enough dynamic task fanouts the reported node event, which contains the compiled - // workflow closure, can exceed the gRPC message size limit. In this case we immediately - // transition the node to failing to abort the workflow. + if eventsErr.IsTooLarge(err) || eventsErr.IsInvalidArguments(err) { + // we immediately transition to failing if one of two scenarios occur during node event recording: + // (1) the event is too large to be sent over gRPC. this can occur if, for example, a dynamic task + // has a very large fanout and the compiled workflow closure causes the event to exceed the gRPC + // message size limit. + // (2) the event is invalid. this can occur if, for example, a dynamic task compiles a workflow + // which is invalid per admin limits (ex. maximum resources exceeded). np = v1alpha1.NodePhaseFailing p = handler.PhaseInfoFailure(core.ExecutionError_USER, "NodeFailed", err.Error(), p.GetInfo())