From fd67c34d2bdfbe81b9e531a5732c0c1f674cf9c7 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Fri, 27 Sep 2024 15:19:00 -0400 Subject: [PATCH] docker: fix bug in waiting for container to exit In ##23966 when we switched to using the official Docker SDK client, we had more contexts to add because most of the library methods take one. But for some APIs like waiting for a container to exit after we've started it, we never want to close this context, because the operation can outlive the Nomad agent itself. --- drivers/docker/handle.go | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/docker/handle.go b/drivers/docker/handle.go index 34d8e305d5c..50042d0bb8f 100644 --- a/drivers/docker/handle.go +++ b/drivers/docker/handle.go @@ -292,12 +292,12 @@ func (h *taskHandle) run() { h.startCpusetFixer() - ctx, cancel := context.WithTimeout(context.Background(), dockerTimeout) - defer cancel() - var werr error var exitCode containerapi.WaitResponse - exitCodeC, errC := h.infinityClient.ContainerWait(ctx, h.containerID, containerapi.WaitConditionNotRunning) + // this needs to use the background context because the container can + // outlive Nomad itself + exitCodeC, errC := h.infinityClient.ContainerWait( + context.Background(), h.containerID, containerapi.WaitConditionNotRunning) select { case exitCode = <-exitCodeC: @@ -308,6 +308,9 @@ func (h *taskHandle) run() { h.logger.Error("failed to wait for container; already terminated") } + ctx, inspectCancel := context.WithTimeout(context.Background(), 10*time.Second) + defer inspectCancel() + container, ierr := h.dockerClient.ContainerInspect(ctx, h.containerID) oom := false if ierr != nil { @@ -331,7 +334,10 @@ func (h *taskHandle) run() { close(h.doneCh) // Stop the container just incase the docker daemon's wait returned - // incorrectly. + // incorrectly. Container should have exited by now so kill_timeout can be + // ignored. + ctx, stopCancel := context.WithTimeout(context.Background(), 10*time.Second) + defer stopCancel() if err := h.dockerClient.ContainerStop(ctx, h.containerID, containerapi.StopOptions{ Timeout: pointer.Of(0), }); err != nil {