diff --git a/pkg/pillar/cmd/domainmgr/domainmgr.go b/pkg/pillar/cmd/domainmgr/domainmgr.go index 8695a93972..508381b28e 100644 --- a/pkg/pillar/cmd/domainmgr/domainmgr.go +++ b/pkg/pillar/cmd/domainmgr/domainmgr.go @@ -60,9 +60,10 @@ const ( ciDirname = runDirname + "/cloudinit" // For cloud-init images // Time limits for event loop handlers - errorTime = 3 * time.Minute - warningTime = 40 * time.Second - casClientType = "containerd" + errorTime = 3 * time.Minute + warningTime = 40 * time.Second + casClientType = "containerd" + unknownStateRetries = 10 ) // Really a constant @@ -1799,7 +1800,23 @@ func doActivateTail(ctx *domainContext, status *types.DomainStatus, status.VifList = checkIfEmu(status.VifList) status.State = types.RUNNING - domainID, state, err := hyper.Task(status).Info(status.DomainName) + var state types.SwState + // If status is unknown, get info again for some retries and if it will not be activated return error. + for retry := 0; retry < unknownStateRetries; retry++ { + ctx.ps.StillRunning(agentName, warningTime, errorTime) + domainID, state, err = hyper.Task(status).Info(status.DomainName) + if err != nil || state != types.UNKNOWN { + break + } + log.Warnf("doActivateTail(%v) for %s: state is UNKNOWN, retry %d", status.UUIDandVersion, status.DisplayName, retry) + time.Sleep(2 * time.Second) + } + + // if the state is still unknown after the retries we set an error, because we + // cannot guarantee that the domain is running. + if state == types.UNKNOWN && err == nil { + err = fmt.Errorf("The domain state is still unknown after %d retries", unknownStateRetries) + } if err != nil { // Immediate failure treat as above @@ -1827,6 +1844,7 @@ func doActivateTail(ctx *domainContext, status *types.DomainStatus, status.DomainId, status.BootTime.Format(time.RFC3339Nano), status.Key()) } + status.Activated = true log.Functionf("doActivateTail(%v) done for %s", status.UUIDandVersion, status.DisplayName) diff --git a/pkg/pillar/containerd/containerd.go b/pkg/pillar/containerd/containerd.go index b8ebfbfa19..a7cca0a0e9 100644 --- a/pkg/pillar/containerd/containerd.go +++ b/pkg/pillar/containerd/containerd.go @@ -513,6 +513,9 @@ func (client *Client) CtrContainerInfo(ctx context.Context, name string) (int, i return 0, 0, "", fmt.Errorf("CtrContainerInfo: couldn't determine task status for container %s: %v", name, err) } + if stat.Status == "unknown" { + logrus.Infof("CtrContainerInfo: PID of the task in container %s is %d, exit code is %d, status is %s and the task object (%v)", name, int(t.Pid()), int(stat.ExitStatus), stat.Status, t) + } return int(t.Pid()), int(stat.ExitStatus), string(stat.Status), nil } diff --git a/pkg/pillar/hypervisor/containerd.go b/pkg/pillar/hypervisor/containerd.go index a1ed69eca3..f46c87c8fb 100644 --- a/pkg/pillar/hypervisor/containerd.go +++ b/pkg/pillar/hypervisor/containerd.go @@ -231,12 +231,19 @@ func (ctx ctrdContext) Info(domainName string) (int, types.SwState, error) { defer done() effectiveDomainID, exit, status, err := ctx.ctrdClient.CtrContainerInfo(ctrdCtx, domainName) if err != nil { - return 0, types.UNKNOWN, logError("containerd looking up domain %s resulted in %v", domainName, err) + return 0, 0, logError("containerd looking up domain %s resulted in %v", domainName, err) } if status == "stopped" && exit != 0 { return 0, types.BROKEN, logError("task broke with exit status %d", exit) } + // When the status is "unknown", it typically indicates a communication issue between containerd and the task. + // This is generally a temporary state, so rather than returning an error, we’ll maintain the last known valid state. + // The goal is to keep the application running without marking it as broken or terminating it unnecessarily. + // Todo: Send an alert to the user that the task is in an unknown state, even after the retries. + if status == "unknown" { + logrus.Errorf("task %s is in %s state", domainName, status) + } stateMap := map[string]types.SwState{ "created": types.INSTALLED, @@ -244,7 +251,9 @@ func (ctx ctrdContext) Info(domainName string) (int, types.SwState, error) { "pausing": types.PAUSING, "paused": types.PAUSED, "stopped": types.HALTED, + "unknown": types.UNKNOWN, } + if effectiveDomainState, matched := stateMap[status]; !matched { err := fmt.Errorf("task %s happens to be in an unexpected state %s", domainName, status)