diff --git a/pkg/pillar/containerd/containerd.go b/pkg/pillar/containerd/containerd.go index b8ebfbfa19..a7cca0a0e9 100644 --- a/pkg/pillar/containerd/containerd.go +++ b/pkg/pillar/containerd/containerd.go @@ -513,6 +513,9 @@ func (client *Client) CtrContainerInfo(ctx context.Context, name string) (int, i return 0, 0, "", fmt.Errorf("CtrContainerInfo: couldn't determine task status for container %s: %v", name, err) } + if stat.Status == "unknown" { + logrus.Infof("CtrContainerInfo: PID of the task in container %s is %d, exit code is %d, status is %s and the task object (%v)", name, int(t.Pid()), int(stat.ExitStatus), stat.Status, t) + } return int(t.Pid()), int(stat.ExitStatus), string(stat.Status), nil } diff --git a/pkg/pillar/hypervisor/containerd.go b/pkg/pillar/hypervisor/containerd.go index a1ed69eca3..0e229c9afe 100644 --- a/pkg/pillar/hypervisor/containerd.go +++ b/pkg/pillar/hypervisor/containerd.go @@ -23,6 +23,9 @@ const ( //ContainerdHypervisorName is a name of containerd hypervisor ContainerdHypervisorName = "containerd" + + // retryCount is the number of times to retry getting the state of a task + retryCount = 10 ) type ctrdContext struct { @@ -231,12 +234,36 @@ func (ctx ctrdContext) Info(domainName string) (int, types.SwState, error) { defer done() effectiveDomainID, exit, status, err := ctx.ctrdClient.CtrContainerInfo(ctrdCtx, domainName) if err != nil { - return 0, types.UNKNOWN, logError("containerd looking up domain %s resulted in %v", domainName, err) + return 0, 0, logError("containerd looking up domain %s resulted in %v", domainName, err) + } + + // if the VM state is unknown, we will retry 10 times to get the state. This is to handle the case + // where the VM is in a transient state. + if status == "unknown" { + for i := 1; i <= retryCount; i++ { + time.Sleep(2 * time.Second) + logrus.Infof("task %s is in %s state, retrying %d from %d", domainName, status, i, retryCount) + effectiveDomainID, exit, status, err = ctx.ctrdClient.CtrContainerInfo(ctrdCtx, domainName) + if err != nil { + return 0, 0, logError("containerd looking up domain %s resulted in %v", domainName, err) + } + // if the VM state will change, we exit the loop. + if status != "unknown" { + break + } + } } if status == "stopped" && exit != 0 { return 0, types.BROKEN, logError("task broke with exit status %d", exit) } + // When the status is "unknown", it typically indicates a communication issue between containerd and the task. + // This is generally a temporary state, so rather than returning an error, we’ll maintain the last known valid state. + // The goal is to keep the application running without marking it as broken or terminating it unnecessarily. + // Todo: Send an alert to the user that the task is in an unknown state, even after the retries. + if status == "unknown" { + logrus.Errorf("task %s is in %s state, after %d retries", domainName, status, retryCount) + } stateMap := map[string]types.SwState{ "created": types.INSTALLED, @@ -244,7 +271,9 @@ func (ctx ctrdContext) Info(domainName string) (int, types.SwState, error) { "pausing": types.PAUSING, "paused": types.PAUSED, "stopped": types.HALTED, + "unknown": types.UNKNOWN, } + if effectiveDomainState, matched := stateMap[status]; !matched { err := fmt.Errorf("task %s happens to be in an unexpected state %s", domainName, status)