Skip to content

Commit

Permalink
Keep the applications alive when getting UNKNOWN state from the conta…
Browse files Browse the repository at this point in the history
…inerd tasks

When a containerd task status for a VM transitions to UNKNOWN, Pillar sends a graceful shutdown command to that VM, even though the VM is still functioning properly.
This behavior causes unnecessary interruptions. Upon investigation, we confirmed via strace that the VM processes remain stable until the shutdown signal is issued.

To address this, we implemented a patch that recognizes the UNKNOWN status without shutting down the VMs.
Additionally, we introduced a retry mechanism to confirm whether the status is a temporary, momentary change before taking any action.

Signed-off-by: Konstantinos Perakis <konstantinos@zededa.com>
  • Loading branch information
cperakis authored and kperakis-zededa committed Jan 30, 2025
1 parent c1fa6c1 commit 951bfdd
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 1 deletion.
3 changes: 3 additions & 0 deletions pkg/pillar/containerd/containerd.go
Original file line number Diff line number Diff line change
Expand Up @@ -513,6 +513,9 @@ func (client *Client) CtrContainerInfo(ctx context.Context, name string) (int, i
return 0, 0, "", fmt.Errorf("CtrContainerInfo: couldn't determine task status for container %s: %v", name, err)
}

if stat.Status == "unknown" {
logrus.Infof("CtrContainerInfo: PID of the task in container %s is %d, exit code is %d, status is %s and the task object (%v)", name, int(t.Pid()), int(stat.ExitStatus), stat.Status, t)
}
return int(t.Pid()), int(stat.ExitStatus), string(stat.Status), nil
}

Expand Down
31 changes: 30 additions & 1 deletion pkg/pillar/hypervisor/containerd.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ const (

//ContainerdHypervisorName is a name of containerd hypervisor
ContainerdHypervisorName = "containerd"

// retryCount is the number of times to retry getting the state of a task
retryCount = 10
)

type ctrdContext struct {
Expand Down Expand Up @@ -231,20 +234,46 @@ func (ctx ctrdContext) Info(domainName string) (int, types.SwState, error) {
defer done()
effectiveDomainID, exit, status, err := ctx.ctrdClient.CtrContainerInfo(ctrdCtx, domainName)
if err != nil {
return 0, types.UNKNOWN, logError("containerd looking up domain %s resulted in %v", domainName, err)
return 0, 0, logError("containerd looking up domain %s resulted in %v", domainName, err)
}

// if the VM state is unknown, we will retry 10 times to get the state. This is to handle the case
// where the VM is in a transient state.
if status == "unknown" {
for i := 1; i <= retryCount; i++ {
time.Sleep(2 * time.Second)
logrus.Infof("task %s is in %s state, retrying %d from %d", domainName, status, i, retryCount)
effectiveDomainID, exit, status, err = ctx.ctrdClient.CtrContainerInfo(ctrdCtx, domainName)
if err != nil {
return 0, 0, logError("containerd looking up domain %s resulted in %v", domainName, err)
}
// if the VM state will change, we exit the loop.
if status != "unknown" {
break
}
}
}

if status == "stopped" && exit != 0 {
return 0, types.BROKEN, logError("task broke with exit status %d", exit)
}
// When the status is "unknown", it typically indicates a communication issue between containerd and the task.
// This is generally a temporary state, so rather than returning an error, we’ll maintain the last known valid state.
// The goal is to keep the application running without marking it as broken or terminating it unnecessarily.
// Todo: Send an alert to the user that the task is in an unknown state, even after the retries.
if status == "unknown" {
logrus.Errorf("task %s is in %s state, after %d retries", domainName, status, retryCount)
}

stateMap := map[string]types.SwState{
"created": types.INSTALLED,
"running": types.RUNNING,
"pausing": types.PAUSING,
"paused": types.PAUSED,
"stopped": types.HALTED,
"unknown": types.UNKNOWN,
}

if effectiveDomainState, matched := stateMap[status]; !matched {
err := fmt.Errorf("task %s happens to be in an unexpected state %s",
domainName, status)
Expand Down

0 comments on commit 951bfdd

Please sign in to comment.