Skip to content

Commit

Permalink
Keep the applications alive when getting UNKNOWN state from the conta…
Browse files Browse the repository at this point in the history
…inerd tasks

When a containerd task status for a VM transitions to UNKNOWN, Pillar sends a graceful shutdown command to that VM, even though the VM is still functioning properly.
This behavior causes unnecessary interruptions. Upon investigation, we confirmed via strace that the VM processes remain stable until the shutdown signal is issued.

To address this, we implemented a patch that recognizes the UNKNOWN status without shutting down the VMs.
Additionally, we introduced a retry mechanism to confirm whether the status is a temporary, momentary change before taking any action.

Signed-off-by: Konstantinos Perakis <konstantinos@zededa.com>
  • Loading branch information
cperakis authored and kperakis-zededa committed Jan 31, 2025
1 parent c1fa6c1 commit 09d95fb
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 4 deletions.
26 changes: 22 additions & 4 deletions pkg/pillar/cmd/domainmgr/domainmgr.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,10 @@ const (
ciDirname = runDirname + "/cloudinit" // For cloud-init images

// Time limits for event loop handlers
errorTime = 3 * time.Minute
warningTime = 40 * time.Second
casClientType = "containerd"
errorTime = 3 * time.Minute
warningTime = 40 * time.Second
casClientType = "containerd"
unknownStateRetries = 10
)

// Really a constant
Expand Down Expand Up @@ -1799,7 +1800,23 @@ func doActivateTail(ctx *domainContext, status *types.DomainStatus,
status.VifList = checkIfEmu(status.VifList)

status.State = types.RUNNING
domainID, state, err := hyper.Task(status).Info(status.DomainName)
var state types.SwState
// If status is unknown, get info again for some retries and if it will not be activated return error.
for retry := 0; retry < unknownStateRetries; retry++ {
ctx.ps.StillRunning(agentName, warningTime, errorTime)
domainID, state, err = hyper.Task(status).Info(status.DomainName)
if err != nil || state != types.UNKNOWN {
break
}
log.Warnf("doActivateTail(%v) for %s: state is UNKNOWN, retry %d", status.UUIDandVersion, status.DisplayName, retry)
time.Sleep(2 * time.Second)
}

// if the state is still unknown after the retries we set an error, because we
// cannot guarantee that the domain is running.
if state == types.UNKNOWN && err == nil {
err = fmt.Errorf("The domain state is still unknown after %d retries", unknownStateRetries)
}

if err != nil {
// Immediate failure treat as above
Expand Down Expand Up @@ -1827,6 +1844,7 @@ func doActivateTail(ctx *domainContext, status *types.DomainStatus,
status.DomainId, status.BootTime.Format(time.RFC3339Nano),
status.Key())
}

status.Activated = true
log.Functionf("doActivateTail(%v) done for %s",
status.UUIDandVersion, status.DisplayName)
Expand Down
3 changes: 3 additions & 0 deletions pkg/pillar/containerd/containerd.go
Original file line number Diff line number Diff line change
Expand Up @@ -513,6 +513,9 @@ func (client *Client) CtrContainerInfo(ctx context.Context, name string) (int, i
return 0, 0, "", fmt.Errorf("CtrContainerInfo: couldn't determine task status for container %s: %v", name, err)
}

if stat.Status == "unknown" {
logrus.Infof("CtrContainerInfo: PID of the task in container %s is %d, exit code is %d, status is %s and the task object (%v)", name, int(t.Pid()), int(stat.ExitStatus), stat.Status, t)
}
return int(t.Pid()), int(stat.ExitStatus), string(stat.Status), nil
}

Expand Down
9 changes: 9 additions & 0 deletions pkg/pillar/hypervisor/containerd.go
Original file line number Diff line number Diff line change
Expand Up @@ -237,14 +237,23 @@ func (ctx ctrdContext) Info(domainName string) (int, types.SwState, error) {
if status == "stopped" && exit != 0 {
return 0, types.BROKEN, logError("task broke with exit status %d", exit)
}
// When the status is "unknown", it typically indicates a communication issue between containerd and the task.
// This is generally a temporary state, so rather than returning an error, we’ll maintain the last known valid state.
// The goal is to keep the application running without marking it as broken or terminating it unnecessarily.
// Todo: Send an alert to the user that the task is in an unknown state, even after the retries.
if status == "unknown" {
logrus.Errorf("task %s is in %s state", domainName, status)
}

stateMap := map[string]types.SwState{
"created": types.INSTALLED,
"running": types.RUNNING,
"pausing": types.PAUSING,
"paused": types.PAUSED,
"stopped": types.HALTED,
"unknown": types.UNKNOWN,
}

if effectiveDomainState, matched := stateMap[status]; !matched {
err := fmt.Errorf("task %s happens to be in an unexpected state %s",
domainName, status)
Expand Down

0 comments on commit 09d95fb

Please sign in to comment.