Skip to content

Commit

Permalink
Keep the applications alive when getting UNKNOWN state from the conta…
Browse files Browse the repository at this point in the history
…inerd tasks

When a containerd task status for a VM transitions to UNKNOWN, Pillar sends a graceful shutdown command to that VM, even though the VM is still functioning properly.
This behavior causes unnecessary interruptions. Upon investigation, we confirmed via strace that the VM processes remain stable until the shutdown signal is issued.

To address this, we implemented a patch that recognizes the UNKNOWN status without shutting down the VMs.
Additionally, we introduced a retry mechanism to confirm whether the status is a temporary, momentary change before taking any action.

Signed-off-by: Konstantinos Perakis <konstantinos@zededa.com>
  • Loading branch information
cperakis authored and kperakis-zededa committed Jan 31, 2025

Verified

This commit was signed with the committer’s verified signature.
jborean93 Jordan Borean
1 parent c1fa6c1 commit 09d5f92
Showing 3 changed files with 36 additions and 3 deletions.
24 changes: 21 additions & 3 deletions pkg/pillar/cmd/domainmgr/domainmgr.go
Original file line number Diff line number Diff line change
@@ -60,9 +60,10 @@ const (
ciDirname = runDirname + "/cloudinit" // For cloud-init images

// Time limits for event loop handlers
errorTime = 3 * time.Minute
warningTime = 40 * time.Second
casClientType = "containerd"
errorTime = 3 * time.Minute
warningTime = 40 * time.Second
casClientType = "containerd"
unknownStateRetries = 10
)

// Really a constant
@@ -1800,6 +1801,22 @@ func doActivateTail(ctx *domainContext, status *types.DomainStatus,

status.State = types.RUNNING
domainID, state, err := hyper.Task(status).Info(status.DomainName)
// If status is unknown, get info again for some retries and if it will not be activated return error.
for retry := 0; retry <= unknownStateRetries; retry++ {
if err == nil && state == types.UNKNOWN {
log.Warnf("doActivateTail(%v) for %s: state is UNKNOWN, retry %d", status.UUIDandVersion, status.DisplayName, retry)
time.Sleep(2 * time.Second)
domainID, state, err = hyper.Task(status).Info(status.DomainName)
} else {
break
}
}

// if the state is still unknown after the retries we set an error, because we
// cannot guarantee that the domain is running.
if state == types.UNKNOWN && err == nil {
err = fmt.Errorf("The domain state is still unknown after %d retries", unknownStateRetries)
}

if err != nil {
// Immediate failure treat as above
@@ -1827,6 +1844,7 @@ func doActivateTail(ctx *domainContext, status *types.DomainStatus,
status.DomainId, status.BootTime.Format(time.RFC3339Nano),
status.Key())
}

status.Activated = true
log.Functionf("doActivateTail(%v) done for %s",
status.UUIDandVersion, status.DisplayName)
3 changes: 3 additions & 0 deletions pkg/pillar/containerd/containerd.go
Original file line number Diff line number Diff line change
@@ -513,6 +513,9 @@ func (client *Client) CtrContainerInfo(ctx context.Context, name string) (int, i
return 0, 0, "", fmt.Errorf("CtrContainerInfo: couldn't determine task status for container %s: %v", name, err)
}

if stat.Status == "unknown" {
logrus.Infof("CtrContainerInfo: PID of the task in container %s is %d, exit code is %d, status is %s and the task object (%v)", name, int(t.Pid()), int(stat.ExitStatus), stat.Status, t)
}
return int(t.Pid()), int(stat.ExitStatus), string(stat.Status), nil
}

12 changes: 12 additions & 0 deletions pkg/pillar/hypervisor/containerd.go
Original file line number Diff line number Diff line change
@@ -23,6 +23,9 @@ const (

//ContainerdHypervisorName is a name of containerd hypervisor
ContainerdHypervisorName = "containerd"

// retryCount is the number of times to retry getting the state of a task
retryCount = 10
)

type ctrdContext struct {
@@ -237,14 +240,23 @@ func (ctx ctrdContext) Info(domainName string) (int, types.SwState, error) {
if status == "stopped" && exit != 0 {
return 0, types.BROKEN, logError("task broke with exit status %d", exit)
}
// When the status is "unknown", it typically indicates a communication issue between containerd and the task.
// This is generally a temporary state, so rather than returning an error, we’ll maintain the last known valid state.
// The goal is to keep the application running without marking it as broken or terminating it unnecessarily.
// Todo: Send an alert to the user that the task is in an unknown state, even after the retries.
if status == "unknown" {
logrus.Errorf("task %s is in %s state", domainName, status)
}

stateMap := map[string]types.SwState{
"created": types.INSTALLED,
"running": types.RUNNING,
"pausing": types.PAUSING,
"paused": types.PAUSED,
"stopped": types.HALTED,
"unknown": types.UNKNOWN,
}

if effectiveDomainState, matched := stateMap[status]; !matched {
err := fmt.Errorf("task %s happens to be in an unexpected state %s",
domainName, status)

0 comments on commit 09d5f92

Please sign in to comment.