Keep the applications alive when getting UNKNOWN state from the conta…

…inerd tasks When a containerd task status for a VM transitions to UNKNOWN, Pillar sends a graceful shutdown command to that VM, even though the VM is still functioning properly. This behavior causes unnecessary interruptions. Upon investigation, we confirmed via strace that the VM processes remain stable until the shutdown signal is issued. To address this, we implemented a patch that recognizes the UNKNOWN status without shutting down the VMs. Additionally, we introduced a retry mechanism to confirm whether the status is a temporary, momentary change before taking any action. Signed-off-by: Konstantinos Perakis <konstantinos@zededa.com>
lf-edge · Jan 31, 2025 · 09d95fb · 09d95fb
1 parent c1fa6c1
commit 09d95fb
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 4 deletions.
diff --git a/pkg/pillar/cmd/domainmgr/domainmgr.go b/pkg/pillar/cmd/domainmgr/domainmgr.go
@@ -60,9 +60,10 @@ const (
 	ciDirname  = runDirname + "/cloudinit" // For cloud-init images
 
 	// Time limits for event loop handlers
-	errorTime     = 3 * time.Minute
-	warningTime   = 40 * time.Second
-	casClientType = "containerd"
+	errorTime           = 3 * time.Minute
+	warningTime         = 40 * time.Second
+	casClientType       = "containerd"
+	unknownStateRetries = 10
 )
 
 // Really a constant
@@ -1799,7 +1800,23 @@ func doActivateTail(ctx *domainContext, status *types.DomainStatus,
 	status.VifList = checkIfEmu(status.VifList)
 
 	status.State = types.RUNNING
-	domainID, state, err := hyper.Task(status).Info(status.DomainName)
+	var state types.SwState
+	// If status is unknown, get info again for some retries and if it will not be activated return error.
+	for retry := 0; retry < unknownStateRetries; retry++ {
+		ctx.ps.StillRunning(agentName, warningTime, errorTime)
+		domainID, state, err = hyper.Task(status).Info(status.DomainName)
+		if err != nil || state != types.UNKNOWN {
+			break
+		}
+		log.Warnf("doActivateTail(%v) for %s: state is UNKNOWN, retry %d", status.UUIDandVersion, status.DisplayName, retry)
+		time.Sleep(2 * time.Second)
+	}
+
+	// if the state is still unknown after the retries we set an error, because we
+	// cannot guarantee that the domain is running.
+	if state == types.UNKNOWN && err == nil {
+		err = fmt.Errorf("The domain state is still unknown after %d retries", unknownStateRetries)
+	}
 
 	if err != nil {
 		// Immediate failure treat as above
@@ -1827,6 +1844,7 @@ func doActivateTail(ctx *domainContext, status *types.DomainStatus,
 			status.DomainId, status.BootTime.Format(time.RFC3339Nano),
 			status.Key())
 	}
+
 	status.Activated = true
 	log.Functionf("doActivateTail(%v) done for %s",
 		status.UUIDandVersion, status.DisplayName)

diff --git a/pkg/pillar/containerd/containerd.go b/pkg/pillar/containerd/containerd.go
@@ -513,6 +513,9 @@ func (client *Client) CtrContainerInfo(ctx context.Context, name string) (int, i
 		return 0, 0, "", fmt.Errorf("CtrContainerInfo: couldn't determine task status for container %s: %v", name, err)
 	}
 
+	if stat.Status == "unknown" {
+		logrus.Infof("CtrContainerInfo: PID of the task in container %s is %d, exit code is %d, status is %s and the task object (%v)", name, int(t.Pid()), int(stat.ExitStatus), stat.Status, t)
+	}
 	return int(t.Pid()), int(stat.ExitStatus), string(stat.Status), nil
 }
 

diff --git a/pkg/pillar/hypervisor/containerd.go b/pkg/pillar/hypervisor/containerd.go
@@ -237,14 +237,23 @@ func (ctx ctrdContext) Info(domainName string) (int, types.SwState, error) {
 	if status == "stopped" && exit != 0 {
 		return 0, types.BROKEN, logError("task broke with exit status %d", exit)
 	}
+	// When the status is "unknown", it typically indicates a communication issue between containerd and the task.
+	// This is generally a temporary state, so rather than returning an error, we’ll maintain the last known valid state.
+	// The goal is to keep the application running without marking it as broken or terminating it unnecessarily.
+	// Todo: Send an alert to the user that the task is in an unknown state, even after the retries.
+	if status == "unknown" {
+		logrus.Errorf("task %s is in %s state", domainName, status)
+	}
 
 	stateMap := map[string]types.SwState{
 		"created": types.INSTALLED,
 		"running": types.RUNNING,
 		"pausing": types.PAUSING,
 		"paused":  types.PAUSED,
 		"stopped": types.HALTED,
+		"unknown": types.UNKNOWN,
 	}
+
 	if effectiveDomainState, matched := stateMap[status]; !matched {
 		err := fmt.Errorf("task %s happens to be in an unexpected state %s",
 			domainName, status)