Skip to content

Commit

Permalink
Allow device to have multiple reasons for being in maintenance mode
Browse files Browse the repository at this point in the history
This change allows the device to have multiple reasons for being in
maintenance mode. This is useful when multiple conditions are met that
require the device to be in maintenance mode. For example, if the TPM
is in error and the device disk is full, the device can be in
maintenance mode for both reasons. Clearing one of the reasons will not
take the device out of maintenance mode if there are other reasons for
it to be in maintenance mode.

Signed-off-by: Shahriyar Jalayeri <shahriyar@zededa.com>
  • Loading branch information
shjala committed Jan 22, 2025
1 parent 2c2d305 commit b2e4070
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 61 deletions.
5 changes: 1 addition & 4 deletions pkg/pillar/cmd/nodeagent/handletimers.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,12 +131,9 @@ func handleRebootOnVaultLocked(ctxPtr *nodeagentContext) {
scheduleNodeOperation(ctxPtr, errStr, types.BootReasonVaultFailure,
types.DeviceOperationReboot)
} else {
log.Noticef("Setting %s",
types.MaintenanceModeReasonVaultLockedUp)
// there is no image update in progress, this happened after a normal
// reboot. enter maintenance mode
ctxPtr.maintMode = true
ctxPtr.maintModeReason = types.MaintenanceModeReasonVaultLockedUp
setMaintenanceModeReason(ctxPtr, types.MaintenanceModeReasonVaultLockedUp, "handleRebootOnVaultLocked")
publishNodeAgentStatus(ctxPtr)
}
} else {
Expand Down
66 changes: 30 additions & 36 deletions pkg/pillar/cmd/nodeagent/nodeagent.go
Original file line number Diff line number Diff line change
Expand Up @@ -780,14 +780,8 @@ func handleVaultStatusImpl(ctxArg interface{}, key string,
if vault.ConversionComplete {
ctx.vaultOperational = types.TS_ENABLED
// Do we need to clear maintenance?
if ctx.maintMode &&
ctx.maintModeReason == types.MaintenanceModeReasonVaultLockedUp {
log.Noticef("Clearing %s",
types.MaintenanceModeReasonVaultLockedUp)
ctx.maintMode = false
ctx.maintModeReason = types.MaintenanceModeReasonNone
publishNodeAgentStatus(ctx)
}
maybeClearMaintenanceModeReason(ctx, types.MaintenanceModeReasonVaultLockedUp, "handleVaultStatusImpl")
publishNodeAgentStatus(ctx)
} else {
ctx.vaultOperational = types.TS_NONE
}
Expand All @@ -811,34 +805,19 @@ func handleVolumeMgrStatusImpl(ctxArg interface{}, key string,

ctx := ctxArg.(*nodeagentContext)
vms := statusArg.(types.VolumeMgrStatus)
changed := false
// This RemainingSpace takes into account the space reserved for
// /persist/newlog plus the percentage/minimum reserved for the rest
// of EVE-OS. Thus it can never go negative, but zero means that
// we neiether have space to download new images nor space to deploy
// a tiny app instance.
if vms.RemainingSpace == 0 {
log.Warnf("MaintenanceMode due to no remaining diskspace")
// Do not overwrite a vault maintenance mode
if !ctx.maintMode {
log.Noticef("Setting %s",
types.MaintenanceModeReasonNoDiskSpace)
ctx.maintModeReason = types.MaintenanceModeReasonNoDiskSpace
ctx.maintMode = true
changed = true
}
// Add to maintenance mode reasons
setMaintenanceModeReason(ctx, types.MaintenanceModeReasonNoDiskSpace, "handleVolumeMgrStatusImpl")
publishNodeAgentStatus(ctx)
} else {
// Do we need to clear maintenance?
if ctx.maintMode &&
ctx.maintModeReason == types.MaintenanceModeReasonNoDiskSpace {
log.Noticef("Clearing %s",
types.MaintenanceModeReasonNoDiskSpace)
ctx.maintMode = false
ctx.maintModeReason = types.MaintenanceModeReasonNone
changed = true
}
}
if changed {
maybeClearMaintenanceModeReason(ctx, types.MaintenanceModeReasonNoDiskSpace, "handleVolumeMgrStatusImpl")
publishNodeAgentStatus(ctx)
}
}
Expand Down Expand Up @@ -880,16 +859,31 @@ func handleTpmStatusImpl(ctxArg interface{}, key string,

if tpm.Status == types.MaintenanceModeReasonTpmEncFailure {
log.Errorf("handleTpmStatusImpl: TPM manager reported TPM error : %s", tpm.Error)
log.Noticef("Setting %s", types.MaintenanceModeReasonTpmEncFailure)
ctx.maintMode = true
ctx.maintModeReason = types.MaintenanceModeReasonTpmEncFailure
setMaintenanceModeReason(ctx, types.MaintenanceModeReasonTpmEncFailure, "handleTpmStatusImpl")
publishNodeAgentStatus(ctx)
} else {
if ctx.maintMode && ctx.maintModeReason == types.MaintenanceModeReasonTpmEncFailure {
log.Noticef("Clearing %s", types.MaintenanceModeReasonTpmEncFailure)
ctx.maintMode = false
ctx.maintModeReason = types.MaintenanceModeReasonNone
publishNodeAgentStatus(ctx)
}
maybeClearMaintenanceModeReason(ctx, types.MaintenanceModeReasonTpmEncFailure, "handleTpmStatusImpl")
publishNodeAgentStatus(ctx)
}
}

func maybeClearMaintenanceModeReason(ctx *nodeagentContext, reason types.MaintenanceModeReason, caller string) {
if ctx.maintModeReason&reason == reason {
clearMaintenanceModeReason(ctx, reason, caller)
}
}

func setMaintenanceModeReason(ctx *nodeagentContext, reason types.MaintenanceModeReason, caller string) {
log.Noticef("%s setting %s", caller, reason)
ctx.maintModeReason |= reason
ctx.maintMode = true
}

func clearMaintenanceModeReason(ctx *nodeagentContext, reason types.MaintenanceModeReason, caller string) {
log.Noticef("%s clearing %s", caller, reason)
ctx.maintModeReason &^= reason
if ctx.maintModeReason == types.MaintenanceModeReasonNone {
log.Noticef("%s : No reason to be in maintenance mode, clearing maintenance mode", caller)
ctx.maintMode = false
}
}
12 changes: 6 additions & 6 deletions pkg/pillar/cmd/zedagent/parseconfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ func parseConfig(getconfigCtx *getconfigContext, config *zconfig.EdgeDevConfig,
// Did MaintenanceMode change?
if ctx.apiMaintenanceMode != config.MaintenanceMode {
ctx.apiMaintenanceMode = config.MaintenanceMode
mergeMaintenanceMode(ctx)
mergeMaintenanceMode(ctx, "parseConfig")
}

// Did the ForceFallbackCounter change? If so we publish for
Expand Down Expand Up @@ -2733,7 +2733,7 @@ func parseConfigItems(ctx *getconfigContext, config *zconfig.EdgeDevConfig,
newMaintenanceMode := newGlobalConfig.GlobalValueTriState(types.MaintenanceMode)
if oldMaintenanceMode != newMaintenanceMode {
ctx.zedagentCtx.gcpMaintenanceMode = newMaintenanceMode
mergeMaintenanceMode(ctx.zedagentCtx)
mergeMaintenanceMode(ctx.zedagentCtx, "parseConfigItems")
}

pub := ctx.zedagentCtx.pubGlobalConfig
Expand All @@ -2749,7 +2749,7 @@ func parseConfigItems(ctx *getconfigContext, config *zconfig.EdgeDevConfig,

// mergeMaintenanceMode handles the configItem override (unless NONE)
// and the API setting
func mergeMaintenanceMode(ctx *zedagentContext) {
func mergeMaintenanceMode(ctx *zedagentContext, caller string) {
switch ctx.gcpMaintenanceMode {
case types.TS_ENABLED:
// Overrides everything, and sets maintenance mode
Expand All @@ -2765,13 +2765,13 @@ func mergeMaintenanceMode(ctx *zedagentContext) {
if ctx.apiMaintenanceMode {
// set reason as user requested
ctx.maintModeReason = types.MaintenanceModeReasonUserRequested
} else if ctx.localMaintenanceMode {
} else if ctx.maintModeReason != ctx.localMaintModeReason {
// set reason to reflect exact local reason
ctx.maintModeReason = ctx.localMaintModeReason
}
}
log.Noticef("Changed maintenanceMode to %t, with reason as %s, considering {%v, %v, %v}",
ctx.maintenanceMode, ctx.maintModeReason.String(), ctx.gcpMaintenanceMode,
log.Noticef("%s changed maintenanceMode to %t, with reason as %s, considering {%v, %v, %v}",
caller, ctx.maintenanceMode, ctx.maintModeReason.String(), ctx.gcpMaintenanceMode,
ctx.apiMaintenanceMode, ctx.localMaintenanceMode)
}

Expand Down
8 changes: 5 additions & 3 deletions pkg/pillar/cmd/zedagent/zedagent.go
Original file line number Diff line number Diff line change
Expand Up @@ -2395,7 +2395,7 @@ func handleGlobalConfigImpl(ctxArg interface{}, key string,
ctx.globalConfig = *gcp
ctx.GCInitialized = true
ctx.gcpMaintenanceMode = gcp.GlobalValueTriState(types.MaintenanceMode)
mergeMaintenanceMode(ctx)
mergeMaintenanceMode(ctx, "handleGlobalConfigImpl")
reinitNetdumper(ctx)
}

Expand Down Expand Up @@ -2520,10 +2520,12 @@ func handleNodeAgentStatusImpl(ctxArg interface{}, key string,
if status.DevicePoweroff {
handleDeviceOperation(ctx, types.DeviceOperationPoweroff)
}
if ctx.localMaintenanceMode != status.LocalMaintenanceMode {
if ctx.localMaintenanceMode != status.LocalMaintenanceMode ||
ctx.localMaintModeReason != status.LocalMaintenanceModeReason {
ctx.localMaintenanceMode = status.LocalMaintenanceMode
ctx.localMaintModeReason = status.LocalMaintenanceModeReason
mergeMaintenanceMode(ctx)

mergeMaintenanceMode(ctx, "handleNodeAgentStatusImpl")
}

if naHasRealChange(*getconfigCtx.NodeAgentStatus, status) {
Expand Down
31 changes: 19 additions & 12 deletions pkg/pillar/types/zedagenttypes.go
Original file line number Diff line number Diff line change
Expand Up @@ -401,20 +401,27 @@ const (

// String returns the verbose equivalent of MaintenanceModeReason code
func (mmr MaintenanceModeReason) String() string {
switch mmr {
case MaintenanceModeReasonNone:
reason := []string{}
if mmr == MaintenanceModeReasonNone {
return "MaintenanceModeReasonNone"
case MaintenanceModeReasonUserRequested:
return "MaintenanceModeReasonUserRequested"
case MaintenanceModeReasonVaultLockedUp:
return "MaintenanceModeReasonVaultLockedUp"
case MaintenanceModeReasonNoDiskSpace:
return "MaintenanceModeReasonNoDiskSpace"
case MaintenanceModeReasonTpmEncFailure:
return "MaintenanceModeReasonTpmEncFailure"
default:
return fmt.Sprintf("Unknown MaintenanceModeReason %d", mmr)
}
if (mmr & MaintenanceModeReasonUserRequested) == MaintenanceModeReasonUserRequested {
reason = append(reason, "MaintenanceModeReasonUserRequested")
}
if (mmr & MaintenanceModeReasonVaultLockedUp) == MaintenanceModeReasonVaultLockedUp {
reason = append(reason, "MaintenanceModeReasonVaultLockedUp")
}
if (mmr & MaintenanceModeReasonNoDiskSpace) == MaintenanceModeReasonNoDiskSpace {
reason = append(reason, "MaintenanceModeReasonNoDiskSpace")
}
if (mmr & MaintenanceModeReasonTpmEncFailure) == MaintenanceModeReasonTpmEncFailure {
reason = append(reason, "MaintenanceModeReasonTpmEncFailure")
}
if len(reason) == 0 {
return "Unknown MaintenanceModeReason"
}

return strings.Join(reason, "|")
}

// NodeAgentStatus :
Expand Down

0 comments on commit b2e4070

Please sign in to comment.