diff --git a/pkg/pillar/cmd/volumemgr/handlevolume.go b/pkg/pillar/cmd/volumemgr/handlevolume.go index bdcee3789e..bd9a6d86f0 100644 --- a/pkg/pillar/cmd/volumemgr/handlevolume.go +++ b/pkg/pillar/cmd/volumemgr/handlevolume.go @@ -7,6 +7,7 @@ import ( "fmt" "time" + zconfig "github.com/lf-edge/eve-api/go/config" "github.com/lf-edge/eve/pkg/pillar/types" "github.com/lf-edge/eve/pkg/pillar/vault" "github.com/lf-edge/eve/pkg/pillar/volumehandlers" @@ -35,6 +36,7 @@ func handleVolumeModify(ctxArg interface{}, key string, log.Functionf("handleVolumeModify(%s)", key) config := configArg.(types.VolumeConfig) ctx := ctxArg.(*volumemgrContext) + if _, deferred := ctx.volumeConfigCreateDeferredMap[key]; deferred { //update deferred creation if exists ctx.volumeConfigCreateDeferredMap[key] = &config @@ -77,6 +79,7 @@ func handleVolumeDelete(ctxArg interface{}, key string, log.Functionf("handleVolumeDelete(%s)", key) config := configArg.(types.VolumeConfig) ctx := ctxArg.(*volumemgrContext) + if _, deferred := ctx.volumeConfigCreateDeferredMap[key]; deferred { //remove deferred creation if exists delete(ctx.volumeConfigCreateDeferredMap, key) @@ -98,6 +101,10 @@ func handleDeferredVolumeCreate(ctx *volumemgrContext, key string, config *types log.Tracef("handleDeferredVolumeCreate(%s)", key) status := ctx.LookupVolumeStatus(config.Key()) if status != nil { + if config.IsReplicated { + // Objects are replicated across cluster nodes, just exit. + return + } log.Fatalf("status exists at handleVolumeCreate for %s", config.Key()) } status = &types.VolumeStatus{ @@ -116,6 +123,8 @@ func handleDeferredVolumeCreate(ctx *volumemgrContext, key string, config *types LastRefCountChangeTime: time.Now(), LastUse: time.Now(), State: types.INITIAL, + IsReplicated: config.IsReplicated, + IsNativeContainer: config.IsNativeContainer, } updateVolumeStatusRefCount(ctx, status) log.Noticef("handleDeferredVolumeCreate(%s) setting contentFormat to %s", key, volumeFormat[status.Key()]) @@ -152,6 +161,18 @@ func handleDeferredVolumeCreate(ctx *volumemgrContext, key string, config *types status.TotalSize = int64(actualSize) status.CurrentSize = int64(actualSize) } + + // Fill the ReferenceName which will be used by domainmgr to launch native containers. + ctStatus := ctx.LookupContentTreeStatus(status.ContentID.String()) + + if ctStatus != nil { + status.ReferenceName = ctStatus.ReferenceID() + // In kubevirt eve though we created PVC from container image, we still set the content format as container. + // This will help domainmgr to load the external boot kernel (support shim VM container) + if ctStatus.Format == zconfig.Format_CONTAINER { + status.ContentFormat = zconfig.Format_CONTAINER + } + } publishVolumeStatus(ctx, status) updateVolumeRefStatus(ctx, status) if err := createOrUpdateAppDiskMetrics(ctx, status); err != nil { diff --git a/pkg/pillar/cmd/zedagent/handlecontent.go b/pkg/pillar/cmd/zedagent/handlecontent.go index 8317e8387a..5c02b73822 100644 --- a/pkg/pillar/cmd/zedagent/handlecontent.go +++ b/pkg/pillar/cmd/zedagent/handlecontent.go @@ -96,6 +96,24 @@ func parseContentInfoConfig(ctx *getconfigContext, contentConfig.MaxDownloadSize = cfgContentTree.GetMaxSizeBytes() contentConfig.DisplayName = cfgContentTree.GetDisplayName() contentConfig.CustomMeta = cfgContentTree.GetCustomMetaData() + contentConfig.IsLocal = true + controllerDNID := cfgContentTree.GetDesignatedNodeId() + // If this node is not designated node id set IsLocal to false. + // Content will be downloaded to only to the designated node id of that content tree. + // So on other nodes in the cluster mark the content tree as non-local. + // On single node eve either kvm or kubevirt based, this node will always be designated node. + // But if this is the contenttree of a container, we download it to all nodes of the cluster, + // so set IsLocal true in such case. + if controllerDNID != "" && controllerDNID != devUUID.String() { + if contentConfig.Format == zconfig.Format_CONTAINER { + contentConfig.IsLocal = true + } else { + contentConfig.IsLocal = false + } + } + + log.Noticef("parseContentInfo designated ID copy from volume config: %v, contentid %v, url %s", controllerDNID, contentConfig.ContentID, contentConfig.RelativeURL) + publishContentTreeConfig(ctx, *contentConfig) } ctx.pubContentTreeConfig.SignalRestarted() diff --git a/pkg/pillar/cmd/zedagent/handlemetrics.go b/pkg/pillar/cmd/zedagent/handlemetrics.go index 68cc88aabe..52967f0a99 100644 --- a/pkg/pillar/cmd/zedagent/handlemetrics.go +++ b/pkg/pillar/cmd/zedagent/handlemetrics.go @@ -1040,6 +1040,7 @@ func PublishAppInfoToZedCloud(ctx *zedagentContext, uuid string, ReportAppInfo.AppID = uuid ReportAppInfo.SystemApp = false + ReportAppInfo.ClusterAppRunning = false if aiStatus != nil { // In cluster mode, if ENClusterAppStatus reports the app is not scheduled on the node, @@ -1152,6 +1153,10 @@ func PublishAppInfoToZedCloud(ctx *zedagentContext, uuid string, snapInfo.SnapErr = encodeErrorInfo(snap.Error) ReportAppInfo.Snapshots = append(ReportAppInfo.Snapshots, snapInfo) } + + // For Clustered apps on HV=kubevirt, 'ClusterAppRunning' designates + // the app is running on this node either naturally or after some failover event. + ReportAppInfo.ClusterAppRunning = aiStatus.Activated } ReportInfo.InfoContent = new(info.ZInfoMsg_Ainfo) diff --git a/pkg/pillar/cmd/zedagent/handlevolume.go b/pkg/pillar/cmd/zedagent/handlevolume.go index db05a92e15..95cecb97c0 100644 --- a/pkg/pillar/cmd/zedagent/handlevolume.go +++ b/pkg/pillar/cmd/zedagent/handlevolume.go @@ -106,6 +106,29 @@ func parseVolumeConfig(ctx *getconfigContext, // Add config submitted via local profile server. addLocalVolumeConfig(ctx, volumeConfig) + controllerDNID := cfgVolume.GetDesignatedNodeId() + // If this node is designated node id set IsReplicated to false. + // On single node eve either kvm or kubevirt based, this node will always be designated node. + if controllerDNID != "" && controllerDNID != devUUID.String() { + volumeConfig.IsReplicated = true + } else { + volumeConfig.IsReplicated = false + } + + // Iterate through appconfig and check if this volume belongs to a native container deployment. + // Looks for NOHYPER type in VirtualizationMode. + appInstanceList := config.GetApps() + for _, ai := range appInstanceList { + if ai.Fixedresources.VirtualizationMode == zconfig.VmMode_NOHYPER { + for _, vr := range ai.VolumeRefList { + if vr.Uuid == volumeConfig.VolumeID.String() && volumeConfig.ContentID != uuid.Nil { + volumeConfig.IsNativeContainer = true + log.Noticef("parseVolumeConfig: setting IsNativeContainer for %s", volumeConfig.VolumeID.String()) + break + } + } + } + } publishVolumeConfig(ctx, *volumeConfig) } diff --git a/pkg/pillar/cmd/zedagent/parseconfig.go b/pkg/pillar/cmd/zedagent/parseconfig.go index d68f15ead2..eeb43a6658 100644 --- a/pkg/pillar/cmd/zedagent/parseconfig.go +++ b/pkg/pillar/cmd/zedagent/parseconfig.go @@ -651,6 +651,8 @@ func parseAppInstanceConfig(getconfigCtx *getconfigContext, appinstancePrevConfigHash, configHash, Apps) appinstancePrevConfigHash = configHash + devUUIDStr := config.GetId().Uuid + // First look for deleted ones items := getconfigCtx.pubAppInstanceConfig.GetAll() for uuidStr := range items { @@ -768,9 +770,14 @@ func parseAppInstanceConfig(getconfigCtx *getconfigContext, // Add config submitted via local profile server. addLocalAppConfig(getconfigCtx, &appInstance) - // XXX add Designated ID to the appInstance - // XXX Keep this here for now to allow the kubevirt single-node working, the later PR to EVE main will remove this - appInstance.DesignatedNodeID = devUUID + controllerDNID := cfgApp.GetDesignatedNodeId() + // If this node is designated node id set IsDesignatedNodeID to true else false. + // On single node eve either kvm or kubevirt based, this node will always be designated node. + if controllerDNID != "" && controllerDNID != devUUIDStr { + appInstance.IsDesignatedNodeID = false + } else { + appInstance.IsDesignatedNodeID = true + } // Verify that it fits and if not publish with error checkAndPublishAppInstanceConfig(getconfigCtx, appInstance) diff --git a/pkg/pillar/cmd/zedagent/reportinfo.go b/pkg/pillar/cmd/zedagent/reportinfo.go index 75a7ac5a75..039c3f12bc 100644 --- a/pkg/pillar/cmd/zedagent/reportinfo.go +++ b/pkg/pillar/cmd/zedagent/reportinfo.go @@ -128,10 +128,13 @@ func objectInfoTask(ctxPtr *zedagentContext, triggerInfo <-chan infoForObjectKey sub := ctxPtr.getconfigCtx.subContentTreeStatus if c, err = sub.Get(infoForKeyMessage.objectKey); err == nil { ctStatus := c.(types.ContentTreeStatus) - uuidStr := ctStatus.Key() - PublishContentInfoToZedCloud(ctxPtr, uuidStr, &ctStatus, - ctxPtr.iteration, infoDest) - ctxPtr.iteration++ + // We publish the info to zedcloud only if it is a local contenttree + if ctStatus.IsLocal { + uuidStr := ctStatus.Key() + PublishContentInfoToZedCloud(ctxPtr, uuidStr, &ctStatus, + ctxPtr.iteration, infoDest) + ctxPtr.iteration++ + } } case info.ZInfoTypes_ZiBlobList: // publish blob info diff --git a/pkg/pillar/cmd/zedkube/applogs.go b/pkg/pillar/cmd/zedkube/applogs.go index 85b1b2900d..a994ad0b6d 100644 --- a/pkg/pillar/cmd/zedkube/applogs.go +++ b/pkg/pillar/cmd/zedkube/applogs.go @@ -17,7 +17,6 @@ import ( "github.com/lf-edge/eve/pkg/pillar/base" "github.com/lf-edge/eve/pkg/pillar/kubeapi" "github.com/lf-edge/eve/pkg/pillar/types" - uuid "github.com/satori/go.uuid" "github.com/sirupsen/logrus" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -48,7 +47,7 @@ func (z *zedkube) collectAppLogs() { if aiconfig.FixedResources.VirtualizationMode != types.NOHYPER { continue } - if aiconfig.DesignatedNodeID != uuid.Nil && aiconfig.DesignatedNodeID.String() != z.nodeuuid { + if !aiconfig.IsDesignatedNodeID { continue } kubeName := base.GetAppKubeName(aiconfig.DisplayName, aiconfig.UUIDandVersion.UUID) @@ -121,11 +120,6 @@ func (z *zedkube) checkAppsStatus() { return } - u, err := uuid.FromString(z.nodeuuid) - if err != nil { - return - } - clientset, err := getKubeClientSet() if err != nil { log.Errorf("checkAppsStatus: can't get clientset %v", err) @@ -146,12 +140,12 @@ func (z *zedkube) checkAppsStatus() { var oldStatus *types.ENClusterAppStatus for _, item := range items { aiconfig := item.(types.AppInstanceConfig) - if aiconfig.DesignatedNodeID == uuid.Nil { // if not for cluster app, skip + if !aiconfig.IsDesignatedNodeID { // if not for cluster app, skip continue } encAppStatus := types.ENClusterAppStatus{ AppUUID: aiconfig.UUIDandVersion.UUID, - IsDNidNode: aiconfig.DesignatedNodeID == u, + IsDNidNode: aiconfig.IsDesignatedNodeID, } contName := base.GetAppKubeName(aiconfig.DisplayName, aiconfig.UUIDandVersion.UUID) diff --git a/pkg/pillar/cmd/zedmanager/handleclusterapp.go b/pkg/pillar/cmd/zedmanager/handleclusterapp.go index 4a67c3415a..7337e9a04a 100644 --- a/pkg/pillar/cmd/zedmanager/handleclusterapp.go +++ b/pkg/pillar/cmd/zedmanager/handleclusterapp.go @@ -40,12 +40,11 @@ func handleENClusterAppStatusImpl(ctx *zedmanagerContext, key string, status *ty log.Errorf("handleENClusterAppStatusImpl(%s) AppInstanceConfig missing for app", key) return } - // XXX this will be handled in later PR in clustering and zedmanager code - //handleCreateAppInstanceStatus(ctx, *aiConfig) + handleCreateAppInstanceStatus(ctx, *aiConfig) } else { - // re-publish the aiStatus, in case the cluster status has changed. + + activateAIStatusUUID(ctx, key) log.Functionf("handleENClusterAppStatusImpl(%s) for app-status %v aiStatus %v", key, status, aiStatus) - publishAppInstanceStatus(ctx, aiStatus) return } } else { // not scheduled here. @@ -54,7 +53,9 @@ func handleENClusterAppStatusImpl(ctx *zedmanagerContext, key string, status *ty if aiStatus != nil { // If I am not scheduled here, modify and publish the aiStatus with NoUploadStatsToController set. publishAppInstanceStatus(ctx, aiStatus) + publishAppInstanceSummary(ctx) } } + } diff --git a/pkg/pillar/cmd/zedmanager/handledomainmgr.go b/pkg/pillar/cmd/zedmanager/handledomainmgr.go index 8bc6d59e66..acba380780 100644 --- a/pkg/pillar/cmd/zedmanager/handledomainmgr.go +++ b/pkg/pillar/cmd/zedmanager/handledomainmgr.go @@ -11,7 +11,6 @@ import ( "strings" "github.com/lf-edge/eve/pkg/pillar/types" - uuid "github.com/satori/go.uuid" ) const ( @@ -48,10 +47,6 @@ func MaybeAddDomainConfig(ctx *zedmanagerContext, AppNum = ns.AppNum } - isDNiDnode := false - if aiConfig.DesignatedNodeID != uuid.Nil && aiConfig.DesignatedNodeID == ctx.nodeUUID { - isDNiDnode = true - } effectiveActivate := effectiveActivateCombined(aiConfig, ctx) dc := types.DomainConfig{ UUIDandVersion: aiConfig.UUIDandVersion, @@ -68,7 +63,7 @@ func MaybeAddDomainConfig(ctx *zedmanagerContext, CloudInitVersion: aiConfig.CloudInitVersion, // This isDNiDnode will be set to true even if the App is not in cluster mode, // This will be set in zedagent parseConfig for the case of single node/device App case. - IsDNidNode: isDNiDnode, + IsDNidNode: aiConfig.IsDesignatedNodeID, } dc.DiskConfigList = make([]types.DiskConfig, 0, len(aiStatus.VolumeRefStatusList)) diff --git a/pkg/pillar/cmd/zedmanager/updatestatus.go b/pkg/pillar/cmd/zedmanager/updatestatus.go index 0a3da0c25c..04d22c5f1a 100644 --- a/pkg/pillar/cmd/zedmanager/updatestatus.go +++ b/pkg/pillar/cmd/zedmanager/updatestatus.go @@ -39,6 +39,31 @@ func updateAIStatusUUID(ctx *zedmanagerContext, uuidStr string) { } } +// Activate this AppInstanceStatus generate config updates to +// the microservices +func activateAIStatusUUID(ctx *zedmanagerContext, uuidStr string) { + + log.Functionf("activateAIStatusUUID(%s)", uuidStr) + status := lookupAppInstanceStatus(ctx, uuidStr) + if status == nil { + log.Functionf("activateAIStatusUUID for %s: Missing AppInstanceStatus", + uuidStr) + return + } + config := lookupAppInstanceConfig(ctx, uuidStr, true) + if config == nil || (status.PurgeInprogress == types.BringDown) { + removeAIStatus(ctx, status) + return + } + doActivate(ctx, uuidStr, *config, status) + + log.Functionf("activateAIStatusUUID status %d for %s", + status.State, uuidStr) + publishAppInstanceStatus(ctx, status) + publishAppInstanceSummary(ctx) + +} + // Remove this AppInstanceStatus and generate config removes for // the microservices func removeAIStatusUUID(ctx *zedmanagerContext, uuidStr string) { diff --git a/pkg/pillar/cmd/zedmanager/zedmanager.go b/pkg/pillar/cmd/zedmanager/zedmanager.go index f935db5df8..627e990319 100644 --- a/pkg/pillar/cmd/zedmanager/zedmanager.go +++ b/pkg/pillar/cmd/zedmanager/zedmanager.go @@ -1098,11 +1098,19 @@ func handleCreate(ctxArg interface{}, key string, log.Functionf("handleCreate(%v) for %s", config.UUIDandVersion, config.DisplayName) + handleCreateAppInstanceStatus(ctx, config) +} + +func handleCreateAppInstanceStatus(ctx *zedmanagerContext, config types.AppInstanceConfig) { + log.Functionf("handleCreateAppInstanceStatus(%v) for %s", + config.UUIDandVersion, config.DisplayName) + status := types.AppInstanceStatus{ - UUIDandVersion: config.UUIDandVersion, - DisplayName: config.DisplayName, - FixedResources: config.FixedResources, - State: types.INITIAL, + UUIDandVersion: config.UUIDandVersion, + DisplayName: config.DisplayName, + FixedResources: config.FixedResources, + State: types.INITIAL, + IsDesignatedNodeID: config.IsDesignatedNodeID, } // Calculate the moment when the application should start, taking into account the configured delay @@ -1119,10 +1127,10 @@ func handleCreate(ctxArg interface{}, key string, configCounter := int(config.PurgeCmd.Counter + config.LocalPurgeCmd.Counter) if err == nil { if persistedCounter == configCounter { - log.Functionf("handleCreate(%v) for %s found matching purge counter %d", + log.Functionf("handleCreateAppInstanceStatus(%v) for %s found matching purge counter %d", config.UUIDandVersion, config.DisplayName, persistedCounter) } else { - log.Warnf("handleCreate(%v) for %s found different purge counter %d vs. %d", + log.Warnf("handleCreateAppInstanceStatus(%v) for %s found different purge counter %d vs. %d", config.UUIDandVersion, config.DisplayName, persistedCounter, configCounter) status.PurgeInprogress = types.DownloadAndVerify status.PurgeStartedAt = time.Now() @@ -1131,7 +1139,7 @@ func handleCreate(ctxArg interface{}, key string, } } else { // Save this PurgeCmd.Counter as the baseline - log.Functionf("handleCreate(%v) for %s saving purge counter %d", + log.Functionf("handleCreateAppInstanceStatus(%v) for %s saving purge counter %d", config.UUIDandVersion, config.DisplayName, configCounter) err = ctx.appToPurgeCounterMap.Assign(mapKey, configCounter, true) if err != nil { @@ -1194,7 +1202,7 @@ func handleCreate(ctxArg interface{}, key string, config.DisplayName, config.UUIDandVersion.UUID) publishAppInstanceStatus(ctx, &status) } - log.Functionf("handleCreate done for %s", config.DisplayName) + log.Functionf("handleCreateAppInstanceStatus done for %s", config.DisplayName) } func handleModify(ctxArg interface{}, key string, @@ -1664,7 +1672,7 @@ func effectiveActivateCurrentProfile(config types.AppInstanceConfig, currentProf func getKubeAppActivateStatus(ctx *zedmanagerContext, aiConfig types.AppInstanceConfig, effectiveActivate bool) bool { - if !ctx.hvTypeKube || aiConfig.DesignatedNodeID == uuid.Nil { + if !ctx.hvTypeKube { return effectiveActivate } @@ -1693,9 +1701,9 @@ func getKubeAppActivateStatus(ctx *zedmanagerContext, aiConfig types.AppInstance } } - log.Functionf("getKubeAppActivateStatus: ai %s, node %s, onTheDevice %v, statusRunning %v", - aiConfig.DesignatedNodeID.String(), ctx.nodeUUID, onTheDevice, statusRunning) - if aiConfig.DesignatedNodeID == ctx.nodeUUID { + log.Functionf("getKubeAppActivateStatus: is designated node %v, node %s, onTheDevice %v, statusRunning %v", + aiConfig.IsDesignatedNodeID, ctx.nodeUUID, onTheDevice, statusRunning) + if aiConfig.IsDesignatedNodeID { if statusRunning && !onTheDevice { return false } diff --git a/pkg/pillar/docs/failover.md b/pkg/pillar/docs/failover.md new file mode 100644 index 0000000000..0e6482361d --- /dev/null +++ b/pkg/pillar/docs/failover.md @@ -0,0 +1,181 @@ +# Application failover and volume data protection support + +## Overview + +Edge devices can be clustered together if they are installed with version of eve which supports kubevirt virtualization. +The volumes created on those devices are synchronously replicated within the cluster for data protection and high availability. The applications deployed on those devices are automatically failed over to surviving nodes by the underlying kubernetes infrastructure. This document covers the process of failover and the underlying data structures. + +## Components + +### Block Volumes + +In a clustered setup block volumes are treated as a cluster wide objects. Controller picks one of the nodes in the cluster as the designated node id for that volume. Controller sends volume config to all the devices in the cluster with designated node id set to uuid of the device which is supposed to be designated node id for that volume. + +EVE API has been enhanced to include Designated node id as String. +* [config/storage.proto](https://github.com/lf-edge/eve-api/blob/main/proto/config/storage.proto) +message Volume { + .... + // To inform the edge-node if the device receiving this Volume is + // responsible for volume creation, convert PVC, or not + string designated_node_id = 10; +} + +EVE volumetypes has been enhanced to include boolean IsReplicated. A volume is set to IsReplicated=false on a node that is designated node id. On all other nodes in the cluster it will set to true. On single node installs it will always be false. +* [types/volumetypes.go](../types/volumetypes.go) +type VolumeConfig struct { + .... + // This is a replicated volume + IsReplicated bool + // This volume is container image for native container. + // We will find out from NOHYPER flag in appinstanceconfig + IsNativeContainer bool +} + +type VolumeStatus struct { + .... + // This is a replicated volume + IsReplicated bool + // This volume is container image for native container. + // We will find out from NOHYPER flag in appinstanceconfig + IsNativeContainer bool +} + +Zedagent microservice parses the config from the device and updates the IsReplicated field in the volumeconfig struct. +Eventually volume manager updates the VolumeStatus in run time. + +All block volumes are created as kubernetes Persistent Volumes (PV) and replicated to other nodes in the cluster. + +### Network instance + +Network instances are cluster wide too, except they do not have a designated node id, in other words they are created on all nodes in a cluster. Controller ensures same network instance configuration is sent to all devices in the cluster. This will ensure NI is ready when an application failover and app starts without any issues. + +There are no changes to any existing data structures in EVE or eve-api + +### Content tree + +Content tree images eve downloads are of two types, qcow2/raw format or container image format. +The qcow2/raw files are converted into PVCs by the volume manager and hence content tree config also has designated node id. +This is very important for the non-container format content tree because the content is replicated to all nodes in the cluster and hence it is not necessary to download content tree to all nodes of the cluster. + +The container image format content tree is processed in two different ways depending on the application type. + + 1) If the application type is a Container (ie container launched in shim VM), only designated node will download the content tree, rest of the nodes will get the content through replication of PVC. + 2) If the application type is Native container (ie NOHYPER virtualization mode), then the content tree is downloaded to all the nodes in the cluster. That is because since the container image does not contain the kernel or OS components they need to be launched natively on the kubernetes infrastructure, converting them to PVC is useless in such cases. + + eve-api has been enhanced to add designated node id to content tree struct + * [config/storage.proto](https://github.com/lf-edge/eve-api/blob/main/proto/config/storage.proto) + message ContentTree { + .... + // To inform the edge-node if the device receiving this content tree is + // responsible for content tree download or not. + string designated_node_id = 12; + } + +EVE contenttreetypes.go has been enhanced to include IsLocal boolean. A content tree is set to IsLocal=true if the content tree is downloaded to that node, else it is set to false. For single node setups IsLocal is always true. For native containers IsLocal is always true. + +* [types/contentreetypes.go](../types/contenttreetypes.go) +type ContentTreeConfig struct { + .... + // Do we download on this node + IsLocal bool +} +type ContentTreeStatus struct { + .... + IsLocal bool +} + +Zedagent microservice parses the config from the device and updates the IsLocal field in the ContentTreeConfig struct. + +If the ContentTree is in qcow2/raw, or container format (not native container), it gets converted to Kubernetes Persistent volume (PV) and gets replicated to all other nodes in the cluster. + +### Application instance + +Application deployed in a cluster setup will also have designated node id. That node is picked by the controller and how it picks the designated node is beyond scope of EVE. Once the designated node id is picked, controller sends Application instance config to all the devices in the cluster with designated node id set to uuid of the device which is supposed to be designated node id for that application. + +The designated device will start the application and publishes AppInstanceStatus. All the nodes in the cluster will publish the AppInstanceStatus since they all receive the config. But only the node that is running the app will have aiStatus.Activated is set to true. That will be used by zedagent to update the info message to the controller. + +* [zedagent/handlemetrics.go](../cmd/zedagent/handlemetrics.go) +// For Clustered apps on HV=kubevirt, 'ClusterAppRunning' designates +// the app is running on this node either naturally or after some failover event. +ReportAppInfo.ClusterAppRunning = aiStatus.Activated + +There is also additional flag in AppInstanceStatus named NoUploadStatsToController. Zedagent looks at that flag and decides to upload stats to controller or not. The reason to have such flag is that app can move between nodes and only one node is supposed to upload stats to controller. Hence that flag will be toggled accordingly. + +eve-api has been enhanced to add designated node id to AppInstanceConfig +* [config/appinfo.config](https://github.com/lf-edge/eve-api/blob/main/proto/config/appconfig.proto) + +message AppInstanceConfig { + .... + // This edge-node UUID for the Designate Node for the Application + string designated_node_id = 26; +} + +* [info/info.proto](https://github.com/lf-edge/eve-api/blob/main/proto/info/info.proto) + +message ZInfoApp { + .... + // Deployed app is scheduled, or rescheduled and launched on this node, + // it has the Pod Spec-name of this node, the app can be in any operating state. + bool cluster_app_running = 20; +} + +EVE specific changes to AppInstanceConfig and AppInstanceStatus structs to carry bool IsDesignatedNodeID +* [types/zedmanagertypes.go](pkg/pillar/types/zedmanagertypes.go) + +type AppInstanceConfig struct { + .... + // Am I Cluster Designated Node Id for this app + IsDesignatedNodeID bool +} +type AppInstanceStatus struct { + .... + // Am I Cluster Designated Node Id for this app + IsDesignatedNodeID bool +} + +### Config from controller + +The app, network and volume config sent from controller is exactly same for all the devices in the cluster. +The devices (eve code) behaves differently depending on the designated node id set for those objects. +This design is simple and helps scale the number of nodes in the cluster easily. + +### Failover scenarios + +There are various scenarios that can trigger the application failover. Some of the most common ones are: + +1) Node graceful reboot +2) Node graceful shutdown +3) Node abrupt power off +4) A network failure between the cluster nodes +5) A physical disk failure on a node. +6) Resource starving on a node. + +### Failover handling + +We depend on the kubernetes infrastructure to detect and the trigger the failover of an application. +Kubernetes scheduler makes the decision to move the app to some other existing node in a cluster. + +Once the application gets Scheduled on a particular node after failover. EVE code does the following; + +1) zedkube microservice has a periodic loop to check for apps shceduled on that node. +2) zedkube publishes ENClusterAppStatus which looks something like this + { + "AppUUID": "a19baff7-5b6c-4363-9a1c-522b210f5139", + "IsDNSet": false, + "ScheduledOnThisNode": true, + "StatusRunning": true + } +3) zedmanager microservice subscribes to ENClusterAppStatus. +4) zedmanager then does the following if the app is scheduled on that node: + a) If AppInstanceStatus does not exist for that app, calls handleCreateAppInstanceStatus() + b) If AppInstanceStatus exists for that app, calls activateAIStatusUUID() + + If the app is descheduled on that node (was scheduled earlier) + a) If AppInstanceStatus exists for that app, calls publishAppInstanceStatus() to update the flag NoUploadStatsToController = true. This will ensure zedagent does not publish the appinfo to controller. + +The workflow above guarantees that the app is running on only one node in a cluster at a given time and the appinfo is sent to controller from only the node that is running the app at that time. + +### Failback handling + +Kubernetes descheduler decides to failback the app when the original failover scenario is resolved. +After that the EVE handling of app failback is same as failover handling as mentioned above. diff --git a/pkg/pillar/types/contenttreetypes.go b/pkg/pillar/types/contenttreetypes.go index ed4f8a7f50..06c368b75f 100644 --- a/pkg/pillar/types/contenttreetypes.go +++ b/pkg/pillar/types/contenttreetypes.go @@ -29,6 +29,8 @@ type ContentTreeConfig struct { GenerationCounter int64 DisplayName string CustomMeta string + // Do we download on this node ? + IsLocal bool } // Key is content info UUID which will be unique @@ -136,6 +138,7 @@ type ContentTreeStatus struct { Blobs []string HVTypeKube bool + IsLocal bool ErrorAndTimeWithSource } diff --git a/pkg/pillar/types/types.go b/pkg/pillar/types/types.go index 50249a9e35..d3788fb28b 100644 --- a/pkg/pillar/types/types.go +++ b/pkg/pillar/types/types.go @@ -51,6 +51,8 @@ const ( SCHEDULING // FAILED to start FAILED + // REMOTELOADED used for content tree which was loaded in other node + REMOTELOADED MAXSTATE ) @@ -105,6 +107,8 @@ func (state SwState) String() string { return "BROKEN" case START_DELAYED: return "START_DELAYED" + case REMOTELOADED: + return "REMOTELOADED" case UNKNOWN: return "UNKNOWN" default: @@ -176,6 +180,9 @@ func (state SwState) ZSwState() info.ZSwState { return info.ZSwState_PENDING case SCHEDULING: return info.ZSwState_SCHEDULING + case REMOTELOADED: + // REMOTELOADED is internal to eve cluster from API perspective return the LOADED state. + return info.ZSwState_LOADED // If we ever see UNKNOWN we return RUNNING assuming the state will change to something // known soon. diff --git a/pkg/pillar/types/volumetypes.go b/pkg/pillar/types/volumetypes.go index a3493a692d..38e3b78a95 100644 --- a/pkg/pillar/types/volumetypes.go +++ b/pkg/pillar/types/volumetypes.go @@ -28,6 +28,11 @@ type VolumeConfig struct { HasNoAppReferences bool Target zconfig.Target CustomMeta string + // This is a replicated volume + IsReplicated bool + // This volume is container image for native container. + // We will find out from NOHYPER flag in appinstanceconfig + IsNativeContainer bool } // Key is volume UUID which will be unique @@ -132,6 +137,12 @@ type VolumeStatus struct { Target zconfig.Target CustomMeta string + // Is this a replicated volume + IsReplicated bool + // Is this volume actually a container image for native container deployment + // We find that info from NOHYPER flag set in appinstance. + IsNativeContainer bool + ErrorAndTimeWithSource } diff --git a/pkg/pillar/types/zedmanagertypes.go b/pkg/pillar/types/zedmanagertypes.go index 2bc5c2482a..6e27532525 100644 --- a/pkg/pillar/types/zedmanagertypes.go +++ b/pkg/pillar/types/zedmanagertypes.go @@ -144,8 +144,8 @@ type AppInstanceConfig struct { // allow AppInstance to discover other AppInstances attached to its network instances AllowToDiscover bool - // Cluster Designated Node Id - DesignatedNodeID uuid.UUID + // Am I Cluster Designated Node Id for this app + IsDesignatedNodeID bool } type AppInstanceOpsCmd struct { @@ -277,6 +277,8 @@ type AppInstanceStatus struct { // This is used in cluster-mode to avoid multiple nodes // updating the same app instance status NoUploadStatsToController bool + // Am I Cluster Designated Node Id for this app + IsDesignatedNodeID bool } // AppCount is uint8 and it should be sufficient for the number of apps we can support