From b5364d11cfb3430e3d7c45d1fa98cc32dc3ee841 Mon Sep 17 00:00:00 2001
From: Naiming Shen <naiming@zededa.com>
Date: Fri, 24 Jan 2025 19:38:34 -0800
Subject: [PATCH] Edge-Node Clustering miscellaneous bug fixes

- handle the ENC App Status and cluster reachable conditions and with
  error message status
- fix an issue in checkAppsStatus() of using staled oldStatus{}
- in multiple applications case, the there is a bug since now we changed
  the logic to not always publish the ENClusterAppStatus, need to use
  the correct oldStatus for the application
- fix the token rotation failure and waitfor cluster status bug
  and fix a bug in waiting for bootstrap server status, we can fall into
  the 'else' condition and get a wrong cert
- if not all-pods-ready, not printing the misleading 'applying node
  labels' message, instead log the 'Not all pods are ready'
- handle the case convert to single-node and immediately back to
  cluster-mode again. we need towait for the bootstrap 'cluster' status
  before moving on
- try to fix an issue of 'k3s.log' file rotation not taking effect once
  a while. the file size can not be truncated. Add a HUP signal before
  truncate the file

Signed-off-by: Naiming Shen <naiming@zededa.com>
---
 pkg/kube/cluster-init.sh               | 97 ++++++++++++++++++++++----
 pkg/kube/cluster-utils.sh              | 10 +++
 pkg/pillar/cmd/zedagent/parseconfig.go |  5 +-
 pkg/pillar/cmd/zedkube/applogs.go      | 50 +++++++++++--
 pkg/pillar/cmd/zedkube/zedkube.go      |  7 ++
 5 files changed, 146 insertions(+), 23 deletions(-)

diff --git a/pkg/kube/cluster-init.sh b/pkg/kube/cluster-init.sh
index 443bf5bcad..ddfe0bfbfb 100755
--- a/pkg/kube/cluster-init.sh
+++ b/pkg/kube/cluster-init.sh
@@ -21,6 +21,7 @@ INITIAL_WAIT_TIME=5
 MAX_WAIT_TIME=$((10 * 60)) # 10 minutes in seconds, exponential backoff for k3s restart
 current_wait_time=$INITIAL_WAIT_TIME
 CLUSTER_WAIT_FILE="/run/kube/cluster-change-wait-ongoing"
+All_PODS_READY=true
 
 # shellcheck source=pkg/kube/descheduler-utils.sh
 . /usr/bin/descheduler-utils.sh
@@ -317,6 +318,31 @@ check_start_containerd() {
         fi
 }
 
+# apply the node-uuid label to the node
+apply_node_uuid_label () {
+        if [ "$All_PODS_READY" = true ]; then
+                logmsg "set node label with uuid $DEVUUID"
+        else
+                logmsg "Not all pods are ready, Continue to wait while applying node labels"
+        fi
+        kubectl label node "$HOSTNAME" node-uuid="$DEVUUID"
+}
+
+# reapply the node labels
+reapply_node_labels() {
+        apply_node_uuid_label
+        apply_longhorn_disk_config "$HOSTNAME"
+        # Check if the node with both labels exists, don't assume above apply worked
+        node_count=$(kubectl get nodes -l node-uuid="$DEVUUID",node.longhorn.io/create-default-disk=config -o json | jq '.items | length')
+
+        if [ "$node_count" -gt 0 ]; then
+                logmsg "Node labels re-applied successfully"
+                touch /var/lib/node-labels-initialized
+        else
+                logmsg "Failed to re-apply node labels, on $HOSTNAME, uuid $DEVUUID"
+        fi
+}
+
 # Return success if all pods are Running/Succeeded and Ready
 # Used in install time to control api server load
 # Return unix style 0 for success.  (Not 0 for false)
@@ -388,7 +414,7 @@ is_bootstrap=""
 join_serverIP=""
 cluster_token=""
 cluster_node_ip=""
-# for bootstrap node, after reboot to get neighbor node to join
+convert_to_single_node=false
 
 # get the EdgeNodeClusterStatus from zedkube publication
 get_enc_status() {
@@ -413,18 +439,44 @@ get_enc_status() {
     fi
 }
 
+
 # When transitioning from single node to cluster mode, need change the controller
 # provided token for the cluster
+
+rotate_cluster_token() {
+        local token="$1"
+        /usr/bin/k3s token rotate --new-token "$token"
+        local status=$?
+        if [ $status -ne 0 ]; then
+                logmsg "Failed to rotate token. Exit status: $status"
+        else
+                logmsg "Token rotated successfully."
+        fi
+        return $status
+}
+
 change_to_new_token() {
   if [ -n "$cluster_token" ]; then
-    /usr/bin/k3s token rotate --new-token "$cluster_token"
+    logmsg "Rotate cluster token size: ${#cluster_token}"
+    rotate_cluster_token "$cluster_token"
+    # Set the starttime before entering the while loop
+    starttime=$(date +%s)
+
     while true; do
         if grep -q "server:$cluster_token" /var/lib/rancher/k3s/server/token; then
             logmsg "Token change has taken effect."
             break
         else
-            logmsg "Token has not taken effect yet. Sleeping for 2 seconds..."
-            sleep 2
+           currenttime=$(date +%s)
+            elapsed=$((currenttime - starttime))
+            if [ $elapsed -ge 60 ]; then
+                # Redo the rotate_cluster_token and reset the starttime
+                rotate_cluster_token "$cluster_token"
+                logmsg "Rotate cluster token again by k3s."
+                starttime=$(date +%s)
+            fi
+            logmsg "Token has not taken effect yet. Sleeping for 5 seconds..."
+            sleep 5
         fi
     done
   else
@@ -581,23 +633,27 @@ EOF
         counter=0
         touch "$CLUSTER_WAIT_FILE"
         while true; do
+          counter=$((counter+1))
           if curl --insecure --max-time 2 "https://$join_serverIP:6443" >/dev/null 2>&1; then
-            counter=$((counter+1))
             #logmsg "curl to Endpoint https://$join_serverIP:6443 ready, check cluster status"
             # if we are here, check the bootstrap server is single or cluster mode
             if ! status=$(curl --max-time 2 -s "http://$join_serverIP:$clusterStatusPort/status"); then
                 if [ $((counter % 30)) -eq 1 ]; then
                         logmsg "Attempt $counter: Failed to connect to the server. Waiting for 10 seconds..."
                 fi
-            elif [ "$status" != "cluster" ]; then
-                if [ $((counter % 30)) -eq 1 ]; then
-                        logmsg "Attempt $counter: Server is not in 'cluster' status. Waiting for 10 seconds..."
-                fi
-            else
+            elif [ "$status" = "cluster" ]; then
                 logmsg "Server is in 'cluster' status. done"
                 rm "$CLUSTER_WAIT_FILE"
                 break
+            else
+                if [ $((counter % 30)) -eq 1 ]; then
+                        logmsg "Attempt $counter: Server is not in 'cluster' status. Waiting for 10 seconds..."
+                fi
             fi
+          else
+                if [ $((counter % 30)) -eq 1 ]; then
+                        logmsg "Attempt $counter: curl to Endpoint https://$join_serverIP:6443 failed. Waiting for 10 seconds..."
+                fi
           fi
           sleep 10
         done
@@ -617,8 +673,14 @@ setup_prereqs
 if [ -f /var/lib/convert-to-single-node ]; then
         logmsg "remove /var/lib and copy saved single node /var/lib"
         restore_var_lib
+        logmsg "wiping unreferenced replicas"
+        rm -rf /persist/vault/volumes/replicas/*
         # assign node-ip to multus nodeIP for yaml config file
         assign_multus_nodeip
+        # set the variable 'convert_to_single_node' to true, in the case
+        # if we immediately convert back to cluster mode, we need to wait for the
+        # bootstrap status before moving on to cluster mode
+        convert_to_single_node=true
 fi
 # since we can wait for long time, always start the containerd first
 check_start_containerd
@@ -658,8 +720,12 @@ else # a restart case, found all_components_initialized
       fi
     done
     # got the cluster config, make the config.ymal now
-    logmsg "Cluster config status ok, provision config.yaml and bootstrap-config.yaml"
-    provision_cluster_config_file false
+   logmsg "Cluster config status ok, provision config.yaml and bootstrap-config.yaml"
+
+    # if we just converted to single node, then we need to wait for the bootstrap
+    # 'cluster' status before moving on to cluster mode
+    provision_cluster_config_file $convert_to_single_node
+    convert_to_single_node=false
     logmsg "provision config.yaml done"
   else # single node mode
     logmsg "Single node mode, prepare config.yaml for $HOSTNAME"
@@ -703,11 +769,14 @@ if [ ! -f /var/lib/all_components_initialized ]; then
         fi
 
         # label the node with device uuid
-        apply_node_uuid_lable
+        apply_node_uuid_label
 
         if ! are_all_pods_ready; then
+                All_PODS_READY=false
+                sleep 10
                 continue
         fi
+        All_PODS_READY=true
 
         if [ ! -f /var/lib/multus_initialized ]; then
                 if [ ! -f /etc/multus-daemonset-new.yaml ]; then
@@ -817,7 +886,7 @@ else
                 fi
         else
                 if [ ! -f /var/lib/node-labels-initialized ]; then
-                        reapply_node_labes
+                        reapply_node_labels
                 fi
                 # Initialize CNI after k3s reboot
                 if [ ! -d /var/lib/cni/bin ] || [ ! -d /opt/cni/bin ]; then
diff --git a/pkg/kube/cluster-utils.sh b/pkg/kube/cluster-utils.sh
index 56799fd31c..43836c8459 100755
--- a/pkg/kube/cluster-utils.sh
+++ b/pkg/kube/cluster-utils.sh
@@ -43,6 +43,16 @@ check_log_file_size() {
                 fi
                 # keep the original log file's attributes
                 cp -p "$K3S_LOG_DIR/$1" "$K3S_LOG_DIR/$1.1"
+                # Check if the argument passed is "$K3s_LOG_FILE", sometimes the k3s is
+                # not releasing the file descriptor, so truncate the file may not
+                # take effect. Signal a HUP signal to that.
+                if [ "$1" = "$K3s_LOG_FILE" ]; then
+                        k3s_pid=$(pgrep -f "k3s server")
+                        if [ -n "$k3s_pid" ]; then
+                                kill -HUP "$k3s_pid"
+                                logmsg "Sent HUP signal to k3s server before truncate k3s.log size"
+                        fi
+                fi
                 truncate -s 0 "$K3S_LOG_DIR/$1"
                 logmsg "k3s logfile $1, size $currentSize rotate"
         fi
diff --git a/pkg/pillar/cmd/zedagent/parseconfig.go b/pkg/pillar/cmd/zedagent/parseconfig.go
index 9bc7139c6b..11b9fa9faa 100644
--- a/pkg/pillar/cmd/zedagent/parseconfig.go
+++ b/pkg/pillar/cmd/zedagent/parseconfig.go
@@ -159,6 +159,9 @@ func parseConfig(getconfigCtx *getconfigContext, config *zconfig.EdgeDevConfig,
 			parseContentInfoConfig(getconfigCtx, config)
 			parseVolumeConfig(getconfigCtx, config)
 			parseEvConfig(getconfigCtx, config)
+			// several service are waiting this NodeInfo at startup, either if we don't
+			// have apps, need to parse this config first
+			parseEdgeNodeInfo(getconfigCtx, config)
 
 			// We have handled the volumes, so we can now process the app instances. But we need to check if
 			// we are in the middle of a baseOS upgrade, and if so, we need to skip processing the app instances.
@@ -177,8 +180,6 @@ func parseConfig(getconfigCtx *getconfigContext, config *zconfig.EdgeDevConfig,
 
 			parseDisksConfig(getconfigCtx, config)
 
-			parseEdgeNodeInfo(getconfigCtx, config)
-
 			parsePatchEnvelopes(getconfigCtx, config)
 		}
 
diff --git a/pkg/pillar/cmd/zedkube/applogs.go b/pkg/pillar/cmd/zedkube/applogs.go
index a994ad0b6d..8fd40abf97 100644
--- a/pkg/pillar/cmd/zedkube/applogs.go
+++ b/pkg/pillar/cmd/zedkube/applogs.go
@@ -126,17 +126,19 @@ func (z *zedkube) checkAppsStatus() {
 		return
 	}
 
-	options := metav1.ListOptions{
-		FieldSelector: fmt.Sprintf("spec.nodeName=%s", z.nodeName),
-	}
-	pods, err := clientset.CoreV1().Pods(kubeapi.EVEKubeNameSpace).List(context.TODO(), options)
+	stItems := z.pubENClusterAppStatus.GetAll()
+
+	pods, err := clientset.CoreV1().Pods(kubeapi.EVEKubeNameSpace).List(context.TODO(), metav1.ListOptions{})
 	if err != nil {
 		log.Errorf("checkAppsStatus: can't get pods %v", err)
+		// If we can't get pods, process the error and return
+		z.handleKubePodsGetError(items, stItems)
 		return
 	}
 
-	pub := z.pubENClusterAppStatus
-	stItems := pub.GetAll()
+	z.getKubePodsError.getKubePodsErrorTime = time.Time{}
+	z.getKubePodsError.processedErrorCondition = false
+
 	var oldStatus *types.ENClusterAppStatus
 	for _, item := range items {
 		aiconfig := item.(types.AppInstanceConfig)
@@ -153,7 +155,9 @@ func (z *zedkube) checkAppsStatus() {
 			contVMIName := "virt-launcher-" + contName
 			log.Functionf("checkAppsStatus: pod %s, cont %s", pod.Name, contName)
 			if strings.HasPrefix(pod.Name, contName) || strings.HasPrefix(pod.Name, contVMIName) {
-				encAppStatus.ScheduledOnThisNode = true
+				if pod.Spec.NodeName == z.nodeName {
+					encAppStatus.ScheduledOnThisNode = true
+				}
 				if pod.Status.Phase == corev1.PodRunning {
 					encAppStatus.StatusRunning = true
 				}
@@ -170,6 +174,7 @@ func (z *zedkube) checkAppsStatus() {
 		}
 		log.Functionf("checkAppsStatus: devname %s, pod (%d) status %+v, old %+v", z.nodeName, len(pods.Items), encAppStatus, oldStatus)
 
+		// Publish if there is a status change
 		if oldStatus == nil || oldStatus.IsDNidNode != encAppStatus.IsDNidNode ||
 			oldStatus.ScheduledOnThisNode != encAppStatus.ScheduledOnThisNode || oldStatus.StatusRunning != encAppStatus.StatusRunning {
 			log.Functionf("checkAppsStatus: status differ, publish")
@@ -177,3 +182,34 @@ func (z *zedkube) checkAppsStatus() {
 		}
 	}
 }
+
+func (z *zedkube) handleKubePodsGetError(items, stItems map[string]interface{}) {
+	if z.getKubePodsError.getKubePodsErrorTime.IsZero() {
+		now := time.Now()
+		z.getKubePodsError.getKubePodsErrorTime = now
+		log.Noticef("handleKubePodsGetError: can't get pods, set error time")
+	} else if time.Since(z.getKubePodsError.getKubePodsErrorTime) > 2*time.Minute {
+		// The settings of kubernetes the node is 'NotReady' after unreachable for 1 minute,
+		// and the replicaSet policy for POD/VMI is after 30 seconds post the 'NotReady' node
+		// the App will be rescheduled to other node. So, we use the 2 minutes as the threshold
+		if z.getKubePodsError.processedErrorCondition == false {
+			z.getKubePodsError.processedErrorCondition = true
+			for _, item := range items {
+				aiconfig := item.(types.AppInstanceConfig)
+				for _, st := range stItems {
+					aiStatus := st.(types.ENClusterAppStatus)
+					if aiStatus.AppUUID == aiconfig.UUIDandVersion.UUID {
+						// if we used to publish the status, of this app is scheduled on this node
+						// need to reset this, since we have lost the connection to the kubernetes
+						// for longer time than the app is to be migrated to other node
+						if aiStatus.ScheduledOnThisNode {
+							aiStatus.ScheduledOnThisNode = false
+							z.pubENClusterAppStatus.Publish(aiconfig.Key(), aiStatus)
+							log.Noticef("handleKubePodsGetError: can't get pods set ScheduledOnThisNode off for %s, ", aiconfig.DisplayName)
+						}
+					}
+				}
+			}
+		}
+	}
+}
diff --git a/pkg/pillar/cmd/zedkube/zedkube.go b/pkg/pillar/cmd/zedkube/zedkube.go
index 8874fac373..c8497d21d0 100644
--- a/pkg/pillar/cmd/zedkube/zedkube.go
+++ b/pkg/pillar/cmd/zedkube/zedkube.go
@@ -38,6 +38,12 @@ var (
 	log    *base.LogObject
 )
 
+// GetKubePodsError is used to check and handle get kube pods error
+type GetKubePodsError struct {
+	getKubePodsErrorTime    time.Time
+	processedErrorCondition bool
+}
+
 type zedkube struct {
 	agentbase.AgentBase
 	globalConfig             *types.ConfigItemValueMap
@@ -78,6 +84,7 @@ type zedkube struct {
 	electionStopCh           chan struct{}
 	statusServer             *http.Server
 	statusServerWG           sync.WaitGroup
+	getKubePodsError         GetKubePodsError
 	drainOverrideTimer       *time.Timer
 
 	// Config Properties for Drain