From b5364d11cfb3430e3d7c45d1fa98cc32dc3ee841 Mon Sep 17 00:00:00 2001 From: Naiming Shen Date: Fri, 24 Jan 2025 19:38:34 -0800 Subject: [PATCH] Edge-Node Clustering miscellaneous bug fixes - handle the ENC App Status and cluster reachable conditions and with error message status - fix an issue in checkAppsStatus() of using staled oldStatus{} - in multiple applications case, the there is a bug since now we changed the logic to not always publish the ENClusterAppStatus, need to use the correct oldStatus for the application - fix the token rotation failure and waitfor cluster status bug and fix a bug in waiting for bootstrap server status, we can fall into the 'else' condition and get a wrong cert - if not all-pods-ready, not printing the misleading 'applying node labels' message, instead log the 'Not all pods are ready' - handle the case convert to single-node and immediately back to cluster-mode again. we need towait for the bootstrap 'cluster' status before moving on - try to fix an issue of 'k3s.log' file rotation not taking effect once a while. the file size can not be truncated. Add a HUP signal before truncate the file Signed-off-by: Naiming Shen --- pkg/kube/cluster-init.sh | 97 ++++++++++++++++++++++---- pkg/kube/cluster-utils.sh | 10 +++ pkg/pillar/cmd/zedagent/parseconfig.go | 5 +- pkg/pillar/cmd/zedkube/applogs.go | 50 +++++++++++-- pkg/pillar/cmd/zedkube/zedkube.go | 7 ++ 5 files changed, 146 insertions(+), 23 deletions(-) diff --git a/pkg/kube/cluster-init.sh b/pkg/kube/cluster-init.sh index 443bf5bcad..ddfe0bfbfb 100755 --- a/pkg/kube/cluster-init.sh +++ b/pkg/kube/cluster-init.sh @@ -21,6 +21,7 @@ INITIAL_WAIT_TIME=5 MAX_WAIT_TIME=$((10 * 60)) # 10 minutes in seconds, exponential backoff for k3s restart current_wait_time=$INITIAL_WAIT_TIME CLUSTER_WAIT_FILE="/run/kube/cluster-change-wait-ongoing" +All_PODS_READY=true # shellcheck source=pkg/kube/descheduler-utils.sh . /usr/bin/descheduler-utils.sh @@ -317,6 +318,31 @@ check_start_containerd() { fi } +# apply the node-uuid label to the node +apply_node_uuid_label () { + if [ "$All_PODS_READY" = true ]; then + logmsg "set node label with uuid $DEVUUID" + else + logmsg "Not all pods are ready, Continue to wait while applying node labels" + fi + kubectl label node "$HOSTNAME" node-uuid="$DEVUUID" +} + +# reapply the node labels +reapply_node_labels() { + apply_node_uuid_label + apply_longhorn_disk_config "$HOSTNAME" + # Check if the node with both labels exists, don't assume above apply worked + node_count=$(kubectl get nodes -l node-uuid="$DEVUUID",node.longhorn.io/create-default-disk=config -o json | jq '.items | length') + + if [ "$node_count" -gt 0 ]; then + logmsg "Node labels re-applied successfully" + touch /var/lib/node-labels-initialized + else + logmsg "Failed to re-apply node labels, on $HOSTNAME, uuid $DEVUUID" + fi +} + # Return success if all pods are Running/Succeeded and Ready # Used in install time to control api server load # Return unix style 0 for success. (Not 0 for false) @@ -388,7 +414,7 @@ is_bootstrap="" join_serverIP="" cluster_token="" cluster_node_ip="" -# for bootstrap node, after reboot to get neighbor node to join +convert_to_single_node=false # get the EdgeNodeClusterStatus from zedkube publication get_enc_status() { @@ -413,18 +439,44 @@ get_enc_status() { fi } + # When transitioning from single node to cluster mode, need change the controller # provided token for the cluster + +rotate_cluster_token() { + local token="$1" + /usr/bin/k3s token rotate --new-token "$token" + local status=$? + if [ $status -ne 0 ]; then + logmsg "Failed to rotate token. Exit status: $status" + else + logmsg "Token rotated successfully." + fi + return $status +} + change_to_new_token() { if [ -n "$cluster_token" ]; then - /usr/bin/k3s token rotate --new-token "$cluster_token" + logmsg "Rotate cluster token size: ${#cluster_token}" + rotate_cluster_token "$cluster_token" + # Set the starttime before entering the while loop + starttime=$(date +%s) + while true; do if grep -q "server:$cluster_token" /var/lib/rancher/k3s/server/token; then logmsg "Token change has taken effect." break else - logmsg "Token has not taken effect yet. Sleeping for 2 seconds..." - sleep 2 + currenttime=$(date +%s) + elapsed=$((currenttime - starttime)) + if [ $elapsed -ge 60 ]; then + # Redo the rotate_cluster_token and reset the starttime + rotate_cluster_token "$cluster_token" + logmsg "Rotate cluster token again by k3s." + starttime=$(date +%s) + fi + logmsg "Token has not taken effect yet. Sleeping for 5 seconds..." + sleep 5 fi done else @@ -581,23 +633,27 @@ EOF counter=0 touch "$CLUSTER_WAIT_FILE" while true; do + counter=$((counter+1)) if curl --insecure --max-time 2 "https://$join_serverIP:6443" >/dev/null 2>&1; then - counter=$((counter+1)) #logmsg "curl to Endpoint https://$join_serverIP:6443 ready, check cluster status" # if we are here, check the bootstrap server is single or cluster mode if ! status=$(curl --max-time 2 -s "http://$join_serverIP:$clusterStatusPort/status"); then if [ $((counter % 30)) -eq 1 ]; then logmsg "Attempt $counter: Failed to connect to the server. Waiting for 10 seconds..." fi - elif [ "$status" != "cluster" ]; then - if [ $((counter % 30)) -eq 1 ]; then - logmsg "Attempt $counter: Server is not in 'cluster' status. Waiting for 10 seconds..." - fi - else + elif [ "$status" = "cluster" ]; then logmsg "Server is in 'cluster' status. done" rm "$CLUSTER_WAIT_FILE" break + else + if [ $((counter % 30)) -eq 1 ]; then + logmsg "Attempt $counter: Server is not in 'cluster' status. Waiting for 10 seconds..." + fi fi + else + if [ $((counter % 30)) -eq 1 ]; then + logmsg "Attempt $counter: curl to Endpoint https://$join_serverIP:6443 failed. Waiting for 10 seconds..." + fi fi sleep 10 done @@ -617,8 +673,14 @@ setup_prereqs if [ -f /var/lib/convert-to-single-node ]; then logmsg "remove /var/lib and copy saved single node /var/lib" restore_var_lib + logmsg "wiping unreferenced replicas" + rm -rf /persist/vault/volumes/replicas/* # assign node-ip to multus nodeIP for yaml config file assign_multus_nodeip + # set the variable 'convert_to_single_node' to true, in the case + # if we immediately convert back to cluster mode, we need to wait for the + # bootstrap status before moving on to cluster mode + convert_to_single_node=true fi # since we can wait for long time, always start the containerd first check_start_containerd @@ -658,8 +720,12 @@ else # a restart case, found all_components_initialized fi done # got the cluster config, make the config.ymal now - logmsg "Cluster config status ok, provision config.yaml and bootstrap-config.yaml" - provision_cluster_config_file false + logmsg "Cluster config status ok, provision config.yaml and bootstrap-config.yaml" + + # if we just converted to single node, then we need to wait for the bootstrap + # 'cluster' status before moving on to cluster mode + provision_cluster_config_file $convert_to_single_node + convert_to_single_node=false logmsg "provision config.yaml done" else # single node mode logmsg "Single node mode, prepare config.yaml for $HOSTNAME" @@ -703,11 +769,14 @@ if [ ! -f /var/lib/all_components_initialized ]; then fi # label the node with device uuid - apply_node_uuid_lable + apply_node_uuid_label if ! are_all_pods_ready; then + All_PODS_READY=false + sleep 10 continue fi + All_PODS_READY=true if [ ! -f /var/lib/multus_initialized ]; then if [ ! -f /etc/multus-daemonset-new.yaml ]; then @@ -817,7 +886,7 @@ else fi else if [ ! -f /var/lib/node-labels-initialized ]; then - reapply_node_labes + reapply_node_labels fi # Initialize CNI after k3s reboot if [ ! -d /var/lib/cni/bin ] || [ ! -d /opt/cni/bin ]; then diff --git a/pkg/kube/cluster-utils.sh b/pkg/kube/cluster-utils.sh index 56799fd31c..43836c8459 100755 --- a/pkg/kube/cluster-utils.sh +++ b/pkg/kube/cluster-utils.sh @@ -43,6 +43,16 @@ check_log_file_size() { fi # keep the original log file's attributes cp -p "$K3S_LOG_DIR/$1" "$K3S_LOG_DIR/$1.1" + # Check if the argument passed is "$K3s_LOG_FILE", sometimes the k3s is + # not releasing the file descriptor, so truncate the file may not + # take effect. Signal a HUP signal to that. + if [ "$1" = "$K3s_LOG_FILE" ]; then + k3s_pid=$(pgrep -f "k3s server") + if [ -n "$k3s_pid" ]; then + kill -HUP "$k3s_pid" + logmsg "Sent HUP signal to k3s server before truncate k3s.log size" + fi + fi truncate -s 0 "$K3S_LOG_DIR/$1" logmsg "k3s logfile $1, size $currentSize rotate" fi diff --git a/pkg/pillar/cmd/zedagent/parseconfig.go b/pkg/pillar/cmd/zedagent/parseconfig.go index 9bc7139c6b..11b9fa9faa 100644 --- a/pkg/pillar/cmd/zedagent/parseconfig.go +++ b/pkg/pillar/cmd/zedagent/parseconfig.go @@ -159,6 +159,9 @@ func parseConfig(getconfigCtx *getconfigContext, config *zconfig.EdgeDevConfig, parseContentInfoConfig(getconfigCtx, config) parseVolumeConfig(getconfigCtx, config) parseEvConfig(getconfigCtx, config) + // several service are waiting this NodeInfo at startup, either if we don't + // have apps, need to parse this config first + parseEdgeNodeInfo(getconfigCtx, config) // We have handled the volumes, so we can now process the app instances. But we need to check if // we are in the middle of a baseOS upgrade, and if so, we need to skip processing the app instances. @@ -177,8 +180,6 @@ func parseConfig(getconfigCtx *getconfigContext, config *zconfig.EdgeDevConfig, parseDisksConfig(getconfigCtx, config) - parseEdgeNodeInfo(getconfigCtx, config) - parsePatchEnvelopes(getconfigCtx, config) } diff --git a/pkg/pillar/cmd/zedkube/applogs.go b/pkg/pillar/cmd/zedkube/applogs.go index a994ad0b6d..8fd40abf97 100644 --- a/pkg/pillar/cmd/zedkube/applogs.go +++ b/pkg/pillar/cmd/zedkube/applogs.go @@ -126,17 +126,19 @@ func (z *zedkube) checkAppsStatus() { return } - options := metav1.ListOptions{ - FieldSelector: fmt.Sprintf("spec.nodeName=%s", z.nodeName), - } - pods, err := clientset.CoreV1().Pods(kubeapi.EVEKubeNameSpace).List(context.TODO(), options) + stItems := z.pubENClusterAppStatus.GetAll() + + pods, err := clientset.CoreV1().Pods(kubeapi.EVEKubeNameSpace).List(context.TODO(), metav1.ListOptions{}) if err != nil { log.Errorf("checkAppsStatus: can't get pods %v", err) + // If we can't get pods, process the error and return + z.handleKubePodsGetError(items, stItems) return } - pub := z.pubENClusterAppStatus - stItems := pub.GetAll() + z.getKubePodsError.getKubePodsErrorTime = time.Time{} + z.getKubePodsError.processedErrorCondition = false + var oldStatus *types.ENClusterAppStatus for _, item := range items { aiconfig := item.(types.AppInstanceConfig) @@ -153,7 +155,9 @@ func (z *zedkube) checkAppsStatus() { contVMIName := "virt-launcher-" + contName log.Functionf("checkAppsStatus: pod %s, cont %s", pod.Name, contName) if strings.HasPrefix(pod.Name, contName) || strings.HasPrefix(pod.Name, contVMIName) { - encAppStatus.ScheduledOnThisNode = true + if pod.Spec.NodeName == z.nodeName { + encAppStatus.ScheduledOnThisNode = true + } if pod.Status.Phase == corev1.PodRunning { encAppStatus.StatusRunning = true } @@ -170,6 +174,7 @@ func (z *zedkube) checkAppsStatus() { } log.Functionf("checkAppsStatus: devname %s, pod (%d) status %+v, old %+v", z.nodeName, len(pods.Items), encAppStatus, oldStatus) + // Publish if there is a status change if oldStatus == nil || oldStatus.IsDNidNode != encAppStatus.IsDNidNode || oldStatus.ScheduledOnThisNode != encAppStatus.ScheduledOnThisNode || oldStatus.StatusRunning != encAppStatus.StatusRunning { log.Functionf("checkAppsStatus: status differ, publish") @@ -177,3 +182,34 @@ func (z *zedkube) checkAppsStatus() { } } } + +func (z *zedkube) handleKubePodsGetError(items, stItems map[string]interface{}) { + if z.getKubePodsError.getKubePodsErrorTime.IsZero() { + now := time.Now() + z.getKubePodsError.getKubePodsErrorTime = now + log.Noticef("handleKubePodsGetError: can't get pods, set error time") + } else if time.Since(z.getKubePodsError.getKubePodsErrorTime) > 2*time.Minute { + // The settings of kubernetes the node is 'NotReady' after unreachable for 1 minute, + // and the replicaSet policy for POD/VMI is after 30 seconds post the 'NotReady' node + // the App will be rescheduled to other node. So, we use the 2 minutes as the threshold + if z.getKubePodsError.processedErrorCondition == false { + z.getKubePodsError.processedErrorCondition = true + for _, item := range items { + aiconfig := item.(types.AppInstanceConfig) + for _, st := range stItems { + aiStatus := st.(types.ENClusterAppStatus) + if aiStatus.AppUUID == aiconfig.UUIDandVersion.UUID { + // if we used to publish the status, of this app is scheduled on this node + // need to reset this, since we have lost the connection to the kubernetes + // for longer time than the app is to be migrated to other node + if aiStatus.ScheduledOnThisNode { + aiStatus.ScheduledOnThisNode = false + z.pubENClusterAppStatus.Publish(aiconfig.Key(), aiStatus) + log.Noticef("handleKubePodsGetError: can't get pods set ScheduledOnThisNode off for %s, ", aiconfig.DisplayName) + } + } + } + } + } + } +} diff --git a/pkg/pillar/cmd/zedkube/zedkube.go b/pkg/pillar/cmd/zedkube/zedkube.go index 8874fac373..c8497d21d0 100644 --- a/pkg/pillar/cmd/zedkube/zedkube.go +++ b/pkg/pillar/cmd/zedkube/zedkube.go @@ -38,6 +38,12 @@ var ( log *base.LogObject ) +// GetKubePodsError is used to check and handle get kube pods error +type GetKubePodsError struct { + getKubePodsErrorTime time.Time + processedErrorCondition bool +} + type zedkube struct { agentbase.AgentBase globalConfig *types.ConfigItemValueMap @@ -78,6 +84,7 @@ type zedkube struct { electionStopCh chan struct{} statusServer *http.Server statusServerWG sync.WaitGroup + getKubePodsError GetKubePodsError drainOverrideTimer *time.Timer // Config Properties for Drain