Skip to content

Commit

Permalink
Edge-Node Clustering miscellaneous bug fixes
Browse files Browse the repository at this point in the history
- handle the ENC App Status and cluster reachable conditions and with
  error message status
- fix an issue in checkAppsStatus() of using staled oldStatus{}
- in multiple applications case, the there is a bug since now we changed
  the logic to not always publish the ENClusterAppStatus, need to use
  the correct oldStatus for the application
- fix the token rotation failure and waitfor cluster status bug
  and fix a bug in waiting for bootstrap server status, we can fall into
  the 'else' condition and get a wrong cert
- if not all-pods-ready, not printing the misleading 'applying node
  labels' message, instead log the 'Not all pods are ready'
- handle the case convert to single-node and immediately back to
  cluster-mode again. we need towait for the bootstrap 'cluster' status
  before moving on
- try to fix an issue of 'k3s.log' file rotation not taking effect once
  a while. the file size can not be truncated. Add a HUP signal before
  truncate the file

Signed-off-by: Naiming Shen <naiming@zededa.com>
  • Loading branch information
naiming-zededa committed Jan 29, 2025
1 parent c49321d commit b5364d1
Show file tree
Hide file tree
Showing 5 changed files with 146 additions and 23 deletions.
97 changes: 83 additions & 14 deletions pkg/kube/cluster-init.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ INITIAL_WAIT_TIME=5
MAX_WAIT_TIME=$((10 * 60)) # 10 minutes in seconds, exponential backoff for k3s restart
current_wait_time=$INITIAL_WAIT_TIME
CLUSTER_WAIT_FILE="/run/kube/cluster-change-wait-ongoing"
All_PODS_READY=true

# shellcheck source=pkg/kube/descheduler-utils.sh
. /usr/bin/descheduler-utils.sh
Expand Down Expand Up @@ -317,6 +318,31 @@ check_start_containerd() {
fi
}

# apply the node-uuid label to the node
apply_node_uuid_label () {
if [ "$All_PODS_READY" = true ]; then
logmsg "set node label with uuid $DEVUUID"
else
logmsg "Not all pods are ready, Continue to wait while applying node labels"
fi
kubectl label node "$HOSTNAME" node-uuid="$DEVUUID"
}

# reapply the node labels
reapply_node_labels() {
apply_node_uuid_label
apply_longhorn_disk_config "$HOSTNAME"
# Check if the node with both labels exists, don't assume above apply worked
node_count=$(kubectl get nodes -l node-uuid="$DEVUUID",node.longhorn.io/create-default-disk=config -o json | jq '.items | length')

if [ "$node_count" -gt 0 ]; then
logmsg "Node labels re-applied successfully"
touch /var/lib/node-labels-initialized
else
logmsg "Failed to re-apply node labels, on $HOSTNAME, uuid $DEVUUID"
fi
}

# Return success if all pods are Running/Succeeded and Ready
# Used in install time to control api server load
# Return unix style 0 for success. (Not 0 for false)
Expand Down Expand Up @@ -388,7 +414,7 @@ is_bootstrap=""
join_serverIP=""
cluster_token=""
cluster_node_ip=""
# for bootstrap node, after reboot to get neighbor node to join
convert_to_single_node=false

# get the EdgeNodeClusterStatus from zedkube publication
get_enc_status() {
Expand All @@ -413,18 +439,44 @@ get_enc_status() {
fi
}


# When transitioning from single node to cluster mode, need change the controller
# provided token for the cluster

rotate_cluster_token() {
local token="$1"
/usr/bin/k3s token rotate --new-token "$token"
local status=$?
if [ $status -ne 0 ]; then
logmsg "Failed to rotate token. Exit status: $status"
else
logmsg "Token rotated successfully."
fi
return $status
}

change_to_new_token() {
if [ -n "$cluster_token" ]; then
/usr/bin/k3s token rotate --new-token "$cluster_token"
logmsg "Rotate cluster token size: ${#cluster_token}"
rotate_cluster_token "$cluster_token"
# Set the starttime before entering the while loop
starttime=$(date +%s)

while true; do
if grep -q "server:$cluster_token" /var/lib/rancher/k3s/server/token; then
logmsg "Token change has taken effect."
break
else
logmsg "Token has not taken effect yet. Sleeping for 2 seconds..."
sleep 2
currenttime=$(date +%s)
elapsed=$((currenttime - starttime))
if [ $elapsed -ge 60 ]; then
# Redo the rotate_cluster_token and reset the starttime
rotate_cluster_token "$cluster_token"
logmsg "Rotate cluster token again by k3s."
starttime=$(date +%s)
fi
logmsg "Token has not taken effect yet. Sleeping for 5 seconds..."
sleep 5
fi
done
else
Expand Down Expand Up @@ -581,23 +633,27 @@ EOF
counter=0
touch "$CLUSTER_WAIT_FILE"
while true; do
counter=$((counter+1))
if curl --insecure --max-time 2 "https://$join_serverIP:6443" >/dev/null 2>&1; then
counter=$((counter+1))
#logmsg "curl to Endpoint https://$join_serverIP:6443 ready, check cluster status"
# if we are here, check the bootstrap server is single or cluster mode
if ! status=$(curl --max-time 2 -s "http://$join_serverIP:$clusterStatusPort/status"); then
if [ $((counter % 30)) -eq 1 ]; then
logmsg "Attempt $counter: Failed to connect to the server. Waiting for 10 seconds..."
fi
elif [ "$status" != "cluster" ]; then
if [ $((counter % 30)) -eq 1 ]; then
logmsg "Attempt $counter: Server is not in 'cluster' status. Waiting for 10 seconds..."
fi
else
elif [ "$status" = "cluster" ]; then
logmsg "Server is in 'cluster' status. done"
rm "$CLUSTER_WAIT_FILE"
break
else
if [ $((counter % 30)) -eq 1 ]; then
logmsg "Attempt $counter: Server is not in 'cluster' status. Waiting for 10 seconds..."
fi
fi
else
if [ $((counter % 30)) -eq 1 ]; then
logmsg "Attempt $counter: curl to Endpoint https://$join_serverIP:6443 failed. Waiting for 10 seconds..."
fi
fi
sleep 10
done
Expand All @@ -617,8 +673,14 @@ setup_prereqs
if [ -f /var/lib/convert-to-single-node ]; then
logmsg "remove /var/lib and copy saved single node /var/lib"
restore_var_lib
logmsg "wiping unreferenced replicas"
rm -rf /persist/vault/volumes/replicas/*
# assign node-ip to multus nodeIP for yaml config file
assign_multus_nodeip
# set the variable 'convert_to_single_node' to true, in the case
# if we immediately convert back to cluster mode, we need to wait for the
# bootstrap status before moving on to cluster mode
convert_to_single_node=true
fi
# since we can wait for long time, always start the containerd first
check_start_containerd
Expand Down Expand Up @@ -658,8 +720,12 @@ else # a restart case, found all_components_initialized
fi
done
# got the cluster config, make the config.ymal now
logmsg "Cluster config status ok, provision config.yaml and bootstrap-config.yaml"
provision_cluster_config_file false
logmsg "Cluster config status ok, provision config.yaml and bootstrap-config.yaml"

# if we just converted to single node, then we need to wait for the bootstrap
# 'cluster' status before moving on to cluster mode
provision_cluster_config_file $convert_to_single_node
convert_to_single_node=false
logmsg "provision config.yaml done"
else # single node mode
logmsg "Single node mode, prepare config.yaml for $HOSTNAME"
Expand Down Expand Up @@ -703,11 +769,14 @@ if [ ! -f /var/lib/all_components_initialized ]; then
fi

# label the node with device uuid
apply_node_uuid_lable
apply_node_uuid_label

if ! are_all_pods_ready; then
All_PODS_READY=false
sleep 10
continue
fi
All_PODS_READY=true

if [ ! -f /var/lib/multus_initialized ]; then
if [ ! -f /etc/multus-daemonset-new.yaml ]; then
Expand Down Expand Up @@ -817,7 +886,7 @@ else
fi
else
if [ ! -f /var/lib/node-labels-initialized ]; then
reapply_node_labes
reapply_node_labels
fi
# Initialize CNI after k3s reboot
if [ ! -d /var/lib/cni/bin ] || [ ! -d /opt/cni/bin ]; then
Expand Down
10 changes: 10 additions & 0 deletions pkg/kube/cluster-utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,16 @@ check_log_file_size() {
fi
# keep the original log file's attributes
cp -p "$K3S_LOG_DIR/$1" "$K3S_LOG_DIR/$1.1"
# Check if the argument passed is "$K3s_LOG_FILE", sometimes the k3s is
# not releasing the file descriptor, so truncate the file may not
# take effect. Signal a HUP signal to that.
if [ "$1" = "$K3s_LOG_FILE" ]; then
k3s_pid=$(pgrep -f "k3s server")
if [ -n "$k3s_pid" ]; then
kill -HUP "$k3s_pid"
logmsg "Sent HUP signal to k3s server before truncate k3s.log size"
fi
fi
truncate -s 0 "$K3S_LOG_DIR/$1"
logmsg "k3s logfile $1, size $currentSize rotate"
fi
Expand Down
5 changes: 3 additions & 2 deletions pkg/pillar/cmd/zedagent/parseconfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,9 @@ func parseConfig(getconfigCtx *getconfigContext, config *zconfig.EdgeDevConfig,
parseContentInfoConfig(getconfigCtx, config)
parseVolumeConfig(getconfigCtx, config)
parseEvConfig(getconfigCtx, config)
// several service are waiting this NodeInfo at startup, either if we don't
// have apps, need to parse this config first
parseEdgeNodeInfo(getconfigCtx, config)

// We have handled the volumes, so we can now process the app instances. But we need to check if
// we are in the middle of a baseOS upgrade, and if so, we need to skip processing the app instances.
Expand All @@ -177,8 +180,6 @@ func parseConfig(getconfigCtx *getconfigContext, config *zconfig.EdgeDevConfig,

parseDisksConfig(getconfigCtx, config)

parseEdgeNodeInfo(getconfigCtx, config)

parsePatchEnvelopes(getconfigCtx, config)
}

Expand Down
50 changes: 43 additions & 7 deletions pkg/pillar/cmd/zedkube/applogs.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,17 +126,19 @@ func (z *zedkube) checkAppsStatus() {
return
}

options := metav1.ListOptions{
FieldSelector: fmt.Sprintf("spec.nodeName=%s", z.nodeName),
}
pods, err := clientset.CoreV1().Pods(kubeapi.EVEKubeNameSpace).List(context.TODO(), options)
stItems := z.pubENClusterAppStatus.GetAll()

pods, err := clientset.CoreV1().Pods(kubeapi.EVEKubeNameSpace).List(context.TODO(), metav1.ListOptions{})
if err != nil {
log.Errorf("checkAppsStatus: can't get pods %v", err)
// If we can't get pods, process the error and return
z.handleKubePodsGetError(items, stItems)
return
}

pub := z.pubENClusterAppStatus
stItems := pub.GetAll()
z.getKubePodsError.getKubePodsErrorTime = time.Time{}
z.getKubePodsError.processedErrorCondition = false

var oldStatus *types.ENClusterAppStatus
for _, item := range items {
aiconfig := item.(types.AppInstanceConfig)
Expand All @@ -153,7 +155,9 @@ func (z *zedkube) checkAppsStatus() {
contVMIName := "virt-launcher-" + contName
log.Functionf("checkAppsStatus: pod %s, cont %s", pod.Name, contName)
if strings.HasPrefix(pod.Name, contName) || strings.HasPrefix(pod.Name, contVMIName) {
encAppStatus.ScheduledOnThisNode = true
if pod.Spec.NodeName == z.nodeName {
encAppStatus.ScheduledOnThisNode = true
}
if pod.Status.Phase == corev1.PodRunning {
encAppStatus.StatusRunning = true
}
Expand All @@ -170,10 +174,42 @@ func (z *zedkube) checkAppsStatus() {
}
log.Functionf("checkAppsStatus: devname %s, pod (%d) status %+v, old %+v", z.nodeName, len(pods.Items), encAppStatus, oldStatus)

// Publish if there is a status change
if oldStatus == nil || oldStatus.IsDNidNode != encAppStatus.IsDNidNode ||
oldStatus.ScheduledOnThisNode != encAppStatus.ScheduledOnThisNode || oldStatus.StatusRunning != encAppStatus.StatusRunning {
log.Functionf("checkAppsStatus: status differ, publish")
z.pubENClusterAppStatus.Publish(aiconfig.Key(), encAppStatus)
}
}
}

func (z *zedkube) handleKubePodsGetError(items, stItems map[string]interface{}) {
if z.getKubePodsError.getKubePodsErrorTime.IsZero() {
now := time.Now()
z.getKubePodsError.getKubePodsErrorTime = now
log.Noticef("handleKubePodsGetError: can't get pods, set error time")
} else if time.Since(z.getKubePodsError.getKubePodsErrorTime) > 2*time.Minute {
// The settings of kubernetes the node is 'NotReady' after unreachable for 1 minute,
// and the replicaSet policy for POD/VMI is after 30 seconds post the 'NotReady' node
// the App will be rescheduled to other node. So, we use the 2 minutes as the threshold
if z.getKubePodsError.processedErrorCondition == false {
z.getKubePodsError.processedErrorCondition = true
for _, item := range items {
aiconfig := item.(types.AppInstanceConfig)
for _, st := range stItems {
aiStatus := st.(types.ENClusterAppStatus)
if aiStatus.AppUUID == aiconfig.UUIDandVersion.UUID {
// if we used to publish the status, of this app is scheduled on this node
// need to reset this, since we have lost the connection to the kubernetes
// for longer time than the app is to be migrated to other node
if aiStatus.ScheduledOnThisNode {
aiStatus.ScheduledOnThisNode = false
z.pubENClusterAppStatus.Publish(aiconfig.Key(), aiStatus)
log.Noticef("handleKubePodsGetError: can't get pods set ScheduledOnThisNode off for %s, ", aiconfig.DisplayName)
}
}
}
}
}
}
}
7 changes: 7 additions & 0 deletions pkg/pillar/cmd/zedkube/zedkube.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ var (
log *base.LogObject
)

// GetKubePodsError is used to check and handle get kube pods error
type GetKubePodsError struct {
getKubePodsErrorTime time.Time
processedErrorCondition bool
}

type zedkube struct {
agentbase.AgentBase
globalConfig *types.ConfigItemValueMap
Expand Down Expand Up @@ -78,6 +84,7 @@ type zedkube struct {
electionStopCh chan struct{}
statusServer *http.Server
statusServerWG sync.WaitGroup
getKubePodsError GetKubePodsError
drainOverrideTimer *time.Timer

// Config Properties for Drain
Expand Down

0 comments on commit b5364d1

Please sign in to comment.