Skip to content

Edge-Node Clustering miscellaneous bug fixes #4549

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 83 additions & 14 deletions pkg/kube/cluster-init.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ INITIAL_WAIT_TIME=5
MAX_WAIT_TIME=$((10 * 60)) # 10 minutes in seconds, exponential backoff for k3s restart
current_wait_time=$INITIAL_WAIT_TIME
CLUSTER_WAIT_FILE="/run/kube/cluster-change-wait-ongoing"
All_PODS_READY=true

# shellcheck source=pkg/kube/descheduler-utils.sh
. /usr/bin/descheduler-utils.sh
Expand Down Expand Up @@ -317,6 +318,31 @@ check_start_containerd() {
fi
}

# apply the node-uuid label to the node
apply_node_uuid_label () {
if [ "$All_PODS_READY" = true ]; then
logmsg "set node label with uuid $DEVUUID"
else
logmsg "Not all pods are ready, Continue to wait while applying node labels"
fi
kubectl label node "$HOSTNAME" node-uuid="$DEVUUID"
}

# reapply the node labels
reapply_node_labels() {
apply_node_uuid_label
apply_longhorn_disk_config "$HOSTNAME"
# Check if the node with both labels exists, don't assume above apply worked
node_count=$(kubectl get nodes -l node-uuid="$DEVUUID",node.longhorn.io/create-default-disk=config -o json | jq '.items | length')

if [ "$node_count" -gt 0 ]; then
logmsg "Node labels re-applied successfully"
touch /var/lib/node-labels-initialized
else
logmsg "Failed to re-apply node labels, on $HOSTNAME, uuid $DEVUUID"
fi
}

# Return success if all pods are Running/Succeeded and Ready
# Used in install time to control api server load
# Return unix style 0 for success. (Not 0 for false)
Expand Down Expand Up @@ -388,7 +414,7 @@ is_bootstrap=""
join_serverIP=""
cluster_token=""
cluster_node_ip=""
# for bootstrap node, after reboot to get neighbor node to join
convert_to_single_node=false

# get the EdgeNodeClusterStatus from zedkube publication
get_enc_status() {
Expand All @@ -413,18 +439,44 @@ get_enc_status() {
fi
}


# When transitioning from single node to cluster mode, need change the controller
# provided token for the cluster

rotate_cluster_token() {
local token="$1"
/usr/bin/k3s token rotate --new-token "$token"
local status=$?
if [ $status -ne 0 ]; then
logmsg "Failed to rotate token. Exit status: $status"
else
logmsg "Token rotated successfully."
fi
return $status
}

change_to_new_token() {
if [ -n "$cluster_token" ]; then
/usr/bin/k3s token rotate --new-token "$cluster_token"
logmsg "Rotate cluster token size: ${#cluster_token}"
rotate_cluster_token "$cluster_token"
# Set the starttime before entering the while loop
starttime=$(date +%s)

while true; do
if grep -q "server:$cluster_token" /var/lib/rancher/k3s/server/token; then
logmsg "Token change has taken effect."
break
else
logmsg "Token has not taken effect yet. Sleeping for 2 seconds..."
sleep 2
currenttime=$(date +%s)
elapsed=$((currenttime - starttime))
if [ $elapsed -ge 60 ]; then
# Redo the rotate_cluster_token and reset the starttime
rotate_cluster_token "$cluster_token"
logmsg "Rotate cluster token again by k3s."
starttime=$(date +%s)
fi
logmsg "Token has not taken effect yet. Sleeping for 5 seconds..."
sleep 5
fi
done
else
Expand Down Expand Up @@ -581,23 +633,27 @@ EOF
counter=0
touch "$CLUSTER_WAIT_FILE"
while true; do
counter=$((counter+1))
if curl --insecure --max-time 2 "https://$join_serverIP:6443" >/dev/null 2>&1; then
counter=$((counter+1))
#logmsg "curl to Endpoint https://$join_serverIP:6443 ready, check cluster status"
# if we are here, check the bootstrap server is single or cluster mode
if ! status=$(curl --max-time 2 -s "http://$join_serverIP:$clusterStatusPort/status"); then
if [ $((counter % 30)) -eq 1 ]; then
logmsg "Attempt $counter: Failed to connect to the server. Waiting for 10 seconds..."
fi
elif [ "$status" != "cluster" ]; then
if [ $((counter % 30)) -eq 1 ]; then
logmsg "Attempt $counter: Server is not in 'cluster' status. Waiting for 10 seconds..."
fi
else
elif [ "$status" = "cluster" ]; then
logmsg "Server is in 'cluster' status. done"
rm "$CLUSTER_WAIT_FILE"
break
else
if [ $((counter % 30)) -eq 1 ]; then
logmsg "Attempt $counter: Server is not in 'cluster' status. Waiting for 10 seconds..."
fi
fi
else
if [ $((counter % 30)) -eq 1 ]; then
logmsg "Attempt $counter: curl to Endpoint https://$join_serverIP:6443 failed. Waiting for 10 seconds..."
fi
fi
sleep 10
done
Expand All @@ -617,8 +673,14 @@ setup_prereqs
if [ -f /var/lib/convert-to-single-node ]; then
logmsg "remove /var/lib and copy saved single node /var/lib"
restore_var_lib
logmsg "wiping unreferenced replicas"
rm -rf /persist/vault/volumes/replicas/*
# assign node-ip to multus nodeIP for yaml config file
assign_multus_nodeip
# set the variable 'convert_to_single_node' to true, in the case
# if we immediately convert back to cluster mode, we need to wait for the
# bootstrap status before moving on to cluster mode
convert_to_single_node=true
fi
# since we can wait for long time, always start the containerd first
check_start_containerd
Expand Down Expand Up @@ -658,8 +720,12 @@ else # a restart case, found all_components_initialized
fi
done
# got the cluster config, make the config.ymal now
logmsg "Cluster config status ok, provision config.yaml and bootstrap-config.yaml"
provision_cluster_config_file false
logmsg "Cluster config status ok, provision config.yaml and bootstrap-config.yaml"

# if we just converted to single node, then we need to wait for the bootstrap
# 'cluster' status before moving on to cluster mode
provision_cluster_config_file $convert_to_single_node
convert_to_single_node=false
logmsg "provision config.yaml done"
else # single node mode
logmsg "Single node mode, prepare config.yaml for $HOSTNAME"
Expand Down Expand Up @@ -703,11 +769,14 @@ if [ ! -f /var/lib/all_components_initialized ]; then
fi

# label the node with device uuid
apply_node_uuid_lable
apply_node_uuid_label

if ! are_all_pods_ready; then
All_PODS_READY=false
sleep 10
continue
fi
All_PODS_READY=true

if [ ! -f /var/lib/multus_initialized ]; then
if [ ! -f /etc/multus-daemonset-new.yaml ]; then
Expand Down Expand Up @@ -817,7 +886,7 @@ else
fi
else
if [ ! -f /var/lib/node-labels-initialized ]; then
reapply_node_labes
reapply_node_labels
fi
# Initialize CNI after k3s reboot
if [ ! -d /var/lib/cni/bin ] || [ ! -d /opt/cni/bin ]; then
Expand Down
10 changes: 10 additions & 0 deletions pkg/kube/cluster-utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,16 @@ check_log_file_size() {
fi
# keep the original log file's attributes
cp -p "$K3S_LOG_DIR/$1" "$K3S_LOG_DIR/$1.1"
# Check if the argument passed is "$K3s_LOG_FILE", sometimes the k3s is
# not releasing the file descriptor, so truncate the file may not
# take effect. Signal a HUP signal to that.
if [ "$1" = "$K3s_LOG_FILE" ]; then
k3s_pid=$(pgrep -f "k3s server")
if [ -n "$k3s_pid" ]; then
kill -HUP "$k3s_pid"
logmsg "Sent HUP signal to k3s server before truncate k3s.log size"
fi
fi
truncate -s 0 "$K3S_LOG_DIR/$1"
logmsg "k3s logfile $1, size $currentSize rotate"
fi
Expand Down
5 changes: 3 additions & 2 deletions pkg/pillar/cmd/zedagent/parseconfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,9 @@ func parseConfig(getconfigCtx *getconfigContext, config *zconfig.EdgeDevConfig,
parseContentInfoConfig(getconfigCtx, config)
parseVolumeConfig(getconfigCtx, config)
parseEvConfig(getconfigCtx, config)
// several service are waiting this NodeInfo at startup, either if we don't
// have apps, need to parse this config first
parseEdgeNodeInfo(getconfigCtx, config)

// We have handled the volumes, so we can now process the app instances. But we need to check if
// we are in the middle of a baseOS upgrade, and if so, we need to skip processing the app instances.
Expand All @@ -177,8 +180,6 @@ func parseConfig(getconfigCtx *getconfigContext, config *zconfig.EdgeDevConfig,

parseDisksConfig(getconfigCtx, config)

parseEdgeNodeInfo(getconfigCtx, config)

parsePatchEnvelopes(getconfigCtx, config)
}

Expand Down
50 changes: 43 additions & 7 deletions pkg/pillar/cmd/zedkube/applogs.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,17 +126,19 @@ func (z *zedkube) checkAppsStatus() {
return
}

options := metav1.ListOptions{
FieldSelector: fmt.Sprintf("spec.nodeName=%s", z.nodeName),
}
pods, err := clientset.CoreV1().Pods(kubeapi.EVEKubeNameSpace).List(context.TODO(), options)
stItems := z.pubENClusterAppStatus.GetAll()

pods, err := clientset.CoreV1().Pods(kubeapi.EVEKubeNameSpace).List(context.TODO(), metav1.ListOptions{})
if err != nil {
log.Errorf("checkAppsStatus: can't get pods %v", err)
// If we can't get pods, process the error and return
z.handleKubePodsGetError(items, stItems)
return
}

pub := z.pubENClusterAppStatus
stItems := pub.GetAll()
z.getKubePodsError.getKubePodsErrorTime = time.Time{}
z.getKubePodsError.processedErrorCondition = false

var oldStatus *types.ENClusterAppStatus
for _, item := range items {
aiconfig := item.(types.AppInstanceConfig)
Expand All @@ -153,7 +155,9 @@ func (z *zedkube) checkAppsStatus() {
contVMIName := "virt-launcher-" + contName
log.Functionf("checkAppsStatus: pod %s, cont %s", pod.Name, contName)
if strings.HasPrefix(pod.Name, contName) || strings.HasPrefix(pod.Name, contVMIName) {
encAppStatus.ScheduledOnThisNode = true
if pod.Spec.NodeName == z.nodeName {
encAppStatus.ScheduledOnThisNode = true
}
if pod.Status.Phase == corev1.PodRunning {
encAppStatus.StatusRunning = true
}
Expand All @@ -170,10 +174,42 @@ func (z *zedkube) checkAppsStatus() {
}
log.Functionf("checkAppsStatus: devname %s, pod (%d) status %+v, old %+v", z.nodeName, len(pods.Items), encAppStatus, oldStatus)

// Publish if there is a status change
if oldStatus == nil || oldStatus.IsDNidNode != encAppStatus.IsDNidNode ||
oldStatus.ScheduledOnThisNode != encAppStatus.ScheduledOnThisNode || oldStatus.StatusRunning != encAppStatus.StatusRunning {
log.Functionf("checkAppsStatus: status differ, publish")
z.pubENClusterAppStatus.Publish(aiconfig.Key(), encAppStatus)
}
}
}

func (z *zedkube) handleKubePodsGetError(items, stItems map[string]interface{}) {
if z.getKubePodsError.getKubePodsErrorTime.IsZero() {
now := time.Now()
z.getKubePodsError.getKubePodsErrorTime = now
log.Noticef("handleKubePodsGetError: can't get pods, set error time")
} else if time.Since(z.getKubePodsError.getKubePodsErrorTime) > 2*time.Minute {
// The settings of kubernetes the node is 'NotReady' after unreachable for 1 minute,
// and the replicaSet policy for POD/VMI is after 30 seconds post the 'NotReady' node
// the App will be rescheduled to other node. So, we use the 2 minutes as the threshold
if z.getKubePodsError.processedErrorCondition == false {
z.getKubePodsError.processedErrorCondition = true
for _, item := range items {
aiconfig := item.(types.AppInstanceConfig)
for _, st := range stItems {
aiStatus := st.(types.ENClusterAppStatus)
if aiStatus.AppUUID == aiconfig.UUIDandVersion.UUID {
// if we used to publish the status, of this app is scheduled on this node
// need to reset this, since we have lost the connection to the kubernetes
// for longer time than the app is to be migrated to other node
if aiStatus.ScheduledOnThisNode {
aiStatus.ScheduledOnThisNode = false
z.pubENClusterAppStatus.Publish(aiconfig.Key(), aiStatus)
log.Noticef("handleKubePodsGetError: can't get pods set ScheduledOnThisNode off for %s, ", aiconfig.DisplayName)
}
}
}
}
}
}
}
2 changes: 0 additions & 2 deletions pkg/pillar/cmd/zedkube/clusterstatus.go
Original file line number Diff line number Diff line change
Expand Up @@ -370,8 +370,6 @@ func (z *zedkube) clusterAppIDHandler(w http.ResponseWriter, r *http.Request) {
log.Errorf("clusterAppIDHandler: error reading response from %s: %v", host, err)
continue
}

// Replace outermost { and } with [ and ] in remoteAppInfoJSON
combinedJSON = combinedJSON + "," + strings.TrimSuffix(string(remoteAppInfoJSON), "\n")
}
}
Expand Down
7 changes: 7 additions & 0 deletions pkg/pillar/cmd/zedkube/zedkube.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ var (
log *base.LogObject
)

// GetKubePodsError is used to check and handle get kube pods error
type GetKubePodsError struct {
getKubePodsErrorTime time.Time
processedErrorCondition bool
}

type zedkube struct {
agentbase.AgentBase
globalConfig *types.ConfigItemValueMap
Expand Down Expand Up @@ -78,6 +84,7 @@ type zedkube struct {
electionStopCh chan struct{}
statusServer *http.Server
statusServerWG sync.WaitGroup
getKubePodsError GetKubePodsError
drainOverrideTimer *time.Timer

// Config Properties for Drain
Expand Down
Loading