From 277e5af5735516a7e7df3ff1f0a74eaca16ab9fc Mon Sep 17 00:00:00 2001 From: Qian Sun Date: Thu, 18 Mar 2021 15:53:26 +0800 Subject: [PATCH] Remove redundant judgment during updating node status Sometimes when a nsx-node-agent pod are created and running for less than 180 seconds, the operator will try to update the node status twice (firstly set network-unavailable=true, then sleep and try to set network-unavailable=false after 180 seconds). The code has a redundant check before sleeping, and in the check logic, Get API reads the node status from cache which may be not synced after the first update operator was executed, so an unexpected "Node condition is not changed" will be reported, then the taints cannot be removed until the removal logic was triggerred accidentally by another event from nsx-node-agent pod. The following logs show this case: {"level":"info","ts":"2021-03-08T14:56:37.864Z","logger":"status_manager","msg":"nsx-node-agent-p8ss5/nsx-kube-proxy for node compute-2 started for less than 17.864554094s"} {"level":"info","ts":"2021-03-08T14:56:37.864Z","logger":"status_manager","msg":"nsx-node-agent-p8ss5/nsx-node-agent for node compute-2 started for less than 17.864554094s"} {"level":"info","ts":"2021-03-08T14:56:37.864Z","logger":"status_manager","msg":"nsx-node-agent-p8ss5/nsx-ovs for node compute-2 started for less than 17.864554094s"} {"level":"info","ts":"2021-03-08T14:56:37.864Z","logger":"status_manager","msg":"Setting status NetworkUnavailable to true for node compute-2"} {"level":"info","ts":"2021-03-08T14:56:37.876Z","logger":"status_manager","msg":"Updated node condition NetworkUnavailable to true for node compute-2"} {"level":"info","ts":"2021-03-08T14:56:37.876Z","logger":"status_manager","msg":"Node condition is not changed"} ... {"level":"info","ts":"2021-03-08T15:26:13.541Z","logger":"status_manager","msg":"Setting status NetworkUnavailable to false for node compute-2"} {"level":"info","ts":"2021-03-08T15:26:13.541Z","logger":"status_manager","msg":"Setting status NetworkUnavailable to false for node compute-2 after -26m53.541741583s"} This patch will remove the redundant check and put the SetNodeConditionFromPod logic in a periodic resync to ensure the taints can always be deleted. --- pkg/controller/pod/pod_controller.go | 8 ++++++++ pkg/controller/statusmanager/pod_status.go | 13 ++++--------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/pkg/controller/pod/pod_controller.go b/pkg/controller/pod/pod_controller.go index de9d1ba..b3b6b9d 100644 --- a/pkg/controller/pod/pod_controller.go +++ b/pkg/controller/pod/pod_controller.go @@ -180,6 +180,13 @@ func (r *ReconcilePod) isForNsxNodeAgentPod(request reconcile.Request) bool { if (request.Namespace == operatortypes.NsxNamespace && strings.Contains( request.Name, operatortypes.NsxNodeAgentDsName) && request.Name != operatortypes.NsxNodeAgentDsName) { + // Check if the pod exists to prevent that a stale pod is in a endless periodic resync + pod := &corev1.Pod{} + err := r.client.Get(context.TODO(), request.NamespacedName, pod) + if err != nil { + log.Info(fmt.Sprintf("Could not retrieve nsx-node-agent pod %s: %v", request.Name, err)) + return false + } return true } return false @@ -199,6 +206,7 @@ func (r *ReconcilePod) Reconcile(request reconcile.Request) (reconcile.Result, e if r.isForNsxNodeAgentPod(request) { reqLogger.Info("Reconciling pod update for network status") r.status.SetNodeConditionFromPod(request.NamespacedName, r.sharedInfo, nil) + return reconcile.Result{RequeueAfter: ResyncPeriod}, nil } return reconcile.Result{}, nil } diff --git a/pkg/controller/statusmanager/pod_status.go b/pkg/controller/statusmanager/pod_status.go index 8a7ded5..6307cb6 100644 --- a/pkg/controller/statusmanager/pod_status.go +++ b/pkg/controller/statusmanager/pod_status.go @@ -144,12 +144,7 @@ func (status *StatusManager) setNodeNetworkUnavailable(nodeName string, ready bo } else { delete(sharedInfo.LastNetworkAvailable, nodeName) } - _, changed := status.combineNode(nodeName, ready, reason, message) - if !changed { - log.Info("Node condition is not changed") - return - } - log.Info(fmt.Sprintf("Setting status NetworkUnavailable to %v for node %s", !ready, nodeName)) + log.V(1).Info(fmt.Sprintf("Setting status NetworkUnavailable to %v for node %s", !ready, nodeName)) if !ready { status.updateNodeStatus(nodeName, ready, reason, message) return @@ -165,7 +160,7 @@ func (status *StatusManager) setNodeNetworkUnavailable(nodeName string, ready bo nodeName, *startedAt, sharedInfo.LastNetworkAvailable[nodeName])) return } - log.Info(fmt.Sprintf("Setting status NetworkUnavailable to %v for node %s after %v", !ready, nodeName, sleepTime)) + log.V(1).Info(fmt.Sprintf("Setting status NetworkUnavailable to %v for node %s %v after nsx-node-agent running", !ready, nodeName, time.Now().Sub(*startedAt))) status.updateNodeStatus(nodeName, ready, reason, message) return }() @@ -202,7 +197,7 @@ func (status *StatusManager) updateNodeStatus(nodeName string, ready bool, reaso // need to retrieve the node info, otherwise, the update may fail because the node has been modified during monitor slept node, changed := status.combineNode(nodeName, ready, reason, message) if !changed { - log.Info("Node condition is not changed, skip updating") + log.V(1).Info(fmt.Sprintf("Node condition is not changed(nodeName: %s, reason: %s, message: %s), skip updating", nodeName, reason, message)) return } err := status.client.Status().Update(context.TODO(), node) @@ -533,4 +528,4 @@ func (status *StatusManager) CombineConditions(conditions *[]configv1.ClusterOpe } } return changed, messages -} \ No newline at end of file +}