Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OCPBUGS-32812: When newly built images rolled out, the update progress is not displaying correctly (went 0 --> 3) #4489

Merged
merged 2 commits into from
Sep 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions pkg/controller/common/layered_node_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ func (l *LayeredNodeState) isDesiredImageEqualToBuild(mosc *mcfgv1alpha1.Machine
return l.isImageAnnotationEqualToBuild(daemonconsts.DesiredImageAnnotationKey, mosc)
}

func (l *LayeredNodeState) IsCurrentImageEqualToBuild(mosc *mcfgv1alpha1.MachineOSConfig) bool {
return l.isImageAnnotationEqualToBuild(daemonconsts.CurrentImageAnnotationKey, mosc)
}

func (l *LayeredNodeState) isDesiredMachineConfigEqualToBuild(mosb *mcfgv1alpha1.MachineOSBuild) bool {
return l.node.Annotations[daemonconsts.DesiredMachineConfigAnnotationKey] == mosb.Spec.DesiredConfig.Name

Expand Down
15 changes: 8 additions & 7 deletions pkg/controller/node/node_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -625,7 +625,7 @@ func (ctrl *Controller) updateNode(old, cur interface{}) {

// Specifically log when a node has completed an update so the MCC logs are a useful central aggregate of state changes
if oldNode.Annotations[daemonconsts.CurrentMachineConfigAnnotationKey] != oldNode.Annotations[daemonconsts.DesiredMachineConfigAnnotationKey] &&
isNodeDone(curNode) {
isNodeDone(curNode, false) {
ctrl.logPoolNode(pool, curNode, "Completed update to %s", curNode.Annotations[daemonconsts.DesiredMachineConfigAnnotationKey])
changed = true
} else {
Expand Down Expand Up @@ -1010,10 +1010,10 @@ func (ctrl *Controller) syncMachineConfigPool(key string) error {
}

klog.V(4).Infof("Continuing updates for layered pool %s", pool.Name)
} else {
klog.V(4).Infof("Pool %s is not layered", pool.Name)
}

klog.V(4).Infof("Pool %s is not layered", pool.Name)

nodes, err := ctrl.getNodesForPool(pool)
if err != nil {
if syncErr := ctrl.syncStatusOnly(pool); syncErr != nil {
Expand Down Expand Up @@ -1166,7 +1166,7 @@ func (ctrl *Controller) updateCandidateNode(mosc *mcfgv1alpha1.MachineOSConfig,
if mosb == nil {
if lns.IsDesiredEqualToPool(pool, layered) {
// If the node's desired annotations match the pool, return directly without updating the node.
klog.Infof("no update is needed")
klog.V(4).Infof("Pool %s: node %s: no update is needed", pool.Name, nodeName)
return nil

}
Expand All @@ -1175,7 +1175,7 @@ func (ctrl *Controller) updateCandidateNode(mosc *mcfgv1alpha1.MachineOSConfig,
} else {
if lns.IsDesiredEqualToBuild(mosc, mosb) {
// If the node's desired annotations match the pool, return directly without updating the node.
klog.Infof("no update is needed")
klog.V(4).Infof("Pool %s: node %s: no update is needed", pool.Name, nodeName)
return nil
}
// ensure this is happening. it might not be.
Expand Down Expand Up @@ -1207,7 +1207,7 @@ func (ctrl *Controller) updateCandidateNode(mosc *mcfgv1alpha1.MachineOSConfig,
func getAllCandidateMachines(layered bool, config *mcfgv1alpha1.MachineOSConfig, build *mcfgv1alpha1.MachineOSBuild, pool *mcfgv1.MachineConfigPool, nodesInPool []*corev1.Node, maxUnavailable int) ([]*corev1.Node, uint) {
unavail := getUnavailableMachines(nodesInPool, pool, layered, build)
if len(unavail) >= maxUnavailable {
klog.Infof("No nodes available for updates")
klog.V(4).Infof("Pool %s: No nodes available for updates", pool.Name)
return nil, 0
}
capacity := maxUnavailable - len(unavail)
Expand All @@ -1226,7 +1226,7 @@ func getAllCandidateMachines(layered bool, config *mcfgv1alpha1.MachineOSConfig,
} else {
if lns.IsDesiredEqualToBuild(config, build) {
// If the node's desired annotations match the pool, return directly without updating the node.
klog.Infof("no update is needed")
klog.V(4).Infof("Pool %s: layered node %s: no update is needed", pool.Name, node.Name)
continue
}
}
Expand All @@ -1235,6 +1235,7 @@ func getAllCandidateMachines(layered bool, config *mcfgv1alpha1.MachineOSConfig,
klog.V(4).Infof("node %s skipped during candidate selection as it is currently unscheduled", node.Name)
continue
}
klog.V(4).Infof("Pool %s: selected candidate node %s", pool.Name, node.Name)
nodes = append(nodes, node)
}
// Nodes which are failing to target this config also count against
Expand Down
16 changes: 10 additions & 6 deletions pkg/controller/node/node_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -725,9 +725,9 @@ func TestGetCandidateMachines(t *testing.T) {
helpers.NewNodeBuilder("node-7").WithEqualConfigsAndImages(machineConfigV1, imageV1).WithNodeReady().Node(),
helpers.NewNodeBuilder("node-8").WithEqualConfigsAndImages(machineConfigV1, imageV1).WithNodeReady().Node(),
},
expected: []string{"node-4", "node-5"},
otherCandidates: []string{"node-6"},
capacity: 2,
expected: []string{"node-4"},
otherCandidates: []string{"node-5", "node-6"},
capacity: 1,
layeredPool: true,
}, {
// Targets https://issues.redhat.com/browse/OCPBUGS-24705.
Expand All @@ -752,9 +752,9 @@ func TestGetCandidateMachines(t *testing.T) {
WithMCDState(daemonconsts.MachineConfigDaemonStateDone).
Node(),
},
expected: nil,
otherCandidates: nil,
capacity: 0,
expected: []string{"node-1"},
otherCandidates: []string{"node-2"},
capacity: 1,
layeredPool: true,
}, {
// Targets https://issues.redhat.com/browse/OCPBUGS-24705.
Expand Down Expand Up @@ -799,11 +799,15 @@ func TestGetCandidateMachines(t *testing.T) {
pool := pb.MachineConfigPool()

// TODO: Double check that mosb, mosc should be nil here and layered should be false
// NOTE: By doing this, we end up ignoring all the layered checks(only MCs diffs will be done to
// determine if a node is available and ready for an update). This will need to reworked.
got := getCandidateMachines(pool, nil, nil, test.nodes, test.progress, false)
nodeNames := getNamesFromNodes(got)
assert.Equal(t, test.expected, nodeNames)

// TODO: Double check that mosb, mosc should be nil here and layered should be false
// NOTE: By doing this, we end up ignoring all the layered checks(only MCs diffs will be done to
// determine if a node is available and ready for an update). This will need to reworked.
allCandidates, capacity := getAllCandidateMachines(false, nil, nil, pool, test.nodes, test.progress)
assert.Equal(t, test.capacity, capacity)
var otherCandidates []string
Expand Down
119 changes: 79 additions & 40 deletions pkg/controller/node/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,33 +145,50 @@ func (ctrl *Controller) calculateStatus(fg featuregates.FeatureGate, mcs []*mcfg
}
continue
}

if cond.Status == metav1.ConditionUnknown {
switch mcfgalphav1.StateProgress(cond.Type) {
case mcfgalphav1.MachineConfigNodeUpdatePrepared:
updatingMachines = append(updatedMachines, ourNode) //nolint:gocritic
case mcfgalphav1.MachineConfigNodeUpdateExecuted:
updatingMachines = append(updatingMachines, ourNode)
case mcfgalphav1.MachineConfigNodeUpdatePostActionComplete:
updatingMachines = append(updatingMachines, ourNode)
case mcfgalphav1.MachineConfigNodeUpdateComplete:
updatingMachines = append(updatingMachines, ourNode)
case mcfgalphav1.MachineConfigNodeResumed:
updatingMachines = append(updatedMachines, ourNode) //nolint:gocritic
readyMachines = append(readyMachines, ourNode)
case mcfgalphav1.MachineConfigNodeUpdateCompatible:
updatingMachines = append(updatedMachines, ourNode) //nolint:gocritic
case mcfgalphav1.MachineConfigNodeUpdateDrained:
unavailableMachines = append(unavailableMachines, ourNode)
updatingMachines = append(updatingMachines, ourNode)
case mcfgalphav1.MachineConfigNodeUpdateCordoned:
unavailableMachines = append(unavailableMachines, ourNode)
updatingMachines = append(updatingMachines, ourNode)
case mcfgalphav1.MachineConfigNodeUpdated:
updatedMachines = append(updatedMachines, ourNode)
readyMachines = append(readyMachines, ourNode)
/*
// TODO: (djoshy) Rework this block to use MCN conditions correctly. See: https://issues.redhat.com/browse/MCO-1228

// Specifically, the main concerns for the following block are:
// (i) Why are only unknown conditions being evaluated? Shouldn't True/False conditions be used?
// (ii) Multiple conditions can be unknown at the same time, resulting in certain machines being double counted

// Some background:
// The MCN conditions are used to feed MCP statuses if the machine counts add up correctly to the total node count.
// If this check fails, node conditions are used to determine machine counts. The MCN counts calculated in this block
// seem incorrect most of the time, so the controller almost always defaults to using the node condition based counts.
// On occasion, the MCN counts cause a false positive in aformentioned check, resulting in invalid values for the MCP
// statuses. Commenting out this block will force the controller to always use node condition based counts to feed the
// MCP status.


if cond.Status == metav1.ConditionUnknown {
// This switch case will cause a node to be double counted, maybe use a hash for node count
switch mcfgalphav1.StateProgress(cond.Type) {
case mcfgalphav1.MachineConfigNodeUpdatePrepared:
updatingMachines = append(updatedMachines, ourNode) //nolint:gocritic
case mcfgalphav1.MachineConfigNodeUpdateExecuted:
updatingMachines = append(updatingMachines, ourNode)
case mcfgalphav1.MachineConfigNodeUpdatePostActionComplete:
updatingMachines = append(updatingMachines, ourNode)
case mcfgalphav1.MachineConfigNodeUpdateComplete:
updatingMachines = append(updatingMachines, ourNode)
case mcfgalphav1.MachineConfigNodeResumed:
updatingMachines = append(updatedMachines, ourNode) //nolint:gocritic
readyMachines = append(readyMachines, ourNode)
case mcfgalphav1.MachineConfigNodeUpdateCompatible:
updatingMachines = append(updatedMachines, ourNode) //nolint:gocritic
case mcfgalphav1.MachineConfigNodeUpdateDrained:
unavailableMachines = append(unavailableMachines, ourNode)
updatingMachines = append(updatingMachines, ourNode)
case mcfgalphav1.MachineConfigNodeUpdateCordoned:
unavailableMachines = append(unavailableMachines, ourNode)
updatingMachines = append(updatingMachines, ourNode)
case mcfgalphav1.MachineConfigNodeUpdated:
updatedMachines = append(updatedMachines, ourNode)
readyMachines = append(readyMachines, ourNode)
}
}
}
*/
}
}
degradedMachineCount := int32(len(degradedMachines))
Expand Down Expand Up @@ -339,7 +356,7 @@ func isNodeManaged(node *corev1.Node) bool {
}

// isNodeDone returns true if the current == desired and the MCD has marked done.
func isNodeDone(node *corev1.Node) bool {
func isNodeDone(node *corev1.Node, layered bool) bool {
if node.Annotations == nil {
return false
}
Expand All @@ -352,12 +369,37 @@ func isNodeDone(node *corev1.Node) bool {
return false
}

if layered {
// The MachineConfig annotations are loaded on boot-up by the daemon which
// isn't currently done for the image annotations, so the comparisons here
// are a bit more nuanced.
cimage, cok := node.Annotations[daemonconsts.CurrentImageAnnotationKey]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you remind me if any of these can exist but be "empty"? For example, if the pool is opted into layering, and then opted out of layering, do these get cleaned up?

Copy link

@sergiordlr sergiordlr Sep 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no way to opt a pool out of OCL yet, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not at the moment, no. There is a PR here for that piece: #4284

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When reverted, the desiredImage and currentImage annotations should be cleared.

dimage, dok := node.Annotations[daemonconsts.DesiredImageAnnotationKey]

// If desired image is not set, but the pool is layered, this node can
// be considered ready for an update. This is the very first time node
// is being opted into layering.
if !dok {
return true
}

// If we're here, we know that a desired image annotation exists.
// If the current image annotation does not exist, it means that the node is
// not "done", as it is doing its very first update as a layered node.
if !cok {
return false
}
// Current image annotation exists; compare with desired values to determine if the node is done
return cconfig == dconfig && cimage == dimage && isNodeMCDState(node, daemonconsts.MachineConfigDaemonStateDone)

}

return cconfig == dconfig && isNodeMCDState(node, daemonconsts.MachineConfigDaemonStateDone)
}

// isNodeDoneAt checks whether a node is fully updated to a targetConfig
func isNodeDoneAt(node *corev1.Node, pool *mcfgv1.MachineConfigPool) bool {
return isNodeDone(node) && node.Annotations[daemonconsts.CurrentMachineConfigAnnotationKey] == pool.Spec.Configuration.Name
func isNodeDoneAt(node *corev1.Node, pool *mcfgv1.MachineConfigPool, layered bool) bool {
return isNodeDone(node, layered) && node.Annotations[daemonconsts.CurrentMachineConfigAnnotationKey] == pool.Spec.Configuration.Name
}

// isNodeMCDState checks the MCD state against the state parameter
Expand Down Expand Up @@ -388,7 +430,8 @@ func getUpdatedMachines(pool *mcfgv1.MachineConfigPool, nodes []*corev1.Node, mo
lns := ctrlcommon.NewLayeredNodeState(node)
if mosb != nil && mosc != nil {
mosbState := ctrlcommon.NewMachineOSBuildState(mosb)
if layered && mosbState.IsBuildSuccess() && mosb.Spec.DesiredConfig.Name == pool.Spec.Configuration.Name {
// It seems like pool image annotations are no longer being used, so node specific checks were required here
if layered && mosbState.IsBuildSuccess() && mosb.Spec.DesiredConfig.Name == pool.Spec.Configuration.Name && isNodeDoneAt(node, pool, layered) && lns.IsCurrentImageEqualToBuild(mosc) {
updated = append(updated, node)
}
} else if lns.IsDoneAt(pool, layered) {
Expand Down Expand Up @@ -441,13 +484,13 @@ func isNodeReady(node *corev1.Node) bool {

// isNodeUnavailable is a helper function for getUnavailableMachines
// See the docs of getUnavailableMachines for more info
func isNodeUnavailable(node *corev1.Node) bool {
func isNodeUnavailable(node *corev1.Node, layered bool) bool {
// Unready nodes are unavailable
if !isNodeReady(node) {
return true
}
// Ready nodes are not unavailable
if isNodeDone(node) {
// If the node is working towards a new image/MC, it is not available
if isNodeDone(node, layered) {
return false
}
// Now we know the node isn't ready - the current config must not
Expand All @@ -471,17 +514,13 @@ func getUnavailableMachines(nodes []*corev1.Node, pool *mcfgv1.MachineConfigPool
mosbState := ctrlcommon.NewMachineOSBuildState(mosb)
// if node is unavail, desiredConfigs match, and the build is a success, then we are unavail.
// not sure on this one honestly
if layered && isNodeUnavailable(node) && mosb.Spec.DesiredConfig.Name == pool.Status.Configuration.Name && mosbState.IsBuildSuccess() {
unavail = append(unavail, node)
}
} else {
lns := ctrlcommon.NewLayeredNodeState(node)
if lns.IsUnavailable(pool, layered) {
if layered && isNodeUnavailable(node, layered) && mosb.Spec.DesiredConfig.Name == pool.Spec.Configuration.Name && mosbState.IsBuildSuccess() {
unavail = append(unavail, node)
}
} else if isNodeUnavailable(node, layered) {
unavail = append(unavail, node)
}
}

return unavail
}

Expand Down
13 changes: 8 additions & 5 deletions pkg/controller/node/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -493,7 +493,7 @@ func TestGetUnavailableMachines(t *testing.T) {
helpers.NewNodeBuilder("node-3").WithEqualConfigs(machineConfigV0).WithNodeNotReady().Node(),
helpers.NewNodeBuilder("node-4").WithEqualConfigs(machineConfigV0).WithNodeReady().Node(),
},
unavail: []string{"node-0", "node-2"},
unavail: []string{"node-0", "node-2", "node-3"},
layeredPoolWithImage: true,
}, {
name: "Mismatched unlayered node and layered pool with image unavailable",
Expand All @@ -502,9 +502,9 @@ func TestGetUnavailableMachines(t *testing.T) {
helpers.NewNodeBuilder("node-1").WithEqualConfigsAndImages(machineConfigV1, imageV1).Node(),
helpers.NewNodeBuilder("node-2").WithEqualConfigsAndImages(machineConfigV1, imageV1).WithNodeNotReady().Node(),
helpers.NewNodeBuilder("node-3").WithEqualConfigs(machineConfigV0).WithNodeNotReady().Node(),
helpers.NewNodeBuilder("node-4").WithEqualConfigs(machineConfigV0).WithNodeReady().Node(),
helpers.NewNodeBuilder("node-4").WithEqualConfigsAndImages(machineConfigV0, imageV1).WithNodeReady().Node(),
},
unavail: []string{"node-3"},
unavail: []string{"node-0", "node-2", "node-3"},
layeredPool: true,
}, {
name: "Mismatched layered node and unlayered pool",
Expand All @@ -515,7 +515,7 @@ func TestGetUnavailableMachines(t *testing.T) {
helpers.NewNodeBuilder("node-3").WithEqualConfigs(machineConfigV0).WithEqualImages(imageV1).WithNodeNotReady().Node(),
helpers.NewNodeBuilder("node-4").WithEqualConfigs(machineConfigV0).WithEqualImages(imageV1).WithNodeReady().Node(),
},
unavail: []string{"node-0"},
unavail: []string{"node-0", "node-2", "node-3"},
}, {
// Targets https://issues.redhat.com/browse/OCPBUGS-24705.
name: "nodes working toward layered should not be considered available",
Expand Down Expand Up @@ -550,11 +550,13 @@ func TestGetUnavailableMachines(t *testing.T) {
// Need to set WithNodeReady() on all nodes to avoid short-circuiting.
helpers.NewNodeBuilder("node-0").
WithEqualConfigs(machineConfigV0).
WithDesiredImage(imageV0).WithCurrentImage(imageV0).
WithMCDState(daemonconsts.MachineConfigDaemonStateDone).
WithNodeReady().
Node(),
helpers.NewNodeBuilder("node-1").
WithEqualConfigs(machineConfigV0).
WithDesiredImage(imageV0).WithCurrentImage(imageV0).
WithMCDState(daemonconsts.MachineConfigDaemonStateDone).
WithNodeReady().
Node(),
Expand All @@ -566,11 +568,12 @@ func TestGetUnavailableMachines(t *testing.T) {
Node(),
helpers.NewNodeBuilder("node-3").
WithEqualConfigs(machineConfigV0).
WithDesiredImage(imageV1).WithCurrentImage("").
WithDesiredImage(imageV1).WithCurrentImage(imageV0).
WithMCDState(daemonconsts.MachineConfigDaemonStateDone).
WithNodeReady().
Node(),
},
layeredPool: true,
layeredPoolWithImage: true,
unavail: []string{"node-2", "node-3"},
},
Expand Down