@@ -47,7 +47,7 @@ type UpgradeJobReconciler struct {
47
47
ManagedUpstreamClusterVersionName string
48
48
}
49
49
50
- var ClusterVersionLockAnnotation = managedupgradev1beta1 .GroupVersion .Group + "/upgrade-job"
50
+ var JobLockAnnotation = managedupgradev1beta1 .GroupVersion .Group + "/upgrade-job"
51
51
52
52
const (
53
53
UpgradeJobHookJobTrackerFinalizer = "upgradejobs.managedupgrade.appuio.io/hook-job-tracker"
@@ -96,15 +96,21 @@ func (r *UpgradeJobReconciler) Reconcile(ctx context.Context, req ctrl.Request)
96
96
// Don't execute hooks created after the job was finished.
97
97
_ , eserr := r .executeHooks (ctx , & uj , managedupgradev1beta1 .EventSuccess , sc .Reason , sc .LastTransitionTime .Time )
98
98
_ , eferr := r .executeHooks (ctx , & uj , managedupgradev1beta1 .EventFinish , sc .Reason , sc .LastTransitionTime .Time )
99
- return ctrl.Result {}, multierr .Combine (eserr , eferr , r .cleanupLock (ctx , & uj ))
99
+ return ctrl.Result {}, multierr .Combine (
100
+ eserr ,
101
+ eferr ,
102
+ // Prevent pools that did not have any changes/updates from being paused indefinitely.
103
+ r .cleanupMachineConfigPools (ctx , uj ),
104
+ r .cleanupLock (ctx , uj ),
105
+ )
100
106
}
101
107
fc := apimeta .FindStatusCondition (uj .Status .Conditions , managedupgradev1beta1 .UpgradeJobConditionFailed )
102
108
if fc != nil && fc .Status == metav1 .ConditionTrue {
103
109
// Ignore hooks status, they can't influence the upgrade anymore.
104
110
// Don't execute hooks created after the job was finished.
105
111
_ , efaerr := r .executeHooks (ctx , & uj , managedupgradev1beta1 .EventFailure , fc .Reason , fc .LastTransitionTime .Time )
106
112
_ , efierr := r .executeHooks (ctx , & uj , managedupgradev1beta1 .EventFinish , fc .Reason , fc .LastTransitionTime .Time )
107
- return ctrl.Result {}, multierr .Combine (efaerr , efierr , r .cleanupLock (ctx , & uj ))
113
+ return ctrl.Result {}, multierr .Combine (efaerr , efierr , r .cleanupLock (ctx , uj ))
108
114
}
109
115
110
116
cont , err := r .executeHooks (ctx , & uj , managedupgradev1beta1 .EventCreate , "" , time.Time {})
@@ -195,7 +201,7 @@ func (r *UpgradeJobReconciler) reconcileStartedJob(ctx context.Context, uj *mana
195
201
return ctrl.Result {}, fmt .Errorf ("failed to lock cluster version: %w" , err )
196
202
}
197
203
198
- if err := r .pauseUnpauseMachineConfigPools (ctx , uj , false ); err != nil {
204
+ if err := r .pauseUnpauseMachineConfigPools (ctx , uj ); err != nil {
199
205
return ctrl.Result {}, fmt .Errorf ("failed to pause machine config pools: %w" , err )
200
206
}
201
207
@@ -319,11 +325,6 @@ func (r *UpgradeJobReconciler) reconcileStartedJob(ctx context.Context, uj *mana
319
325
return ctrl.Result {}, nil
320
326
}
321
327
322
- // Ensure pools that were paused but did not need an upgrade are unpaused
323
- if err := r .pauseUnpauseMachineConfigPools (ctx , uj , true ); err != nil {
324
- return ctrl.Result {}, fmt .Errorf ("failed to ensure machine config pools are unpaused: %w" , err )
325
- }
326
-
327
328
// Set the upgrade as successful
328
329
r .setStatusCondition (& uj .Status .Conditions , metav1.Condition {
329
330
Type : managedupgradev1beta1 .UpgradeJobConditionSucceeded ,
@@ -373,7 +374,7 @@ func JobFromClusterVersionMapper(c client.Reader, cvName string) handler.MapFunc
373
374
// upgradeJobNameFromLockedClusterVersion returns the upgrade job name from the locked cluster version.
374
375
// If the cluster version is not locked, it returns false.
375
376
func upgradeJobNameFromLockedClusterVersion (cv configv1.ClusterVersion ) (ok bool , nn types.NamespacedName ) {
376
- job := cv .GetAnnotations ()[ClusterVersionLockAnnotation ]
377
+ job := cv .GetAnnotations ()[JobLockAnnotation ]
377
378
if job == "" {
378
379
return false , types.NamespacedName {}
379
380
}
@@ -441,17 +442,17 @@ func (r *UpgradeJobReconciler) runHealthCheck(
441
442
return true , r .Status ().Update (ctx , uj )
442
443
}
443
444
444
- func (r * UpgradeJobReconciler ) cleanupLock (ctx context.Context , uj * managedupgradev1beta1.UpgradeJob ) error {
445
+ func (r * UpgradeJobReconciler ) cleanupLock (ctx context.Context , uj managedupgradev1beta1.UpgradeJob ) error {
445
446
var version configv1.ClusterVersion
446
447
if err := r .Get (ctx , types.NamespacedName {
447
448
Name : r .ManagedUpstreamClusterVersionName ,
448
449
}, & version ); err != nil {
449
450
return fmt .Errorf ("failed to get cluster version: %w" , err )
450
451
}
451
452
452
- lockingJob , hasLockingJob := version .Annotations [ClusterVersionLockAnnotation ]
453
+ lockingJob , hasLockingJob := version .Annotations [JobLockAnnotation ]
453
454
if hasLockingJob && lockingJob == uj .Namespace + "/" + uj .Name {
454
- delete (version .Annotations , ClusterVersionLockAnnotation )
455
+ delete (version .Annotations , JobLockAnnotation )
455
456
if err := r .Update (ctx , & version ); err != nil {
456
457
return fmt .Errorf ("failed to unlock cluster version: %w" , err )
457
458
}
@@ -466,11 +467,11 @@ func (r *UpgradeJobReconciler) tryLockClusterVersion(ctx context.Context, versio
466
467
version .Annotations = map [string ]string {}
467
468
}
468
469
469
- lockingJob , hasLockingJob := version .Annotations [ClusterVersionLockAnnotation ]
470
+ lockingJob , hasLockingJob := version .Annotations [JobLockAnnotation ]
470
471
if hasLockingJob && lockingJob != lockVal {
471
472
return fmt .Errorf ("cluster version is locked by %s" , lockingJob )
472
473
} else if ! hasLockingJob {
473
- version .Annotations [ClusterVersionLockAnnotation ] = lockVal
474
+ version .Annotations [JobLockAnnotation ] = lockVal
474
475
// There is no race condition between the Get and Update calls because the server will reject the update with a Conflict error if the resource has been modified since the Get call.
475
476
if err := r .Client .Update (ctx , version ); err != nil {
476
477
return fmt .Errorf ("failed to lock cluster version: %w" , err )
@@ -873,10 +874,9 @@ func findTrackedHookJob(ujhookName, event string, uj managedupgradev1beta1.Upgra
873
874
874
875
// pauseUnpauseMachineConfigPools pauses or unpauses the machine config pools that match the given selectors in .Spec.MachineConfigPools and have a delay set.
875
876
// The decision to pause or unpause is based on `pool.DelayUpgrade.DelayMin` relative to the startAfter time of the upgrade job.
876
- // If ensureUnpause is true, it will unpause the pools even if the delay has not expired.
877
877
// It sets a timeout condition and returns an error if the delay is expired.
878
878
// It also returns an error if the machine config pools cannot be listed or updated.
879
- func (r * UpgradeJobReconciler ) pauseUnpauseMachineConfigPools (ctx context.Context , uj * managedupgradev1beta1.UpgradeJob , ensureUnpause bool ) error {
879
+ func (r * UpgradeJobReconciler ) pauseUnpauseMachineConfigPools (ctx context.Context , uj * managedupgradev1beta1.UpgradeJob ) error {
880
880
l := log .FromContext (ctx ).WithName ("UpgradeJobReconciler.pauseUnpauseMachineConfigPools" )
881
881
882
882
var controllerManagesPools bool
@@ -887,8 +887,8 @@ func (r *UpgradeJobReconciler) pauseUnpauseMachineConfigPools(ctx context.Contex
887
887
}
888
888
timeSinceStart := r .timeSinceStartAfter (uj )
889
889
beforeMinDelay := timeSinceStart < pool .DelayUpgrade .DelayMin .Duration
890
- shouldPause := ! ensureUnpause && beforeMinDelay
891
- l = l .WithValues ("poolconfig_matchLabels" , pool .MatchLabels , "shouldPause" , shouldPause , "beforeMinDelay" , beforeMinDelay , "ensureUnpause" , ensureUnpause , " timeSinceStart" , timeSinceStart )
890
+ shouldPause := beforeMinDelay
891
+ l = l .WithValues ("poolconfig_matchLabels" , pool .MatchLabels , "shouldPause" , shouldPause , "beforeMinDelay" , beforeMinDelay , "timeSinceStart" , timeSinceStart )
892
892
893
893
sel , err := metav1 .LabelSelectorAsSelector (pool .MatchLabels )
894
894
if err != nil {
@@ -919,6 +919,11 @@ func (r *UpgradeJobReconciler) pauseUnpauseMachineConfigPools(ctx context.Contex
919
919
}
920
920
if mcp .Spec .Paused != shouldPause {
921
921
l .Info ("Updating machine config pools pause field" , "from" , mcp .Spec .Paused , "to" , shouldPause )
922
+ if mcp .Annotations == nil {
923
+ mcp .Annotations = map [string ]string {}
924
+ }
925
+ // Mark the MCP as managed by the upgrade job for later cleanup
926
+ mcp .Annotations [JobLockAnnotation ] = uj .Namespace + "/" + uj .Name
922
927
mcp .Spec .Paused = shouldPause
923
928
if err := r .Update (ctx , & mcp ); err != nil {
924
929
return fmt .Errorf ("failed to pause/unpause machine config pool %q: %w" , mcp .Name , err )
@@ -1040,3 +1045,30 @@ func (r *UpgradeJobReconciler) checkAndMarkSkipped(ctx context.Context, uj manag
1040
1045
}
1041
1046
return false , nil
1042
1047
}
1048
+
1049
+ // cleanupMachineConfigPools removes the JobLockAnnotation from all machine config pools that have it set to the upgrade job and unpauses annotated pools if they are paused.
1050
+ func (r * UpgradeJobReconciler ) cleanupMachineConfigPools (ctx context.Context , uj managedupgradev1beta1.UpgradeJob ) error {
1051
+ l := log .FromContext (ctx ).WithName ("UpgradeJobReconciler.cleanupMachineConfigPools" )
1052
+
1053
+ var mcpl machineconfigurationv1.MachineConfigPoolList
1054
+ if err := r .List (ctx , & mcpl ); err != nil {
1055
+ return fmt .Errorf ("failed to list machine config pools: %w" , err )
1056
+ }
1057
+
1058
+ errs := make ([]error , 0 , len (mcpl .Items ))
1059
+ for _ , mcp := range mcpl .Items {
1060
+ if mcp .Annotations [JobLockAnnotation ] != uj .Namespace + "/" + uj .Name {
1061
+ continue
1062
+ }
1063
+ delete (mcp .Annotations , JobLockAnnotation )
1064
+ if mcp .Spec .Paused {
1065
+ l .Info ("unpausing machine config pool" , "pool" , mcp .Name )
1066
+ mcp .Spec .Paused = false
1067
+ }
1068
+ if err := r .Update (ctx , & mcp ); err != nil {
1069
+ errs = append (errs , fmt .Errorf ("failed to cleanup machine config pool %q: %w" , mcp .Name , err ))
1070
+ }
1071
+ }
1072
+
1073
+ return multierr .Combine (errs ... )
1074
+ }
0 commit comments