From a0b526bc43eb7c9dfd5b04b36bdf25a4b246acdc Mon Sep 17 00:00:00 2001 From: Hugo Shaka Date: Wed, 8 Jan 2025 16:51:04 -0500 Subject: [PATCH] autoupdate rollout: honour the maintenance window duration (#50745) * autoupdate rollout: honour the maintenance window duration * Update lib/autoupdate/rollout/reconciler.go Co-authored-by: Bartosz Leper * Address feedback * Update lib/autoupdate/rollout/strategy.go --------- Co-authored-by: Bartosz Leper --- lib/autoupdate/rollout/reconciler.go | 21 ++--- lib/autoupdate/rollout/reconciler_test.go | 2 +- lib/autoupdate/rollout/strategy.go | 19 +++-- .../rollout/strategy_haltonerror.go | 9 ++- .../rollout/strategy_haltonerror_test.go | 2 +- lib/autoupdate/rollout/strategy_test.go | 76 ++++++++++++++----- lib/autoupdate/rollout/strategy_timebased.go | 14 +++- .../rollout/strategy_timebased_test.go | 8 +- 8 files changed, 106 insertions(+), 45 deletions(-) diff --git a/lib/autoupdate/rollout/reconciler.go b/lib/autoupdate/rollout/reconciler.go index 02f393a58ff8d..2a3282fa4670b 100644 --- a/lib/autoupdate/rollout/reconciler.go +++ b/lib/autoupdate/rollout/reconciler.go @@ -202,11 +202,12 @@ func (r *reconciler) buildRolloutSpec(config *autoupdate.AutoUpdateConfigSpecAge } return &autoupdate.AutoUpdateAgentRolloutSpec{ - StartVersion: version.GetStartVersion(), - TargetVersion: version.GetTargetVersion(), - Schedule: version.GetSchedule(), - AutoupdateMode: mode, - Strategy: strategy, + StartVersion: version.GetStartVersion(), + TargetVersion: version.GetTargetVersion(), + Schedule: version.GetSchedule(), + AutoupdateMode: mode, + Strategy: strategy, + MaintenanceWindowDuration: config.GetMaintenanceWindowDuration(), }, nil } @@ -318,7 +319,7 @@ func (r *reconciler) computeStatus( } status.Groups = groups - err = r.progressRollout(ctx, newSpec.GetStrategy(), status, now) + err = r.progressRollout(ctx, newSpec, status, now) // Failing to progress the update is not a hard failure. // We want to update the status even if something went wrong to surface the failed reconciliation and potential errors to the user. if err != nil { @@ -334,13 +335,13 @@ func (r *reconciler) computeStatus( // groups are updated in place. // If an error is returned, the groups should still be upserted, depending on the strategy, // failing to update a group might not be fatal (other groups can still progress independently). -func (r *reconciler) progressRollout(ctx context.Context, strategyName string, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error { +func (r *reconciler) progressRollout(ctx context.Context, spec *autoupdate.AutoUpdateAgentRolloutSpec, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error { for _, strategy := range r.rolloutStrategies { - if strategy.name() == strategyName { - return strategy.progressRollout(ctx, status, now) + if strategy.name() == spec.GetStrategy() { + return strategy.progressRollout(ctx, spec, status, now) } } - return trace.NotImplemented("rollout strategy %q not implemented", strategyName) + return trace.NotImplemented("rollout strategy %q not implemented", spec.GetStrategy()) } // makeGroupStatus creates the autoupdate_agent_rollout.status.groups based on the autoupdate_config. diff --git a/lib/autoupdate/rollout/reconciler_test.go b/lib/autoupdate/rollout/reconciler_test.go index c2739685b7f72..af14136a3d156 100644 --- a/lib/autoupdate/rollout/reconciler_test.go +++ b/lib/autoupdate/rollout/reconciler_test.go @@ -714,7 +714,7 @@ func (f *fakeRolloutStrategy) name() string { return f.strategyName } -func (f *fakeRolloutStrategy) progressRollout(ctx context.Context, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error { +func (f *fakeRolloutStrategy) progressRollout(ctx context.Context, spec *autoupdate.AutoUpdateAgentRolloutSpec, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error { f.calls++ return nil } diff --git a/lib/autoupdate/rollout/strategy.go b/lib/autoupdate/rollout/strategy.go index d2f8c2da81f93..d5b8236ce8f90 100644 --- a/lib/autoupdate/rollout/strategy.go +++ b/lib/autoupdate/rollout/strategy.go @@ -40,10 +40,14 @@ const ( // This interface allows us to inject dummy strategies for simpler testing. type rolloutStrategy interface { name() string - progressRollout(context.Context, *autoupdate.AutoUpdateAgentRolloutStatus, time.Time) error + // progressRollout takes the new rollout spec, existing rollout status and current time. + // It updates the status resource in-place to progress the rollout to the next step if possible/needed. + progressRollout(context.Context, *autoupdate.AutoUpdateAgentRolloutSpec, *autoupdate.AutoUpdateAgentRolloutStatus, time.Time) error } -func inWindow(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now time.Time) (bool, error) { +// inWindow checks if the time is in the group's maintenance window. +// The maintenance window is the semi-open interval: [windowStart, windowEnd). +func inWindow(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now time.Time, duration time.Duration) (bool, error) { dayOK, err := canUpdateToday(group.ConfigDays, now) if err != nil { return false, trace.Wrap(err, "checking the day of the week") @@ -51,17 +55,22 @@ func inWindow(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now time.Time if !dayOK { return false, nil } - return int(group.ConfigStartHour) == now.Hour(), nil + + // We compute the theoretical window start and end + windowStart := now.Truncate(24 * time.Hour).Add(time.Duration(group.ConfigStartHour) * time.Hour) + windowEnd := windowStart.Add(duration) + + return !now.Before(windowStart) && now.Before(windowEnd), nil } // rolloutChangedInWindow checks if the rollout got created after the theoretical group start time -func rolloutChangedInWindow(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now, rolloutStart time.Time) (bool, error) { +func rolloutChangedInWindow(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now, rolloutStart time.Time, duration time.Duration) (bool, error) { // If the rollout is older than 24h, we know it did not change during the window if now.Sub(rolloutStart) > 24*time.Hour { return false, nil } // Else we check if the rollout happened in the group window. - return inWindow(group, rolloutStart) + return inWindow(group, rolloutStart, duration) } func canUpdateToday(allowedDays []string, now time.Time) (bool, error) { diff --git a/lib/autoupdate/rollout/strategy_haltonerror.go b/lib/autoupdate/rollout/strategy_haltonerror.go index 6ed1a4aae049d..fafc5d5ae30d3 100644 --- a/lib/autoupdate/rollout/strategy_haltonerror.go +++ b/lib/autoupdate/rollout/strategy_haltonerror.go @@ -35,6 +35,7 @@ const ( updateReasonPreviousGroupsNotDone = "previous_groups_not_done" updateReasonUpdateComplete = "update_complete" updateReasonUpdateInProgress = "update_in_progress" + haltOnErrorWindowDuration = time.Hour ) type haltOnErrorStrategy struct { @@ -54,7 +55,7 @@ func newHaltOnErrorStrategy(log *slog.Logger) (rolloutStrategy, error) { }, nil } -func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error { +func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, _ *autoupdate.AutoUpdateAgentRolloutSpec, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error { // We process every group in order, all the previous groups must be in the DONE state // for the next group to become active. Even if some early groups are not DONE, // later groups might be ACTIVE and need to transition to DONE, so we cannot @@ -81,7 +82,7 @@ func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, status *autou } // Check if the rollout got created after the theoretical group start time - rolloutChangedDuringWindow, err := rolloutChangedInWindow(group, now, status.StartTime.AsTime()) + rolloutChangedDuringWindow, err := rolloutChangedInWindow(group, now, status.StartTime.AsTime(), haltOnErrorWindowDuration) if err != nil { setGroupState(group, group.State, updateReasonReconcilerError, now) return err @@ -149,14 +150,14 @@ func canStartHaltOnError(group, previousGroup *autoupdate.AutoUpdateAgentRollout } } - return inWindow(group, now) + return inWindow(group, now, haltOnErrorWindowDuration) } func isDoneHaltOnError(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now time.Time) (bool, string) { // Currently we don't implement status reporting from groups/agents. // So we just wait 60 minutes and consider the maintenance done. // This will change as we introduce agent status report and aggregated agent counts. - if group.StartTime.AsTime().Add(time.Hour).Before(now) { + if group.StartTime.AsTime().Add(haltOnErrorWindowDuration).Before(now) { return true, updateReasonUpdateComplete } return false, updateReasonUpdateInProgress diff --git a/lib/autoupdate/rollout/strategy_haltonerror_test.go b/lib/autoupdate/rollout/strategy_haltonerror_test.go index ee3eb8e80ffca..2f59534ddd7db 100644 --- a/lib/autoupdate/rollout/strategy_haltonerror_test.go +++ b/lib/autoupdate/rollout/strategy_haltonerror_test.go @@ -500,7 +500,7 @@ func Test_progressGroupsHaltOnError(t *testing.T) { State: 0, StartTime: tt.rolloutStartTime, } - err := strategy.progressRollout(ctx, status, clock.Now()) + err := strategy.progressRollout(ctx, nil, status, clock.Now()) require.NoError(t, err) // We use require.Equal instead of Elements match because group order matters. // It's not super important for time-based, but is crucial for halt-on-error. diff --git a/lib/autoupdate/rollout/strategy_test.go b/lib/autoupdate/rollout/strategy_test.go index 1348716ba6c1d..0711d4043ae9c 100644 --- a/lib/autoupdate/rollout/strategy_test.go +++ b/lib/autoupdate/rollout/strategy_test.go @@ -95,11 +95,12 @@ func Test_canUpdateToday(t *testing.T) { func Test_inWindow(t *testing.T) { tests := []struct { - name string - group *autoupdate.AutoUpdateAgentRolloutStatusGroup - now time.Time - want bool - wantErr require.ErrorAssertionFunc + name string + group *autoupdate.AutoUpdateAgentRolloutStatusGroup + now time.Time + duration time.Duration + want bool + wantErr require.ErrorAssertionFunc }{ { name: "out of window", @@ -107,9 +108,10 @@ func Test_inWindow(t *testing.T) { ConfigDays: everyWeekdayButSunday, ConfigStartHour: matchingStartHour, }, - now: testSunday, - want: false, - wantErr: require.NoError, + now: testSunday, + duration: time.Hour, + want: false, + wantErr: require.NoError, }, { name: "inside window, wrong hour", @@ -117,9 +119,10 @@ func Test_inWindow(t *testing.T) { ConfigDays: everyWeekday, ConfigStartHour: nonMatchingStartHour, }, - now: testSunday, - want: false, - wantErr: require.NoError, + now: testSunday, + duration: time.Hour, + want: false, + wantErr: require.NoError, }, { name: "inside window, correct hour", @@ -127,9 +130,10 @@ func Test_inWindow(t *testing.T) { ConfigDays: everyWeekday, ConfigStartHour: matchingStartHour, }, - now: testSunday, - want: true, - wantErr: require.NoError, + now: testSunday, + duration: time.Hour, + want: true, + wantErr: require.NoError, }, { name: "invalid weekdays", @@ -137,14 +141,48 @@ func Test_inWindow(t *testing.T) { ConfigDays: []string{"HelloThereGeneralKenobi"}, ConfigStartHour: matchingStartHour, }, - now: testSunday, - want: false, - wantErr: require.Error, + now: testSunday, + duration: time.Hour, + want: false, + wantErr: require.Error, + }, + { + name: "short window", + group: &autoupdate.AutoUpdateAgentRolloutStatusGroup{ + ConfigDays: everyWeekday, + ConfigStartHour: matchingStartHour, + }, + now: testSunday, + duration: time.Second, + want: false, + wantErr: require.NoError, + }, + { + name: "window start time is included", + group: &autoupdate.AutoUpdateAgentRolloutStatusGroup{ + ConfigDays: everyWeekday, + ConfigStartHour: matchingStartHour, + }, + now: testSunday.Truncate(24 * time.Hour).Add(time.Duration(matchingStartHour) * time.Hour), + duration: time.Hour, + want: true, + wantErr: require.NoError, + }, + { + name: "window end time is not included", + group: &autoupdate.AutoUpdateAgentRolloutStatusGroup{ + ConfigDays: everyWeekday, + ConfigStartHour: matchingStartHour, + }, + now: testSunday.Truncate(24 * time.Hour).Add(time.Duration(matchingStartHour+1) * time.Hour), + duration: time.Hour, + want: false, + wantErr: require.NoError, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got, err := inWindow(tt.group, tt.now) + got, err := inWindow(tt.group, tt.now, tt.duration) tt.wantErr(t, err) require.Equal(t, tt.want, got) }) @@ -205,7 +243,7 @@ func Test_rolloutChangedInWindow(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { // Test execution. - result, err := rolloutChangedInWindow(group, tt.now, tt.rolloutStart) + result, err := rolloutChangedInWindow(group, tt.now, tt.rolloutStart, time.Hour) require.NoError(t, err) require.Equal(t, tt.want, result) }) diff --git a/lib/autoupdate/rollout/strategy_timebased.go b/lib/autoupdate/rollout/strategy_timebased.go index 5d06adf0e5ace..e4df5c6e23789 100644 --- a/lib/autoupdate/rollout/strategy_timebased.go +++ b/lib/autoupdate/rollout/strategy_timebased.go @@ -51,7 +51,13 @@ func newTimeBasedStrategy(log *slog.Logger) (rolloutStrategy, error) { }, nil } -func (h *timeBasedStrategy) progressRollout(ctx context.Context, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error { +func (h *timeBasedStrategy) progressRollout(ctx context.Context, spec *autoupdate.AutoUpdateAgentRolloutSpec, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error { + windowDuration := spec.GetMaintenanceWindowDuration().AsDuration() + // Backward compatibility for resources previously created without duration. + if windowDuration == 0 { + windowDuration = haltOnErrorWindowDuration + } + // We always process every group regardless of the order. var errs []error for _, group := range status.Groups { @@ -61,7 +67,7 @@ func (h *timeBasedStrategy) progressRollout(ctx context.Context, status *autoupd // We start any group unstarted group in window. // Done groups can transition back to active if they enter their maintenance window again. // Some agents might have missed the previous windows and might expected to try again. - shouldBeActive, err := inWindow(group, now) + shouldBeActive, err := inWindow(group, now, windowDuration) if err != nil { // In time-based rollouts, groups are not dependent. // Failing to transition a group should affect other groups. @@ -72,7 +78,7 @@ func (h *timeBasedStrategy) progressRollout(ctx context.Context, status *autoupd } // Check if the rollout got created after the theoretical group start time - rolloutChangedDuringWindow, err := rolloutChangedInWindow(group, now, status.StartTime.AsTime()) + rolloutChangedDuringWindow, err := rolloutChangedInWindow(group, now, status.StartTime.AsTime(), windowDuration) if err != nil { setGroupState(group, group.State, updateReasonReconcilerError, now) errs = append(errs, err) @@ -93,7 +99,7 @@ func (h *timeBasedStrategy) progressRollout(ctx context.Context, status *autoupd case autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE: // The group is currently being updated. We check if the maintenance // is over and if we should transition it to the done state - shouldBeActive, err := inWindow(group, now) + shouldBeActive, err := inWindow(group, now, windowDuration) if err != nil { // In time-based rollouts, groups are not dependent. // Failing to transition a group should affect other groups. diff --git a/lib/autoupdate/rollout/strategy_timebased_test.go b/lib/autoupdate/rollout/strategy_timebased_test.go index 84367f9927c04..6fa6245598a15 100644 --- a/lib/autoupdate/rollout/strategy_timebased_test.go +++ b/lib/autoupdate/rollout/strategy_timebased_test.go @@ -25,6 +25,7 @@ import ( "github.com/jonboulle/clockwork" "github.com/stretchr/testify/require" + "google.golang.org/protobuf/types/known/durationpb" "google.golang.org/protobuf/types/known/timestamppb" "github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1" @@ -325,6 +326,11 @@ func Test_progressGroupsTimeBased(t *testing.T) { }, }, } + + spec := &autoupdate.AutoUpdateAgentRolloutSpec{ + MaintenanceWindowDuration: durationpb.New(time.Hour), + } + for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { status := &autoupdate.AutoUpdateAgentRolloutStatus{ @@ -332,7 +338,7 @@ func Test_progressGroupsTimeBased(t *testing.T) { State: 0, StartTime: tt.rolloutStartTime, } - err := strategy.progressRollout(ctx, status, clock.Now()) + err := strategy.progressRollout(ctx, spec, status, clock.Now()) require.NoError(t, err) // We use require.Equal instead of Elements match because group order matters. // It's not super important for time-based, but is crucial for halt-on-error.