From a0b526bc43eb7c9dfd5b04b36bdf25a4b246acdc Mon Sep 17 00:00:00 2001
From: Hugo Shaka <hugo.hervieux@goteleport.com>
Date: Wed, 8 Jan 2025 16:51:04 -0500
Subject: [PATCH] autoupdate rollout: honour the maintenance window duration
 (#50745)

* autoupdate rollout: honour the maintenance window duration

* Update lib/autoupdate/rollout/reconciler.go

Co-authored-by: Bartosz Leper <bartosz.leper@goteleport.com>

* Address feedback

* Update lib/autoupdate/rollout/strategy.go

---------

Co-authored-by: Bartosz Leper <bartosz.leper@goteleport.com>
---
 lib/autoupdate/rollout/reconciler.go          | 21 ++---
 lib/autoupdate/rollout/reconciler_test.go     |  2 +-
 lib/autoupdate/rollout/strategy.go            | 19 +++--
 .../rollout/strategy_haltonerror.go           |  9 ++-
 .../rollout/strategy_haltonerror_test.go      |  2 +-
 lib/autoupdate/rollout/strategy_test.go       | 76 ++++++++++++++-----
 lib/autoupdate/rollout/strategy_timebased.go  | 14 +++-
 .../rollout/strategy_timebased_test.go        |  8 +-
 8 files changed, 106 insertions(+), 45 deletions(-)

diff --git a/lib/autoupdate/rollout/reconciler.go b/lib/autoupdate/rollout/reconciler.go
index 02f393a58ff8d..2a3282fa4670b 100644
--- a/lib/autoupdate/rollout/reconciler.go
+++ b/lib/autoupdate/rollout/reconciler.go
@@ -202,11 +202,12 @@ func (r *reconciler) buildRolloutSpec(config *autoupdate.AutoUpdateConfigSpecAge
 	}
 
 	return &autoupdate.AutoUpdateAgentRolloutSpec{
-		StartVersion:   version.GetStartVersion(),
-		TargetVersion:  version.GetTargetVersion(),
-		Schedule:       version.GetSchedule(),
-		AutoupdateMode: mode,
-		Strategy:       strategy,
+		StartVersion:              version.GetStartVersion(),
+		TargetVersion:             version.GetTargetVersion(),
+		Schedule:                  version.GetSchedule(),
+		AutoupdateMode:            mode,
+		Strategy:                  strategy,
+		MaintenanceWindowDuration: config.GetMaintenanceWindowDuration(),
 	}, nil
 
 }
@@ -318,7 +319,7 @@ func (r *reconciler) computeStatus(
 	}
 	status.Groups = groups
 
-	err = r.progressRollout(ctx, newSpec.GetStrategy(), status, now)
+	err = r.progressRollout(ctx, newSpec, status, now)
 	// Failing to progress the update is not a hard failure.
 	// We want to update the status even if something went wrong to surface the failed reconciliation and potential errors to the user.
 	if err != nil {
@@ -334,13 +335,13 @@ func (r *reconciler) computeStatus(
 // groups are updated in place.
 // If an error is returned, the groups should still be upserted, depending on the strategy,
 // failing to update a group might not be fatal (other groups can still progress independently).
-func (r *reconciler) progressRollout(ctx context.Context, strategyName string, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error {
+func (r *reconciler) progressRollout(ctx context.Context, spec *autoupdate.AutoUpdateAgentRolloutSpec, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error {
 	for _, strategy := range r.rolloutStrategies {
-		if strategy.name() == strategyName {
-			return strategy.progressRollout(ctx, status, now)
+		if strategy.name() == spec.GetStrategy() {
+			return strategy.progressRollout(ctx, spec, status, now)
 		}
 	}
-	return trace.NotImplemented("rollout strategy %q not implemented", strategyName)
+	return trace.NotImplemented("rollout strategy %q not implemented", spec.GetStrategy())
 }
 
 // makeGroupStatus creates the autoupdate_agent_rollout.status.groups based on the autoupdate_config.
diff --git a/lib/autoupdate/rollout/reconciler_test.go b/lib/autoupdate/rollout/reconciler_test.go
index c2739685b7f72..af14136a3d156 100644
--- a/lib/autoupdate/rollout/reconciler_test.go
+++ b/lib/autoupdate/rollout/reconciler_test.go
@@ -714,7 +714,7 @@ func (f *fakeRolloutStrategy) name() string {
 	return f.strategyName
 }
 
-func (f *fakeRolloutStrategy) progressRollout(ctx context.Context, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error {
+func (f *fakeRolloutStrategy) progressRollout(ctx context.Context, spec *autoupdate.AutoUpdateAgentRolloutSpec, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error {
 	f.calls++
 	return nil
 }
diff --git a/lib/autoupdate/rollout/strategy.go b/lib/autoupdate/rollout/strategy.go
index d2f8c2da81f93..d5b8236ce8f90 100644
--- a/lib/autoupdate/rollout/strategy.go
+++ b/lib/autoupdate/rollout/strategy.go
@@ -40,10 +40,14 @@ const (
 // This interface allows us to inject dummy strategies for simpler testing.
 type rolloutStrategy interface {
 	name() string
-	progressRollout(context.Context, *autoupdate.AutoUpdateAgentRolloutStatus, time.Time) error
+	// progressRollout takes the new rollout spec, existing rollout status and current time.
+	// It updates the status resource in-place to progress the rollout to the next step if possible/needed.
+	progressRollout(context.Context, *autoupdate.AutoUpdateAgentRolloutSpec, *autoupdate.AutoUpdateAgentRolloutStatus, time.Time) error
 }
 
-func inWindow(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now time.Time) (bool, error) {
+// inWindow checks if the time is in the group's maintenance window.
+// The maintenance window is the semi-open interval: [windowStart, windowEnd).
+func inWindow(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now time.Time, duration time.Duration) (bool, error) {
 	dayOK, err := canUpdateToday(group.ConfigDays, now)
 	if err != nil {
 		return false, trace.Wrap(err, "checking the day of the week")
@@ -51,17 +55,22 @@ func inWindow(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now time.Time
 	if !dayOK {
 		return false, nil
 	}
-	return int(group.ConfigStartHour) == now.Hour(), nil
+
+	// We compute the theoretical window start and end
+	windowStart := now.Truncate(24 * time.Hour).Add(time.Duration(group.ConfigStartHour) * time.Hour)
+	windowEnd := windowStart.Add(duration)
+
+	return !now.Before(windowStart) && now.Before(windowEnd), nil
 }
 
 // rolloutChangedInWindow checks if the rollout got created after the theoretical group start time
-func rolloutChangedInWindow(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now, rolloutStart time.Time) (bool, error) {
+func rolloutChangedInWindow(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now, rolloutStart time.Time, duration time.Duration) (bool, error) {
 	// If the rollout is older than 24h, we know it did not change during the window
 	if now.Sub(rolloutStart) > 24*time.Hour {
 		return false, nil
 	}
 	// Else we check if the rollout happened in the group window.
-	return inWindow(group, rolloutStart)
+	return inWindow(group, rolloutStart, duration)
 }
 
 func canUpdateToday(allowedDays []string, now time.Time) (bool, error) {
diff --git a/lib/autoupdate/rollout/strategy_haltonerror.go b/lib/autoupdate/rollout/strategy_haltonerror.go
index 6ed1a4aae049d..fafc5d5ae30d3 100644
--- a/lib/autoupdate/rollout/strategy_haltonerror.go
+++ b/lib/autoupdate/rollout/strategy_haltonerror.go
@@ -35,6 +35,7 @@ const (
 	updateReasonPreviousGroupsNotDone = "previous_groups_not_done"
 	updateReasonUpdateComplete        = "update_complete"
 	updateReasonUpdateInProgress      = "update_in_progress"
+	haltOnErrorWindowDuration         = time.Hour
 )
 
 type haltOnErrorStrategy struct {
@@ -54,7 +55,7 @@ func newHaltOnErrorStrategy(log *slog.Logger) (rolloutStrategy, error) {
 	}, nil
 }
 
-func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error {
+func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, _ *autoupdate.AutoUpdateAgentRolloutSpec, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error {
 	// We process every group in order, all the previous groups must be in the DONE state
 	// for the next group to become active. Even if some early groups are not DONE,
 	// later groups might be ACTIVE and need to transition to DONE, so we cannot
@@ -81,7 +82,7 @@ func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, status *autou
 			}
 
 			// Check if the rollout got created after the theoretical group start time
-			rolloutChangedDuringWindow, err := rolloutChangedInWindow(group, now, status.StartTime.AsTime())
+			rolloutChangedDuringWindow, err := rolloutChangedInWindow(group, now, status.StartTime.AsTime(), haltOnErrorWindowDuration)
 			if err != nil {
 				setGroupState(group, group.State, updateReasonReconcilerError, now)
 				return err
@@ -149,14 +150,14 @@ func canStartHaltOnError(group, previousGroup *autoupdate.AutoUpdateAgentRollout
 		}
 	}
 
-	return inWindow(group, now)
+	return inWindow(group, now, haltOnErrorWindowDuration)
 }
 
 func isDoneHaltOnError(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now time.Time) (bool, string) {
 	// Currently we don't implement status reporting from groups/agents.
 	// So we just wait 60 minutes and consider the maintenance done.
 	// This will change as we introduce agent status report and aggregated agent counts.
-	if group.StartTime.AsTime().Add(time.Hour).Before(now) {
+	if group.StartTime.AsTime().Add(haltOnErrorWindowDuration).Before(now) {
 		return true, updateReasonUpdateComplete
 	}
 	return false, updateReasonUpdateInProgress
diff --git a/lib/autoupdate/rollout/strategy_haltonerror_test.go b/lib/autoupdate/rollout/strategy_haltonerror_test.go
index ee3eb8e80ffca..2f59534ddd7db 100644
--- a/lib/autoupdate/rollout/strategy_haltonerror_test.go
+++ b/lib/autoupdate/rollout/strategy_haltonerror_test.go
@@ -500,7 +500,7 @@ func Test_progressGroupsHaltOnError(t *testing.T) {
 				State:     0,
 				StartTime: tt.rolloutStartTime,
 			}
-			err := strategy.progressRollout(ctx, status, clock.Now())
+			err := strategy.progressRollout(ctx, nil, status, clock.Now())
 			require.NoError(t, err)
 			// We use require.Equal instead of Elements match because group order matters.
 			// It's not super important for time-based, but is crucial for halt-on-error.
diff --git a/lib/autoupdate/rollout/strategy_test.go b/lib/autoupdate/rollout/strategy_test.go
index 1348716ba6c1d..0711d4043ae9c 100644
--- a/lib/autoupdate/rollout/strategy_test.go
+++ b/lib/autoupdate/rollout/strategy_test.go
@@ -95,11 +95,12 @@ func Test_canUpdateToday(t *testing.T) {
 
 func Test_inWindow(t *testing.T) {
 	tests := []struct {
-		name    string
-		group   *autoupdate.AutoUpdateAgentRolloutStatusGroup
-		now     time.Time
-		want    bool
-		wantErr require.ErrorAssertionFunc
+		name     string
+		group    *autoupdate.AutoUpdateAgentRolloutStatusGroup
+		now      time.Time
+		duration time.Duration
+		want     bool
+		wantErr  require.ErrorAssertionFunc
 	}{
 		{
 			name: "out of window",
@@ -107,9 +108,10 @@ func Test_inWindow(t *testing.T) {
 				ConfigDays:      everyWeekdayButSunday,
 				ConfigStartHour: matchingStartHour,
 			},
-			now:     testSunday,
-			want:    false,
-			wantErr: require.NoError,
+			now:      testSunday,
+			duration: time.Hour,
+			want:     false,
+			wantErr:  require.NoError,
 		},
 		{
 			name: "inside window, wrong hour",
@@ -117,9 +119,10 @@ func Test_inWindow(t *testing.T) {
 				ConfigDays:      everyWeekday,
 				ConfigStartHour: nonMatchingStartHour,
 			},
-			now:     testSunday,
-			want:    false,
-			wantErr: require.NoError,
+			now:      testSunday,
+			duration: time.Hour,
+			want:     false,
+			wantErr:  require.NoError,
 		},
 		{
 			name: "inside window, correct hour",
@@ -127,9 +130,10 @@ func Test_inWindow(t *testing.T) {
 				ConfigDays:      everyWeekday,
 				ConfigStartHour: matchingStartHour,
 			},
-			now:     testSunday,
-			want:    true,
-			wantErr: require.NoError,
+			now:      testSunday,
+			duration: time.Hour,
+			want:     true,
+			wantErr:  require.NoError,
 		},
 		{
 			name: "invalid weekdays",
@@ -137,14 +141,48 @@ func Test_inWindow(t *testing.T) {
 				ConfigDays:      []string{"HelloThereGeneralKenobi"},
 				ConfigStartHour: matchingStartHour,
 			},
-			now:     testSunday,
-			want:    false,
-			wantErr: require.Error,
+			now:      testSunday,
+			duration: time.Hour,
+			want:     false,
+			wantErr:  require.Error,
+		},
+		{
+			name: "short window",
+			group: &autoupdate.AutoUpdateAgentRolloutStatusGroup{
+				ConfigDays:      everyWeekday,
+				ConfigStartHour: matchingStartHour,
+			},
+			now:      testSunday,
+			duration: time.Second,
+			want:     false,
+			wantErr:  require.NoError,
+		},
+		{
+			name: "window start time is included",
+			group: &autoupdate.AutoUpdateAgentRolloutStatusGroup{
+				ConfigDays:      everyWeekday,
+				ConfigStartHour: matchingStartHour,
+			},
+			now:      testSunday.Truncate(24 * time.Hour).Add(time.Duration(matchingStartHour) * time.Hour),
+			duration: time.Hour,
+			want:     true,
+			wantErr:  require.NoError,
+		},
+		{
+			name: "window end time is not included",
+			group: &autoupdate.AutoUpdateAgentRolloutStatusGroup{
+				ConfigDays:      everyWeekday,
+				ConfigStartHour: matchingStartHour,
+			},
+			now:      testSunday.Truncate(24 * time.Hour).Add(time.Duration(matchingStartHour+1) * time.Hour),
+			duration: time.Hour,
+			want:     false,
+			wantErr:  require.NoError,
 		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			got, err := inWindow(tt.group, tt.now)
+			got, err := inWindow(tt.group, tt.now, tt.duration)
 			tt.wantErr(t, err)
 			require.Equal(t, tt.want, got)
 		})
@@ -205,7 +243,7 @@ func Test_rolloutChangedInWindow(t *testing.T) {
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			// Test execution.
-			result, err := rolloutChangedInWindow(group, tt.now, tt.rolloutStart)
+			result, err := rolloutChangedInWindow(group, tt.now, tt.rolloutStart, time.Hour)
 			require.NoError(t, err)
 			require.Equal(t, tt.want, result)
 		})
diff --git a/lib/autoupdate/rollout/strategy_timebased.go b/lib/autoupdate/rollout/strategy_timebased.go
index 5d06adf0e5ace..e4df5c6e23789 100644
--- a/lib/autoupdate/rollout/strategy_timebased.go
+++ b/lib/autoupdate/rollout/strategy_timebased.go
@@ -51,7 +51,13 @@ func newTimeBasedStrategy(log *slog.Logger) (rolloutStrategy, error) {
 	}, nil
 }
 
-func (h *timeBasedStrategy) progressRollout(ctx context.Context, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error {
+func (h *timeBasedStrategy) progressRollout(ctx context.Context, spec *autoupdate.AutoUpdateAgentRolloutSpec, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error {
+	windowDuration := spec.GetMaintenanceWindowDuration().AsDuration()
+	// Backward compatibility for resources previously created without duration.
+	if windowDuration == 0 {
+		windowDuration = haltOnErrorWindowDuration
+	}
+
 	// We always process every group regardless of the order.
 	var errs []error
 	for _, group := range status.Groups {
@@ -61,7 +67,7 @@ func (h *timeBasedStrategy) progressRollout(ctx context.Context, status *autoupd
 			// We start any group unstarted group in window.
 			// Done groups can transition back to active if they enter their maintenance window again.
 			// Some agents might have missed the previous windows and might expected to try again.
-			shouldBeActive, err := inWindow(group, now)
+			shouldBeActive, err := inWindow(group, now, windowDuration)
 			if err != nil {
 				// In time-based rollouts, groups are not dependent.
 				// Failing to transition a group should affect other groups.
@@ -72,7 +78,7 @@ func (h *timeBasedStrategy) progressRollout(ctx context.Context, status *autoupd
 			}
 
 			// Check if the rollout got created after the theoretical group start time
-			rolloutChangedDuringWindow, err := rolloutChangedInWindow(group, now, status.StartTime.AsTime())
+			rolloutChangedDuringWindow, err := rolloutChangedInWindow(group, now, status.StartTime.AsTime(), windowDuration)
 			if err != nil {
 				setGroupState(group, group.State, updateReasonReconcilerError, now)
 				errs = append(errs, err)
@@ -93,7 +99,7 @@ func (h *timeBasedStrategy) progressRollout(ctx context.Context, status *autoupd
 		case autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE:
 			// The group is currently being updated. We check if the maintenance
 			// is over and if we should transition it to the done state
-			shouldBeActive, err := inWindow(group, now)
+			shouldBeActive, err := inWindow(group, now, windowDuration)
 			if err != nil {
 				// In time-based rollouts, groups are not dependent.
 				// Failing to transition a group should affect other groups.
diff --git a/lib/autoupdate/rollout/strategy_timebased_test.go b/lib/autoupdate/rollout/strategy_timebased_test.go
index 84367f9927c04..6fa6245598a15 100644
--- a/lib/autoupdate/rollout/strategy_timebased_test.go
+++ b/lib/autoupdate/rollout/strategy_timebased_test.go
@@ -25,6 +25,7 @@ import (
 
 	"github.com/jonboulle/clockwork"
 	"github.com/stretchr/testify/require"
+	"google.golang.org/protobuf/types/known/durationpb"
 	"google.golang.org/protobuf/types/known/timestamppb"
 
 	"github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1"
@@ -325,6 +326,11 @@ func Test_progressGroupsTimeBased(t *testing.T) {
 			},
 		},
 	}
+
+	spec := &autoupdate.AutoUpdateAgentRolloutSpec{
+		MaintenanceWindowDuration: durationpb.New(time.Hour),
+	}
+
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			status := &autoupdate.AutoUpdateAgentRolloutStatus{
@@ -332,7 +338,7 @@ func Test_progressGroupsTimeBased(t *testing.T) {
 				State:     0,
 				StartTime: tt.rolloutStartTime,
 			}
-			err := strategy.progressRollout(ctx, status, clock.Now())
+			err := strategy.progressRollout(ctx, spec, status, clock.Now())
 			require.NoError(t, err)
 			// We use require.Equal instead of Elements match because group order matters.
 			// It's not super important for time-based, but is crucial for halt-on-error.