Skip to content

Commit

Permalink
autoupdate rollout: honour the maintenance window duration (#50745)
Browse files Browse the repository at this point in the history
* autoupdate rollout: honour the maintenance window duration

* Update lib/autoupdate/rollout/reconciler.go

Co-authored-by: Bartosz Leper <bartosz.leper@goteleport.com>

* Address feedback

* Update lib/autoupdate/rollout/strategy.go

---------

Co-authored-by: Bartosz Leper <bartosz.leper@goteleport.com>
  • Loading branch information
hugoShaka and bl-nero authored Jan 8, 2025
1 parent 486176a commit a0b526b
Show file tree
Hide file tree
Showing 8 changed files with 106 additions and 45 deletions.
21 changes: 11 additions & 10 deletions lib/autoupdate/rollout/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,11 +202,12 @@ func (r *reconciler) buildRolloutSpec(config *autoupdate.AutoUpdateConfigSpecAge
}

return &autoupdate.AutoUpdateAgentRolloutSpec{
StartVersion: version.GetStartVersion(),
TargetVersion: version.GetTargetVersion(),
Schedule: version.GetSchedule(),
AutoupdateMode: mode,
Strategy: strategy,
StartVersion: version.GetStartVersion(),
TargetVersion: version.GetTargetVersion(),
Schedule: version.GetSchedule(),
AutoupdateMode: mode,
Strategy: strategy,
MaintenanceWindowDuration: config.GetMaintenanceWindowDuration(),
}, nil

}
Expand Down Expand Up @@ -318,7 +319,7 @@ func (r *reconciler) computeStatus(
}
status.Groups = groups

err = r.progressRollout(ctx, newSpec.GetStrategy(), status, now)
err = r.progressRollout(ctx, newSpec, status, now)
// Failing to progress the update is not a hard failure.
// We want to update the status even if something went wrong to surface the failed reconciliation and potential errors to the user.
if err != nil {
Expand All @@ -334,13 +335,13 @@ func (r *reconciler) computeStatus(
// groups are updated in place.
// If an error is returned, the groups should still be upserted, depending on the strategy,
// failing to update a group might not be fatal (other groups can still progress independently).
func (r *reconciler) progressRollout(ctx context.Context, strategyName string, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error {
func (r *reconciler) progressRollout(ctx context.Context, spec *autoupdate.AutoUpdateAgentRolloutSpec, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error {
for _, strategy := range r.rolloutStrategies {
if strategy.name() == strategyName {
return strategy.progressRollout(ctx, status, now)
if strategy.name() == spec.GetStrategy() {
return strategy.progressRollout(ctx, spec, status, now)
}
}
return trace.NotImplemented("rollout strategy %q not implemented", strategyName)
return trace.NotImplemented("rollout strategy %q not implemented", spec.GetStrategy())
}

// makeGroupStatus creates the autoupdate_agent_rollout.status.groups based on the autoupdate_config.
Expand Down
2 changes: 1 addition & 1 deletion lib/autoupdate/rollout/reconciler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -714,7 +714,7 @@ func (f *fakeRolloutStrategy) name() string {
return f.strategyName
}

func (f *fakeRolloutStrategy) progressRollout(ctx context.Context, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error {
func (f *fakeRolloutStrategy) progressRollout(ctx context.Context, spec *autoupdate.AutoUpdateAgentRolloutSpec, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error {
f.calls++
return nil
}
Expand Down
19 changes: 14 additions & 5 deletions lib/autoupdate/rollout/strategy.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,28 +40,37 @@ const (
// This interface allows us to inject dummy strategies for simpler testing.
type rolloutStrategy interface {
name() string
progressRollout(context.Context, *autoupdate.AutoUpdateAgentRolloutStatus, time.Time) error
// progressRollout takes the new rollout spec, existing rollout status and current time.
// It updates the status resource in-place to progress the rollout to the next step if possible/needed.
progressRollout(context.Context, *autoupdate.AutoUpdateAgentRolloutSpec, *autoupdate.AutoUpdateAgentRolloutStatus, time.Time) error
}

func inWindow(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now time.Time) (bool, error) {
// inWindow checks if the time is in the group's maintenance window.
// The maintenance window is the semi-open interval: [windowStart, windowEnd).
func inWindow(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now time.Time, duration time.Duration) (bool, error) {
dayOK, err := canUpdateToday(group.ConfigDays, now)
if err != nil {
return false, trace.Wrap(err, "checking the day of the week")
}
if !dayOK {
return false, nil
}
return int(group.ConfigStartHour) == now.Hour(), nil

// We compute the theoretical window start and end
windowStart := now.Truncate(24 * time.Hour).Add(time.Duration(group.ConfigStartHour) * time.Hour)
windowEnd := windowStart.Add(duration)

return !now.Before(windowStart) && now.Before(windowEnd), nil
}

// rolloutChangedInWindow checks if the rollout got created after the theoretical group start time
func rolloutChangedInWindow(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now, rolloutStart time.Time) (bool, error) {
func rolloutChangedInWindow(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now, rolloutStart time.Time, duration time.Duration) (bool, error) {
// If the rollout is older than 24h, we know it did not change during the window
if now.Sub(rolloutStart) > 24*time.Hour {
return false, nil
}
// Else we check if the rollout happened in the group window.
return inWindow(group, rolloutStart)
return inWindow(group, rolloutStart, duration)
}

func canUpdateToday(allowedDays []string, now time.Time) (bool, error) {
Expand Down
9 changes: 5 additions & 4 deletions lib/autoupdate/rollout/strategy_haltonerror.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ const (
updateReasonPreviousGroupsNotDone = "previous_groups_not_done"
updateReasonUpdateComplete = "update_complete"
updateReasonUpdateInProgress = "update_in_progress"
haltOnErrorWindowDuration = time.Hour
)

type haltOnErrorStrategy struct {
Expand All @@ -54,7 +55,7 @@ func newHaltOnErrorStrategy(log *slog.Logger) (rolloutStrategy, error) {
}, nil
}

func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error {
func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, _ *autoupdate.AutoUpdateAgentRolloutSpec, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error {
// We process every group in order, all the previous groups must be in the DONE state
// for the next group to become active. Even if some early groups are not DONE,
// later groups might be ACTIVE and need to transition to DONE, so we cannot
Expand All @@ -81,7 +82,7 @@ func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, status *autou
}

// Check if the rollout got created after the theoretical group start time
rolloutChangedDuringWindow, err := rolloutChangedInWindow(group, now, status.StartTime.AsTime())
rolloutChangedDuringWindow, err := rolloutChangedInWindow(group, now, status.StartTime.AsTime(), haltOnErrorWindowDuration)
if err != nil {
setGroupState(group, group.State, updateReasonReconcilerError, now)
return err
Expand Down Expand Up @@ -149,14 +150,14 @@ func canStartHaltOnError(group, previousGroup *autoupdate.AutoUpdateAgentRollout
}
}

return inWindow(group, now)
return inWindow(group, now, haltOnErrorWindowDuration)
}

func isDoneHaltOnError(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now time.Time) (bool, string) {
// Currently we don't implement status reporting from groups/agents.
// So we just wait 60 minutes and consider the maintenance done.
// This will change as we introduce agent status report and aggregated agent counts.
if group.StartTime.AsTime().Add(time.Hour).Before(now) {
if group.StartTime.AsTime().Add(haltOnErrorWindowDuration).Before(now) {
return true, updateReasonUpdateComplete
}
return false, updateReasonUpdateInProgress
Expand Down
2 changes: 1 addition & 1 deletion lib/autoupdate/rollout/strategy_haltonerror_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -500,7 +500,7 @@ func Test_progressGroupsHaltOnError(t *testing.T) {
State: 0,
StartTime: tt.rolloutStartTime,
}
err := strategy.progressRollout(ctx, status, clock.Now())
err := strategy.progressRollout(ctx, nil, status, clock.Now())
require.NoError(t, err)
// We use require.Equal instead of Elements match because group order matters.
// It's not super important for time-based, but is crucial for halt-on-error.
Expand Down
76 changes: 57 additions & 19 deletions lib/autoupdate/rollout/strategy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,56 +95,94 @@ func Test_canUpdateToday(t *testing.T) {

func Test_inWindow(t *testing.T) {
tests := []struct {
name string
group *autoupdate.AutoUpdateAgentRolloutStatusGroup
now time.Time
want bool
wantErr require.ErrorAssertionFunc
name string
group *autoupdate.AutoUpdateAgentRolloutStatusGroup
now time.Time
duration time.Duration
want bool
wantErr require.ErrorAssertionFunc
}{
{
name: "out of window",
group: &autoupdate.AutoUpdateAgentRolloutStatusGroup{
ConfigDays: everyWeekdayButSunday,
ConfigStartHour: matchingStartHour,
},
now: testSunday,
want: false,
wantErr: require.NoError,
now: testSunday,
duration: time.Hour,
want: false,
wantErr: require.NoError,
},
{
name: "inside window, wrong hour",
group: &autoupdate.AutoUpdateAgentRolloutStatusGroup{
ConfigDays: everyWeekday,
ConfigStartHour: nonMatchingStartHour,
},
now: testSunday,
want: false,
wantErr: require.NoError,
now: testSunday,
duration: time.Hour,
want: false,
wantErr: require.NoError,
},
{
name: "inside window, correct hour",
group: &autoupdate.AutoUpdateAgentRolloutStatusGroup{
ConfigDays: everyWeekday,
ConfigStartHour: matchingStartHour,
},
now: testSunday,
want: true,
wantErr: require.NoError,
now: testSunday,
duration: time.Hour,
want: true,
wantErr: require.NoError,
},
{
name: "invalid weekdays",
group: &autoupdate.AutoUpdateAgentRolloutStatusGroup{
ConfigDays: []string{"HelloThereGeneralKenobi"},
ConfigStartHour: matchingStartHour,
},
now: testSunday,
want: false,
wantErr: require.Error,
now: testSunday,
duration: time.Hour,
want: false,
wantErr: require.Error,
},
{
name: "short window",
group: &autoupdate.AutoUpdateAgentRolloutStatusGroup{
ConfigDays: everyWeekday,
ConfigStartHour: matchingStartHour,
},
now: testSunday,
duration: time.Second,
want: false,
wantErr: require.NoError,
},
{
name: "window start time is included",
group: &autoupdate.AutoUpdateAgentRolloutStatusGroup{
ConfigDays: everyWeekday,
ConfigStartHour: matchingStartHour,
},
now: testSunday.Truncate(24 * time.Hour).Add(time.Duration(matchingStartHour) * time.Hour),
duration: time.Hour,
want: true,
wantErr: require.NoError,
},
{
name: "window end time is not included",
group: &autoupdate.AutoUpdateAgentRolloutStatusGroup{
ConfigDays: everyWeekday,
ConfigStartHour: matchingStartHour,
},
now: testSunday.Truncate(24 * time.Hour).Add(time.Duration(matchingStartHour+1) * time.Hour),
duration: time.Hour,
want: false,
wantErr: require.NoError,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := inWindow(tt.group, tt.now)
got, err := inWindow(tt.group, tt.now, tt.duration)
tt.wantErr(t, err)
require.Equal(t, tt.want, got)
})
Expand Down Expand Up @@ -205,7 +243,7 @@ func Test_rolloutChangedInWindow(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Test execution.
result, err := rolloutChangedInWindow(group, tt.now, tt.rolloutStart)
result, err := rolloutChangedInWindow(group, tt.now, tt.rolloutStart, time.Hour)
require.NoError(t, err)
require.Equal(t, tt.want, result)
})
Expand Down
14 changes: 10 additions & 4 deletions lib/autoupdate/rollout/strategy_timebased.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,13 @@ func newTimeBasedStrategy(log *slog.Logger) (rolloutStrategy, error) {
}, nil
}

func (h *timeBasedStrategy) progressRollout(ctx context.Context, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error {
func (h *timeBasedStrategy) progressRollout(ctx context.Context, spec *autoupdate.AutoUpdateAgentRolloutSpec, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error {
windowDuration := spec.GetMaintenanceWindowDuration().AsDuration()
// Backward compatibility for resources previously created without duration.
if windowDuration == 0 {
windowDuration = haltOnErrorWindowDuration
}

// We always process every group regardless of the order.
var errs []error
for _, group := range status.Groups {
Expand All @@ -61,7 +67,7 @@ func (h *timeBasedStrategy) progressRollout(ctx context.Context, status *autoupd
// We start any group unstarted group in window.
// Done groups can transition back to active if they enter their maintenance window again.
// Some agents might have missed the previous windows and might expected to try again.
shouldBeActive, err := inWindow(group, now)
shouldBeActive, err := inWindow(group, now, windowDuration)
if err != nil {
// In time-based rollouts, groups are not dependent.
// Failing to transition a group should affect other groups.
Expand All @@ -72,7 +78,7 @@ func (h *timeBasedStrategy) progressRollout(ctx context.Context, status *autoupd
}

// Check if the rollout got created after the theoretical group start time
rolloutChangedDuringWindow, err := rolloutChangedInWindow(group, now, status.StartTime.AsTime())
rolloutChangedDuringWindow, err := rolloutChangedInWindow(group, now, status.StartTime.AsTime(), windowDuration)
if err != nil {
setGroupState(group, group.State, updateReasonReconcilerError, now)
errs = append(errs, err)
Expand All @@ -93,7 +99,7 @@ func (h *timeBasedStrategy) progressRollout(ctx context.Context, status *autoupd
case autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE:
// The group is currently being updated. We check if the maintenance
// is over and if we should transition it to the done state
shouldBeActive, err := inWindow(group, now)
shouldBeActive, err := inWindow(group, now, windowDuration)
if err != nil {
// In time-based rollouts, groups are not dependent.
// Failing to transition a group should affect other groups.
Expand Down
8 changes: 7 additions & 1 deletion lib/autoupdate/rollout/strategy_timebased_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (

"github.com/jonboulle/clockwork"
"github.com/stretchr/testify/require"
"google.golang.org/protobuf/types/known/durationpb"
"google.golang.org/protobuf/types/known/timestamppb"

"github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1"
Expand Down Expand Up @@ -325,14 +326,19 @@ func Test_progressGroupsTimeBased(t *testing.T) {
},
},
}

spec := &autoupdate.AutoUpdateAgentRolloutSpec{
MaintenanceWindowDuration: durationpb.New(time.Hour),
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
status := &autoupdate.AutoUpdateAgentRolloutStatus{
Groups: tt.initialState,
State: 0,
StartTime: tt.rolloutStartTime,
}
err := strategy.progressRollout(ctx, status, clock.Now())
err := strategy.progressRollout(ctx, spec, status, clock.Now())
require.NoError(t, err)
// We use require.Equal instead of Elements match because group order matters.
// It's not super important for time-based, but is crucial for halt-on-error.
Expand Down

0 comments on commit a0b526b

Please sign in to comment.