Skip to content

Commit

Permalink
Reduce clock usage + add time and period override in rollout controll…
Browse files Browse the repository at this point in the history
…er (#50634)
  • Loading branch information
hugoShaka authored Dec 31, 2024
1 parent 2467aeb commit afae487
Show file tree
Hide file tree
Showing 11 changed files with 136 additions and 105 deletions.
165 changes: 91 additions & 74 deletions api/gen/proto/go/teleport/autoupdate/v1/autoupdate.pb.go

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions api/proto/teleport/autoupdate/v1/autoupdate.proto
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ message AutoUpdateAgentRolloutSpec {
}

// AutoUpdateAgentRolloutStatus tracks the current agent rollout status.
// The status is reset if any spec field changes except the mode.
message AutoUpdateAgentRolloutStatus {
repeated AutoUpdateAgentRolloutStatusGroup groups = 1;
AutoUpdateAgentRolloutState state = 2;
Expand All @@ -182,6 +183,11 @@ message AutoUpdateAgentRolloutStatus {
// before the rollout start time and the maintenance window belongs to the previous rollout.
// When the timestamp is nil, the controller will ignore the start time and check and allow groups to activate.
google.protobuf.Timestamp start_time = 3;

// Time override is an optional timestamp making the autoupdate_agent_rollout controller use a specific time instead
// of the system clock when evaluating time-based criteria. This field is used for testing and troubleshooting
// purposes.
google.protobuf.Timestamp time_override = 4;
}

// AutoUpdateAgentRolloutStatusGroup tracks the current agent rollout status of a specific group.
Expand Down
1 change: 1 addition & 0 deletions lib/autoupdate/rollout/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ func (c *Controller) Run(ctx context.Context) error {
ticker := interval.New(config)
defer ticker.Stop()

c.log.InfoContext(ctx, "Starting autoupdate_agent_rollout controller", "period", c.period)
for {
select {
case <-ctx.Done():
Expand Down
18 changes: 15 additions & 3 deletions lib/autoupdate/rollout/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ func (r *reconciler) tryReconcile(ctx context.Context) error {

// if there are no existing rollout, we create a new one and set the status
if !rolloutExists {
r.log.DebugContext(ctx, "creating rollout")
rollout, err := update.NewAutoUpdateAgentRollout(newSpec)
rollout.Status = newStatus
if err != nil {
Expand All @@ -175,6 +176,7 @@ func (r *reconciler) tryReconcile(ctx context.Context) error {
return trace.Wrap(err, "creating rollout")
}

r.log.DebugContext(ctx, "updating rollout")
// If there was a previous rollout, we update its spec and status and do an update.
// We don't create a new resource to keep the metadata containing the revision ID.
existingRollout.Spec = newSpec
Expand Down Expand Up @@ -295,6 +297,16 @@ func (r *reconciler) computeStatus(
// compute the group state changes
now := r.clock.Now()

// If timeOverride is set to a non-zero value (we have two potential zeros, go time's zero and timestamppb's zero)
// we use this instead of the clock's time.
if timeOverride := status.GetTimeOverride().AsTime(); !(timeOverride.IsZero() || timeOverride.Unix() == 0) {
r.log.DebugContext(ctx, "reconciling with synthetic time instead of real time",
"time_override", timeOverride,
"real_time", now,
)
now = timeOverride
}

// If this is a new rollout or the rollout has been reset, we create groups from the config
groups := status.GetGroups()
var err error
Expand All @@ -306,7 +318,7 @@ func (r *reconciler) computeStatus(
}
status.Groups = groups

err = r.progressRollout(ctx, newSpec.GetStrategy(), status)
err = r.progressRollout(ctx, newSpec.GetStrategy(), status, now)
// Failing to progress the update is not a hard failure.
// We want to update the status even if something went wrong to surface the failed reconciliation and potential errors to the user.
if err != nil {
Expand All @@ -322,10 +334,10 @@ func (r *reconciler) computeStatus(
// groups are updated in place.
// If an error is returned, the groups should still be upserted, depending on the strategy,
// failing to update a group might not be fatal (other groups can still progress independently).
func (r *reconciler) progressRollout(ctx context.Context, strategyName string, status *autoupdate.AutoUpdateAgentRolloutStatus) error {
func (r *reconciler) progressRollout(ctx context.Context, strategyName string, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error {
for _, strategy := range r.rolloutStrategies {
if strategy.name() == strategyName {
return strategy.progressRollout(ctx, status)
return strategy.progressRollout(ctx, status, now)
}
}
return trace.NotImplemented("rollout strategy %q not implemented", strategyName)
Expand Down
2 changes: 1 addition & 1 deletion lib/autoupdate/rollout/reconciler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -714,7 +714,7 @@ func (f *fakeRolloutStrategy) name() string {
return f.strategyName
}

func (f *fakeRolloutStrategy) progressRollout(ctx context.Context, status *autoupdate.AutoUpdateAgentRolloutStatus) error {
func (f *fakeRolloutStrategy) progressRollout(ctx context.Context, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error {
f.calls++
return nil
}
Expand Down
2 changes: 1 addition & 1 deletion lib/autoupdate/rollout/strategy.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ const (
// This interface allows us to inject dummy strategies for simpler testing.
type rolloutStrategy interface {
name() string
progressRollout(context.Context, *autoupdate.AutoUpdateAgentRolloutStatus) error
progressRollout(context.Context, *autoupdate.AutoUpdateAgentRolloutStatus, time.Time) error
}

func inWindow(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now time.Time) (bool, error) {
Expand Down
15 changes: 4 additions & 11 deletions lib/autoupdate/rollout/strategy_haltonerror.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ import (
"time"

"github.com/gravitational/trace"
"github.com/jonboulle/clockwork"

"github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1"
update "github.com/gravitational/teleport/api/types/autoupdate"
Expand All @@ -39,29 +38,23 @@ const (
)

type haltOnErrorStrategy struct {
log *slog.Logger
clock clockwork.Clock
log *slog.Logger
}

func (h *haltOnErrorStrategy) name() string {
return update.AgentsStrategyHaltOnError
}

func newHaltOnErrorStrategy(log *slog.Logger, clock clockwork.Clock) (rolloutStrategy, error) {
func newHaltOnErrorStrategy(log *slog.Logger) (rolloutStrategy, error) {
if log == nil {
return nil, trace.BadParameter("missing log")
}
if clock == nil {
return nil, trace.BadParameter("missing clock")
}
return &haltOnErrorStrategy{
log: log.With("strategy", update.AgentsStrategyHaltOnError),
clock: clock,
log: log.With("strategy", update.AgentsStrategyHaltOnError),
}, nil
}

func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, status *autoupdate.AutoUpdateAgentRolloutStatus) error {
now := h.clock.Now()
func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error {
// We process every group in order, all the previous groups must be in the DONE state
// for the next group to become active. Even if some early groups are not DONE,
// later groups might be ACTIVE and need to transition to DONE, so we cannot
Expand Down
4 changes: 2 additions & 2 deletions lib/autoupdate/rollout/strategy_haltonerror_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ func Test_canStartHaltOnError(t *testing.T) {
func Test_progressGroupsHaltOnError(t *testing.T) {
clock := clockwork.NewFakeClockAt(testSunday)
log := utils.NewSlogLoggerForTests()
strategy, err := newHaltOnErrorStrategy(log, clock)
strategy, err := newHaltOnErrorStrategy(log)
require.NoError(t, err)

fewMinutesAgo := clock.Now().Add(-5 * time.Minute)
Expand Down Expand Up @@ -500,7 +500,7 @@ func Test_progressGroupsHaltOnError(t *testing.T) {
State: 0,
StartTime: tt.rolloutStartTime,
}
err := strategy.progressRollout(ctx, status)
err := strategy.progressRollout(ctx, status, clock.Now())
require.NoError(t, err)
// We use require.Equal instead of Elements match because group order matters.
// It's not super important for time-based, but is crucial for halt-on-error.
Expand Down
16 changes: 5 additions & 11 deletions lib/autoupdate/rollout/strategy_timebased.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ package rollout
import (
"context"
"log/slog"
"time"

"github.com/gravitational/trace"
"github.com/jonboulle/clockwork"

"github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1"
update "github.com/gravitational/teleport/api/types/autoupdate"
Expand All @@ -35,29 +35,23 @@ const (
)

type timeBasedStrategy struct {
log *slog.Logger
clock clockwork.Clock
log *slog.Logger
}

func (h *timeBasedStrategy) name() string {
return update.AgentsStrategyTimeBased
}

func newTimeBasedStrategy(log *slog.Logger, clock clockwork.Clock) (rolloutStrategy, error) {
func newTimeBasedStrategy(log *slog.Logger) (rolloutStrategy, error) {
if log == nil {
return nil, trace.BadParameter("missing log")
}
if clock == nil {
return nil, trace.BadParameter("missing clock")
}
return &timeBasedStrategy{
log: log.With("strategy", update.AgentsStrategyTimeBased),
clock: clock,
log: log.With("strategy", update.AgentsStrategyTimeBased),
}, nil
}

func (h *timeBasedStrategy) progressRollout(ctx context.Context, status *autoupdate.AutoUpdateAgentRolloutStatus) error {
now := h.clock.Now()
func (h *timeBasedStrategy) progressRollout(ctx context.Context, status *autoupdate.AutoUpdateAgentRolloutStatus, now time.Time) error {
// We always process every group regardless of the order.
var errs []error
for _, group := range status.Groups {
Expand Down
4 changes: 2 additions & 2 deletions lib/autoupdate/rollout/strategy_timebased_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ import (
func Test_progressGroupsTimeBased(t *testing.T) {
clock := clockwork.NewFakeClockAt(testSunday)
log := utils.NewSlogLoggerForTests()
strategy, err := newTimeBasedStrategy(log, clock)
strategy, err := newTimeBasedStrategy(log)
require.NoError(t, err)

groupName := "test-group"
Expand Down Expand Up @@ -332,7 +332,7 @@ func Test_progressGroupsTimeBased(t *testing.T) {
State: 0,
StartTime: tt.rolloutStartTime,
}
err := strategy.progressRollout(ctx, status)
err := strategy.progressRollout(ctx, status, clock.Now())
require.NoError(t, err)
// We use require.Equal instead of Elements match because group order matters.
// It's not super important for time-based, but is crucial for halt-on-error.
Expand Down
8 changes: 8 additions & 0 deletions lib/config/configuration.go
Original file line number Diff line number Diff line change
Expand Up @@ -2753,6 +2753,14 @@ func Configure(clf *CommandLineFlags, cfg *servicecfg.Config, legacyAppFlags boo
cfg.Proxy.QUICProxyPeering = true
}

if rawPeriod := os.Getenv("TELEPORT_UNSTABLE_AGENT_ROLLOUT_SYNC_PERIOD"); rawPeriod != "" {
period, err := time.ParseDuration(rawPeriod)
if err != nil {
return trace.Wrap(err, "invalid agent rollout period %q", rawPeriod)
}
cfg.Auth.AgentRolloutControllerSyncPeriod = period
}

return nil
}

Expand Down

0 comments on commit afae487

Please sign in to comment.