diff --git a/lib/autoupdate/rolloutcontroller/client.go b/lib/autoupdate/rollout/client.go similarity index 98% rename from lib/autoupdate/rolloutcontroller/client.go rename to lib/autoupdate/rollout/client.go index 4dead0f9dee19..16b9f5197fb2b 100644 --- a/lib/autoupdate/rolloutcontroller/client.go +++ b/lib/autoupdate/rollout/client.go @@ -16,7 +16,7 @@ * along with this program. If not, see . */ -package rolloutcontroller +package rollout import ( "context" diff --git a/lib/autoupdate/rolloutcontroller/client_test.go b/lib/autoupdate/rollout/client_test.go similarity index 98% rename from lib/autoupdate/rolloutcontroller/client_test.go rename to lib/autoupdate/rollout/client_test.go index ba204ffb77db3..fc0e49bd04763 100644 --- a/lib/autoupdate/rolloutcontroller/client_test.go +++ b/lib/autoupdate/rollout/client_test.go @@ -16,7 +16,7 @@ * along with this program. If not, see . */ -package rolloutcontroller +package rollout import ( "context" @@ -29,7 +29,7 @@ import ( ) // mockClient is a mock implementation if the Client interface for testing purposes. -// This is used to precisely check which calls are made by the Reconciler during tests. +// This is used to precisely check which calls are made by the reconciler during tests. // Use newMockClient to create one from stubs. Once the test is over, you must call // mockClient.checkIfEmpty to validate all expected calls were made. type mockClient struct { diff --git a/lib/autoupdate/rollout/controller.go b/lib/autoupdate/rollout/controller.go new file mode 100644 index 0000000000000..53a3741f8050a --- /dev/null +++ b/lib/autoupdate/rollout/controller.go @@ -0,0 +1,106 @@ +/* + * Teleport + * Copyright (C) 2024 Gravitational, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +package rollout + +import ( + "context" + "log/slog" + "time" + + "github.com/gravitational/trace" + "github.com/jonboulle/clockwork" + + "github.com/gravitational/teleport/api/utils/retryutils" + "github.com/gravitational/teleport/lib/utils/interval" +) + +const ( + reconcilerPeriod = time.Minute +) + +// Controller wakes up every minute to reconcile the autoupdate_agent_rollout resource. +// See the reconciler godoc for more details about the reconciliation process. +// We currently wake up every minute, in the future we might decide to also watch for events +// (from autoupdate_config and autoupdate_version changefeed) to react faster. +type Controller struct { + // TODO(hugoShaka) add prometheus metrics describing the reconciliation status + reconciler reconciler + clock clockwork.Clock + log *slog.Logger +} + +// NewController creates a new Controller for the autoupdate_agent_rollout kind. +func NewController(client Client, log *slog.Logger, clock clockwork.Clock) (*Controller, error) { + if client == nil { + return nil, trace.BadParameter("missing client") + } + if log == nil { + return nil, trace.BadParameter("missing log") + } + if clock == nil { + return nil, trace.BadParameter("missing clock") + } + return &Controller{ + clock: clock, + log: log, + reconciler: reconciler{ + clt: client, + log: log, + }, + }, nil +} + +// Run the autoupdate_agent_rollout controller. This function returns only when its context is canceled. +func (c *Controller) Run(ctx context.Context) error { + config := interval.Config{ + Duration: reconcilerPeriod, + FirstDuration: reconcilerPeriod, + Jitter: retryutils.SeventhJitter, + Clock: c.clock, + } + ticker := interval.New(config) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + c.log.InfoContext(ctx, "Stopping autoupdate_agent_rollout controller", "reason", ctx.Err()) + return ctx.Err() + case <-ticker.Next(): + c.log.DebugContext(ctx, "Reconciling autoupdate_agent_rollout") + if err := c.tryAndCatch(ctx); err != nil { + c.log.ErrorContext(ctx, "Failed to reconcile autoudpate_agent_controller", "error", err) + } + } + } +} + +// tryAndCatch tries to run the controller reconciliation logic and recovers from potential panic by converting them +// into errors. This ensures that a critical bug in the reconciler cannot bring down the whole Teleport cluster. +func (c *Controller) tryAndCatch(ctx context.Context) (err error) { + // If something terribly bad happens during the reconciliation, we recover and return an error + defer func() { + if r := recover(); r != nil { + c.log.ErrorContext(ctx, "Recovered from panic in the autoupdate_agent_rollout controller", "panic", r) + err = trace.NewAggregate(err, trace.Errorf("Panic recovered during reconciliation: %v", r)) + } + }() + err = trace.Wrap(c.reconciler.reconcile(ctx)) + return +} diff --git a/lib/autoupdate/rolloutcontroller/reconciler.go b/lib/autoupdate/rollout/reconciler.go similarity index 95% rename from lib/autoupdate/rolloutcontroller/reconciler.go rename to lib/autoupdate/rollout/reconciler.go index 78989c4ec4a6b..2fc04634c72f9 100644 --- a/lib/autoupdate/rolloutcontroller/reconciler.go +++ b/lib/autoupdate/rollout/reconciler.go @@ -16,7 +16,7 @@ * along with this program. If not, see . */ -package rolloutcontroller +package rollout import ( "context" @@ -37,11 +37,11 @@ const ( maxConflictRetry = 3 ) -// Reconciler reconciles the AutoUpdateAgentRollout singleton based on the content of the AutoUpdateVersion and +// reconciler reconciles the AutoUpdateAgentRollout singleton based on the content of the AutoUpdateVersion and // AutoUpdateConfig singletons. This reconciler is not based on the services.GenericReconciler because: // - we reconcile 2 resources with one // - both input and output are singletons, we don't need the multi resource logic nor stream/paginated APIs -type Reconciler struct { +type reconciler struct { clt Client log *slog.Logger @@ -49,9 +49,9 @@ type Reconciler struct { mutex sync.Mutex } -// Reconcile the AutoUpdateAgentRollout singleton. The reconciliation can fail because of a conflict (multiple auths +// reconcile the AutoUpdateAgentRollout singleton. The reconciliation can fail because of a conflict (multiple auths // are racing), in this case we retry the reconciliation immediately. -func (r *Reconciler) Reconcile(ctx context.Context) error { +func (r *reconciler) reconcile(ctx context.Context) error { r.mutex.Lock() defer r.mutex.Unlock() @@ -88,7 +88,7 @@ func (r *Reconciler) Reconcile(ctx context.Context) error { // The creation/update/deletion can fail with a trace.CompareFailedError or trace.NotFoundError // if the resource change while we were computing it. // The caller must handle those error and retry the reconciliation. -func (r *Reconciler) tryReconcile(ctx context.Context) error { +func (r *reconciler) tryReconcile(ctx context.Context) error { // get autoupdate_config var config *autoupdate.AutoUpdateConfig if c, err := r.clt.GetAutoUpdateConfig(ctx); err == nil { @@ -171,7 +171,7 @@ func (r *Reconciler) tryReconcile(ctx context.Context) error { return trace.Wrap(err, "updating rollout") } -func (r *Reconciler) buildRolloutSpec(config *autoupdate.AutoUpdateConfigSpecAgents, version *autoupdate.AutoUpdateVersionSpecAgents) (*autoupdate.AutoUpdateAgentRolloutSpec, error) { +func (r *reconciler) buildRolloutSpec(config *autoupdate.AutoUpdateConfigSpecAgents, version *autoupdate.AutoUpdateVersionSpecAgents) (*autoupdate.AutoUpdateAgentRolloutSpec, error) { // reconcile mode mode, err := getMode(config.GetMode(), version.GetMode()) if err != nil { diff --git a/lib/autoupdate/rolloutcontroller/reconciler_test.go b/lib/autoupdate/rollout/reconciler_test.go similarity index 96% rename from lib/autoupdate/rolloutcontroller/reconciler_test.go rename to lib/autoupdate/rollout/reconciler_test.go index 340451d8da46d..4d24563f7b32f 100644 --- a/lib/autoupdate/rolloutcontroller/reconciler_test.go +++ b/lib/autoupdate/rollout/reconciler_test.go @@ -16,7 +16,7 @@ * along with this program. If not, see . */ -package rolloutcontroller +package rollout import ( "context" @@ -307,7 +307,7 @@ func TestTryReconcile(t *testing.T) { // Test execution: Running the reconciliation - reconciler := &Reconciler{ + reconciler := &reconciler{ clt: client, log: log, } @@ -375,13 +375,13 @@ func TestReconciler_Reconcile(t *testing.T) { } client := newMockClient(t, stubs) - reconciler := &Reconciler{ + reconciler := &reconciler{ clt: client, log: log, } // Test execution: run the reconciliation loop - require.NoError(t, reconciler.Reconcile(ctx)) + require.NoError(t, reconciler.reconcile(ctx)) // Test validation: check that all the expected calls were received client.checkIfEmpty(t) @@ -397,13 +397,13 @@ func TestReconciler_Reconcile(t *testing.T) { } client := newMockClient(t, stubs) - reconciler := &Reconciler{ + reconciler := &reconciler{ clt: client, log: log, } // Test execution: run the reconciliation loop - require.NoError(t, reconciler.Reconcile(ctx)) + require.NoError(t, reconciler.reconcile(ctx)) // Test validation: check that all the expected calls were received client.checkIfEmpty(t) @@ -421,13 +421,13 @@ func TestReconciler_Reconcile(t *testing.T) { } client := newMockClient(t, stubs) - reconciler := &Reconciler{ + reconciler := &reconciler{ clt: client, log: log, } // Test execution: run the reconciliation loop - require.NoError(t, reconciler.Reconcile(ctx)) + require.NoError(t, reconciler.reconcile(ctx)) // Test validation: check that all the expected calls were received client.checkIfEmpty(t) @@ -461,13 +461,13 @@ func TestReconciler_Reconcile(t *testing.T) { } client := newMockClient(t, stubs) - reconciler := &Reconciler{ + reconciler := &reconciler{ clt: client, log: log, } // Test execution: run the reconciliation loop - require.NoError(t, reconciler.Reconcile(ctx)) + require.NoError(t, reconciler.reconcile(ctx)) // Test validation: check that all the expected calls were received client.checkIfEmpty(t) @@ -499,13 +499,13 @@ func TestReconciler_Reconcile(t *testing.T) { } client := newMockClient(t, stubs) - reconciler := &Reconciler{ + reconciler := &reconciler{ clt: client, log: log, } // Test execution: run the reconciliation loop - require.NoError(t, reconciler.Reconcile(ctx)) + require.NoError(t, reconciler.reconcile(ctx)) // Test validation: check that all the expected calls were received client.checkIfEmpty(t) @@ -523,13 +523,13 @@ func TestReconciler_Reconcile(t *testing.T) { } client := newMockClient(t, stubs) - reconciler := &Reconciler{ + reconciler := &reconciler{ clt: client, log: log, } // Test execution: run the reconciliation loop - require.ErrorContains(t, reconciler.Reconcile(ctx), "the DB fell on the floor") + require.ErrorContains(t, reconciler.reconcile(ctx), "the DB fell on the floor") // Test validation: check that all the expected calls were received client.checkIfEmpty(t) @@ -553,13 +553,13 @@ func TestReconciler_Reconcile(t *testing.T) { } client := newMockClient(t, stubs) - reconciler := &Reconciler{ + reconciler := &reconciler{ clt: client, log: log, } // Test execution: run the reconciliation loop - require.ErrorContains(t, reconciler.Reconcile(cancelableCtx), "canceled") + require.ErrorIs(t, reconciler.reconcile(cancelableCtx), context.Canceled) // Test validation: check that all the expected calls were received client.checkIfEmpty(t) diff --git a/lib/service/service.go b/lib/service/service.go index 3167fb2bb9af3..ef30805a0fa7d 100644 --- a/lib/service/service.go +++ b/lib/service/service.go @@ -98,6 +98,7 @@ import ( "github.com/gravitational/teleport/lib/auth/storage" "github.com/gravitational/teleport/lib/authz" "github.com/gravitational/teleport/lib/automaticupgrades" + "github.com/gravitational/teleport/lib/autoupdate/rollout" "github.com/gravitational/teleport/lib/backend" "github.com/gravitational/teleport/lib/backend/dynamo" _ "github.com/gravitational/teleport/lib/backend/etcdbk" @@ -2430,6 +2431,14 @@ func (process *TeleportProcess) initAuthService() error { return trace.Wrap(spiffeFedSyncer.Run(process.GracefulExitContext()), "running SPIFFEFederation Syncer") }) + agentRolloutController, err := rollout.NewController(authServer, logger, process.Clock) + if err != nil { + return trace.Wrap(err, "creating the rollout controller") + } + process.RegisterFunc("auth.autoupdate_agent_rollout_controller", func() error { + return trace.Wrap(agentRolloutController.Run(process.GracefulExitContext()), "running autoupdate_agent_rollout controller") + }) + process.RegisterFunc("auth.server_info", func() error { return trace.Wrap(auth.ReconcileServerInfos(process.GracefulExitContext(), authServer)) }) diff --git a/lib/service/service_test.go b/lib/service/service_test.go index a9308e0503a4d..0ff48c3055395 100644 --- a/lib/service/service_test.go +++ b/lib/service/service_test.go @@ -47,7 +47,9 @@ import ( "github.com/gravitational/teleport" "github.com/gravitational/teleport/api/breaker" + autoupdatepb "github.com/gravitational/teleport/api/gen/proto/go/teleport/autoupdate/v1" "github.com/gravitational/teleport/api/types" + autoupdate "github.com/gravitational/teleport/api/types/autoupdate" apiutils "github.com/gravitational/teleport/api/utils" "github.com/gravitational/teleport/entitlements" "github.com/gravitational/teleport/lib" @@ -1845,3 +1847,76 @@ func TestInitDatabaseService(t *testing.T) { }) } } + +// TestAgentRolloutController validates that the agent rollout controller is started +// when we run the Auth Service. It does so by creating a dummy autoupdate_version resource +// and checking that the corresponding autoupdate_agent_rollout resource is created by the auth. +// If you want to test the reconciliation logic, add tests to the rolloutcontroller package instead. +func TestAgentRolloutController(t *testing.T) { + t.Parallel() + + // Test setup: create a Teleport Auth config + fakeClock := clockwork.NewFakeClock() + + dataDir := t.TempDir() + + cfg := servicecfg.MakeDefaultConfig() + cfg.Clock = fakeClock + cfg.DataDir = dataDir + cfg.SetAuthServerAddress(utils.NetAddr{AddrNetwork: "tcp", Addr: "127.0.0.1:0"}) + cfg.Auth.Enabled = true + cfg.Proxy.Enabled = false + cfg.SSH.Enabled = false + cfg.DebugService.Enabled = false + cfg.Auth.StorageConfig.Params["path"] = dataDir + cfg.Auth.ListenAddr = utils.NetAddr{AddrNetwork: "tcp", Addr: "127.0.0.1:0"} + cfg.CircuitBreakerConfig = breaker.NoopBreakerConfig() + + process, err := NewTeleport(cfg) + require.NoError(t, err) + + // Test setup: start the Teleport auth and wait for it to beocme ready + require.NoError(t, process.Start()) + + // Test setup: wait for every service to start + ctx, cancel := context.WithTimeout(process.ExitContext(), 30*time.Second) + defer cancel() + for _, eventName := range []string{AuthTLSReady, InstanceReady} { + _, err := process.WaitForEvent(ctx, eventName) + require.NoError(t, err) + } + + // Test cleanup: close the Teleport process and wait for every service to exist before returning. + // This ensures that a service will not make the test fail by writing a file to the temporary directory while it's + // being removed. + t.Cleanup(func() { + require.NoError(t, process.Close()) + require.NoError(t, process.Wait()) + }) + + // Test execution: create the autoupdate_version resource + authServer := process.GetAuthServer() + version, err := autoupdate.NewAutoUpdateVersion(&autoupdatepb.AutoUpdateVersionSpec{ + Agents: &autoupdatepb.AutoUpdateVersionSpecAgents{ + StartVersion: "1.2.3", + TargetVersion: "1.2.4", + Schedule: autoupdate.AgentsScheduleImmediate, + Mode: autoupdate.AgentsUpdateModeEnabled, + }, + }) + require.NoError(t, err) + version, err = authServer.CreateAutoUpdateVersion(ctx, version) + require.NoError(t, err) + + // Test execution: advance clock to trigger a reconciliation + fakeClock.Advance(2 * time.Minute) + + // Test validation: check that a new autoupdate_agent_rollout config was created + require.Eventually(t, func() bool { + rollout, err := authServer.GetAutoUpdateAgentRollout(ctx) + if err != nil { + return false + } + return rollout.Spec.GetTargetVersion() == version.Spec.GetAgents().GetTargetVersion() + }, time.Second, 10*time.Millisecond) +}