From 0e6830f39377ddc8acf76e76c4a57e3e6cb37c46 Mon Sep 17 00:00:00 2001 From: Jernej Kos Date: Tue, 23 Apr 2024 15:00:13 +0200 Subject: [PATCH 1/2] go/upgrade: Support upgrades without stopping the node --- .changelog/5659.feature.md | 1 + go/oasis-test-runner/scenario/e2e/upgrade.go | 60 ++++++++++++-------- go/upgrade/migrations/consensus_240.go | 5 ++ go/upgrade/migrations/dummy.go | 4 ++ go/upgrade/migrations/empty.go | 4 ++ go/upgrade/migrations/migrations.go | 3 + go/upgrade/upgrade.go | 26 ++++++++- 7 files changed, 76 insertions(+), 27 deletions(-) create mode 100644 .changelog/5659.feature.md diff --git a/.changelog/5659.feature.md b/.changelog/5659.feature.md new file mode 100644 index 00000000000..3bfadbacdfa --- /dev/null +++ b/.changelog/5659.feature.md @@ -0,0 +1 @@ +go/upgrade: Support upgrades without stopping the node diff --git a/go/oasis-test-runner/scenario/e2e/upgrade.go b/go/oasis-test-runner/scenario/e2e/upgrade.go index 892cb0c1c77..adb586ec969 100644 --- a/go/oasis-test-runner/scenario/e2e/upgrade.go +++ b/go/oasis-test-runner/scenario/e2e/upgrade.go @@ -169,11 +169,11 @@ func (c *upgrade240Checker) PostUpgradeFn(ctx context.Context, ctrl *oasis.Contr var ( // NodeUpgradeDummy is the node upgrade dummy scenario. - NodeUpgradeDummy scenario.Scenario = newNodeUpgradeImpl(migrations.DummyUpgradeHandler, &dummyUpgradeChecker{}) + NodeUpgradeDummy scenario.Scenario = newNodeUpgradeImpl(migrations.DummyUpgradeHandler, &dummyUpgradeChecker{}, true) // NodeUpgradeEmpty is the empty node upgrade scenario. - NodeUpgradeEmpty scenario.Scenario = newNodeUpgradeImpl(migrations.EmptyHandler, &noOpUpgradeChecker{}) + NodeUpgradeEmpty scenario.Scenario = newNodeUpgradeImpl(migrations.EmptyHandler, &noOpUpgradeChecker{}, false) // NodeUpgradeConsensus240 is the node upgrade scenario for migrating to consensus 24.0. - NodeUpgradeConsensus240 scenario.Scenario = newNodeUpgradeImpl(migrations.Consensus240, &upgrade240Checker{}) + NodeUpgradeConsensus240 scenario.Scenario = newNodeUpgradeImpl(migrations.Consensus240, &upgrade240Checker{}, false) malformedDescriptor = []byte(`{ "v": 1, @@ -202,6 +202,7 @@ type nodeUpgradeImpl struct { handlerName upgrade.HandlerName upgradeChecker upgradeChecker + needsRestart bool } func (sc *nodeUpgradeImpl) writeDescriptor(name string, content []byte) (string, error) { @@ -253,11 +254,12 @@ func (sc *nodeUpgradeImpl) restart(ctx context.Context, wait bool) error { } } -func newNodeUpgradeImpl(handlerName upgrade.HandlerName, upgradeChecker upgradeChecker) scenario.Scenario { +func newNodeUpgradeImpl(handlerName upgrade.HandlerName, upgradeChecker upgradeChecker, needsRestart bool) scenario.Scenario { sc := &nodeUpgradeImpl{ Scenario: *NewScenario("node-upgrade-" + string(handlerName)), handlerName: handlerName, upgradeChecker: upgradeChecker, + needsRestart: needsRestart, } return sc } @@ -267,6 +269,7 @@ func (sc *nodeUpgradeImpl) Clone() scenario.Scenario { Scenario: *sc.Scenario.Clone().(*Scenario), handlerName: sc.handlerName, upgradeChecker: sc.upgradeChecker, + needsRestart: sc.needsRestart, } } @@ -280,7 +283,6 @@ func (sc *nodeUpgradeImpl) Fixture() (*oasis.NetworkFixture, error) { Network: oasis.NetworkCfg{ NodeBinary: f.Network.NodeBinary, DefaultLogWatcherHandlerFactories: []log.WatcherHandlerFactory{ - oasis.LogAssertUpgradeStartup(), oasis.LogAssertUpgradeConsensus(), }, }, @@ -297,6 +299,12 @@ func (sc *nodeUpgradeImpl) Fixture() (*oasis.NetworkFixture, error) { Seeds: []oasis.SeedFixture{{}}, } + if sc.needsRestart { + ff.Network.DefaultLogWatcherHandlerFactories = append(ff.Network.DefaultLogWatcherHandlerFactories, + oasis.LogAssertUpgradeStartup(), + ) + } + ff.Network.SetMockEpoch() ff.Network.SetInsecureBeacon() @@ -445,27 +453,29 @@ func (sc *nodeUpgradeImpl) Run(ctx context.Context, childEnv *env.Env) error { / return err } - sc.Logger.Info("restarting network") - errCh := make(chan error, len(sc.Net.Validators())) - var group sync.WaitGroup - for i, val := range sc.Net.Validators() { - group.Add(1) - go func(i int, val *oasis.Validator) { - defer group.Done() - sc.Logger.Debug("waiting for validator to exit", "num", i) - <-val.Exit() - sc.Logger.Debug("restarting validator", "num", i) - if restartError := val.Restart(ctx); err != nil { - errCh <- restartError - } - }(i, val) - } + if sc.needsRestart { + sc.Logger.Info("restarting network") + errCh := make(chan error, len(sc.Net.Validators())) + var group sync.WaitGroup + for i, val := range sc.Net.Validators() { + group.Add(1) + go func(i int, val *oasis.Validator) { + defer group.Done() + sc.Logger.Debug("waiting for validator to exit", "num", i) + <-val.Exit() + sc.Logger.Debug("restarting validator", "num", i) + if restartError := val.Restart(ctx); err != nil { + errCh <- restartError + } + }(i, val) + } - group.Wait() - select { - case err = <-errCh: - return fmt.Errorf("can't restart upgraded validator for upgrade test: %w", err) - default: + group.Wait() + select { + case err = <-errCh: + return fmt.Errorf("can't restart upgraded validator for upgrade test: %w", err) + default: + } } sc.Logger.Info("waiting for network to come back up") diff --git a/go/upgrade/migrations/consensus_240.go b/go/upgrade/migrations/consensus_240.go index 9234f8ae38b..facdfba6c4d 100644 --- a/go/upgrade/migrations/consensus_240.go +++ b/go/upgrade/migrations/consensus_240.go @@ -25,6 +25,11 @@ var _ Handler = (*Handler240)(nil) // from version 23.0.x to 24.0.0. type Handler240 struct{} +// HasStartupUpgrade implements Handler. +func (h *Handler240) HasStartupUpgrade() bool { + return false +} + // StartupUpgrade implements Handler. func (h *Handler240) StartupUpgrade() error { return nil diff --git a/go/upgrade/migrations/dummy.go b/go/upgrade/migrations/dummy.go index 90343797895..b57c7b0754c 100644 --- a/go/upgrade/migrations/dummy.go +++ b/go/upgrade/migrations/dummy.go @@ -38,6 +38,10 @@ func init() { type dummyMigrationHandler struct{} +func (th *dummyMigrationHandler) HasStartupUpgrade() bool { + return true +} + func (th *dummyMigrationHandler) StartupUpgrade() error { return nil } diff --git a/go/upgrade/migrations/empty.go b/go/upgrade/migrations/empty.go index f0c8f59f811..c1be4af41b4 100644 --- a/go/upgrade/migrations/empty.go +++ b/go/upgrade/migrations/empty.go @@ -10,6 +10,10 @@ var _ Handler = (*emptyHandler)(nil) type emptyHandler struct{} +func (th *emptyHandler) HasStartupUpgrade() bool { + return false +} + func (th *emptyHandler) StartupUpgrade() error { // Nothing to do. return nil diff --git a/go/upgrade/migrations/migrations.go b/go/upgrade/migrations/migrations.go index e788f98d2d6..e9a82d27125 100644 --- a/go/upgrade/migrations/migrations.go +++ b/go/upgrade/migrations/migrations.go @@ -23,6 +23,9 @@ var ( // Handler is the interface used by migration handlers. type Handler interface { + // HasStartupUpgrade returns true iff the handler requires a startup upgrade. + HasStartupUpgrade() bool + // StartupUpgrade is called by the upgrade manager to perform // the node startup portion of the upgrade. StartupUpgrade() error diff --git a/go/upgrade/upgrade.go b/go/upgrade/upgrade.go index f6e9883e74d..96756ac27fe 100644 --- a/go/upgrade/upgrade.go +++ b/go/upgrade/upgrade.go @@ -279,8 +279,30 @@ func (u *upgradeManager) ConsensusUpgrade(privateCtx interface{}, currentEpoch b if err := u.flushDescriptorLocked(); err != nil { return err } - u.shouldStop = true // Ensure we really stop before proceeding. - return api.ErrStopForUpgrade + // Check if we can proceed in place (e.g. without a restart). + u.shouldStop = func() bool { + if err := pu.Descriptor.EnsureCompatible(); err != nil { + // Not compatible, we must stop. + return true + } + // Check if the migration handler has a startup stage. + handler, err := migrations.GetHandler(pu.Descriptor.Handler) + if err != nil { + // Handler not available, we must stop. + return true + } + if handler.HasStartupUpgrade() { + // Handler has a startup upgrade, we must stop. + return true + } + return false + }() + if u.shouldStop { + return api.ErrStopForUpgrade + } + // We can continue with the upgrade in place. + u.logger.Info("skipping node restart as no startup upgrade stage needed") + pu.PushStage(api.UpgradeStageStartup) } // If we're already past the upgrade height, then everything must be complete. From 3a04e49173a2a7fd0918e8263b1153ef3ba0801f Mon Sep 17 00:00:00 2001 From: Jernej Kos Date: Tue, 23 Apr 2024 18:30:59 +0200 Subject: [PATCH 2/2] ci: Bump parallelism for basic sgx1 E2E tests --- .buildkite/code.pipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/code.pipeline.yml b/.buildkite/code.pipeline.yml index badc6c18d4d..ab600757241 100644 --- a/.buildkite/code.pipeline.yml +++ b/.buildkite/code.pipeline.yml @@ -238,7 +238,7 @@ steps: - "build-rust-runtime-loader" - "build-rust-runtimes" branches: "!master !stable/*" - parallelism: 6 + parallelism: 7 command: - trap 'buildkite-agent artifact upload "coverage-merged-e2e-*.txt;/tmp/e2e/**/*.log;/tmp/e2e/**/genesis.json;/tmp/e2e/**/runtime_genesis.json"' EXIT - .buildkite/scripts/download_e2e_test_artifacts.sh