Skip to content

Commit

Permalink
Merge pull request #5659 from oasisprotocol/kostko/feature/upgrade-no…
Browse files Browse the repository at this point in the history
…halt

go/upgrade: Support upgrades without stopping the node
  • Loading branch information
kostko authored Apr 24, 2024
2 parents f9c0325 + 3a04e49 commit e78d429
Show file tree
Hide file tree
Showing 8 changed files with 77 additions and 28 deletions.
2 changes: 1 addition & 1 deletion .buildkite/code.pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ steps:
- "build-rust-runtime-loader"
- "build-rust-runtimes"
branches: "!master !stable/*"
parallelism: 6
parallelism: 7
command:
- trap 'buildkite-agent artifact upload "coverage-merged-e2e-*.txt;/tmp/e2e/**/*.log;/tmp/e2e/**/genesis.json;/tmp/e2e/**/runtime_genesis.json"' EXIT
- .buildkite/scripts/download_e2e_test_artifacts.sh
Expand Down
1 change: 1 addition & 0 deletions .changelog/5659.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
go/upgrade: Support upgrades without stopping the node
60 changes: 35 additions & 25 deletions go/oasis-test-runner/scenario/e2e/upgrade.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,11 +169,11 @@ func (c *upgrade240Checker) PostUpgradeFn(ctx context.Context, ctrl *oasis.Contr

var (
// NodeUpgradeDummy is the node upgrade dummy scenario.
NodeUpgradeDummy scenario.Scenario = newNodeUpgradeImpl(migrations.DummyUpgradeHandler, &dummyUpgradeChecker{})
NodeUpgradeDummy scenario.Scenario = newNodeUpgradeImpl(migrations.DummyUpgradeHandler, &dummyUpgradeChecker{}, true)
// NodeUpgradeEmpty is the empty node upgrade scenario.
NodeUpgradeEmpty scenario.Scenario = newNodeUpgradeImpl(migrations.EmptyHandler, &noOpUpgradeChecker{})
NodeUpgradeEmpty scenario.Scenario = newNodeUpgradeImpl(migrations.EmptyHandler, &noOpUpgradeChecker{}, false)
// NodeUpgradeConsensus240 is the node upgrade scenario for migrating to consensus 24.0.
NodeUpgradeConsensus240 scenario.Scenario = newNodeUpgradeImpl(migrations.Consensus240, &upgrade240Checker{})
NodeUpgradeConsensus240 scenario.Scenario = newNodeUpgradeImpl(migrations.Consensus240, &upgrade240Checker{}, false)

malformedDescriptor = []byte(`{
"v": 1,
Expand Down Expand Up @@ -202,6 +202,7 @@ type nodeUpgradeImpl struct {

handlerName upgrade.HandlerName
upgradeChecker upgradeChecker
needsRestart bool
}

func (sc *nodeUpgradeImpl) writeDescriptor(name string, content []byte) (string, error) {
Expand Down Expand Up @@ -253,11 +254,12 @@ func (sc *nodeUpgradeImpl) restart(ctx context.Context, wait bool) error {
}
}

func newNodeUpgradeImpl(handlerName upgrade.HandlerName, upgradeChecker upgradeChecker) scenario.Scenario {
func newNodeUpgradeImpl(handlerName upgrade.HandlerName, upgradeChecker upgradeChecker, needsRestart bool) scenario.Scenario {
sc := &nodeUpgradeImpl{
Scenario: *NewScenario("node-upgrade-" + string(handlerName)),
handlerName: handlerName,
upgradeChecker: upgradeChecker,
needsRestart: needsRestart,
}
return sc
}
Expand All @@ -267,6 +269,7 @@ func (sc *nodeUpgradeImpl) Clone() scenario.Scenario {
Scenario: *sc.Scenario.Clone().(*Scenario),
handlerName: sc.handlerName,
upgradeChecker: sc.upgradeChecker,
needsRestart: sc.needsRestart,
}
}

Expand All @@ -280,7 +283,6 @@ func (sc *nodeUpgradeImpl) Fixture() (*oasis.NetworkFixture, error) {
Network: oasis.NetworkCfg{
NodeBinary: f.Network.NodeBinary,
DefaultLogWatcherHandlerFactories: []log.WatcherHandlerFactory{
oasis.LogAssertUpgradeStartup(),
oasis.LogAssertUpgradeConsensus(),
},
},
Expand All @@ -297,6 +299,12 @@ func (sc *nodeUpgradeImpl) Fixture() (*oasis.NetworkFixture, error) {
Seeds: []oasis.SeedFixture{{}},
}

if sc.needsRestart {
ff.Network.DefaultLogWatcherHandlerFactories = append(ff.Network.DefaultLogWatcherHandlerFactories,
oasis.LogAssertUpgradeStartup(),
)
}

ff.Network.SetMockEpoch()
ff.Network.SetInsecureBeacon()

Expand Down Expand Up @@ -445,27 +453,29 @@ func (sc *nodeUpgradeImpl) Run(ctx context.Context, childEnv *env.Env) error { /
return err
}

sc.Logger.Info("restarting network")
errCh := make(chan error, len(sc.Net.Validators()))
var group sync.WaitGroup
for i, val := range sc.Net.Validators() {
group.Add(1)
go func(i int, val *oasis.Validator) {
defer group.Done()
sc.Logger.Debug("waiting for validator to exit", "num", i)
<-val.Exit()
sc.Logger.Debug("restarting validator", "num", i)
if restartError := val.Restart(ctx); err != nil {
errCh <- restartError
}
}(i, val)
}
if sc.needsRestart {
sc.Logger.Info("restarting network")
errCh := make(chan error, len(sc.Net.Validators()))
var group sync.WaitGroup
for i, val := range sc.Net.Validators() {
group.Add(1)
go func(i int, val *oasis.Validator) {
defer group.Done()
sc.Logger.Debug("waiting for validator to exit", "num", i)
<-val.Exit()
sc.Logger.Debug("restarting validator", "num", i)
if restartError := val.Restart(ctx); err != nil {
errCh <- restartError
}
}(i, val)
}

group.Wait()
select {
case err = <-errCh:
return fmt.Errorf("can't restart upgraded validator for upgrade test: %w", err)
default:
group.Wait()
select {
case err = <-errCh:
return fmt.Errorf("can't restart upgraded validator for upgrade test: %w", err)
default:
}
}

sc.Logger.Info("waiting for network to come back up")
Expand Down
5 changes: 5 additions & 0 deletions go/upgrade/migrations/consensus_240.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ var _ Handler = (*Handler240)(nil)
// from version 23.0.x to 24.0.0.
type Handler240 struct{}

// HasStartupUpgrade implements Handler.
func (h *Handler240) HasStartupUpgrade() bool {
return false
}

// StartupUpgrade implements Handler.
func (h *Handler240) StartupUpgrade() error {
return nil
Expand Down
4 changes: 4 additions & 0 deletions go/upgrade/migrations/dummy.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ func init() {

type dummyMigrationHandler struct{}

func (th *dummyMigrationHandler) HasStartupUpgrade() bool {
return true
}

func (th *dummyMigrationHandler) StartupUpgrade() error {
return nil
}
Expand Down
4 changes: 4 additions & 0 deletions go/upgrade/migrations/empty.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ var _ Handler = (*emptyHandler)(nil)

type emptyHandler struct{}

func (th *emptyHandler) HasStartupUpgrade() bool {
return false
}

func (th *emptyHandler) StartupUpgrade() error {
// Nothing to do.
return nil
Expand Down
3 changes: 3 additions & 0 deletions go/upgrade/migrations/migrations.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ var (

// Handler is the interface used by migration handlers.
type Handler interface {
// HasStartupUpgrade returns true iff the handler requires a startup upgrade.
HasStartupUpgrade() bool

// StartupUpgrade is called by the upgrade manager to perform
// the node startup portion of the upgrade.
StartupUpgrade() error
Expand Down
26 changes: 24 additions & 2 deletions go/upgrade/upgrade.go
Original file line number Diff line number Diff line change
Expand Up @@ -279,8 +279,30 @@ func (u *upgradeManager) ConsensusUpgrade(privateCtx interface{}, currentEpoch b
if err := u.flushDescriptorLocked(); err != nil {
return err
}
u.shouldStop = true // Ensure we really stop before proceeding.
return api.ErrStopForUpgrade
// Check if we can proceed in place (e.g. without a restart).
u.shouldStop = func() bool {
if err := pu.Descriptor.EnsureCompatible(); err != nil {
// Not compatible, we must stop.
return true
}
// Check if the migration handler has a startup stage.
handler, err := migrations.GetHandler(pu.Descriptor.Handler)
if err != nil {
// Handler not available, we must stop.
return true
}
if handler.HasStartupUpgrade() {
// Handler has a startup upgrade, we must stop.
return true
}
return false
}()
if u.shouldStop {
return api.ErrStopForUpgrade
}
// We can continue with the upgrade in place.
u.logger.Info("skipping node restart as no startup upgrade stage needed")
pu.PushStage(api.UpgradeStageStartup)
}

// If we're already past the upgrade height, then everything must be complete.
Expand Down

0 comments on commit e78d429

Please sign in to comment.