Skip to content

Commit

Permalink
Ensuers that when we failed to bring up Dqlite, and we are waiting for
Browse files Browse the repository at this point in the history
server detail messages, we do not bounce the dbaccessor worker for
messages indicating other cluster members.

Instead we try again to start Dqlite, potentially going through the same
workflow.
  • Loading branch information
manadart committed Aug 21, 2023
1 parent e5801f5 commit 405fc60
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 16 deletions.
6 changes: 4 additions & 2 deletions worker/dbaccessor/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -616,8 +616,10 @@ func (w *dbWorker) processAPIServerChange(apiDetails apiserver.Details) error {
}

// Otherwise there is no deterministic course of action.
// Play it safe and throw out.
return errors.Errorf("unable to reconcile current controller and Dqlite cluster status")
// We don't want to throw an error here, because it can result in churn
// when entering HA. Just try again to start.
log.Infof("unable to reconcile current controller and Dqlite cluster status; reattempting node start-up")
return errors.Trace(w.startExistingDqliteNode())
}

// Otherwise this is a node added by enabling HA,
Expand Down
36 changes: 22 additions & 14 deletions worker/dbaccessor/worker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,37 +70,40 @@ func (s *workerSuite) TestStartupTimeoutSingleControllerReconfigure(c *gc.C) {
c.Assert(errors.Is(err, dependency.ErrBounce), jc.IsTrue)
}

func (s *workerSuite) TestStartupTimeoutMultipleControllerError(c *gc.C) {
func (s *workerSuite) TestStartupTimeoutMultipleControllerRetry(c *gc.C) {
defer s.setupMocks(c).Finish()

s.expectAnyLogs()
s.expectClock()
s.expectTrackedDBKill()

mgrExp := s.nodeManager.EXPECT()
mgrExp.EnsureDataDir().Return(c.MkDir(), nil)
mgrExp.EnsureDataDir().Return(c.MkDir(), nil).Times(2)
mgrExp.IsExistingNode().Return(true, nil).Times(2)
mgrExp.IsBootstrappedNode(gomock.Any()).Return(false, nil).Times(3)
mgrExp.WithTLSOption().Return(nil, nil)
mgrExp.WithLogFuncOption().Return(nil)
mgrExp.WithTracingOption().Return(nil)
mgrExp.IsBootstrappedNode(gomock.Any()).Return(false, nil).Times(4)

// App gets started, we time out waiting, then we close it.
// We expect 2 attempts to start.
mgrExp.WithTLSOption().Return(nil, nil).Times(2)
mgrExp.WithLogFuncOption().Return(nil).Times(2)
mgrExp.WithTracingOption().Return(nil).Times(2)

// App gets started, we time out waiting, then we close it both times.
appExp := s.dbApp.EXPECT()
appExp.Ready(gomock.Any()).Return(context.DeadlineExceeded)
appExp.Close().Return(nil)
appExp.Ready(gomock.Any()).Return(context.DeadlineExceeded).Times(2)
appExp.Close().Return(nil).Times(2)

// We expect to request API details.
s.hub.EXPECT().Subscribe(apiserver.DetailsTopic, gomock.Any()).Return(func() {}, nil)
s.hub.EXPECT().Publish(apiserver.DetailsRequestTopic, gomock.Any()).Return(func() {}, nil)
s.hub.EXPECT().Publish(apiserver.DetailsRequestTopic, gomock.Any()).Return(func() {}, nil).Times(2)

w := s.newWorker(c)
defer w.Kill()
defer workertest.CleanKill(c, w)
dbw := w.(*dbWorker)

// If there are multiple servers reported, we can't reason about our
// current state in a discrete fashion. The worker throws an error.
select {
case w.(*dbWorker).apiServerChanges <- apiserver.Details{
case dbw.apiServerChanges <- apiserver.Details{
Servers: map[string]apiserver.APIServer{
"0": {ID: "0", InternalAddress: "10.6.6.6:1234"},
"1": {ID: "1", InternalAddress: "10.6.6.7:1234"},
Expand All @@ -110,8 +113,13 @@ func (s *workerSuite) TestStartupTimeoutMultipleControllerError(c *gc.C) {
c.Fatal("timed out waiting for cluster change to be processed")
}

err := workertest.CheckKilled(c, w)
c.Assert(err, gc.ErrorMatches, "unable to reconcile current controller and Dqlite cluster status")
// At this point, the Dqlite node is not started.
// The worker is waiting for legitimate server detail messages.
select {
case <-dbw.dbReady:
c.Fatal("Dqlite node should not be started yet.")
case <-time.After(testing.ShortWait):
}
}

func (s *workerSuite) TestStartupNotExistingNodeThenCluster(c *gc.C) {
Expand Down

0 comments on commit 405fc60

Please sign in to comment.