Skip to content

Commit

Permalink
Ensures that when we are in a potential backstop invocation scenario,
Browse files Browse the repository at this point in the history
non-satisfaction of the conditions for cluster reconfiguration causes an
error instead of nil-return with continued waiting.

This is the safest option and ensures we never get into a situation
where a worker is waiting to cluster Dqlite, but waiting for API server
detail messages that may not come in a timely fashion.
  • Loading branch information
manadart committed Jul 21, 2023
1 parent 4bfc98e commit fc3c2d4
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 27 deletions.
31 changes: 15 additions & 16 deletions worker/dbaccessor/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -567,31 +567,30 @@ func (w *dbWorker) processAPIServerChange(apiDetails apiserver.Details) error {
return dependency.ErrBounce
}

// If we are an existing, previously clustered node, but the node is
// not running, check if we're the only API server.
// If we are, then we have gone from HA-n to HA-1 and have a broken
// cluster. Attempt to reconfigure the cluster with just ourselves
// as a member.
// If we are an existing, previously clustered node,
// and the node is running, we have nothing to do.
w.mu.RLock()
running := w.dbApp != nil
w.mu.RUnlock()

if serverCount > 1 || running {
if running {
return nil
}

// Be super careful and check that the sole node is actually us.
if _, ok := apiDetails.Servers[w.cfg.ControllerID]; !ok {
return errors.Errorf("unable to reconcile current controller and Dqlite cluster status")
}
// Make absolutely sure. We only reconfigure the cluster if the details
// indicate exactly one controller machine, and that machine is us.
if _, ok := apiDetails.Servers[w.cfg.ControllerID]; ok && serverCount == 1 {
log.Warningf("reconfiguring Dqlite cluster with this node as the only member")
if err := w.cfg.NodeManager.SetClusterToLocalNode(ctx); err != nil {
return errors.Annotatef(err, "reconfiguring Dqlite cluster")
}

log.Warningf("reconfiguring Dqlite cluster with this node as the only member")
if err := w.cfg.NodeManager.SetClusterToLocalNode(ctx); err != nil {
return errors.Annotatef(err, "reconfiguring Dqlite cluster")
log.Infof("successfully reconfigured Dqlite; restarting worker")
return dependency.ErrBounce
}

log.Infof("successfully reconfigured Dqlite; restarting worker")
return dependency.ErrBounce
// Otherwise there is no deterministic course of action.
// Play it safe and throw out.
return errors.Errorf("unable to reconcile current controller and Dqlite cluster status")
}

// Otherwise this is a node added by enabling HA,
Expand Down
50 changes: 39 additions & 11 deletions worker/dbaccessor/worker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ type workerSuite struct {

var _ = gc.Suite(&workerSuite{})

func (s *workerSuite) TestStartupTimeoutSingleServerReconfigure(c *gc.C) {
func (s *workerSuite) TestStartupTimeoutSingleControllerReconfigure(c *gc.C) {
defer s.setupMocks(c).Finish()

s.expectAnyLogs()
Expand All @@ -38,8 +38,8 @@ func (s *workerSuite) TestStartupTimeoutSingleServerReconfigure(c *gc.C) {

mgrExp := s.nodeManager.EXPECT()
mgrExp.EnsureDataDir().Return(c.MkDir(), nil)
mgrExp.IsExistingNode().Return(true, nil).Times(3)
mgrExp.IsBootstrappedNode(gomock.Any()).Return(false, nil).Times(4)
mgrExp.IsExistingNode().Return(true, nil).Times(2)
mgrExp.IsBootstrappedNode(gomock.Any()).Return(false, nil).Times(3)
mgrExp.WithTLSOption().Return(nil, nil)
mgrExp.WithLogFuncOption().Return(nil)
mgrExp.WithTracingOption().Return(nil)
Expand All @@ -57,33 +57,61 @@ func (s *workerSuite) TestStartupTimeoutSingleServerReconfigure(c *gc.C) {
w := s.newWorker(c)
defer w.Kill()

// If there are multiple servers reported, we do nothing.
// Topology is just us. We should reconfigure the node and shut down.
select {
case w.(*dbWorker).apiServerChanges <- apiserver.Details{
Servers: map[string]apiserver.APIServer{
"0": {ID: "0", InternalAddress: "10.6.6.6:1234"},
"1": {ID: "1", InternalAddress: "10.6.6.7:1234"},
},
Servers: map[string]apiserver.APIServer{"0": {ID: "0", InternalAddress: "10.6.6.6:1234"}},
}:
case <-time.After(testing.LongWait):
c.Fatal("timed out waiting for cluster change to be processed")
}

workertest.CheckAlive(c, w)
err := workertest.CheckKilled(c, w)
c.Assert(errors.Is(err, dependency.ErrBounce), jc.IsTrue)
}

func (s *workerSuite) TestStartupTimeoutMultipleControllerError(c *gc.C) {
defer s.setupMocks(c).Finish()

s.expectAnyLogs()
s.expectClock()
s.expectTrackedDBKill()

mgrExp := s.nodeManager.EXPECT()
mgrExp.EnsureDataDir().Return(c.MkDir(), nil)
mgrExp.IsExistingNode().Return(true, nil).Times(2)
mgrExp.IsBootstrappedNode(gomock.Any()).Return(false, nil).Times(3)
mgrExp.WithTLSOption().Return(nil, nil)
mgrExp.WithLogFuncOption().Return(nil)
mgrExp.WithTracingOption().Return(nil)

// App gets started, we time out waiting, then we close it.
appExp := s.dbApp.EXPECT()
appExp.Ready(gomock.Any()).Return(context.DeadlineExceeded)
appExp.Close().Return(nil)

// We expect to request API details.
s.hub.EXPECT().Subscribe(apiserver.DetailsTopic, gomock.Any()).Return(func() {}, nil)
s.hub.EXPECT().Publish(apiserver.DetailsRequestTopic, gomock.Any()).Return(func() {}, nil)

// Now it's just us. We should reconfigure the node and shut down.
w := s.newWorker(c)
defer w.Kill()

// If there are multiple servers reported, we can't reason about our
// current state in a discrete fashion. The worker throws an error.
select {
case w.(*dbWorker).apiServerChanges <- apiserver.Details{
Servers: map[string]apiserver.APIServer{
"0": {ID: "0", InternalAddress: "10.6.6.6:1234"},
"1": {ID: "1", InternalAddress: "10.6.6.7:1234"},
},
}:
case <-time.After(testing.LongWait):
c.Fatal("timed out waiting for cluster change to be processed")
}

err := workertest.CheckKilled(c, w)
c.Assert(errors.Is(err, dependency.ErrBounce), jc.IsTrue)
c.Assert(err, gc.ErrorMatches, "unable to reconcile current controller and Dqlite cluster status")
}

func (s *workerSuite) TestStartupNotExistingNodeThenCluster(c *gc.C) {
Expand Down

0 comments on commit fc3c2d4

Please sign in to comment.