diff --git a/worker/dbaccessor/worker.go b/worker/dbaccessor/worker.go index 1ca787a0b35..9261bfe07a2 100644 --- a/worker/dbaccessor/worker.go +++ b/worker/dbaccessor/worker.go @@ -567,31 +567,30 @@ func (w *dbWorker) processAPIServerChange(apiDetails apiserver.Details) error { return dependency.ErrBounce } - // If we are an existing, previously clustered node, but the node is - // not running, check if we're the only API server. - // If we are, then we have gone from HA-n to HA-1 and have a broken - // cluster. Attempt to reconfigure the cluster with just ourselves - // as a member. + // If we are an existing, previously clustered node, + // and the node is running, we have nothing to do. w.mu.RLock() running := w.dbApp != nil w.mu.RUnlock() - - if serverCount > 1 || running { + if running { return nil } - // Be super careful and check that the sole node is actually us. - if _, ok := apiDetails.Servers[w.cfg.ControllerID]; !ok { - return errors.Errorf("unable to reconcile current controller and Dqlite cluster status") - } + // Make absolutely sure. We only reconfigure the cluster if the details + // indicate exactly one controller machine, and that machine is us. + if _, ok := apiDetails.Servers[w.cfg.ControllerID]; ok && serverCount == 1 { + log.Warningf("reconfiguring Dqlite cluster with this node as the only member") + if err := w.cfg.NodeManager.SetClusterToLocalNode(ctx); err != nil { + return errors.Annotatef(err, "reconfiguring Dqlite cluster") + } - log.Warningf("reconfiguring Dqlite cluster with this node as the only member") - if err := w.cfg.NodeManager.SetClusterToLocalNode(ctx); err != nil { - return errors.Annotatef(err, "reconfiguring Dqlite cluster") + log.Infof("successfully reconfigured Dqlite; restarting worker") + return dependency.ErrBounce } - log.Infof("successfully reconfigured Dqlite; restarting worker") - return dependency.ErrBounce + // Otherwise there is no deterministic course of action. + // Play it safe and throw out. + return errors.Errorf("unable to reconcile current controller and Dqlite cluster status") } // Otherwise this is a node added by enabling HA, diff --git a/worker/dbaccessor/worker_test.go b/worker/dbaccessor/worker_test.go index 8456f51411a..1c6b39fba0d 100644 --- a/worker/dbaccessor/worker_test.go +++ b/worker/dbaccessor/worker_test.go @@ -29,7 +29,7 @@ type workerSuite struct { var _ = gc.Suite(&workerSuite{}) -func (s *workerSuite) TestStartupTimeoutSingleServerReconfigure(c *gc.C) { +func (s *workerSuite) TestStartupTimeoutSingleControllerReconfigure(c *gc.C) { defer s.setupMocks(c).Finish() s.expectAnyLogs() @@ -38,8 +38,8 @@ func (s *workerSuite) TestStartupTimeoutSingleServerReconfigure(c *gc.C) { mgrExp := s.nodeManager.EXPECT() mgrExp.EnsureDataDir().Return(c.MkDir(), nil) - mgrExp.IsExistingNode().Return(true, nil).Times(3) - mgrExp.IsBootstrappedNode(gomock.Any()).Return(false, nil).Times(4) + mgrExp.IsExistingNode().Return(true, nil).Times(2) + mgrExp.IsBootstrappedNode(gomock.Any()).Return(false, nil).Times(3) mgrExp.WithTLSOption().Return(nil, nil) mgrExp.WithLogFuncOption().Return(nil) mgrExp.WithTracingOption().Return(nil) @@ -57,25 +57,53 @@ func (s *workerSuite) TestStartupTimeoutSingleServerReconfigure(c *gc.C) { w := s.newWorker(c) defer w.Kill() - // If there are multiple servers reported, we do nothing. + // Topology is just us. We should reconfigure the node and shut down. select { case w.(*dbWorker).apiServerChanges <- apiserver.Details{ - Servers: map[string]apiserver.APIServer{ - "0": {ID: "0", InternalAddress: "10.6.6.6:1234"}, - "1": {ID: "1", InternalAddress: "10.6.6.7:1234"}, - }, + Servers: map[string]apiserver.APIServer{"0": {ID: "0", InternalAddress: "10.6.6.6:1234"}}, }: case <-time.After(testing.LongWait): c.Fatal("timed out waiting for cluster change to be processed") } - workertest.CheckAlive(c, w) + err := workertest.CheckKilled(c, w) + c.Assert(errors.Is(err, dependency.ErrBounce), jc.IsTrue) +} + +func (s *workerSuite) TestStartupTimeoutMultipleControllerError(c *gc.C) { + defer s.setupMocks(c).Finish() + + s.expectAnyLogs() + s.expectClock() + s.expectTrackedDBKill() + + mgrExp := s.nodeManager.EXPECT() + mgrExp.EnsureDataDir().Return(c.MkDir(), nil) + mgrExp.IsExistingNode().Return(true, nil).Times(2) + mgrExp.IsBootstrappedNode(gomock.Any()).Return(false, nil).Times(3) + mgrExp.WithTLSOption().Return(nil, nil) + mgrExp.WithLogFuncOption().Return(nil) + mgrExp.WithTracingOption().Return(nil) + + // App gets started, we time out waiting, then we close it. + appExp := s.dbApp.EXPECT() + appExp.Ready(gomock.Any()).Return(context.DeadlineExceeded) + appExp.Close().Return(nil) + + // We expect to request API details. + s.hub.EXPECT().Subscribe(apiserver.DetailsTopic, gomock.Any()).Return(func() {}, nil) + s.hub.EXPECT().Publish(apiserver.DetailsRequestTopic, gomock.Any()).Return(func() {}, nil) - // Now it's just us. We should reconfigure the node and shut down. + w := s.newWorker(c) + defer w.Kill() + + // If there are multiple servers reported, we can't reason about our + // current state in a discrete fashion. The worker throws an error. select { case w.(*dbWorker).apiServerChanges <- apiserver.Details{ Servers: map[string]apiserver.APIServer{ "0": {ID: "0", InternalAddress: "10.6.6.6:1234"}, + "1": {ID: "1", InternalAddress: "10.6.6.7:1234"}, }, }: case <-time.After(testing.LongWait): @@ -83,7 +111,7 @@ func (s *workerSuite) TestStartupTimeoutSingleServerReconfigure(c *gc.C) { } err := workertest.CheckKilled(c, w) - c.Assert(errors.Is(err, dependency.ErrBounce), jc.IsTrue) + c.Assert(err, gc.ErrorMatches, "unable to reconcile current controller and Dqlite cluster status") } func (s *workerSuite) TestStartupNotExistingNodeThenCluster(c *gc.C) {