Fix failing scale down and then scale up integration test

canonical · Sep 5, 2024 · ab63d53 · ab63d53
1 parent f0daa96
commit ab63d53
Show file tree

Hide file tree

Showing 3 changed files with 41 additions and 29 deletions.
diff --git a/lib/charms/mysql/v0/mysql.py b/lib/charms/mysql/v0/mysql.py
@@ -624,7 +624,7 @@ def cluster_initialized(self) -> bool:
         return False
 
     @property
-    def only_single_cluster_node_exists(self) -> Optional[bool]:
+    def only_single_cluster_node_exists_unitialized(self) -> Optional[bool]:
         """Check if only a single cluster node exists across all units."""
         if not self.app_peer_data.get("cluster-name"):
             return None
@@ -633,7 +633,11 @@ def only_single_cluster_node_exists(self) -> Optional[bool]:
         for unit in self.app_units:
             total_cluster_nodes += self._mysql.get_cluster_node_count(from_instance=self.get_unit_address(unit))
 
-        return total_cluster_nodes == 1
+        total_online_cluster_nodes = 0
+        for unit in self.app_units:
+            total_online_cluster_nodes += self._mysql.get_cluster_node_count(from_instance=self.get_unit_address(unit), node_status=MySQLMemberState["ONLINE"])
+
+        return total_cluster_nodes == 1 and total_online_cluster_nodes == 0
 
     @property
     def cluster_fully_initialized(self) -> bool:

diff --git a/src/charm.py b/src/charm.py
@@ -736,6 +736,15 @@ def _handle_potential_cluster_crash_scenario(self) -> bool:
         if not self._mysql.is_mysqld_running():
             return True
 
+        only_single_unitialized_node_across_cluster = (
+            self.only_single_cluster_node_exists_unitialized
+        )
+
+        if (
+            not self.cluster_initialized and not only_single_unitialized_node_across_cluster
+        ) or not self.unit_peer_data.get("member-role"):
+            return True
+
         # retrieve and persist state for every unit
         try:
             state, role = self._mysql.get_member_state()
@@ -764,23 +773,21 @@ def _handle_potential_cluster_crash_scenario(self) -> bool:
                 self.peers.data[unit].get("member-state", "unknown") for unit in self.peers.units
             }
 
-            only_single_node_across_cluster = self.only_single_cluster_node_exists
-
             # Add state 'offline' for this unit (self.peers.unit does not
             # include this unit)
             if (all_states | {"offline"} == {"offline"} and self.unit.is_leader()) or (
-                only_single_node_across_cluster and all_states == {"waiting"}
+                only_single_unitialized_node_across_cluster and all_states == {"waiting"}
             ):
                 # All instance are off, reboot cluster from outage from the leader unit
 
                 logger.info("Attempting reboot from complete outage.")
                 try:
-                    if self.unit.is_leader() or only_single_node_across_cluster:
+                    if self.unit.is_leader() or only_single_unitialized_node_across_cluster:
                         self._mysql.reboot_from_complete_outage()
                 except MySQLRebootFromCompleteOutageError:
                     logger.error("Failed to reboot cluster from complete outage.")
 
-                    if only_single_node_across_cluster and all_states == {"waiting"}:
+                    if only_single_unitialized_node_across_cluster and all_states == {"waiting"}:
                         self._mysql.drop_group_replication_metadata_schema()
                         self.create_cluster()
                         self.unit.status = ActiveStatus(self.active_status_message)

diff --git a/tests/integration/high_availability/test_crash_during_setup.py b/tests/integration/high_availability/test_crash_during_setup.py
@@ -74,27 +74,28 @@ async def test_crash_during_cluster_setup(ops_test) -> None:
     delete_pod(ops_test, leader_unit)
 
     logger.info("Waiting until pod rescheduled and cluster is set up again")
-    await ops_test.model.block_until(
-        lambda: leader_unit.workload_status == "active"
-        and leader_unit.workload_status_message == "Primary",
-        timeout=TIMEOUT,
-    )
-
-    logger.info("Removing disabled flag from non-leader units")
-    for unit in non_leader_units:
-        unit_label = unit.name.replace("/", "-")
-        await delete_file_or_directory_in_unit(
-            ops_test,
-            unit.name,
-            f"/var/lib/juju/agents/unit-{unit_label}/charm/disable",
-            container_name="charm",
+    async with ops_test.fast_forward("60s"):
+        await ops_test.model.block_until(
+            lambda: leader_unit.workload_status == "active"
+            and leader_unit.workload_status_message == "Primary",
+            timeout=TIMEOUT,
         )
 
-    logger.info("Waiting until cluster is fully active")
-    await ops_test.model.wait_for_idle(
-        apps=[APP_NAME],
-        status="active",
-        raise_on_blocked=False,
-        timeout=TIMEOUT,
-        wait_for_exact_units=3,
-    )
+        logger.info("Removing disabled flag from non-leader units")
+        for unit in non_leader_units:
+            unit_label = unit.name.replace("/", "-")
+            await delete_file_or_directory_in_unit(
+                ops_test,
+                unit.name,
+                f"/var/lib/juju/agents/unit-{unit_label}/charm/disable",
+                container_name="charm",
+            )
+
+        logger.info("Waiting until cluster is fully active")
+        await ops_test.model.wait_for_idle(
+            apps=[APP_NAME],
+            status="active",
+            raise_on_blocked=False,
+            timeout=TIMEOUT,
+            wait_for_exact_units=3,
+        )