Skip to content

Commit

Permalink
Fix failing scale down and then scale up integration test
Browse files Browse the repository at this point in the history
  • Loading branch information
shayancanonical committed Sep 5, 2024
1 parent f0daa96 commit ab63d53
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 29 deletions.
8 changes: 6 additions & 2 deletions lib/charms/mysql/v0/mysql.py
Original file line number Diff line number Diff line change
Expand Up @@ -624,7 +624,7 @@ def cluster_initialized(self) -> bool:
return False

@property
def only_single_cluster_node_exists(self) -> Optional[bool]:
def only_single_cluster_node_exists_unitialized(self) -> Optional[bool]:
"""Check if only a single cluster node exists across all units."""
if not self.app_peer_data.get("cluster-name"):
return None
Expand All @@ -633,7 +633,11 @@ def only_single_cluster_node_exists(self) -> Optional[bool]:
for unit in self.app_units:
total_cluster_nodes += self._mysql.get_cluster_node_count(from_instance=self.get_unit_address(unit))

return total_cluster_nodes == 1
total_online_cluster_nodes = 0
for unit in self.app_units:
total_online_cluster_nodes += self._mysql.get_cluster_node_count(from_instance=self.get_unit_address(unit), node_status=MySQLMemberState["ONLINE"])

return total_cluster_nodes == 1 and total_online_cluster_nodes == 0

@property
def cluster_fully_initialized(self) -> bool:
Expand Down
17 changes: 12 additions & 5 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -736,6 +736,15 @@ def _handle_potential_cluster_crash_scenario(self) -> bool:
if not self._mysql.is_mysqld_running():
return True

only_single_unitialized_node_across_cluster = (
self.only_single_cluster_node_exists_unitialized
)

if (
not self.cluster_initialized and not only_single_unitialized_node_across_cluster
) or not self.unit_peer_data.get("member-role"):
return True

# retrieve and persist state for every unit
try:
state, role = self._mysql.get_member_state()
Expand Down Expand Up @@ -764,23 +773,21 @@ def _handle_potential_cluster_crash_scenario(self) -> bool:
self.peers.data[unit].get("member-state", "unknown") for unit in self.peers.units
}

only_single_node_across_cluster = self.only_single_cluster_node_exists

# Add state 'offline' for this unit (self.peers.unit does not
# include this unit)
if (all_states | {"offline"} == {"offline"} and self.unit.is_leader()) or (
only_single_node_across_cluster and all_states == {"waiting"}
only_single_unitialized_node_across_cluster and all_states == {"waiting"}
):
# All instance are off, reboot cluster from outage from the leader unit

logger.info("Attempting reboot from complete outage.")
try:
if self.unit.is_leader() or only_single_node_across_cluster:
if self.unit.is_leader() or only_single_unitialized_node_across_cluster:
self._mysql.reboot_from_complete_outage()
except MySQLRebootFromCompleteOutageError:
logger.error("Failed to reboot cluster from complete outage.")

if only_single_node_across_cluster and all_states == {"waiting"}:
if only_single_unitialized_node_across_cluster and all_states == {"waiting"}:
self._mysql.drop_group_replication_metadata_schema()
self.create_cluster()
self.unit.status = ActiveStatus(self.active_status_message)
Expand Down
45 changes: 23 additions & 22 deletions tests/integration/high_availability/test_crash_during_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,27 +74,28 @@ async def test_crash_during_cluster_setup(ops_test) -> None:
delete_pod(ops_test, leader_unit)

logger.info("Waiting until pod rescheduled and cluster is set up again")
await ops_test.model.block_until(
lambda: leader_unit.workload_status == "active"
and leader_unit.workload_status_message == "Primary",
timeout=TIMEOUT,
)

logger.info("Removing disabled flag from non-leader units")
for unit in non_leader_units:
unit_label = unit.name.replace("/", "-")
await delete_file_or_directory_in_unit(
ops_test,
unit.name,
f"/var/lib/juju/agents/unit-{unit_label}/charm/disable",
container_name="charm",
async with ops_test.fast_forward("60s"):
await ops_test.model.block_until(
lambda: leader_unit.workload_status == "active"
and leader_unit.workload_status_message == "Primary",
timeout=TIMEOUT,
)

logger.info("Waiting until cluster is fully active")
await ops_test.model.wait_for_idle(
apps=[APP_NAME],
status="active",
raise_on_blocked=False,
timeout=TIMEOUT,
wait_for_exact_units=3,
)
logger.info("Removing disabled flag from non-leader units")
for unit in non_leader_units:
unit_label = unit.name.replace("/", "-")
await delete_file_or_directory_in_unit(
ops_test,
unit.name,
f"/var/lib/juju/agents/unit-{unit_label}/charm/disable",
container_name="charm",
)

logger.info("Waiting until cluster is fully active")
await ops_test.model.wait_for_idle(
apps=[APP_NAME],
status="active",
raise_on_blocked=False,
timeout=TIMEOUT,
wait_for_exact_units=3,
)

0 comments on commit ab63d53

Please sign in to comment.