From f50bb2c07c77d12b3f0436825dea22368874164e Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Wed, 20 Nov 2024 17:51:33 +0200 Subject: [PATCH 01/22] Update patroni configuration --- src/patroni.py | 2 +- templates/patroni.yml.j2 | 2 +- tests/unit/test_patroni.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/patroni.py b/src/patroni.py index 8cbb122a4d..c2632c6095 100644 --- a/src/patroni.py +++ b/src/patroni.py @@ -489,7 +489,7 @@ def render_patroni_yml_file( restore_to_latest=restore_to_latest, stanza=stanza, restore_stanza=restore_stanza, - minority_count=self._members_count // 2, + synchronous_node_count=self._members_count - 1, version=self.rock_postgresql_version.split(".")[0], pg_parameters=parameters, primary_cluster_endpoint=self._charm.async_replication.get_primary_cluster_endpoint(), diff --git a/templates/patroni.yml.j2 b/templates/patroni.yml.j2 index 96854216b1..2bd837d7e7 100644 --- a/templates/patroni.yml.j2 +++ b/templates/patroni.yml.j2 @@ -1,7 +1,7 @@ bootstrap: dcs: synchronous_mode: true - synchronous_node_count: {{ minority_count }} + synchronous_node_count: {{ synchronous_node_count }} postgresql: use_pg_rewind: true remove_data_directory_on_rewind_failure: true diff --git a/tests/unit/test_patroni.py b/tests/unit/test_patroni.py index 211b84fafb..9fee68019d 100644 --- a/tests/unit/test_patroni.py +++ b/tests/unit/test_patroni.py @@ -216,7 +216,7 @@ def test_render_patroni_yml_file(harness, patroni): replication_password=patroni._replication_password, rewind_user=REWIND_USER, rewind_password=patroni._rewind_password, - minority_count=patroni._members_count // 2, + synchronous_node_count=patroni._members_count - 1, version="14", patroni_password=patroni._patroni_password, ) @@ -251,7 +251,7 @@ def test_render_patroni_yml_file(harness, patroni): replication_password=patroni._replication_password, rewind_user=REWIND_USER, rewind_password=patroni._rewind_password, - minority_count=patroni._members_count // 2, + synchronous_node_count=patroni._members_count - 1, version="14", patroni_password=patroni._patroni_password, ) From 235e19dc3458a92b4fc49536031eb0827e193973 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Wed, 20 Nov 2024 17:53:54 +0200 Subject: [PATCH 02/22] Update test assertion --- tests/integration/helpers.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 9a0c4c5c0c..d4c6f9d131 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -769,7 +769,6 @@ async def switchover( ) assert response.status_code == 200, f"Switchover status code is {response.status_code}" app_name = current_primary.split("/")[0] - minority_count = len(ops_test.model.applications[app_name].units) // 2 for attempt in Retrying(stop=stop_after_attempt(30), wait=wait_fixed(2), reraise=True): with attempt: response = requests.get(f"http://{primary_ip}:8008/cluster") @@ -777,7 +776,7 @@ async def switchover( standbys = len([ member for member in response.json()["members"] if member["role"] == "sync_standby" ]) - assert standbys >= minority_count + assert standbys == len(ops_test.model.applications[app_name].units) - 1 async def wait_for_idle_on_blocked( From 30a7f565c84d835f78a1284d57b32b4a06eae514 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Thu, 21 Nov 2024 13:20:27 +0200 Subject: [PATCH 03/22] Copy update_synchronous_node_count from VM --- src/charm.py | 14 +++++++++++++- src/patroni.py | 22 ++++++++++++++++++++++ tests/unit/test_charm.py | 1 + 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/src/charm.py b/src/charm.py index 89d73e360e..c3406e25b7 100755 --- a/src/charm.py +++ b/src/charm.py @@ -441,13 +441,24 @@ def get_unit_ip(self, unit: Unit) -> Optional[str]: else: return None + def updated_synchronous_node_count(self, num_units: int | None = None) -> bool: + """Tries to update synchronous_node_count configuration and reports the result.""" + try: + self._patroni.update_synchronous_node_count(num_units) + return True + except RetryError: + logger.debug("Unable to set synchronous_node_count") + return False + def _on_peer_relation_departed(self, event: RelationDepartedEvent) -> None: """The leader removes the departing units from the list of cluster members.""" # Allow leader to update endpoints if it isn't leaving. if not self.unit.is_leader() or event.departing_unit == self.unit: return - if "cluster_initialised" not in self._peers.data[self.app]: + if "cluster_initialised" not in self._peers.data[ + self.app + ] or not self.updated_synchronous_node_count(self.app.planned_units()): logger.debug( "Deferring on_peer_relation_departed: Cluster must be initialized before members can leave" ) @@ -774,6 +785,7 @@ def _add_members(self, event) -> None: for member in self._hosts - self._patroni.cluster_members: logger.debug("Adding %s to cluster", member) self.add_cluster_member(member) + self._patroni.update_synchronous_node_count() except NotReadyError: logger.info("Deferring reconfigure: another member doing sync right now") event.defer() diff --git a/src/patroni.py b/src/patroni.py index c2632c6095..aa973ea84b 100644 --- a/src/patroni.py +++ b/src/patroni.py @@ -52,6 +52,10 @@ class SwitchoverFailedError(Exception): """Raised when a switchover failed for some reason.""" +class UpdateSyncNodeCountError(Exception): + """Raised when updating synchronous_node_count failed for some reason.""" + + class Patroni: """This class handles the communication with Patroni API and configuration files.""" @@ -125,6 +129,24 @@ def _get_alternative_patroni_url( url = self._patroni_url return url + def update_synchronous_node_count(self, units: int | None = None) -> None: + """Update synchronous_node_count to the minority of the planned cluster.""" + if units is None: + units = self.planned_units + # Try to update synchronous_node_count. + for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): + with attempt: + r = requests.patch( + f"{self._patroni_url}/config", + json={"synchronous_node_count": units // 2}, + verify=self.verify, + auth=self._patroni_auth, + ) + + # Check whether the update was unsuccessful. + if r.status_code != 200: + raise UpdateSyncNodeCountError(f"received {r.status_code}") + def get_primary(self, unit_name_pattern=False, alternative_endpoints: List[str] = None) -> str: """Get primary instance. diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index fbfbbc2e8d..9ca83781ef 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -668,6 +668,7 @@ def test_on_peer_relation_departed(harness): "charm.PostgresqlOperatorCharm._get_endpoints_to_remove", return_value=sentinel.units ) as _get_endpoints_to_remove, patch("charm.PostgresqlOperatorCharm._remove_from_endpoints") as _remove_from_endpoints, + patch("charm.PostgresqlOperatorCharm.updated_synchronous_node_count"), ): # Early exit if not leader event = Mock() From c6339ce19e0ef774a768fd4e9487f11674a48661 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Thu, 21 Nov 2024 13:39:52 +0200 Subject: [PATCH 04/22] Add unit test --- src/patroni.py | 8 ++++---- tests/unit/test_patroni.py | 25 +++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/src/patroni.py b/src/patroni.py index aa973ea84b..be3c320ae8 100644 --- a/src/patroni.py +++ b/src/patroni.py @@ -130,16 +130,16 @@ def _get_alternative_patroni_url( return url def update_synchronous_node_count(self, units: int | None = None) -> None: - """Update synchronous_node_count to the minority of the planned cluster.""" + """Update synchronous_node_count.""" if units is None: - units = self.planned_units + units = self._members_count # Try to update synchronous_node_count. for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): with attempt: r = requests.patch( f"{self._patroni_url}/config", - json={"synchronous_node_count": units // 2}, - verify=self.verify, + json={"synchronous_node_count": units - 1}, + verify=self._verify, auth=self._patroni_auth, ) diff --git a/tests/unit/test_patroni.py b/tests/unit/test_patroni.py index 9fee68019d..8024ac5ee5 100644 --- a/tests/unit/test_patroni.py +++ b/tests/unit/test_patroni.py @@ -452,3 +452,28 @@ def test_last_postgresql_logs(harness, patroni): (root / "var" / "log" / "postgresql" / "postgresql.3.log").unlink() (root / "var" / "log" / "postgresql").rmdir() assert patroni.last_postgresql_logs() == "" + + +def test_update_synchronous_node_count(harness, patroni): + with ( + patch("patroni.stop_after_delay", return_value=stop_after_delay(0)) as _wait_fixed, + patch("patroni.wait_fixed", return_value=wait_fixed(0)) as _wait_fixed, + patch("requests.patch") as _patch, + ): + response = _patch.return_value + response.status_code = 200 + + patroni.update_synchronous_node_count() + + _patch.assert_called_once_with( + "http://postgresql-k8s-0:8008/config", + json={"synchronous_node_count": 2}, + verify=True, + auth=patroni._patroni_auth, + ) + + # Test when the request fails. + response.status_code = 500 + with pytest.raises(RetryError): + patroni.update_synchronous_node_count() + assert False From 9bb565ea25a9df37415e0787bb252c6d83f4701e Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Thu, 21 Nov 2024 16:42:08 +0200 Subject: [PATCH 05/22] Set sync node count during upgrade --- src/upgrade.py | 1 + tests/unit/test_upgrade.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/src/upgrade.py b/src/upgrade.py index 18c7e10fd1..9780a1ab13 100644 --- a/src/upgrade.py +++ b/src/upgrade.py @@ -152,6 +152,7 @@ def _on_upgrade_changed(self, event) -> None: return self.charm.update_config() + self.charm.updated_synchronous_node_count() def _on_upgrade_charm_check_legacy(self, event: UpgradeCharmEvent) -> None: if not self.peer_relation: diff --git a/tests/unit/test_upgrade.py b/tests/unit/test_upgrade.py index 8035a6dd39..276dd0e4ca 100644 --- a/tests/unit/test_upgrade.py +++ b/tests/unit/test_upgrade.py @@ -158,6 +158,9 @@ def test_on_upgrade_changed(harness): with ( patch("charm.PostgresqlOperatorCharm.update_config") as _update_config, patch("charm.Patroni.member_started", new_callable=PropertyMock) as _member_started, + patch( + "charm.PostgresqlOperatorCharm.updated_synchronous_node_count" + ) as _updated_synchronous_node_count, ): harness.set_can_connect(POSTGRESQL_CONTAINER, True) _member_started.return_value = False @@ -168,6 +171,7 @@ def test_on_upgrade_changed(harness): _member_started.return_value = True harness.charm.on.upgrade_relation_changed.emit(relation) _update_config.assert_called_once() + _updated_synchronous_node_count.assert_called_once_with() def test_pre_upgrade_check(harness): From 6e60993d50ee615121aa3f1bd9a12ec24b94db93 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Thu, 21 Nov 2024 19:27:21 +0200 Subject: [PATCH 06/22] Fix tls test --- tests/integration/test_tls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_tls.py b/tests/integration/test_tls.py index 5ca5d9f373..eae009140e 100644 --- a/tests/integration/test_tls.py +++ b/tests/integration/test_tls.py @@ -105,7 +105,7 @@ async def test_tls(ops_test: OpsTest) -> None: patroni_password = await get_password(ops_test, "patroni") cluster_info = requests.get(f"https://{primary_address}:8008/cluster", verify=False) for member in cluster_info.json()["members"]: - if member["role"] == "replica": + if member["role"] != "leader": replica = "/".join(member["name"].rsplit("-", 1)) # Check if TLS enabled for replication From 025bfb28d6da895cba0908793d8b1222c154b325 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Thu, 21 Nov 2024 20:18:31 +0200 Subject: [PATCH 07/22] Switchover primary --- .../ha_tests/test_async_replication.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/tests/integration/ha_tests/test_async_replication.py b/tests/integration/ha_tests/test_async_replication.py index 3c5ea5ea09..f74a0581ee 100644 --- a/tests/integration/ha_tests/test_async_replication.py +++ b/tests/integration/ha_tests/test_async_replication.py @@ -32,7 +32,6 @@ are_writes_increasing, check_writes, get_standby_leader, - get_sync_standby, start_continuous_writes, ) @@ -416,11 +415,11 @@ async def test_async_replication_failover_in_main_cluster( logger.info("checking whether writes are increasing") await are_writes_increasing(ops_test) - sync_standby = await get_sync_standby(first_model, DATABASE_APP_NAME) - logger.info(f"Sync-standby: {sync_standby}") - logger.info("deleting the sync-standby pod") + primary = await get_primary(first_model, DATABASE_APP_NAME) + logger.info(f"Primary: {primary}") + logger.info("deleting the primary pod") client = Client(namespace=first_model.info.name) - client.delete(Pod, name=sync_standby.replace("/", "-")) + client.delete(Pod, name=primary.replace("/", "-")) async with ops_test.fast_forward(FAST_INTERVAL), fast_forward(second_model, FAST_INTERVAL): await gather( @@ -433,9 +432,9 @@ async def test_async_replication_failover_in_main_cluster( ) # Check that the sync-standby unit is not the same as before. - new_sync_standby = await get_sync_standby(first_model, DATABASE_APP_NAME) - logger.info(f"New sync-standby: {new_sync_standby}") - assert new_sync_standby != sync_standby, "Sync-standby is the same as before" + new_primary = await get_primary(first_model, DATABASE_APP_NAME) + logger.info(f"New sync-standby: {new_primary}") + assert new_primary != primary, "Sync-standby is the same as before" logger.info("Ensure continuous_writes after the crashed unit") await are_writes_increasing(ops_test) From f88c95666cb721bc515cb65be4727202de9b9f0c Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Thu, 21 Nov 2024 21:03:03 +0200 Subject: [PATCH 08/22] Add different helper to get leader --- tests/integration/ha_tests/helpers.py | 18 ++++++++++++++++++ .../ha_tests/test_async_replication.py | 5 +++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/tests/integration/ha_tests/helpers.py b/tests/integration/ha_tests/helpers.py index ab8ea58abc..ab6b8acc70 100644 --- a/tests/integration/ha_tests/helpers.py +++ b/tests/integration/ha_tests/helpers.py @@ -510,6 +510,24 @@ async def get_postgresql_parameter(ops_test: OpsTest, parameter_name: str) -> Op return parameter_value +async def get_leader(model: Model, application_name: str) -> str: + """Get the standby leader name. + + Args: + model: the model instance. + application_name: the name of the application to get the value for. + + Returns: + the name of the standby leader. + """ + status = await model.get_status() + first_unit_ip = list(status["applications"][application_name]["units"].values())[0]["address"] + cluster = get_patroni_cluster(first_unit_ip) + for member in cluster["members"]: + if member["role"] == "leader": + return member["name"] + + async def get_standby_leader(model: Model, application_name: str) -> str: """Get the standby leader name. diff --git a/tests/integration/ha_tests/test_async_replication.py b/tests/integration/ha_tests/test_async_replication.py index f74a0581ee..d944aa82ed 100644 --- a/tests/integration/ha_tests/test_async_replication.py +++ b/tests/integration/ha_tests/test_async_replication.py @@ -31,6 +31,7 @@ from .helpers import ( are_writes_increasing, check_writes, + get_leader, get_standby_leader, start_continuous_writes, ) @@ -415,7 +416,7 @@ async def test_async_replication_failover_in_main_cluster( logger.info("checking whether writes are increasing") await are_writes_increasing(ops_test) - primary = await get_primary(first_model, DATABASE_APP_NAME) + primary = await get_leader(first_model, DATABASE_APP_NAME) logger.info(f"Primary: {primary}") logger.info("deleting the primary pod") client = Client(namespace=first_model.info.name) @@ -432,7 +433,7 @@ async def test_async_replication_failover_in_main_cluster( ) # Check that the sync-standby unit is not the same as before. - new_primary = await get_primary(first_model, DATABASE_APP_NAME) + new_primary = await get_leader(first_model, DATABASE_APP_NAME) logger.info(f"New sync-standby: {new_primary}") assert new_primary != primary, "Sync-standby is the same as before" From 39817f6178d9e0370f22f3e99d38ff2580c7f39c Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Fri, 29 Nov 2024 16:06:58 +0200 Subject: [PATCH 09/22] Add config boilerplate --- config.yaml | 5 +++++ src/config.py | 3 ++- tests/integration/test_config.py | 1 + 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/config.yaml b/config.yaml index 6fb64366b2..9177e4f4f9 100644 --- a/config.yaml +++ b/config.yaml @@ -2,6 +2,11 @@ # See LICENSE file for licensing details. options: + durability_synchronous_node_count: + description: | + Sets the number of synchronous nodes to be maintained in the cluster. Should be + a positive value. + type: integer durability_synchronous_commit: description: | Sets the current transactions synchronization level. This charm allows only the diff --git a/src/config.py b/src/config.py index b5f41ec5b4..65f85b6ae9 100644 --- a/src/config.py +++ b/src/config.py @@ -8,7 +8,7 @@ from typing import Optional from charms.data_platform_libs.v0.data_models import BaseConfigModel -from pydantic import validator +from pydantic import PositiveInt, validator logger = logging.getLogger(__name__) @@ -16,6 +16,7 @@ class CharmConfig(BaseConfigModel): """Manager for the structured configuration.""" + durability_synchronous_node_count: Optional[PositiveInt] durability_synchronous_commit: Optional[str] instance_default_text_search_config: Optional[str] instance_password_encryption: Optional[str] diff --git a/tests/integration/test_config.py b/tests/integration/test_config.py index 9fe542b0cd..a70b3f68dc 100644 --- a/tests/integration/test_config.py +++ b/tests/integration/test_config.py @@ -30,6 +30,7 @@ async def test_config_parameters(ops_test: OpsTest) -> None: test_string = "abcXYZ123" configs = [ + {"durability_synchronous_node_count": [0, 1]}, # config option is greater than 0 { "durability_synchronous_commit": [test_string, "on"] }, # config option is one of `on`, `remote_apply` or `remote_write` From 43412d1b926c317436dd8611e760b7ac697168f2 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Fri, 29 Nov 2024 17:45:10 +0200 Subject: [PATCH 10/22] Use config value when setting sync node count --- lib/charms/postgresql_k8s/v0/postgresql.py | 4 ++-- src/charm.py | 18 +++++++++++++----- src/patroni.py | 20 +++++++++++++++----- tests/integration/test_config.py | 2 +- tests/unit/test_charm.py | 11 +++++++++++ 5 files changed, 42 insertions(+), 13 deletions(-) diff --git a/lib/charms/postgresql_k8s/v0/postgresql.py b/lib/charms/postgresql_k8s/v0/postgresql.py index 4d8d6dc30c..f8d347f2df 100644 --- a/lib/charms/postgresql_k8s/v0/postgresql.py +++ b/lib/charms/postgresql_k8s/v0/postgresql.py @@ -36,7 +36,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 39 +LIBPATCH = 40 INVALID_EXTRA_USER_ROLE_BLOCKING_MESSAGE = "invalid role(s) for extra user roles" @@ -654,7 +654,7 @@ def build_postgresql_parameters( "request", "response", "vacuum", - )): + )) or config in ("durability_synchronous_node_count"): continue parameter = "_".join(config.split("_")[1:]) if parameter in ["date_style", "time_zone"]: diff --git a/src/charm.py b/src/charm.py index c384bec642..775122c324 100755 --- a/src/charm.py +++ b/src/charm.py @@ -441,10 +441,10 @@ def get_unit_ip(self, unit: Unit) -> Optional[str]: else: return None - def updated_synchronous_node_count(self, num_units: int | None = None) -> bool: + def updated_synchronous_node_count(self) -> bool: """Tries to update synchronous_node_count configuration and reports the result.""" try: - self._patroni.update_synchronous_node_count(num_units) + self._patroni.update_synchronous_node_count() return True except RetryError: logger.debug("Unable to set synchronous_node_count") @@ -456,9 +456,10 @@ def _on_peer_relation_departed(self, event: RelationDepartedEvent) -> None: if not self.unit.is_leader() or event.departing_unit == self.unit: return - if "cluster_initialised" not in self._peers.data[ - self.app - ] or not self.updated_synchronous_node_count(self.app.planned_units()): + if ( + "cluster_initialised" not in self._peers.data[self.app] + or not self.updated_synchronous_node_count() + ): logger.debug( "Deferring on_peer_relation_departed: Cluster must be initialized before members can leave" ) @@ -662,6 +663,10 @@ def _on_config_changed(self, event) -> None: self.unit.status = BlockedStatus("Configuration Error. Please check the logs") logger.error("Invalid configuration: %s", str(e)) return + if not self.updated_synchronous_node_count(): + logger.debug("Defer on_config_changed: unable to set synchronous node count") + event.defer() + return if self.is_blocked and "Configuration Error" in self.unit.status.message: self._set_active_status() @@ -675,6 +680,9 @@ def _on_config_changed(self, event) -> None: # Enable and/or disable the extensions. self.enable_disable_extensions() + self._unblock_extensions() + + def _unblock_extensions(self) -> None: # Unblock the charm after extensions are enabled (only if it's blocked due to application # charms requesting extensions). if self.unit.status.message != EXTENSIONS_BLOCKING_MESSAGE: diff --git a/src/patroni.py b/src/patroni.py index be3c320ae8..c74c1a3ac0 100644 --- a/src/patroni.py +++ b/src/patroni.py @@ -129,16 +129,26 @@ def _get_alternative_patroni_url( url = self._patroni_url return url - def update_synchronous_node_count(self, units: int | None = None) -> None: + @property + def _synchronous_node_count(self) -> int: + pass + units = ( + self._charm.config.durability_synchronous_node_count + if self._charm.config.durability_synchronous_node_count + else self._members_count - 1 + ) + if units > self._members_count - 1: + units = self._members_count - 1 + return units + + def update_synchronous_node_count(self) -> None: """Update synchronous_node_count.""" - if units is None: - units = self._members_count # Try to update synchronous_node_count. for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): with attempt: r = requests.patch( f"{self._patroni_url}/config", - json={"synchronous_node_count": units - 1}, + json={"synchronous_node_count": self._synchronous_node_count}, verify=self._verify, auth=self._patroni_auth, ) @@ -511,7 +521,7 @@ def render_patroni_yml_file( restore_to_latest=restore_to_latest, stanza=stanza, restore_stanza=restore_stanza, - synchronous_node_count=self._members_count - 1, + synchronous_node_count=self._synchronous_node_count, version=self.rock_postgresql_version.split(".")[0], pg_parameters=parameters, primary_cluster_endpoint=self._charm.async_replication.get_primary_cluster_endpoint(), diff --git a/tests/integration/test_config.py b/tests/integration/test_config.py index a70b3f68dc..ade8be1fd3 100644 --- a/tests/integration/test_config.py +++ b/tests/integration/test_config.py @@ -30,7 +30,7 @@ async def test_config_parameters(ops_test: OpsTest) -> None: test_string = "abcXYZ123" configs = [ - {"durability_synchronous_node_count": [0, 1]}, # config option is greater than 0 + {"durability_synchronous_node_count": ["0", "1"]}, # config option is greater than 0 { "durability_synchronous_commit": [test_string, "on"] }, # config option is one of `on`, `remote_apply` or `remote_write` diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 90f5b52da8..029eea5a0e 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -279,6 +279,9 @@ def test_on_config_changed(harness): "charm.PostgreSQLUpgrade.idle", return_value=False, new_callable=PropertyMock ) as _idle, patch("charm.PostgresqlOperatorCharm.update_config") as _update_config, + patch( + "charm.PostgresqlOperatorCharm.updated_synchronous_node_count", return_value=True + ) as _updated_synchronous_node_count, patch("charm.Patroni.member_started", return_value=True, new_callable=PropertyMock), patch("charm.Patroni.get_primary"), patch( @@ -321,6 +324,14 @@ def test_on_config_changed(harness): harness.charm._on_config_changed(mock_event) assert isinstance(harness.charm.unit.status, ActiveStatus) assert not _enable_disable_extensions.called + _updated_synchronous_node_count.assert_called_once_with() + + # Deferst on update sync nodes failure + _updated_synchronous_node_count.return_value = False + harness.charm._on_config_changed(mock_event) + mock_event.defer.assert_called_once_with() + mock_event.defer.reset_mock() + _updated_synchronous_node_count.return_value = True # Leader enables extensions with harness.hooks_disabled(): From 0b1c28998af458e4fdd10bd544b0ac82d440410f Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Fri, 29 Nov 2024 17:48:34 +0200 Subject: [PATCH 11/22] Escape tuple --- lib/charms/postgresql_k8s/v0/postgresql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/charms/postgresql_k8s/v0/postgresql.py b/lib/charms/postgresql_k8s/v0/postgresql.py index f8d347f2df..00628047bd 100644 --- a/lib/charms/postgresql_k8s/v0/postgresql.py +++ b/lib/charms/postgresql_k8s/v0/postgresql.py @@ -654,7 +654,7 @@ def build_postgresql_parameters( "request", "response", "vacuum", - )) or config in ("durability_synchronous_node_count"): + )) or config in ("durability_synchronous_node_count",): continue parameter = "_".join(config.split("_")[1:]) if parameter in ["date_style", "time_zone"]: From 707a0140de97d2083906d742c0beed39ed85b8bd Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Mon, 2 Dec 2024 16:02:59 +0200 Subject: [PATCH 12/22] Add policy values --- config.yaml | 7 ++++--- lib/charms/postgresql_k8s/v0/postgresql.py | 4 ++-- src/config.py | 4 ++-- src/patroni.py | 16 +++++++++------- tests/integration/test_config.py | 5 ++++- 5 files changed, 21 insertions(+), 15 deletions(-) diff --git a/config.yaml b/config.yaml index 1f9a1cf1ad..b1aa9d6968 100644 --- a/config.yaml +++ b/config.yaml @@ -2,11 +2,12 @@ # See LICENSE file for licensing details. options: - durability_synchronous_node_count: + synchronous_node_count: description: | Sets the number of synchronous nodes to be maintained in the cluster. Should be - a positive value. - type: int + either "all", "minority", "majority" or a positive value. + type: string + default: "all" durability_synchronous_commit: description: | Sets the current transactions synchronization level. This charm allows only the diff --git a/lib/charms/postgresql_k8s/v0/postgresql.py b/lib/charms/postgresql_k8s/v0/postgresql.py index 00628047bd..4d8d6dc30c 100644 --- a/lib/charms/postgresql_k8s/v0/postgresql.py +++ b/lib/charms/postgresql_k8s/v0/postgresql.py @@ -36,7 +36,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 40 +LIBPATCH = 39 INVALID_EXTRA_USER_ROLE_BLOCKING_MESSAGE = "invalid role(s) for extra user roles" @@ -654,7 +654,7 @@ def build_postgresql_parameters( "request", "response", "vacuum", - )) or config in ("durability_synchronous_node_count",): + )): continue parameter = "_".join(config.split("_")[1:]) if parameter in ["date_style", "time_zone"]: diff --git a/src/config.py b/src/config.py index 65f85b6ae9..1c798ee7c9 100644 --- a/src/config.py +++ b/src/config.py @@ -5,7 +5,7 @@ """Structured configuration for the PostgreSQL charm.""" import logging -from typing import Optional +from typing import Literal, Optional from charms.data_platform_libs.v0.data_models import BaseConfigModel from pydantic import PositiveInt, validator @@ -16,7 +16,7 @@ class CharmConfig(BaseConfigModel): """Manager for the structured configuration.""" - durability_synchronous_node_count: Optional[PositiveInt] + synchronous_node_count: Literal["all", "minority", "majority"] | PositiveInt durability_synchronous_commit: Optional[str] instance_default_text_search_config: Optional[str] instance_password_encryption: Optional[str] diff --git a/src/patroni.py b/src/patroni.py index c74c1a3ac0..374976214c 100644 --- a/src/patroni.py +++ b/src/patroni.py @@ -131,15 +131,17 @@ def _get_alternative_patroni_url( @property def _synchronous_node_count(self) -> int: - pass - units = ( - self._charm.config.durability_synchronous_node_count - if self._charm.config.durability_synchronous_node_count + if self._charm.config.synchronous_node_count == "all": + return self._members_count - 1 + elif self._charm.config.synchronous_node_count == "minorty": + return self._members_count // 2 + elif self._charm.config.synchronous_node_count == "majority": + return self._members_count // 2 + 1 + return ( + self._charm.config.synchronous_node_count + if self._charm.config.synchronous_node_count < self._members_count - 1 else self._members_count - 1 ) - if units > self._members_count - 1: - units = self._members_count - 1 - return units def update_synchronous_node_count(self) -> None: """Update synchronous_node_count.""" diff --git a/tests/integration/test_config.py b/tests/integration/test_config.py index ade8be1fd3..89e2e92c9e 100644 --- a/tests/integration/test_config.py +++ b/tests/integration/test_config.py @@ -30,7 +30,10 @@ async def test_config_parameters(ops_test: OpsTest) -> None: test_string = "abcXYZ123" configs = [ - {"durability_synchronous_node_count": ["0", "1"]}, # config option is greater than 0 + {"synchronous_node_count": ["0", "1"]}, # config option is greater than 0 + { + "synchronous_node_count": [test_string, "all"] + }, # config option is one of `all`, `minority` or `majority` { "durability_synchronous_commit": [test_string, "on"] }, # config option is one of `on`, `remote_apply` or `remote_write` From 562dad5e414e8a72856a959c49008b27e1b5dd12 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Mon, 2 Dec 2024 18:52:25 +0200 Subject: [PATCH 13/22] Add integration test --- tests/integration/ha_tests/helpers.py | 21 +++++ .../ha_tests/test_synchronous_policy.py | 89 +++++++++++++++++++ 2 files changed, 110 insertions(+) create mode 100644 tests/integration/ha_tests/test_synchronous_policy.py diff --git a/tests/integration/ha_tests/helpers.py b/tests/integration/ha_tests/helpers.py index ab6b8acc70..56c4e15d2a 100644 --- a/tests/integration/ha_tests/helpers.py +++ b/tests/integration/ha_tests/helpers.py @@ -1174,3 +1174,24 @@ async def remove_unit_force(ops_test: OpsTest, num_units: int): timeout=1000, wait_for_exact_units=scale, ) + + +async def get_cluster_roles( + ops_test: OpsTest, unit_name: str, use_ip_from_inside: bool = False +) -> dict[str, str | list[str] | None]: + """Returns whether the unit a replica in the cluster.""" + unit_ip = await get_unit_address(ops_test, unit_name) + members = {"replicas": [], "primaries": [], "sync_standbys": []} + member_list = get_patroni_cluster(unit_ip)["members"] + logger.info(f"Cluster members are: {member_list}") + for member in member_list: + role = member["role"] + name = "/".join(member["name"].rsplit("-", 1)) + if role == "leader": + members["primaries"].append(name) + elif role == "sync_standby": + members["sync_standbys"].append(name) + else: + members["replicas"].append(name) + + return members diff --git a/tests/integration/ha_tests/test_synchronous_policy.py b/tests/integration/ha_tests/test_synchronous_policy.py new file mode 100644 index 0000000000..9dd61874c6 --- /dev/null +++ b/tests/integration/ha_tests/test_synchronous_policy.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. +import pytest +from pytest_operator.plugin import OpsTest + +from ..helpers import app_name, build_and_deploy +from .helpers import get_cluster_roles + + +@pytest.mark.group(1) +@pytest.mark.abort_on_fail +async def test_build_and_deploy(ops_test: OpsTest) -> None: + """Build and deploy three unit of PostgreSQL.""" + wait_for_apps = False + # It is possible for users to provide their own cluster for HA testing. Hence, check if there + # is a pre-existing cluster. + if not await app_name(ops_test): + wait_for_apps = True + await build_and_deploy(ops_test, 3, wait_for_idle=False) + + if wait_for_apps: + async with ops_test.fast_forward(): + await ops_test.model.wait_for_idle(status="active", timeout=1000, raise_on_error=False) + + +@pytest.mark.group(1) +async def test_default_all(ops_test: OpsTest) -> None: + app = await app_name(ops_test) + + async with ops_test.fast_forward(): + await ops_test.model.wait_for_idle(apps=[app], status="active") + + roles = get_cluster_roles(ops_test.model.applications[app].units[0].name) + + assert len(roles["primaries"]) == 1 + assert len(roles["sync_standbys"]) == 2 + assert len(roles["replicas"]) == 0 + + +@pytest.mark.group(1) +async def test_minority(ops_test: OpsTest) -> None: + """Kill primary unit, check reelection.""" + app = await app_name(ops_test) + + await ops_test.model.applications[app].set_config({"synchronous_node_count": "minority"}) + + async with ops_test.fast_forward(): + await ops_test.model.wait_for_idle(apps=[app], status="active") + + roles = get_cluster_roles(ops_test.model.applications[app].units[0].name) + + assert len(roles["primaries"]) == 1 + assert len(roles["sync_standbys"]) == 1 + assert len(roles["replicas"]) == 1 + + +@pytest.mark.group(1) +async def test_majority(ops_test: OpsTest) -> None: + """Kill primary unit, check reelection.""" + app = await app_name(ops_test) + + await ops_test.model.applications[app].set_config({"synchronous_node_count": "majority"}) + + async with ops_test.fast_forward(): + await ops_test.model.wait_for_idle(apps=[app], status="active") + + roles = get_cluster_roles(ops_test.model.applications[app].units[0].name) + + assert len(roles["primaries"]) == 1 + assert len(roles["sync_standbys"]) == 2 + assert len(roles["replicas"]) == 0 + + +@pytest.mark.group(1) +async def test_constant(ops_test: OpsTest) -> None: + """Kill primary unit, check reelection.""" + app = await app_name(ops_test) + + await ops_test.model.applications[app].set_config({"synchronous_node_count": "1"}) + + async with ops_test.fast_forward(): + await ops_test.model.wait_for_idle(apps=[app], status="active") + + roles = get_cluster_roles(ops_test.model.applications[app].units[0].name) + + assert len(roles["primaries"]) == 1 + assert len(roles["sync_standbys"]) == 1 + assert len(roles["replicas"]) == 1 From 048e72036ed2d9ea400f5a4e45956fd19513818c Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Mon, 2 Dec 2024 19:25:37 +0200 Subject: [PATCH 14/22] Fix casting --- src/patroni.py | 7 ++----- tests/integration/ha_tests/helpers.py | 2 +- tests/integration/ha_tests/test_synchronous_policy.py | 8 ++++---- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/patroni.py b/src/patroni.py index 374976214c..fe5326f149 100644 --- a/src/patroni.py +++ b/src/patroni.py @@ -137,11 +137,8 @@ def _synchronous_node_count(self) -> int: return self._members_count // 2 elif self._charm.config.synchronous_node_count == "majority": return self._members_count // 2 + 1 - return ( - self._charm.config.synchronous_node_count - if self._charm.config.synchronous_node_count < self._members_count - 1 - else self._members_count - 1 - ) + node_count = int(self._charm.config.synchronous_node_count) + return node_count if node_count < self._members_count - 1 else self._members_count - 1 def update_synchronous_node_count(self) -> None: """Update synchronous_node_count.""" diff --git a/tests/integration/ha_tests/helpers.py b/tests/integration/ha_tests/helpers.py index 56c4e15d2a..d3beba2e6b 100644 --- a/tests/integration/ha_tests/helpers.py +++ b/tests/integration/ha_tests/helpers.py @@ -1177,7 +1177,7 @@ async def remove_unit_force(ops_test: OpsTest, num_units: int): async def get_cluster_roles( - ops_test: OpsTest, unit_name: str, use_ip_from_inside: bool = False + ops_test: OpsTest, unit_name: str ) -> dict[str, str | list[str] | None]: """Returns whether the unit a replica in the cluster.""" unit_ip = await get_unit_address(ops_test, unit_name) diff --git a/tests/integration/ha_tests/test_synchronous_policy.py b/tests/integration/ha_tests/test_synchronous_policy.py index 9dd61874c6..425d7e1d26 100644 --- a/tests/integration/ha_tests/test_synchronous_policy.py +++ b/tests/integration/ha_tests/test_synchronous_policy.py @@ -31,7 +31,7 @@ async def test_default_all(ops_test: OpsTest) -> None: async with ops_test.fast_forward(): await ops_test.model.wait_for_idle(apps=[app], status="active") - roles = get_cluster_roles(ops_test.model.applications[app].units[0].name) + roles = get_cluster_roles(ops_test, ops_test.model.applications[app].units[0].name) assert len(roles["primaries"]) == 1 assert len(roles["sync_standbys"]) == 2 @@ -48,7 +48,7 @@ async def test_minority(ops_test: OpsTest) -> None: async with ops_test.fast_forward(): await ops_test.model.wait_for_idle(apps=[app], status="active") - roles = get_cluster_roles(ops_test.model.applications[app].units[0].name) + roles = get_cluster_roles(ops_test, ops_test.model.applications[app].units[0].name) assert len(roles["primaries"]) == 1 assert len(roles["sync_standbys"]) == 1 @@ -65,7 +65,7 @@ async def test_majority(ops_test: OpsTest) -> None: async with ops_test.fast_forward(): await ops_test.model.wait_for_idle(apps=[app], status="active") - roles = get_cluster_roles(ops_test.model.applications[app].units[0].name) + roles = get_cluster_roles(ops_test, ops_test.model.applications[app].units[0].name) assert len(roles["primaries"]) == 1 assert len(roles["sync_standbys"]) == 2 @@ -82,7 +82,7 @@ async def test_constant(ops_test: OpsTest) -> None: async with ops_test.fast_forward(): await ops_test.model.wait_for_idle(apps=[app], status="active") - roles = get_cluster_roles(ops_test.model.applications[app].units[0].name) + roles = get_cluster_roles(ops_test, ops_test.model.applications[app].units[0].name) assert len(roles["primaries"]) == 1 assert len(roles["sync_standbys"]) == 1 From 1f6518791a5342e23999a8b8f2a0ae77ce683fc4 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Mon, 2 Dec 2024 20:18:32 +0200 Subject: [PATCH 15/22] Fix test --- src/patroni.py | 9 ++++++--- .../ha_tests/test_synchronous_policy.py | 14 +++++++------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/src/patroni.py b/src/patroni.py index fe5326f149..68c5dde693 100644 --- a/src/patroni.py +++ b/src/patroni.py @@ -133,12 +133,15 @@ def _get_alternative_patroni_url( def _synchronous_node_count(self) -> int: if self._charm.config.synchronous_node_count == "all": return self._members_count - 1 - elif self._charm.config.synchronous_node_count == "minorty": + elif self._charm.config.synchronous_node_count == "minority": return self._members_count // 2 elif self._charm.config.synchronous_node_count == "majority": return self._members_count // 2 + 1 - node_count = int(self._charm.config.synchronous_node_count) - return node_count if node_count < self._members_count - 1 else self._members_count - 1 + return ( + self._charm.config.synchronous_node_count + if self._charm.config.synchronous_node_count < self._members_count - 1 + else self._members_count - 1 + ) def update_synchronous_node_count(self) -> None: """Update synchronous_node_count.""" diff --git a/tests/integration/ha_tests/test_synchronous_policy.py b/tests/integration/ha_tests/test_synchronous_policy.py index 425d7e1d26..bde481e7dc 100644 --- a/tests/integration/ha_tests/test_synchronous_policy.py +++ b/tests/integration/ha_tests/test_synchronous_policy.py @@ -29,9 +29,9 @@ async def test_default_all(ops_test: OpsTest) -> None: app = await app_name(ops_test) async with ops_test.fast_forward(): - await ops_test.model.wait_for_idle(apps=[app], status="active") + await ops_test.model.wait_for_idle(apps=[app], status="active", timeout=300) - roles = get_cluster_roles(ops_test, ops_test.model.applications[app].units[0].name) + roles = await get_cluster_roles(ops_test, ops_test.model.applications[app].units[0].name) assert len(roles["primaries"]) == 1 assert len(roles["sync_standbys"]) == 2 @@ -46,9 +46,9 @@ async def test_minority(ops_test: OpsTest) -> None: await ops_test.model.applications[app].set_config({"synchronous_node_count": "minority"}) async with ops_test.fast_forward(): - await ops_test.model.wait_for_idle(apps=[app], status="active") + await ops_test.model.wait_for_idle(apps=[app], status="active", timeout=300) - roles = get_cluster_roles(ops_test, ops_test.model.applications[app].units[0].name) + roles = await get_cluster_roles(ops_test, ops_test.model.applications[app].units[0].name) assert len(roles["primaries"]) == 1 assert len(roles["sync_standbys"]) == 1 @@ -65,7 +65,7 @@ async def test_majority(ops_test: OpsTest) -> None: async with ops_test.fast_forward(): await ops_test.model.wait_for_idle(apps=[app], status="active") - roles = get_cluster_roles(ops_test, ops_test.model.applications[app].units[0].name) + roles = await get_cluster_roles(ops_test, ops_test.model.applications[app].units[0].name) assert len(roles["primaries"]) == 1 assert len(roles["sync_standbys"]) == 2 @@ -80,9 +80,9 @@ async def test_constant(ops_test: OpsTest) -> None: await ops_test.model.applications[app].set_config({"synchronous_node_count": "1"}) async with ops_test.fast_forward(): - await ops_test.model.wait_for_idle(apps=[app], status="active") + await ops_test.model.wait_for_idle(apps=[app], status="active", timeout=300) - roles = get_cluster_roles(ops_test, ops_test.model.applications[app].units[0].name) + roles = await get_cluster_roles(ops_test, ops_test.model.applications[app].units[0].name) assert len(roles["primaries"]) == 1 assert len(roles["sync_standbys"]) == 1 From 88541f193f5177fb3984d2d01bf384f342a46612 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Tue, 3 Dec 2024 13:18:43 +0200 Subject: [PATCH 16/22] Update to spec --- config.yaml | 2 +- src/config.py | 2 +- src/patroni.py | 4 +- .../ha_tests/test_synchronous_policy.py | 57 +++++++++---------- 4 files changed, 29 insertions(+), 36 deletions(-) diff --git a/config.yaml b/config.yaml index b1aa9d6968..9481854b05 100644 --- a/config.yaml +++ b/config.yaml @@ -5,7 +5,7 @@ options: synchronous_node_count: description: | Sets the number of synchronous nodes to be maintained in the cluster. Should be - either "all", "minority", "majority" or a positive value. + either "all", "majority" or a positive value. type: string default: "all" durability_synchronous_commit: diff --git a/src/config.py b/src/config.py index 1c798ee7c9..9d6d27c42f 100644 --- a/src/config.py +++ b/src/config.py @@ -16,7 +16,7 @@ class CharmConfig(BaseConfigModel): """Manager for the structured configuration.""" - synchronous_node_count: Literal["all", "minority", "majority"] | PositiveInt + synchronous_node_count: Literal["all", "majority"] | PositiveInt durability_synchronous_commit: Optional[str] instance_default_text_search_config: Optional[str] instance_password_encryption: Optional[str] diff --git a/src/patroni.py b/src/patroni.py index 68c5dde693..4d29ff69d3 100644 --- a/src/patroni.py +++ b/src/patroni.py @@ -133,10 +133,8 @@ def _get_alternative_patroni_url( def _synchronous_node_count(self) -> int: if self._charm.config.synchronous_node_count == "all": return self._members_count - 1 - elif self._charm.config.synchronous_node_count == "minority": - return self._members_count // 2 elif self._charm.config.synchronous_node_count == "majority": - return self._members_count // 2 + 1 + return self._members_count // 2 return ( self._charm.config.synchronous_node_count if self._charm.config.synchronous_node_count < self._members_count - 1 diff --git a/tests/integration/ha_tests/test_synchronous_policy.py b/tests/integration/ha_tests/test_synchronous_policy.py index bde481e7dc..5e2d6776d5 100644 --- a/tests/integration/ha_tests/test_synchronous_policy.py +++ b/tests/integration/ha_tests/test_synchronous_policy.py @@ -3,6 +3,7 @@ # See LICENSE file for licensing details. import pytest from pytest_operator.plugin import OpsTest +from tenacity import Retrying, stop_after_attempt, wait_fixed from ..helpers import app_name, build_and_deploy from .helpers import get_cluster_roles @@ -31,33 +32,19 @@ async def test_default_all(ops_test: OpsTest) -> None: async with ops_test.fast_forward(): await ops_test.model.wait_for_idle(apps=[app], status="active", timeout=300) - roles = await get_cluster_roles(ops_test, ops_test.model.applications[app].units[0].name) + for attempt in Retrying(stop=stop_after_attempt(3), wait=wait_fixed(5), reraise=True): + with attempt: + roles = await get_cluster_roles( + ops_test, ops_test.model.applications[app].units[0].name + ) - assert len(roles["primaries"]) == 1 - assert len(roles["sync_standbys"]) == 2 - assert len(roles["replicas"]) == 0 - - -@pytest.mark.group(1) -async def test_minority(ops_test: OpsTest) -> None: - """Kill primary unit, check reelection.""" - app = await app_name(ops_test) - - await ops_test.model.applications[app].set_config({"synchronous_node_count": "minority"}) - - async with ops_test.fast_forward(): - await ops_test.model.wait_for_idle(apps=[app], status="active", timeout=300) - - roles = await get_cluster_roles(ops_test, ops_test.model.applications[app].units[0].name) - - assert len(roles["primaries"]) == 1 - assert len(roles["sync_standbys"]) == 1 - assert len(roles["replicas"]) == 1 + assert len(roles["primaries"]) == 1 + assert len(roles["sync_standbys"]) == 2 + assert len(roles["replicas"]) == 0 @pytest.mark.group(1) async def test_majority(ops_test: OpsTest) -> None: - """Kill primary unit, check reelection.""" app = await app_name(ops_test) await ops_test.model.applications[app].set_config({"synchronous_node_count": "majority"}) @@ -65,11 +52,15 @@ async def test_majority(ops_test: OpsTest) -> None: async with ops_test.fast_forward(): await ops_test.model.wait_for_idle(apps=[app], status="active") - roles = await get_cluster_roles(ops_test, ops_test.model.applications[app].units[0].name) + for attempt in Retrying(stop=stop_after_attempt(3), wait=wait_fixed(5), reraise=True): + with attempt: + roles = await get_cluster_roles( + ops_test, ops_test.model.applications[app].units[0].name + ) - assert len(roles["primaries"]) == 1 - assert len(roles["sync_standbys"]) == 2 - assert len(roles["replicas"]) == 0 + assert len(roles["primaries"]) == 1 + assert len(roles["sync_standbys"]) == 1 + assert len(roles["replicas"]) == 1 @pytest.mark.group(1) @@ -77,13 +68,17 @@ async def test_constant(ops_test: OpsTest) -> None: """Kill primary unit, check reelection.""" app = await app_name(ops_test) - await ops_test.model.applications[app].set_config({"synchronous_node_count": "1"}) + await ops_test.model.applications[app].set_config({"synchronous_node_count": "2"}) async with ops_test.fast_forward(): await ops_test.model.wait_for_idle(apps=[app], status="active", timeout=300) - roles = await get_cluster_roles(ops_test, ops_test.model.applications[app].units[0].name) + for attempt in Retrying(stop=stop_after_attempt(3), wait=wait_fixed(5), reraise=True): + with attempt: + roles = await get_cluster_roles( + ops_test, ops_test.model.applications[app].units[0].name + ) - assert len(roles["primaries"]) == 1 - assert len(roles["sync_standbys"]) == 1 - assert len(roles["replicas"]) == 1 + assert len(roles["primaries"]) == 1 + assert len(roles["sync_standbys"]) == 2 + assert len(roles["replicas"]) == 0 From 95c80deaa73461451e05fd520ca87d5bd3733663 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Tue, 3 Dec 2024 14:12:38 +0200 Subject: [PATCH 17/22] Bump retry timout --- tests/integration/ha_tests/test_synchronous_policy.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/ha_tests/test_synchronous_policy.py b/tests/integration/ha_tests/test_synchronous_policy.py index 5e2d6776d5..62f04deb6b 100644 --- a/tests/integration/ha_tests/test_synchronous_policy.py +++ b/tests/integration/ha_tests/test_synchronous_policy.py @@ -32,7 +32,7 @@ async def test_default_all(ops_test: OpsTest) -> None: async with ops_test.fast_forward(): await ops_test.model.wait_for_idle(apps=[app], status="active", timeout=300) - for attempt in Retrying(stop=stop_after_attempt(3), wait=wait_fixed(5), reraise=True): + for attempt in Retrying(stop=stop_after_attempt(3), wait=wait_fixed(10), reraise=True): with attempt: roles = await get_cluster_roles( ops_test, ops_test.model.applications[app].units[0].name @@ -52,7 +52,7 @@ async def test_majority(ops_test: OpsTest) -> None: async with ops_test.fast_forward(): await ops_test.model.wait_for_idle(apps=[app], status="active") - for attempt in Retrying(stop=stop_after_attempt(3), wait=wait_fixed(5), reraise=True): + for attempt in Retrying(stop=stop_after_attempt(3), wait=wait_fixed(10), reraise=True): with attempt: roles = await get_cluster_roles( ops_test, ops_test.model.applications[app].units[0].name @@ -73,7 +73,7 @@ async def test_constant(ops_test: OpsTest) -> None: async with ops_test.fast_forward(): await ops_test.model.wait_for_idle(apps=[app], status="active", timeout=300) - for attempt in Retrying(stop=stop_after_attempt(3), wait=wait_fixed(5), reraise=True): + for attempt in Retrying(stop=stop_after_attempt(3), wait=wait_fixed(10), reraise=True): with attempt: roles = await get_cluster_roles( ops_test, ops_test.model.applications[app].units[0].name From a74eae164588a1518c336bccaf7a5d2d41d3a9c3 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Tue, 3 Dec 2024 15:20:35 +0200 Subject: [PATCH 18/22] Switch to planned units --- src/patroni.py | 7 ++++--- tests/integration/ha_tests/test_synchronous_policy.py | 6 +++--- tests/unit/test_patroni.py | 6 +++--- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/patroni.py b/src/patroni.py index 4d29ff69d3..72311eb1a6 100644 --- a/src/patroni.py +++ b/src/patroni.py @@ -131,14 +131,15 @@ def _get_alternative_patroni_url( @property def _synchronous_node_count(self) -> int: + planned_units = self._charm.app.planned_units() if self._charm.config.synchronous_node_count == "all": - return self._members_count - 1 + return planned_units - 1 elif self._charm.config.synchronous_node_count == "majority": - return self._members_count // 2 + return planned_units // 2 return ( self._charm.config.synchronous_node_count if self._charm.config.synchronous_node_count < self._members_count - 1 - else self._members_count - 1 + else planned_units - 1 ) def update_synchronous_node_count(self) -> None: diff --git a/tests/integration/ha_tests/test_synchronous_policy.py b/tests/integration/ha_tests/test_synchronous_policy.py index 62f04deb6b..5e2d6776d5 100644 --- a/tests/integration/ha_tests/test_synchronous_policy.py +++ b/tests/integration/ha_tests/test_synchronous_policy.py @@ -32,7 +32,7 @@ async def test_default_all(ops_test: OpsTest) -> None: async with ops_test.fast_forward(): await ops_test.model.wait_for_idle(apps=[app], status="active", timeout=300) - for attempt in Retrying(stop=stop_after_attempt(3), wait=wait_fixed(10), reraise=True): + for attempt in Retrying(stop=stop_after_attempt(3), wait=wait_fixed(5), reraise=True): with attempt: roles = await get_cluster_roles( ops_test, ops_test.model.applications[app].units[0].name @@ -52,7 +52,7 @@ async def test_majority(ops_test: OpsTest) -> None: async with ops_test.fast_forward(): await ops_test.model.wait_for_idle(apps=[app], status="active") - for attempt in Retrying(stop=stop_after_attempt(3), wait=wait_fixed(10), reraise=True): + for attempt in Retrying(stop=stop_after_attempt(3), wait=wait_fixed(5), reraise=True): with attempt: roles = await get_cluster_roles( ops_test, ops_test.model.applications[app].units[0].name @@ -73,7 +73,7 @@ async def test_constant(ops_test: OpsTest) -> None: async with ops_test.fast_forward(): await ops_test.model.wait_for_idle(apps=[app], status="active", timeout=300) - for attempt in Retrying(stop=stop_after_attempt(3), wait=wait_fixed(10), reraise=True): + for attempt in Retrying(stop=stop_after_attempt(3), wait=wait_fixed(5), reraise=True): with attempt: roles = await get_cluster_roles( ops_test, ops_test.model.applications[app].units[0].name diff --git a/tests/unit/test_patroni.py b/tests/unit/test_patroni.py index 8024ac5ee5..8cdd0973fc 100644 --- a/tests/unit/test_patroni.py +++ b/tests/unit/test_patroni.py @@ -216,7 +216,7 @@ def test_render_patroni_yml_file(harness, patroni): replication_password=patroni._replication_password, rewind_user=REWIND_USER, rewind_password=patroni._rewind_password, - synchronous_node_count=patroni._members_count - 1, + synchronous_node_count=0, version="14", patroni_password=patroni._patroni_password, ) @@ -251,7 +251,7 @@ def test_render_patroni_yml_file(harness, patroni): replication_password=patroni._replication_password, rewind_user=REWIND_USER, rewind_password=patroni._rewind_password, - synchronous_node_count=patroni._members_count - 1, + synchronous_node_count=0, version="14", patroni_password=patroni._patroni_password, ) @@ -467,7 +467,7 @@ def test_update_synchronous_node_count(harness, patroni): _patch.assert_called_once_with( "http://postgresql-k8s-0:8008/config", - json={"synchronous_node_count": 2}, + json={"synchronous_node_count": 0}, verify=True, auth=patroni._patroni_auth, ) From 0a5916b0473d6b3851f836895c0347a1772b8d83 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Thu, 19 Dec 2024 15:41:32 +0200 Subject: [PATCH 19/22] Fix generator --- tests/integration/ha_tests/helpers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/ha_tests/helpers.py b/tests/integration/ha_tests/helpers.py index 3cafd5ebc3..132b465f99 100644 --- a/tests/integration/ha_tests/helpers.py +++ b/tests/integration/ha_tests/helpers.py @@ -513,9 +513,9 @@ async def get_leader(model: Model, application_name: str) -> str: the name of the standby leader. """ status = await model.get_status() - first_unit_ip = next(list(status["applications"][application_name]["units"].values()))[ - "address" - ] + first_unit_ip = next( + unit for unit in status["applications"][application_name]["units"].values() + )["address"] cluster = get_patroni_cluster(first_unit_ip) for member in cluster["members"]: if member["role"] == "leader": From faee1242b73f46107415cbab6067889b0c7ae42d Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Fri, 14 Feb 2025 11:03:14 +0200 Subject: [PATCH 20/22] Update conf description --- config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.yaml b/config.yaml index 52c87c335a..e835ce66e3 100644 --- a/config.yaml +++ b/config.yaml @@ -5,7 +5,7 @@ options: synchronous_node_count: description: | Sets the number of synchronous nodes to be maintained in the cluster. Should be - either "all", "majority" or a positive value. + either "all", "majority" or a positive integer value. type: string default: "all" durability_synchronous_commit: From ee6093f31193673f2b5e827a55457f32fea5db07 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Fri, 14 Feb 2025 11:37:07 +0200 Subject: [PATCH 21/22] Spread task --- tests/integration/ha_tests/test_synchronous_policy.py | 5 ----- tests/spread/test_synchronous_policy.py/task.yaml | 7 +++++++ 2 files changed, 7 insertions(+), 5 deletions(-) create mode 100644 tests/spread/test_synchronous_policy.py/task.yaml diff --git a/tests/integration/ha_tests/test_synchronous_policy.py b/tests/integration/ha_tests/test_synchronous_policy.py index 5e2d6776d5..46e8f361b5 100644 --- a/tests/integration/ha_tests/test_synchronous_policy.py +++ b/tests/integration/ha_tests/test_synchronous_policy.py @@ -9,7 +9,6 @@ from .helpers import get_cluster_roles -@pytest.mark.group(1) @pytest.mark.abort_on_fail async def test_build_and_deploy(ops_test: OpsTest) -> None: """Build and deploy three unit of PostgreSQL.""" @@ -25,7 +24,6 @@ async def test_build_and_deploy(ops_test: OpsTest) -> None: await ops_test.model.wait_for_idle(status="active", timeout=1000, raise_on_error=False) -@pytest.mark.group(1) async def test_default_all(ops_test: OpsTest) -> None: app = await app_name(ops_test) @@ -43,7 +41,6 @@ async def test_default_all(ops_test: OpsTest) -> None: assert len(roles["replicas"]) == 0 -@pytest.mark.group(1) async def test_majority(ops_test: OpsTest) -> None: app = await app_name(ops_test) @@ -63,9 +60,7 @@ async def test_majority(ops_test: OpsTest) -> None: assert len(roles["replicas"]) == 1 -@pytest.mark.group(1) async def test_constant(ops_test: OpsTest) -> None: - """Kill primary unit, check reelection.""" app = await app_name(ops_test) await ops_test.model.applications[app].set_config({"synchronous_node_count": "2"}) diff --git a/tests/spread/test_synchronous_policy.py/task.yaml b/tests/spread/test_synchronous_policy.py/task.yaml new file mode 100644 index 0000000000..fada7cb4fb --- /dev/null +++ b/tests/spread/test_synchronous_policy.py/task.yaml @@ -0,0 +1,7 @@ +summary: test_synchronous_policy.py +environment: + TEST_MODULE: ha_tests/test_synchronous_policy.py +execute: | + tox run -e integration -- "tests/integration/$TEST_MODULE" --model testing --alluredir="$SPREAD_TASK/allure-results" +artifacts: + - allure-results From f628e2c23565977e8d2b50c460b2d02928a1e215 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Fri, 14 Feb 2025 13:06:06 +0200 Subject: [PATCH 22/22] Pass the charm --- tests/integration/ha_tests/test_synchronous_policy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/ha_tests/test_synchronous_policy.py b/tests/integration/ha_tests/test_synchronous_policy.py index 46e8f361b5..4214a4ae11 100644 --- a/tests/integration/ha_tests/test_synchronous_policy.py +++ b/tests/integration/ha_tests/test_synchronous_policy.py @@ -10,14 +10,14 @@ @pytest.mark.abort_on_fail -async def test_build_and_deploy(ops_test: OpsTest) -> None: +async def test_build_and_deploy(ops_test: OpsTest, charm) -> None: """Build and deploy three unit of PostgreSQL.""" wait_for_apps = False # It is possible for users to provide their own cluster for HA testing. Hence, check if there # is a pre-existing cluster. if not await app_name(ops_test): wait_for_apps = True - await build_and_deploy(ops_test, 3, wait_for_idle=False) + await build_and_deploy(ops_test, charm, 3, wait_for_idle=False) if wait_for_apps: async with ops_test.fast_forward():