diff --git a/kubetool/core/group.py b/kubetool/core/group.py index 6961485c1..fc117b834 100755 --- a/kubetool/core/group.py +++ b/kubetool/core/group.py @@ -713,8 +713,10 @@ def get_last_member(self, provide_node_configs=False, apply_filter=None): return self.get_member(-1, provide_node_configs=provide_node_configs, apply_filter=apply_filter) def get_any_member(self, provide_node_configs=False, apply_filter=None): - return random.choice(self.get_ordered_members_list(provide_node_configs=provide_node_configs, - apply_filter=apply_filter)) + member = random.choice(self.get_ordered_members_list(provide_node_configs=provide_node_configs, + apply_filter=apply_filter)) + self.cluster.log.verbose(f'Selected node {str(member)}') + return member def get_member_by_name(self, name, provide_node_configs=False): return self.get_first_member(provide_node_configs=provide_node_configs, apply_filter={"name": name}) diff --git a/kubetool/etcd.py b/kubetool/etcd.py index 3aeffda72..d4d31ce69 100644 --- a/kubetool/etcd.py +++ b/kubetool/etcd.py @@ -1,6 +1,10 @@ import io import json import time + +import fabric.connection + +from kubetool.core.cluster import KubernetesCluster from kubetool.core.group import NodeGroup @@ -37,10 +41,12 @@ def remove_members(group: NodeGroup): else: log.verbose(f"Skipping {node_name} as it is not among etcd members.") -# the method checks etcd endpoints health until all endpoints are healthy or retries are exhausted -# if all member are healthy the method checks the leader -def wait_for_health(cluster, connection): +def wait_for_health(cluster: KubernetesCluster, connection: fabric.connection.Connection) -> list[dict]: + """ + The method checks etcd endpoints health until all endpoints are healthy or retries are exhausted + if all member are healthy the method checks the leader. + """ log = cluster.log init_timeout = cluster.globals['etcd']['health']['init_timeout'] timeout = cluster.globals['etcd']['health']['timeout'] @@ -50,8 +56,8 @@ def wait_for_health(cluster, connection): time.sleep(init_timeout) while retries > 0: start_time = time.time() - etcd_health_raw = connection.sudo('etcdctl endpoint health --cluster -w json' - , is_async=False, hide=True).get_simple_out() + etcd_health_raw = connection.sudo('etcdctl endpoint health --cluster -w json', + is_async=False, hide=True).get_simple_out() end_time = time.time() sudo_time = int(end_time - start_time) log.verbose(etcd_health_raw) @@ -73,8 +79,8 @@ def wait_for_health(cluster, connection): retries -= 1 if is_healthy: - etcd_status_raw = connection.sudo('etcdctl endpoint status --cluster -w json' - , is_async=False, hide=True).get_simple_out() + etcd_status_raw = connection.sudo('etcdctl endpoint status --cluster -w json', + is_async=False, hide=True).get_simple_out() log.verbose(etcd_status_raw) etcd_status_list = json.load(io.StringIO(etcd_status_raw.lower().strip())) elected_leader = None @@ -91,3 +97,5 @@ def wait_for_health(cluster, connection): raise Exception('ETCD cluster is still not healthy!') log.verbose('ETCD cluster is healthy!') + + return etcd_status_list diff --git a/kubetool/procedures/restore.py b/kubetool/procedures/restore.py index 890571973..da56a1ecd 100755 --- a/kubetool/procedures/restore.py +++ b/kubetool/procedures/restore.py @@ -211,8 +211,7 @@ def import_etcd(cluster: KubernetesCluster): # After restore check db size equal, cluster health and leader elected # Checks should be changed - master_conn = cluster.nodes['master'].get_first_member() - etcd.wait_for_health(cluster, cluster.nodes['master']) + cluster_status = etcd.wait_for_health(cluster, cluster.nodes['master'].get_any_member()) # Check DB size is correct backup_source = cluster.context['backup_descriptor'].get('etcd', {}).get('source')