Skip to content

Commit

Permalink
MANOPD-70594 Fix etcd health validation issue (#64)
Browse files Browse the repository at this point in the history
  • Loading branch information
iLeonidze authored Nov 24, 2021
1 parent 5731432 commit 771bae0
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 11 deletions.
6 changes: 4 additions & 2 deletions kubetool/core/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -713,8 +713,10 @@ def get_last_member(self, provide_node_configs=False, apply_filter=None):
return self.get_member(-1, provide_node_configs=provide_node_configs, apply_filter=apply_filter)

def get_any_member(self, provide_node_configs=False, apply_filter=None):
return random.choice(self.get_ordered_members_list(provide_node_configs=provide_node_configs,
apply_filter=apply_filter))
member = random.choice(self.get_ordered_members_list(provide_node_configs=provide_node_configs,
apply_filter=apply_filter))
self.cluster.log.verbose(f'Selected node {str(member)}')
return member

def get_member_by_name(self, name, provide_node_configs=False):
return self.get_first_member(provide_node_configs=provide_node_configs, apply_filter={"name": name})
Expand Down
22 changes: 15 additions & 7 deletions kubetool/etcd.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import io
import json
import time

import fabric.connection

from kubetool.core.cluster import KubernetesCluster
from kubetool.core.group import NodeGroup


Expand Down Expand Up @@ -37,10 +41,12 @@ def remove_members(group: NodeGroup):
else:
log.verbose(f"Skipping {node_name} as it is not among etcd members.")

# the method checks etcd endpoints health until all endpoints are healthy or retries are exhausted
# if all member are healthy the method checks the leader
def wait_for_health(cluster, connection):

def wait_for_health(cluster: KubernetesCluster, connection: fabric.connection.Connection) -> list[dict]:
"""
The method checks etcd endpoints health until all endpoints are healthy or retries are exhausted
if all member are healthy the method checks the leader.
"""
log = cluster.log
init_timeout = cluster.globals['etcd']['health']['init_timeout']
timeout = cluster.globals['etcd']['health']['timeout']
Expand All @@ -50,8 +56,8 @@ def wait_for_health(cluster, connection):
time.sleep(init_timeout)
while retries > 0:
start_time = time.time()
etcd_health_raw = connection.sudo('etcdctl endpoint health --cluster -w json'
, is_async=False, hide=True).get_simple_out()
etcd_health_raw = connection.sudo('etcdctl endpoint health --cluster -w json',
is_async=False, hide=True).get_simple_out()
end_time = time.time()
sudo_time = int(end_time - start_time)
log.verbose(etcd_health_raw)
Expand All @@ -73,8 +79,8 @@ def wait_for_health(cluster, connection):
retries -= 1

if is_healthy:
etcd_status_raw = connection.sudo('etcdctl endpoint status --cluster -w json'
, is_async=False, hide=True).get_simple_out()
etcd_status_raw = connection.sudo('etcdctl endpoint status --cluster -w json',
is_async=False, hide=True).get_simple_out()
log.verbose(etcd_status_raw)
etcd_status_list = json.load(io.StringIO(etcd_status_raw.lower().strip()))
elected_leader = None
Expand All @@ -91,3 +97,5 @@ def wait_for_health(cluster, connection):
raise Exception('ETCD cluster is still not healthy!')

log.verbose('ETCD cluster is healthy!')

return etcd_status_list
3 changes: 1 addition & 2 deletions kubetool/procedures/restore.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,8 +211,7 @@ def import_etcd(cluster: KubernetesCluster):

# After restore check db size equal, cluster health and leader elected
# Checks should be changed
master_conn = cluster.nodes['master'].get_first_member()
etcd.wait_for_health(cluster, cluster.nodes['master'])
cluster_status = etcd.wait_for_health(cluster, cluster.nodes['master'].get_any_member())

# Check DB size is correct
backup_source = cluster.context['backup_descriptor'].get('etcd', {}).get('source')
Expand Down

0 comments on commit 771bae0

Please sign in to comment.