From 2a42a2dc3112a96aca9373425969363ea3330b21 Mon Sep 17 00:00:00 2001 From: Paige Patton Date: Mon, 3 Feb 2025 12:50:22 -0500 Subject: [PATCH] adding node id to affected node --- .../node_actions/alibaba_node_scenarios.py | 4 ++++ .../node_actions/aws_node_scenarios.py | 4 ++++ .../node_actions/az_node_scenarios.py | 6 ++++-- .../node_actions/bm_node_scenarios.py | 20 +++++++++++++++---- .../node_actions/docker_node_scenarios.py | 2 ++ .../node_actions/gcp_node_scenarios.py | 6 ++++-- .../shut_down/shut_down_scenario_plugin.py | 17 +++++++++------- 7 files changed, 44 insertions(+), 15 deletions(-) diff --git a/krkn/scenario_plugins/node_actions/alibaba_node_scenarios.py b/krkn/scenario_plugins/node_actions/alibaba_node_scenarios.py index 8a546053..4d401316 100644 --- a/krkn/scenario_plugins/node_actions/alibaba_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/alibaba_node_scenarios.py @@ -239,6 +239,7 @@ def node_start_scenario(self, instance_kill_count, node, timeout): try: logging.info("Starting node_start_scenario injection") vm_id = self.alibaba.get_instance_id(node) + affected_node.node_id = vm_id logging.info( "Starting the node %s with instance ID: %s " % (node, vm_id) ) @@ -263,6 +264,7 @@ def node_stop_scenario(self, instance_kill_count, node, timeout): try: logging.info("Starting node_stop_scenario injection") vm_id = self.alibaba.get_instance_id(node) + affected_node.node_id = vm_id logging.info( "Stopping the node %s with instance ID: %s " % (node, vm_id) ) @@ -289,6 +291,7 @@ def node_termination_scenario(self, instance_kill_count, node, timeout): "Starting node_termination_scenario injection by first stopping instance" ) vm_id = self.alibaba.get_instance_id(node) + affected_node.node_id = vm_id self.alibaba.stop_instances(vm_id) self.alibaba.wait_until_stopped(vm_id, timeout, affected_node) logging.info( @@ -316,6 +319,7 @@ def node_reboot_scenario(self, instance_kill_count, node, timeout): try: logging.info("Starting node_reboot_scenario injection") instance_id = self.alibaba.get_instance_id(node) + affected_node.node_id = instance_id logging.info("Rebooting the node with instance ID: %s " % (instance_id)) self.alibaba.reboot_instances(instance_id) nodeaction.wait_for_unknown_status(node, timeout, self.kubecli, affected_node) diff --git a/krkn/scenario_plugins/node_actions/aws_node_scenarios.py b/krkn/scenario_plugins/node_actions/aws_node_scenarios.py index 205869ba..1ac7c55e 100644 --- a/krkn/scenario_plugins/node_actions/aws_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/aws_node_scenarios.py @@ -272,6 +272,7 @@ def node_start_scenario(self, instance_kill_count, node, timeout): try: logging.info("Starting node_start_scenario injection") instance_id = self.aws.get_instance_id(node) + affected_node.node_id = instance_id logging.info( "Starting the node %s with instance ID: %s " % (node, instance_id) ) @@ -299,6 +300,7 @@ def node_stop_scenario(self, instance_kill_count, node, timeout): try: logging.info("Starting node_stop_scenario injection") instance_id = self.aws.get_instance_id(node) + affected_node.node_id = instance_id logging.info( "Stopping the node %s with instance ID: %s " % (node, instance_id) ) @@ -325,6 +327,7 @@ def node_termination_scenario(self, instance_kill_count, node, timeout): try: logging.info("Starting node_termination_scenario injection") instance_id = self.aws.get_instance_id(node) + affected_node.node_id = instance_id logging.info( "Terminating the node %s with instance ID: %s " % (node, instance_id) @@ -358,6 +361,7 @@ def node_reboot_scenario(self, instance_kill_count, node, timeout): try: logging.info("Starting node_reboot_scenario injection" + str(node)) instance_id = self.aws.get_instance_id(node) + affected_node.node_id = instance_id logging.info( "Rebooting the node %s with instance ID: %s " % (node, instance_id) ) diff --git a/krkn/scenario_plugins/node_actions/az_node_scenarios.py b/krkn/scenario_plugins/node_actions/az_node_scenarios.py index 156a4bb4..65344623 100644 --- a/krkn/scenario_plugins/node_actions/az_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/az_node_scenarios.py @@ -170,7 +170,7 @@ def node_start_scenario(self, instance_kill_count, node, timeout): try: logging.info("Starting node_start_scenario injection") vm_name, resource_group = self.azure.get_instance_id(node) - + affected_node.node_id = vm_name logging.info( "Starting the node %s with instance ID: %s " % (vm_name, resource_group) @@ -197,6 +197,7 @@ def node_stop_scenario(self, instance_kill_count, node, timeout): try: logging.info("Starting node_stop_scenario injection") vm_name, resource_group = self.azure.get_instance_id(node) + affected_node.node_id = vm_name logging.info( "Stopping the node %s with instance ID: %s " % (vm_name, resource_group) @@ -221,8 +222,8 @@ def node_termination_scenario(self, instance_kill_count, node, timeout): affected_node = AffectedNode(node) try: logging.info("Starting node_termination_scenario injection") - affected_node = AffectedNode(node) vm_name, resource_group = self.azure.get_instance_id(node) + affected_node.node_id = vm_name logging.info( "Terminating the node %s with instance ID: %s " % (vm_name, resource_group) @@ -257,6 +258,7 @@ def node_reboot_scenario(self, instance_kill_count, node, timeout): try: logging.info("Starting node_reboot_scenario injection") vm_name, resource_group = self.azure.get_instance_id(node) + affected_node.node_id = vm_name logging.info( "Rebooting the node %s with instance ID: %s " % (vm_name, resource_group) diff --git a/krkn/scenario_plugins/node_actions/bm_node_scenarios.py b/krkn/scenario_plugins/node_actions/bm_node_scenarios.py index 4a9a4eb1..fa8ec24b 100644 --- a/krkn/scenario_plugins/node_actions/bm_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/bm_node_scenarios.py @@ -109,20 +109,28 @@ def reboot_instances(self, bmc_addr, node_name): self.get_ipmi_connection(bmc_addr, node_name).chassis_control_power_cycle() # Wait until the node instance is running - def wait_until_running(self, bmc_addr, node_name): + def wait_until_running(self, bmc_addr, node_name, affected_node): + start_time = time.time() while ( not self.get_ipmi_connection(bmc_addr, node_name) .get_chassis_status() .power_on ): time.sleep(1) + end_time = time.time() + if affected_node: + affected_node.set_affected_node_status("running", end_time - start_time) # Wait until the node instance is stopped - def wait_until_stopped(self, bmc_addr, node_name): + def wait_until_stopped(self, bmc_addr, node_name, affected_node): + start_time = time.time() while ( self.get_ipmi_connection(bmc_addr, node_name).get_chassis_status().power_on ): time.sleep(1) + end_time = time.time() + if affected_node: + affected_node.set_affected_node_status("stopped", end_time - start_time) # krkn_lib @@ -134,15 +142,17 @@ def __init__(self, bm_info, user, passwd, kubecli: KrknKubernetes,affected_nodes # Node scenario to start the node def node_start_scenario(self, instance_kill_count, node, timeout): for _ in range(instance_kill_count): + affected_node = AffectedNode(node) try: logging.info("Starting node_start_scenario injection") bmc_addr = self.bm.get_bmc_addr(node) + affected_node.node_id = bmc_addr logging.info( "Starting the node %s with bmc address: %s " % (node, bmc_addr) ) self.bm.start_instances(bmc_addr, node) - self.bm.wait_until_running(bmc_addr, node) - nodeaction.wait_for_ready_status(node, timeout, self.kubecli) + self.bm.wait_until_running(bmc_addr, node, affected_node) + nodeaction.wait_for_ready_status(node, timeout, self.kubecli, affected_node) logging.info( "Node with bmc address: %s is in running state" % (bmc_addr) ) @@ -155,6 +165,7 @@ def node_start_scenario(self, instance_kill_count, node, timeout): ) logging.error("node_start_scenario injection failed!") raise e + self.affected_nodes_status.affected_nodes.append(affected_node) # Node scenario to stop the node def node_stop_scenario(self, instance_kill_count, node, timeout): @@ -163,6 +174,7 @@ def node_stop_scenario(self, instance_kill_count, node, timeout): try: logging.info("Starting node_stop_scenario injection") bmc_addr = self.bm.get_bmc_addr(node) + affected_node.node_id = bmc_addr logging.info( "Stopping the node %s with bmc address: %s " % (node, bmc_addr) ) diff --git a/krkn/scenario_plugins/node_actions/docker_node_scenarios.py b/krkn/scenario_plugins/node_actions/docker_node_scenarios.py index 2e050b6e..1730656d 100644 --- a/krkn/scenario_plugins/node_actions/docker_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/docker_node_scenarios.py @@ -49,6 +49,7 @@ def node_start_scenario(self, instance_kill_count, node, timeout): try: logging.info("Starting node_start_scenario injection") container_id = self.docker.get_container_id(node) + affected_node.node_id = container_id logging.info( "Starting the node %s with container ID: %s " % (node, container_id) ) @@ -74,6 +75,7 @@ def node_stop_scenario(self, instance_kill_count, node, timeout): try: logging.info("Starting node_stop_scenario injection") container_id = self.docker.get_container_id(node) + affected_node.node_id = container_id logging.info( "Stopping the node %s with container ID: %s " % (node, container_id) ) diff --git a/krkn/scenario_plugins/node_actions/gcp_node_scenarios.py b/krkn/scenario_plugins/node_actions/gcp_node_scenarios.py index ec39538d..540cd009 100644 --- a/krkn/scenario_plugins/node_actions/gcp_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/gcp_node_scenarios.py @@ -234,6 +234,7 @@ def node_start_scenario(self, instance_kill_count, node, timeout): logging.info("Starting node_start_scenario injection") instance = self.gcp.get_node_instance(node) instance_id = self.gcp.get_instance_name(instance) + affected_node.node_id = instance_id logging.info( "Starting the node %s with instance ID: %s " % (node, instance_id) ) @@ -252,7 +253,6 @@ def node_start_scenario(self, instance_kill_count, node, timeout): logging.error("node_start_scenario injection failed!") raise RuntimeError() - logging.info("started affected node" + str(affected_node.to_json())) self.affected_nodes_status.affected_nodes.append(affected_node) # Node scenario to stop the node @@ -263,6 +263,7 @@ def node_stop_scenario(self, instance_kill_count, node, timeout): logging.info("Starting node_stop_scenario injection") instance = self.gcp.get_node_instance(node) instance_id = self.gcp.get_instance_name(instance) + affected_node.node_id = instance_id logging.info( "Stopping the node %s with instance ID: %s " % (node, instance_id) ) @@ -280,7 +281,6 @@ def node_stop_scenario(self, instance_kill_count, node, timeout): logging.error("node_stop_scenario injection failed!") raise RuntimeError() - logging.info("stopedd affected node" + str(affected_node.to_json())) self.affected_nodes_status.affected_nodes.append(affected_node) # Node scenario to terminate the node @@ -291,6 +291,7 @@ def node_termination_scenario(self, instance_kill_count, node, timeout): logging.info("Starting node_termination_scenario injection") instance = self.gcp.get_node_instance(node) instance_id = self.gcp.get_instance_name(instance) + affected_node.node_id = instance_id logging.info( "Terminating the node %s with instance ID: %s " % (node, instance_id) @@ -325,6 +326,7 @@ def node_reboot_scenario(self, instance_kill_count, node, timeout): logging.info("Starting node_reboot_scenario injection") instance = self.gcp.get_node_instance(node) instance_id = self.gcp.get_instance_name(instance) + affected_node.node_id = instance_id logging.info( "Rebooting the node %s with instance ID: %s " % (node, instance_id) ) diff --git a/krkn/scenario_plugins/shut_down/shut_down_scenario_plugin.py b/krkn/scenario_plugins/shut_down/shut_down_scenario_plugin.py index 8c7ec751..1ddd4242 100644 --- a/krkn/scenario_plugins/shut_down/shut_down_scenario_plugin.py +++ b/krkn/scenario_plugins/shut_down/shut_down_scenario_plugin.py @@ -15,6 +15,8 @@ from krkn.scenario_plugins.node_actions.openstack_node_scenarios import OPENSTACKCLOUD from krkn.scenario_plugins.native.node_scenarios.ibmcloud_plugin import IbmCloud +import krkn.scenario_plugins.node_actions.common_node_functions as nodeaction + from krkn_lib.models.k8s import AffectedNodeStatus, AffectedNode class ShutDownScenarioPlugin(AbstractScenarioPlugin): @@ -38,7 +40,7 @@ def run( shut_down_config_scenario, lib_telemetry.get_lib_kubernetes(), affected_nodes_status ) - scenario_telemetry.affected_nodes = affected_nodes_status + scenario_telemetry.affected_nodes = affected_nodes_status.affected_nodes end_time = int(time.time()) cerberus.publish_kraken_status(krkn_config, [], start_time, end_time) return 0 @@ -56,7 +58,6 @@ def multiprocess_nodes(self, cloud_object_function, nodes, processes=0): pool = ThreadPool(processes=len(nodes)) else: pool = ThreadPool(processes=processes) - logging.info("nodes type " + str(type(nodes[0]))) if type(nodes[0]) is tuple: node_id = [] node_info = [] @@ -105,9 +106,8 @@ def cluster_shut_down(self, shut_down_config, kubecli: KrknKubernetes, affected_ node_id = [] for node in nodes: instance_id = cloud_object.get_instance_id(node) - affected_nodes_status.affected_nodes.append(AffectedNode(node)) + affected_nodes_status.affected_nodes.append(AffectedNode(node, node_id=instance_id)) node_id.append(instance_id) - logging.info("node id list " + str(node_id)) for _ in range(runs): logging.info("Starting cluster_shut_down scenario injection") stopping_nodes = set(node_id) @@ -117,8 +117,7 @@ def cluster_shut_down(self, shut_down_config, kubecli: KrknKubernetes, affected_ while len(stopping_nodes) > 0: for node in stopping_nodes: affected_node = affected_nodes_status.get_affected_node_index(node) - # need to add in time that is passing while waiting for other nodes to be stopped - affected_node.set_cloud_stopping_time(time.time() - start_time) + if type(node) is tuple: node_status = cloud_object.wait_until_stopped( node[1], node[0], timeout, affected_node @@ -129,6 +128,8 @@ def cluster_shut_down(self, shut_down_config, kubecli: KrknKubernetes, affected_ # Only want to remove node from stopping list # when fully stopped/no error if node_status: + # need to add in time that is passing while waiting for other nodes to be stopped + affected_node.set_cloud_stopping_time(time.time() - start_time) stopped_nodes.remove(node) stopping_nodes = stopped_nodes.copy() @@ -148,7 +149,7 @@ def cluster_shut_down(self, shut_down_config, kubecli: KrknKubernetes, affected_ for node in not_running_nodes: affected_node = affected_nodes_status.get_affected_node_index(node) # need to add in time that is passing while waiting for other nodes to be running - affected_node.set_cloud_running_time(time.time() - start_time) + if type(node) is tuple: node_status = cloud_object.wait_until_running( node[1], node[0], timeout, affected_node @@ -156,8 +157,10 @@ def cluster_shut_down(self, shut_down_config, kubecli: KrknKubernetes, affected_ else: node_status = cloud_object.wait_until_running(node, timeout, affected_node) if node_status: + affected_node.set_cloud_running_time(time.time() - start_time) restarted_nodes.remove(node) not_running_nodes = restarted_nodes.copy() + logging.info("Waiting for 150s to allow cluster component initialization") time.sleep(150)