Skip to content

Commit

Permalink
adding node id to affected node
Browse files Browse the repository at this point in the history
  • Loading branch information
paigerube14 authored and chaitanyaenr committed Feb 4, 2025
1 parent 21ab8d4 commit 2a42a2d
Show file tree
Hide file tree
Showing 7 changed files with 44 additions and 15 deletions.
4 changes: 4 additions & 0 deletions krkn/scenario_plugins/node_actions/alibaba_node_scenarios.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,7 @@ def node_start_scenario(self, instance_kill_count, node, timeout):
try:
logging.info("Starting node_start_scenario injection")
vm_id = self.alibaba.get_instance_id(node)
affected_node.node_id = vm_id
logging.info(
"Starting the node %s with instance ID: %s " % (node, vm_id)
)
Expand All @@ -263,6 +264,7 @@ def node_stop_scenario(self, instance_kill_count, node, timeout):
try:
logging.info("Starting node_stop_scenario injection")
vm_id = self.alibaba.get_instance_id(node)
affected_node.node_id = vm_id
logging.info(
"Stopping the node %s with instance ID: %s " % (node, vm_id)
)
Expand All @@ -289,6 +291,7 @@ def node_termination_scenario(self, instance_kill_count, node, timeout):
"Starting node_termination_scenario injection by first stopping instance"
)
vm_id = self.alibaba.get_instance_id(node)
affected_node.node_id = vm_id
self.alibaba.stop_instances(vm_id)
self.alibaba.wait_until_stopped(vm_id, timeout, affected_node)
logging.info(
Expand Down Expand Up @@ -316,6 +319,7 @@ def node_reboot_scenario(self, instance_kill_count, node, timeout):
try:
logging.info("Starting node_reboot_scenario injection")
instance_id = self.alibaba.get_instance_id(node)
affected_node.node_id = instance_id
logging.info("Rebooting the node with instance ID: %s " % (instance_id))
self.alibaba.reboot_instances(instance_id)
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli, affected_node)
Expand Down
4 changes: 4 additions & 0 deletions krkn/scenario_plugins/node_actions/aws_node_scenarios.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,7 @@ def node_start_scenario(self, instance_kill_count, node, timeout):
try:
logging.info("Starting node_start_scenario injection")
instance_id = self.aws.get_instance_id(node)
affected_node.node_id = instance_id
logging.info(
"Starting the node %s with instance ID: %s " % (node, instance_id)
)
Expand Down Expand Up @@ -299,6 +300,7 @@ def node_stop_scenario(self, instance_kill_count, node, timeout):
try:
logging.info("Starting node_stop_scenario injection")
instance_id = self.aws.get_instance_id(node)
affected_node.node_id = instance_id
logging.info(
"Stopping the node %s with instance ID: %s " % (node, instance_id)
)
Expand All @@ -325,6 +327,7 @@ def node_termination_scenario(self, instance_kill_count, node, timeout):
try:
logging.info("Starting node_termination_scenario injection")
instance_id = self.aws.get_instance_id(node)
affected_node.node_id = instance_id
logging.info(
"Terminating the node %s with instance ID: %s "
% (node, instance_id)
Expand Down Expand Up @@ -358,6 +361,7 @@ def node_reboot_scenario(self, instance_kill_count, node, timeout):
try:
logging.info("Starting node_reboot_scenario injection" + str(node))
instance_id = self.aws.get_instance_id(node)
affected_node.node_id = instance_id
logging.info(
"Rebooting the node %s with instance ID: %s " % (node, instance_id)
)
Expand Down
6 changes: 4 additions & 2 deletions krkn/scenario_plugins/node_actions/az_node_scenarios.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def node_start_scenario(self, instance_kill_count, node, timeout):
try:
logging.info("Starting node_start_scenario injection")
vm_name, resource_group = self.azure.get_instance_id(node)

affected_node.node_id = vm_name
logging.info(
"Starting the node %s with instance ID: %s "
% (vm_name, resource_group)
Expand All @@ -197,6 +197,7 @@ def node_stop_scenario(self, instance_kill_count, node, timeout):
try:
logging.info("Starting node_stop_scenario injection")
vm_name, resource_group = self.azure.get_instance_id(node)
affected_node.node_id = vm_name
logging.info(
"Stopping the node %s with instance ID: %s "
% (vm_name, resource_group)
Expand All @@ -221,8 +222,8 @@ def node_termination_scenario(self, instance_kill_count, node, timeout):
affected_node = AffectedNode(node)
try:
logging.info("Starting node_termination_scenario injection")
affected_node = AffectedNode(node)
vm_name, resource_group = self.azure.get_instance_id(node)
affected_node.node_id = vm_name
logging.info(
"Terminating the node %s with instance ID: %s "
% (vm_name, resource_group)
Expand Down Expand Up @@ -257,6 +258,7 @@ def node_reboot_scenario(self, instance_kill_count, node, timeout):
try:
logging.info("Starting node_reboot_scenario injection")
vm_name, resource_group = self.azure.get_instance_id(node)
affected_node.node_id = vm_name
logging.info(
"Rebooting the node %s with instance ID: %s "
% (vm_name, resource_group)
Expand Down
20 changes: 16 additions & 4 deletions krkn/scenario_plugins/node_actions/bm_node_scenarios.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,20 +109,28 @@ def reboot_instances(self, bmc_addr, node_name):
self.get_ipmi_connection(bmc_addr, node_name).chassis_control_power_cycle()

# Wait until the node instance is running
def wait_until_running(self, bmc_addr, node_name):
def wait_until_running(self, bmc_addr, node_name, affected_node):
start_time = time.time()
while (
not self.get_ipmi_connection(bmc_addr, node_name)
.get_chassis_status()
.power_on
):
time.sleep(1)
end_time = time.time()
if affected_node:
affected_node.set_affected_node_status("running", end_time - start_time)

# Wait until the node instance is stopped
def wait_until_stopped(self, bmc_addr, node_name):
def wait_until_stopped(self, bmc_addr, node_name, affected_node):
start_time = time.time()
while (
self.get_ipmi_connection(bmc_addr, node_name).get_chassis_status().power_on
):
time.sleep(1)
end_time = time.time()
if affected_node:
affected_node.set_affected_node_status("stopped", end_time - start_time)


# krkn_lib
Expand All @@ -134,15 +142,17 @@ def __init__(self, bm_info, user, passwd, kubecli: KrknKubernetes,affected_nodes
# Node scenario to start the node
def node_start_scenario(self, instance_kill_count, node, timeout):
for _ in range(instance_kill_count):
affected_node = AffectedNode(node)
try:
logging.info("Starting node_start_scenario injection")
bmc_addr = self.bm.get_bmc_addr(node)
affected_node.node_id = bmc_addr
logging.info(
"Starting the node %s with bmc address: %s " % (node, bmc_addr)
)
self.bm.start_instances(bmc_addr, node)
self.bm.wait_until_running(bmc_addr, node)
nodeaction.wait_for_ready_status(node, timeout, self.kubecli)
self.bm.wait_until_running(bmc_addr, node, affected_node)
nodeaction.wait_for_ready_status(node, timeout, self.kubecli, affected_node)
logging.info(
"Node with bmc address: %s is in running state" % (bmc_addr)
)
Expand All @@ -155,6 +165,7 @@ def node_start_scenario(self, instance_kill_count, node, timeout):
)
logging.error("node_start_scenario injection failed!")
raise e
self.affected_nodes_status.affected_nodes.append(affected_node)

# Node scenario to stop the node
def node_stop_scenario(self, instance_kill_count, node, timeout):
Expand All @@ -163,6 +174,7 @@ def node_stop_scenario(self, instance_kill_count, node, timeout):
try:
logging.info("Starting node_stop_scenario injection")
bmc_addr = self.bm.get_bmc_addr(node)
affected_node.node_id = bmc_addr
logging.info(
"Stopping the node %s with bmc address: %s " % (node, bmc_addr)
)
Expand Down
2 changes: 2 additions & 0 deletions krkn/scenario_plugins/node_actions/docker_node_scenarios.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def node_start_scenario(self, instance_kill_count, node, timeout):
try:
logging.info("Starting node_start_scenario injection")
container_id = self.docker.get_container_id(node)
affected_node.node_id = container_id
logging.info(
"Starting the node %s with container ID: %s " % (node, container_id)
)
Expand All @@ -74,6 +75,7 @@ def node_stop_scenario(self, instance_kill_count, node, timeout):
try:
logging.info("Starting node_stop_scenario injection")
container_id = self.docker.get_container_id(node)
affected_node.node_id = container_id
logging.info(
"Stopping the node %s with container ID: %s " % (node, container_id)
)
Expand Down
6 changes: 4 additions & 2 deletions krkn/scenario_plugins/node_actions/gcp_node_scenarios.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@ def node_start_scenario(self, instance_kill_count, node, timeout):
logging.info("Starting node_start_scenario injection")
instance = self.gcp.get_node_instance(node)
instance_id = self.gcp.get_instance_name(instance)
affected_node.node_id = instance_id
logging.info(
"Starting the node %s with instance ID: %s " % (node, instance_id)
)
Expand All @@ -252,7 +253,6 @@ def node_start_scenario(self, instance_kill_count, node, timeout):
logging.error("node_start_scenario injection failed!")

raise RuntimeError()
logging.info("started affected node" + str(affected_node.to_json()))
self.affected_nodes_status.affected_nodes.append(affected_node)

# Node scenario to stop the node
Expand All @@ -263,6 +263,7 @@ def node_stop_scenario(self, instance_kill_count, node, timeout):
logging.info("Starting node_stop_scenario injection")
instance = self.gcp.get_node_instance(node)
instance_id = self.gcp.get_instance_name(instance)
affected_node.node_id = instance_id
logging.info(
"Stopping the node %s with instance ID: %s " % (node, instance_id)
)
Expand All @@ -280,7 +281,6 @@ def node_stop_scenario(self, instance_kill_count, node, timeout):
logging.error("node_stop_scenario injection failed!")

raise RuntimeError()
logging.info("stopedd affected node" + str(affected_node.to_json()))
self.affected_nodes_status.affected_nodes.append(affected_node)

# Node scenario to terminate the node
Expand All @@ -291,6 +291,7 @@ def node_termination_scenario(self, instance_kill_count, node, timeout):
logging.info("Starting node_termination_scenario injection")
instance = self.gcp.get_node_instance(node)
instance_id = self.gcp.get_instance_name(instance)
affected_node.node_id = instance_id
logging.info(
"Terminating the node %s with instance ID: %s "
% (node, instance_id)
Expand Down Expand Up @@ -325,6 +326,7 @@ def node_reboot_scenario(self, instance_kill_count, node, timeout):
logging.info("Starting node_reboot_scenario injection")
instance = self.gcp.get_node_instance(node)
instance_id = self.gcp.get_instance_name(instance)
affected_node.node_id = instance_id
logging.info(
"Rebooting the node %s with instance ID: %s " % (node, instance_id)
)
Expand Down
17 changes: 10 additions & 7 deletions krkn/scenario_plugins/shut_down/shut_down_scenario_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
from krkn.scenario_plugins.node_actions.openstack_node_scenarios import OPENSTACKCLOUD
from krkn.scenario_plugins.native.node_scenarios.ibmcloud_plugin import IbmCloud

import krkn.scenario_plugins.node_actions.common_node_functions as nodeaction

from krkn_lib.models.k8s import AffectedNodeStatus, AffectedNode

class ShutDownScenarioPlugin(AbstractScenarioPlugin):
Expand All @@ -38,7 +40,7 @@ def run(
shut_down_config_scenario, lib_telemetry.get_lib_kubernetes(), affected_nodes_status
)

scenario_telemetry.affected_nodes = affected_nodes_status
scenario_telemetry.affected_nodes = affected_nodes_status.affected_nodes
end_time = int(time.time())
cerberus.publish_kraken_status(krkn_config, [], start_time, end_time)
return 0
Expand All @@ -56,7 +58,6 @@ def multiprocess_nodes(self, cloud_object_function, nodes, processes=0):
pool = ThreadPool(processes=len(nodes))
else:
pool = ThreadPool(processes=processes)
logging.info("nodes type " + str(type(nodes[0])))
if type(nodes[0]) is tuple:
node_id = []
node_info = []
Expand Down Expand Up @@ -105,9 +106,8 @@ def cluster_shut_down(self, shut_down_config, kubecli: KrknKubernetes, affected_
node_id = []
for node in nodes:
instance_id = cloud_object.get_instance_id(node)
affected_nodes_status.affected_nodes.append(AffectedNode(node))
affected_nodes_status.affected_nodes.append(AffectedNode(node, node_id=instance_id))
node_id.append(instance_id)
logging.info("node id list " + str(node_id))
for _ in range(runs):
logging.info("Starting cluster_shut_down scenario injection")
stopping_nodes = set(node_id)
Expand All @@ -117,8 +117,7 @@ def cluster_shut_down(self, shut_down_config, kubecli: KrknKubernetes, affected_
while len(stopping_nodes) > 0:
for node in stopping_nodes:
affected_node = affected_nodes_status.get_affected_node_index(node)
# need to add in time that is passing while waiting for other nodes to be stopped
affected_node.set_cloud_stopping_time(time.time() - start_time)

if type(node) is tuple:
node_status = cloud_object.wait_until_stopped(
node[1], node[0], timeout, affected_node
Expand All @@ -129,6 +128,8 @@ def cluster_shut_down(self, shut_down_config, kubecli: KrknKubernetes, affected_
# Only want to remove node from stopping list
# when fully stopped/no error
if node_status:
# need to add in time that is passing while waiting for other nodes to be stopped
affected_node.set_cloud_stopping_time(time.time() - start_time)
stopped_nodes.remove(node)

stopping_nodes = stopped_nodes.copy()
Expand All @@ -148,16 +149,18 @@ def cluster_shut_down(self, shut_down_config, kubecli: KrknKubernetes, affected_
for node in not_running_nodes:
affected_node = affected_nodes_status.get_affected_node_index(node)
# need to add in time that is passing while waiting for other nodes to be running
affected_node.set_cloud_running_time(time.time() - start_time)

if type(node) is tuple:
node_status = cloud_object.wait_until_running(
node[1], node[0], timeout, affected_node
)
else:
node_status = cloud_object.wait_until_running(node, timeout, affected_node)
if node_status:
affected_node.set_cloud_running_time(time.time() - start_time)
restarted_nodes.remove(node)
not_running_nodes = restarted_nodes.copy()

logging.info("Waiting for 150s to allow cluster component initialization")
time.sleep(150)

Expand Down

0 comments on commit 2a42a2d

Please sign in to comment.