Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test(robot): add drain cases from manual test cases #2116

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions e2e/keywords/common.resource
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Library ../libs/keywords/setting_keywords.py
Library ../libs/keywords/backupstore_keywords.py
Library ../libs/keywords/backup_keywords.py
Library ../libs/keywords/sharemanager_keywords.py
Library ../libs/keywords/k8s_keywords.py

*** Keywords ***
Set test environment
Expand All @@ -35,6 +36,7 @@ Set test environment
END

Cleanup test resources
uncordon_all_nodes
cleanup_control_plane_network_latency
reset_node_schedule
cleanup_node_exec
Expand Down
5 changes: 5 additions & 0 deletions e2e/keywords/host.resource
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,8 @@ Restart cluster
Power on off node
Run keyword And Continue On Failure
... power_on_node_by_name ${powered_off_node}

Power off node ${node_id}
${powered_off_node} = get_node_by_index ${node_id}
power_off_node_by_name ${powered_off_node}
Set Test Variable ${powered_off_node}
61 changes: 60 additions & 1 deletion e2e/keywords/k8s.resource
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@ Library ../libs/keywords/k8s_keywords.py
Library ../libs/keywords/workload_keywords.py
Library ../libs/keywords/volume_keywords.py
Library ../libs/keywords/host_keywords.py
Library ../libs/keywords/node_keywords.py

*** Variables ***

${DRAIN_TIMEOUT} 90

*** Keywords ***
Stop volume node kubelet of ${workload_kind} ${workload_id} for ${duration} seconds
Expand Down Expand Up @@ -51,5 +52,63 @@ Force drain volume of ${workload_kind} ${workload_id} replica node
Set Test Variable ${drained_node}
Set Test Variable ${last_volume_node}

Force drain node ${node_id}
${node_name} = get_node_by_index ${node_id}
force_drain_node ${node_name}

Drain volume of ${workload_kind} ${workload_id} volume node
${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id}
${volume_name} = get_workload_volume_name ${workload_name}
${drained_node} = get_volume_node ${volume_name}
${last_volume_node} = get_volume_node ${volume_name}
drain_node ${drained_node}
wait_for_all_pods_evicted ${drained_node}
Set Test Variable ${drained_node}
Set Test Variable ${last_volume_node}

Uncordon the drained node
uncordon_node ${drained_node}

Cordon ${workload_kind} ${workload_id} volume node
${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id}
${volume_name} = get_workload_volume_name ${workload_name}
${volume_node} = get_volume_node ${volume_name}
cordon_node ${volume_node}
check_node_is_not_schedulable ${volume_node}

Force drain all nodes
FOR ${node_id} IN RANGE 0 3
${node_name} = get_node_by_index ${node_id}
force_drain_node ${node_name}
wait_for_all_pods_evicted ${node_name}
END

Check node ${node_id} cordoned
${node_name} = get_node_by_index ${node_id}
check_node_cordoned ${node_name}

Force drain node ${node_id} and expect failure
${drained_node} = get_node_by_index ${node_id}
${instance_manager_name} = get_instance_manager_on_node ${drained_node}
Run Keyword And Expect Error * force_drain_node ${drained_node}
Set Test Variable ${instance_manager_name}
Set Test Variable ${drained_node}

Force drain node ${node_id} and expect success
${drained_node} = get_node_by_index ${node_id}
${instance_manager_name} = get_instance_manager_on_node ${drained_node}
force_drain_node ${drained_node}
Set Test Variable ${instance_manager_name}
Set Test Variable ${drained_node}

The drain process not completed
check_drain_process_not_completed ${drain_process}

The drain process completed
wait_for_all_pods_evicted ${drained_node}
check_drain_process_completed ${drain_process}

Check PDB not exist
[Arguments] ${instance_manger}
check_instance_manager_pdb_not_exist ${instance_manger}

10 changes: 10 additions & 0 deletions e2e/keywords/longhorn.resource
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ Documentation Longhorn Keywords

Library ../libs/keywords/instancemanager_keywords.py
Library ../libs/keywords/workload_keywords.py
Library ../libs/keywords/k8s_keywords.py

*** Variables ***
@{longhorn_workloads}
Expand Down Expand Up @@ -43,3 +44,12 @@ Check Longhorn workload pods ${condition} annotated with ${key}
Run Keyword IF '${condition}' == 'not' Should Not Be True ${is_annotated}
... ELSE IF '${condition}' == 'is' Should Be True ${is_annotated}
... ELSE Fail Invalid condition ${condition}

Check instance-manager pod is not running on drained node
${pod} = get_instance_manager_on_node ${drained_node}
Should Be Equal ${pod} ${None}

Check instance-manager pod is running on node ${node_id}
${node_name} = get_node_by_index ${node_id}
${pod} = get_instance_manager_on_node ${node_name}
Should Not Be Equal ${pod} ${None}
6 changes: 6 additions & 0 deletions e2e/keywords/volume.resource
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,12 @@ Wait volume ${volume_id} replica on node ${node_id} stopped
${node_name} = get_node_by_index ${node_id}
wait_for_replica_stopped ${volume_name} ${node_name}

Check volume ${volume_id} replica on node ${node_id} exist
${volume_name} = generate_name_with_suffix volume ${volume_id}
${node_name} = get_node_by_index ${node_id}
${replica_name} get_replica_name_on_node ${volume_name} ${node_name}
Should Not Be Equal ${replica_name} ${None}

Check volume ${volume_id} data is intact
${volume_name} = generate_name_with_suffix volume ${volume_id}
check_data_checksum ${volume_name}
Expand Down
27 changes: 25 additions & 2 deletions e2e/libs/k8s/k8s.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import time
import subprocess
import asyncio
import os
from kubernetes import client
from workload.pod import create_pod
from workload.pod import delete_pod
Expand All @@ -9,6 +10,8 @@
from utility.utility import subprocess_exec_cmd
from utility.utility import logging
from utility.utility import get_retry_count_and_interval
from utility.utility import subprocess_exec_cmd_with_timeout
from robot.libraries.BuiltIn import BuiltIn

async def restart_kubelet(node_name, downtime_in_sec=10):
manifest = new_pod_manifest(
Expand All @@ -32,9 +35,9 @@ def drain_node(node_name):
exec_cmd = ["kubectl", "drain", node_name, "--ignore-daemonsets", "--delete-emptydir-data"]
res = subprocess_exec_cmd(exec_cmd)

def force_drain_node(node_name):
def force_drain_node(node_name, timeout):
exec_cmd = ["kubectl", "drain", node_name, "--force", "--ignore-daemonsets", "--delete-emptydir-data"]
res = subprocess_exec_cmd(exec_cmd)
res = subprocess_exec_cmd_with_timeout(exec_cmd, timeout)

def cordon_node(node_name):
exec_cmd = ["kubectl", "cordon", node_name]
Expand Down Expand Up @@ -71,3 +74,23 @@ def wait_all_pods_evicted(node_name):
time.sleep(retry_interval)

assert evicted, 'failed to evict pods'

def check_node_cordoned(node_name):
api = client.CoreV1Api()
node = api.read_node(node_name)
assert node.spec.unschedulable is True, f"node {node_name} is not cordoned."

def get_instance_manager_on_node(node_name):
data_engine = BuiltIn().get_variable_value("${DATA_ENGINE}")
pods = get_all_pods_on_node(node_name)
for pod in pods:
labels = pod.metadata.labels
if labels.get("longhorn.io/data-engine") == data_engine and \
labels.get("longhorn.io/component") == "instance-manager":
return pod.metadata.name
return None

def check_instance_manager_pdb_not_exist(instance_manager):
exec_cmd = ["kubectl", "get", "pdb", "-n", "longhorn-system"]
res = subprocess_exec_cmd(exec_cmd)
assert instance_manager not in res.decode('utf-8')
3 changes: 3 additions & 0 deletions e2e/libs/keywords/host_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,6 @@ def power_off_volume_node(self, volume_name):

def power_on_node_by_name(self, node_name):
self.host.power_on_node(node_name)

def power_off_node_by_name(self, node_name):
self.host.power_off_node(node_name)
30 changes: 28 additions & 2 deletions e2e/libs/keywords/k8s_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,12 @@
from k8s.k8s import drain_node, force_drain_node
from k8s.k8s import cordon_node, uncordon_node
from k8s.k8s import wait_all_pods_evicted
from k8s.k8s import get_all_pods_on_node
from k8s.k8s import check_node_cordoned
from k8s.k8s import get_instance_manager_on_node
from k8s.k8s import check_instance_manager_pdb_not_exist
from utility.utility import logging

from node import Node

class k8s_keywords:

Expand Down Expand Up @@ -45,10 +49,32 @@ def drain_node(self, node_name):
drain_node(node_name)

def force_drain_node(self, node_name):
force_drain_node(node_name)
timeout = int(BuiltIn().get_variable_value("${DRAIN_TIMEOUT}", default="90"))
force_drain_node(node_name, timeout)

def uncordon_node(self, node_name):
uncordon_node(node_name)

def cordon_node(self, node_name):
cordon_node(node_name)

def wait_for_all_pods_evicted(self, node_name):
wait_all_pods_evicted(node_name)

def uncordon_all_nodes(self):
nodes = Node.list_node_names_by_role("worker")

for node_name in nodes:
uncordon_node(node_name)

def get_all_pods_on_node(self, node_name):
return get_all_pods_on_node(node_name)

def check_node_cordoned(self, node_name):
check_node_cordoned(node_name)

def get_instance_manager_on_node(self, node_name):
return get_instance_manager_on_node(node_name)

def check_instance_manager_pdb_not_exist(self, instance_manager):
return check_instance_manager_pdb_not_exist(instance_manager)
4 changes: 4 additions & 0 deletions e2e/libs/keywords/node_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,9 @@ def enable_node_scheduling(self, node_name):

def reset_node_schedule(self):
nodes = self.node.list_node_names_by_role("worker")

for node_name in nodes:
self.enable_node_scheduling(node_name)

def check_node_is_not_schedulable(self, node_name):
self.node.check_node_schedulable(node_name, schedulable="False")
10 changes: 10 additions & 0 deletions e2e/libs/node/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from utility.utility import get_retry_count_and_interval
from utility.utility import logging

from k8s.k8s import uncordon_node

class Node:

DEFAULT_DISK_PATH = "/var/lib/longhorn/"
Expand Down Expand Up @@ -159,3 +161,11 @@ def set_default_disk_scheduling(self, node_name, allowScheduling):
if disk.path == self.DEFAULT_DISK_PATH:
disk.allowScheduling = allowScheduling
self.update_disks(node_name, node.disks)

def check_node_schedulable(self, node_name, schedulable):
node = get_longhorn_client().by_id_node(node_name)
for _ in range(self.retry_count):
if node["conditions"]["Schedulable"]["status"] == schedulable:
break
time.sleep(self.retry_interval)
assert node["conditions"]["Schedulable"]["status"] == schedulable
4 changes: 4 additions & 0 deletions e2e/libs/utility/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,10 @@ def subprocess_exec_cmd(cmd):
logging(f"Executed command {cmd} with result {res}")
return res

def subprocess_exec_cmd_with_timeout(cmd, timeout):
res = subprocess.check_output(cmd, timeout=timeout)
logging(f"Executed command {cmd} with timeout {timeout}s, result {res}")
return res

def wait_for_cluster_ready():
core_api = client.CoreV1Api()
Expand Down
Loading