Skip to content

Commit

Permalink
test(robot): add drain cases from manual test cases
Browse files Browse the repository at this point in the history
longhorn/longhorn-9292

Signed-off-by: Chris <chris.chien@suse.com>
  • Loading branch information
chriscchien committed Sep 26, 2024
1 parent 22d28e6 commit 1aff627
Show file tree
Hide file tree
Showing 12 changed files with 294 additions and 5 deletions.
2 changes: 2 additions & 0 deletions e2e/keywords/common.resource
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Library ../libs/keywords/setting_keywords.py
Library ../libs/keywords/backupstore_keywords.py
Library ../libs/keywords/backup_keywords.py
Library ../libs/keywords/sharemanager_keywords.py
Library ../libs/keywords/k8s_keywords.py

*** Keywords ***
Set test environment
Expand All @@ -35,6 +36,7 @@ Set test environment
END

Cleanup test resources
uncordon_all_nodes
cleanup_control_plane_network_latency
reset_node_schedule
cleanup_node_exec
Expand Down
5 changes: 5 additions & 0 deletions e2e/keywords/host.resource
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,8 @@ Restart cluster
Power on off node
Run keyword And Continue On Failure
... power_on_node_by_name ${powered_off_node}

Power off node ${node_id}
${powered_off_node} = get_node_by_index ${node_id}
power_off_node_by_name ${powered_off_node}
Set Test Variable ${powered_off_node}
61 changes: 60 additions & 1 deletion e2e/keywords/k8s.resource
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@ Library ../libs/keywords/k8s_keywords.py
Library ../libs/keywords/workload_keywords.py
Library ../libs/keywords/volume_keywords.py
Library ../libs/keywords/host_keywords.py
Library ../libs/keywords/node_keywords.py

*** Variables ***

${DRAIN_TIMEOUT} 90

*** Keywords ***
Stop volume node kubelet of ${workload_kind} ${workload_id} for ${duration} seconds
Expand Down Expand Up @@ -51,5 +52,63 @@ Force drain volume of ${workload_kind} ${workload_id} replica node
Set Test Variable ${drained_node}
Set Test Variable ${last_volume_node}

Force drain node ${node_id}
${node_name} = get_node_by_index ${node_id}
force_drain_node ${node_name}

Drain volume of ${workload_kind} ${workload_id} volume node
${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id}
${volume_name} = get_workload_volume_name ${workload_name}
${drained_node} = get_volume_node ${volume_name}
${last_volume_node} = get_volume_node ${volume_name}
drain_node ${drained_node}
wait_for_all_pods_evicted ${drained_node}
Set Test Variable ${drained_node}
Set Test Variable ${last_volume_node}

Uncordon the drained node
uncordon_node ${drained_node}

Cordon ${workload_kind} ${workload_id} volume node
${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id}
${volume_name} = get_workload_volume_name ${workload_name}
${volume_node} = get_volume_node ${volume_name}
cordon_node ${volume_node}
check_node_is_not_schedulable ${volume_node}

Force drain all nodes
FOR ${node_id} IN RANGE 0 3
${node_name} = get_node_by_index ${node_id}
force_drain_node ${node_name}
wait_for_all_pods_evicted ${node_name}
END

Check node ${node_id} cordoned
${node_name} = get_node_by_index ${node_id}
check_node_cordoned ${node_name}

Force drain node ${node_id} and expect failure
${drained_node} = get_node_by_index ${node_id}
${instance_manager_name} = get_instance_manager_on_node ${drained_node}
Run Keyword And Expect Error * force_drain_node ${drained_node}
Set Test Variable ${instance_manager_name}
Set Test Variable ${drained_node}

Force drain node ${node_id} and expect success
${drained_node} = get_node_by_index ${node_id}
${instance_manager_name} = get_instance_manager_on_node ${drained_node}
force_drain_node ${drained_node}
Set Test Variable ${instance_manager_name}
Set Test Variable ${drained_node}

The drain process not completed
check_drain_process_not_completed ${drain_process}

The drain process completed
wait_for_all_pods_evicted ${drained_node}
check_drain_process_completed ${drain_process}

Check PDB not exist
[Arguments] ${instance_manger}
check_instance_manager_pdb_not_exist ${instance_manger}

10 changes: 10 additions & 0 deletions e2e/keywords/longhorn.resource
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ Documentation Longhorn Keywords
Library ../libs/keywords/instancemanager_keywords.py
Library ../libs/keywords/workload_keywords.py
Library ../libs/keywords/k8s_keywords.py

*** Variables ***
@{longhorn_workloads}
Expand Down Expand Up @@ -43,3 +44,12 @@ Check Longhorn workload pods ${condition} annotated with ${key}
Run Keyword IF '${condition}' == 'not' Should Not Be True ${is_annotated}
... ELSE IF '${condition}' == 'is' Should Be True ${is_annotated}
... ELSE Fail Invalid condition ${condition}

Check instance-manager pod is not running on drained node
${pod} = get_instance_manager_on_node ${drained_node}
Should Be Equal ${pod} ${None}

Check instance-manager pod is running on node ${node_id}
${node_name} = get_node_by_index ${node_id}
${pod} = get_instance_manager_on_node ${node_name}
Should Not Be Equal ${pod} ${None}
6 changes: 6 additions & 0 deletions e2e/keywords/volume.resource
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,12 @@ Wait volume ${volume_id} replica on node ${node_id} stopped
${node_name} = get_node_by_index ${node_id}
wait_for_replica_stopped ${volume_name} ${node_name}

Check volume ${volume_id} replica on node ${node_id} exist
${volume_name} = generate_name_with_suffix volume ${volume_id}
${node_name} = get_node_by_index ${node_id}
${replica_name} get_replica_name_on_node ${volume_name} ${node_name}
Should Not Be Equal ${replica_name} ${None}

Check volume ${volume_id} data is intact
${volume_name} = generate_name_with_suffix volume ${volume_id}
check_data_checksum ${volume_name}
Expand Down
27 changes: 25 additions & 2 deletions e2e/libs/k8s/k8s.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import time
import subprocess
import asyncio
import os
from kubernetes import client
from workload.pod import create_pod
from workload.pod import delete_pod
Expand All @@ -9,6 +10,8 @@
from utility.utility import subprocess_exec_cmd
from utility.utility import logging
from utility.utility import get_retry_count_and_interval
from utility.utility import subprocess_exec_cmd_with_timeout
from robot.libraries.BuiltIn import BuiltIn

async def restart_kubelet(node_name, downtime_in_sec=10):
manifest = new_pod_manifest(
Expand All @@ -32,9 +35,9 @@ def drain_node(node_name):
exec_cmd = ["kubectl", "drain", node_name, "--ignore-daemonsets", "--delete-emptydir-data"]
res = subprocess_exec_cmd(exec_cmd)

def force_drain_node(node_name):
def force_drain_node(node_name, timeout):
exec_cmd = ["kubectl", "drain", node_name, "--force", "--ignore-daemonsets", "--delete-emptydir-data"]
res = subprocess_exec_cmd(exec_cmd)
res = subprocess_exec_cmd_with_timeout(exec_cmd, timeout)

def cordon_node(node_name):
exec_cmd = ["kubectl", "cordon", node_name]
Expand Down Expand Up @@ -71,3 +74,23 @@ def wait_all_pods_evicted(node_name):
time.sleep(retry_interval)

assert evicted, 'failed to evict pods'

def check_node_cordoned(node_name):
api = client.CoreV1Api()
node = api.read_node(node_name)
assert node.spec.unschedulable is True, f"node {node_name} is not cordoned."

def get_instance_manager_on_node(node_name):
data_engine = BuiltIn().get_variable_value("${DATA_ENGINE}")
pods = get_all_pods_on_node(node_name)
for pod in pods:
labels = pod.metadata.labels
if labels.get("longhorn.io/data-engine") == data_engine and \
labels.get("longhorn.io/component") == "instance-manager":
return pod.metadata.name
return None

def check_instance_manager_pdb_not_exist(instance_manager):
exec_cmd = ["kubectl", "get", "pdb", "-n", "longhorn-system"]
res = subprocess_exec_cmd(exec_cmd)
assert instance_manager not in res.decode('utf-8')
3 changes: 3 additions & 0 deletions e2e/libs/keywords/host_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,6 @@ def power_off_volume_node(self, volume_name):

def power_on_node_by_name(self, node_name):
self.host.power_on_node(node_name)

def power_off_node_by_name(self, node_name):
self.host.power_off_node(node_name)
30 changes: 28 additions & 2 deletions e2e/libs/keywords/k8s_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,12 @@
from k8s.k8s import drain_node, force_drain_node
from k8s.k8s import cordon_node, uncordon_node
from k8s.k8s import wait_all_pods_evicted
from k8s.k8s import get_all_pods_on_node
from k8s.k8s import check_node_cordoned
from k8s.k8s import get_instance_manager_on_node
from k8s.k8s import check_instance_manager_pdb_not_exist
from utility.utility import logging

from node import Node

class k8s_keywords:

Expand Down Expand Up @@ -45,10 +49,32 @@ def drain_node(self, node_name):
drain_node(node_name)

def force_drain_node(self, node_name):
force_drain_node(node_name)
timeout = int(BuiltIn().get_variable_value("${DRAIN_TIMEOUT}", default="90"))
force_drain_node(node_name, timeout)

def uncordon_node(self, node_name):
uncordon_node(node_name)

def cordon_node(self, node_name):
cordon_node(node_name)

def wait_for_all_pods_evicted(self, node_name):
wait_all_pods_evicted(node_name)

def uncordon_all_nodes(self):
nodes = Node.list_node_names_by_role("worker")

for node_name in nodes:
uncordon_node(node_name)

def get_all_pods_on_node(self, node_name):
return get_all_pods_on_node(node_name)

def check_node_cordoned(self, node_name):
check_node_cordoned(node_name)

def get_instance_manager_on_node(self, node_name):
return get_instance_manager_on_node(node_name)

def check_instance_manager_pdb_not_exist(self, instance_manager):
return check_instance_manager_pdb_not_exist(instance_manager)
4 changes: 4 additions & 0 deletions e2e/libs/keywords/node_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,9 @@ def enable_node_scheduling(self, node_name):

def reset_node_schedule(self):
nodes = self.node.list_node_names_by_role("worker")

for node_name in nodes:
self.enable_node_scheduling(node_name)

def check_node_is_not_schedulable(self, node_name):
self.node.check_node_schedulable(node_name, schedulable="False")
10 changes: 10 additions & 0 deletions e2e/libs/node/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from utility.utility import get_retry_count_and_interval
from utility.utility import logging

from k8s.k8s import uncordon_node

class Node:

DEFAULT_DISK_PATH = "/var/lib/longhorn/"
Expand Down Expand Up @@ -159,3 +161,11 @@ def set_default_disk_scheduling(self, node_name, allowScheduling):
if disk.path == self.DEFAULT_DISK_PATH:
disk.allowScheduling = allowScheduling
self.update_disks(node_name, node.disks)

def check_node_schedulable(self, node_name, schedulable):
node = get_longhorn_client().by_id_node(node_name)
for _ in range(self.retry_count):
if node["conditions"]["Schedulable"]["status"] == schedulable:
break
time.sleep(self.retry_interval)
assert node["conditions"]["Schedulable"]["status"] == schedulable
4 changes: 4 additions & 0 deletions e2e/libs/utility/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,10 @@ def subprocess_exec_cmd(cmd):
logging(f"Executed command {cmd} with result {res}")
return res

def subprocess_exec_cmd_with_timeout(cmd, timeout):
res = subprocess.check_output(cmd, timeout=timeout)
logging(f"Executed command {cmd} with timeout {timeout}s, result {res}")
return res

def wait_for_cluster_ready():
core_api = client.CoreV1Api()
Expand Down
Loading

0 comments on commit 1aff627

Please sign in to comment.