Skip to content

Commit

Permalink
test(robot): add drain cases from manual test cases
Browse files Browse the repository at this point in the history
longhorn/longhorn-9292

Signed-off-by: Chris <chris.chien@suse.com>
  • Loading branch information
chriscchien committed Sep 25, 2024
1 parent 22d28e6 commit e0353ac
Show file tree
Hide file tree
Showing 13 changed files with 354 additions and 2 deletions.
2 changes: 2 additions & 0 deletions e2e/keywords/common.resource
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Library ../libs/keywords/setting_keywords.py
Library ../libs/keywords/backupstore_keywords.py
Library ../libs/keywords/backup_keywords.py
Library ../libs/keywords/sharemanager_keywords.py
Library ../libs/keywords/k8s_keywords.py

*** Keywords ***
Set test environment
Expand All @@ -35,6 +36,7 @@ Set test environment
END

Cleanup test resources
uncordon_all_nodes
cleanup_control_plane_network_latency
reset_node_schedule
cleanup_node_exec
Expand Down
5 changes: 5 additions & 0 deletions e2e/keywords/host.resource
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,8 @@ Restart cluster
Power on off node
Run keyword And Continue On Failure
... power_on_node_by_name ${powered_off_node}

Power off node ${node_id}
${powered_off_node} = get_node_by_index ${node_id}
power_off_node_by_name ${powered_off_node}
Set Test Variable ${powered_off_node}
59 changes: 59 additions & 0 deletions e2e/keywords/k8s.resource
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ Library ../libs/keywords/k8s_keywords.py
Library ../libs/keywords/workload_keywords.py
Library ../libs/keywords/volume_keywords.py
Library ../libs/keywords/host_keywords.py
Library ../libs/keywords/node_keywords.py

*** Variables ***

Expand Down Expand Up @@ -51,5 +52,63 @@ Force drain volume of ${workload_kind} ${workload_id} replica node
Set Test Variable ${drained_node}
Set Test Variable ${last_volume_node}

Force drain node ${node_id}
${node_name} = get_node_by_index ${node_id}
force_drain_node ${node_name}

Drain volume of ${workload_kind} ${workload_id} volume node
${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id}
${volume_name} = get_workload_volume_name ${workload_name}
${drained_node} = get_volume_node ${volume_name}
${last_volume_node} = get_volume_node ${volume_name}
drain_node ${drained_node}
wait_for_all_pods_evicted ${drained_node}
Set Test Variable ${drained_node}
Set Test Variable ${last_volume_node}

Uncordon the drained node
uncordon_node ${drained_node}

Cordon ${workload_kind} ${workload_id} volume node
${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id}
${volume_name} = get_workload_volume_name ${workload_name}
${volume_node} = get_volume_node ${volume_name}
cordon_node ${volume_node}
check_node_is_not_schedulable ${volume_node}

Force drain all nodes
FOR ${node_id} IN RANGE 0 3
${node_name} = get_node_by_index ${node_id}
force_drain_node ${node_name}
wait_for_all_pods_evicted ${node_name}
END

Check node ${node_id} cordoned
${node_name} = get_node_by_index ${node_id}
check_node_cordoned ${node_name}

Force drain node ${node_id} and wait for ${duration} second
[Arguments] ${dataEngine}
${drained_node} = get_node_by_index ${node_id}
${instance_manager_name} = get_instance_manager_on_node ${drained_node} ${dataEngine}
${drain_process} = force_drain_node_and_wait ${drained_node} ${duration}
Set Test Variable ${drain_process}
Set Test Variable ${instance_manager_name}
Set Test Variable ${drained_node}

The drain process not completed
check_drain_process_not_completed ${drain_process}

The drain process completed
wait_for_all_pods_evicted ${drained_node}
check_drain_process_completed ${drain_process}

Drain logs should contain
[Arguments] ${log}
${drain_logs} = get_drain_process_error_log ${drain_process}
Should Contain ${drain_logs} ${log}

Check PDB not exist
[Arguments] ${instance_manger}
check_instance_manager_pdb_not_exist ${instance_manger}

12 changes: 12 additions & 0 deletions e2e/keywords/longhorn.resource
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ Documentation Longhorn Keywords
Library ../libs/keywords/instancemanager_keywords.py
Library ../libs/keywords/workload_keywords.py
Library ../libs/keywords/k8s_keywords.py

*** Variables ***
@{longhorn_workloads}
Expand Down Expand Up @@ -43,3 +44,14 @@ Check Longhorn workload pods ${condition} annotated with ${key}
Run Keyword IF '${condition}' == 'not' Should Not Be True ${is_annotated}
... ELSE IF '${condition}' == 'is' Should Be True ${is_annotated}
... ELSE Fail Invalid condition ${condition}

Check instance-manager pod is not running on drained node
[Arguments] ${dataEngine}
${pod} = get_instance_manager_on_node ${drained_node} ${dataEngine}
Should Be Equal ${pod} ${None}

Check instance-manager pod is running on node ${node_id}
[Arguments] ${dataEngine}
${node_name} = get_node_by_index ${node_id}
${pod} = get_instance_manager_on_node ${node_name} ${dataEngine}
Should Not Be Equal ${pod} ${None}
6 changes: 6 additions & 0 deletions e2e/keywords/volume.resource
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,12 @@ Wait volume ${volume_id} replica on node ${node_id} stopped
${node_name} = get_node_by_index ${node_id}
wait_for_replica_stopped ${volume_name} ${node_name}

Check volume ${volume_id} replica on node ${node_id} exist
${volume_name} = generate_name_with_suffix volume ${volume_id}
${node_name} = get_node_by_index ${node_id}
${replica_name} get_replica_name_on_node ${volume_name} ${node_name}
Should Not Be Equal ${replica_name} ${None}

Check volume ${volume_id} data is intact
${volume_name} = generate_name_with_suffix volume ${volume_id}
check_data_checksum ${volume_name}
Expand Down
45 changes: 45 additions & 0 deletions e2e/libs/k8s/k8s.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import time
import subprocess
import asyncio
import os
from kubernetes import client
from workload.pod import create_pod
from workload.pod import delete_pod
Expand All @@ -9,6 +10,9 @@
from utility.utility import subprocess_exec_cmd
from utility.utility import logging
from utility.utility import get_retry_count_and_interval
from utility.utility import check_popen_process_not_completed
from utility.utility import check_popen_process_completed
from utility.utility import get_popen_process_error_log

async def restart_kubelet(node_name, downtime_in_sec=10):
manifest = new_pod_manifest(
Expand Down Expand Up @@ -71,3 +75,44 @@ def wait_all_pods_evicted(node_name):
time.sleep(retry_interval)

assert evicted, 'failed to evict pods'

def check_node_cordoned(node_name):
api = client.CoreV1Api()
node = api.read_node(node_name)
assert node.spec.unschedulable is True, f"node {node_name} is not cordoned."

def force_drain_node_and_wait(node_name, duration):
_, retry_interval = get_retry_count_and_interval()
exec_cmd = ["kubectl", "drain", node_name, "--force", "--ignore-daemonsets", "--delete-emptydir-data"]
drain_process = subprocess.Popen(exec_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

for i in range(int(duration)):
logging(f"Performing {exec_cmd} and wait... counts={i}")
if drain_process.poll() is not None:
raise AssertionError(f"Drain node {node_name} completed, but it was expected not to complete!")
time.sleep(retry_interval)

return drain_process

def check_drain_process_not_completed(drain_process):
check_popen_process_not_completed(drain_process)

def check_drain_process_completed(drain_process):
check_popen_process_completed(drain_process)

def get_drain_process_error_log(drain_process):
return get_popen_process_error_log(drain_process)

def get_instance_manager_on_node(node_name, data_engine):
pods = get_all_pods_on_node(node_name)
for pod in pods:
labels = pod.metadata.labels
if labels.get("longhorn.io/data-engine") == data_engine and \
labels.get("longhorn.io/component") == "instance-manager":
return pod.metadata.name
return None

def check_instance_manager_pdb_not_exist(instance_manager):
exec_cmd = ["kubectl", "get", "pdb", "-n", "longhorn-system"]
res = subprocess_exec_cmd(exec_cmd)
assert instance_manager not in res.decode('utf-8')
3 changes: 3 additions & 0 deletions e2e/libs/keywords/host_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,6 @@ def power_off_volume_node(self, volume_name):

def power_on_node_by_name(self, node_name):
self.host.power_on_node(node_name)

def power_off_node_by_name(self, node_name):
self.host.power_off_node(node_name)
43 changes: 42 additions & 1 deletion e2e/libs/keywords/k8s_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,16 @@
from k8s.k8s import drain_node, force_drain_node
from k8s.k8s import cordon_node, uncordon_node
from k8s.k8s import wait_all_pods_evicted
from k8s.k8s import get_all_pods_on_node
from k8s.k8s import check_node_cordoned
from k8s.k8s import force_drain_node_and_wait
from k8s.k8s import check_drain_process_not_completed
from k8s.k8s import get_instance_manager_on_node
from k8s.k8s import get_drain_process_error_log
from k8s.k8s import check_instance_manager_pdb_not_exist
from k8s.k8s import check_drain_process_completed
from utility.utility import logging

from node import Node

class k8s_keywords:

Expand Down Expand Up @@ -50,5 +58,38 @@ def force_drain_node(self, node_name):
def uncordon_node(self, node_name):
uncordon_node(node_name)

def cordon_node(self, node_name):
cordon_node(node_name)

def wait_for_all_pods_evicted(self, node_name):
wait_all_pods_evicted(node_name)

def uncordon_all_nodes(self):
nodes = Node.list_node_names_by_role("worker")

for node_name in nodes:
uncordon_node(node_name)

def get_all_pods_on_node(self, node_name):
return get_all_pods_on_node(node_name)

def check_node_cordoned(self, node_name):
check_node_cordoned(node_name)

def force_drain_node_and_wait(self, node_name, duration):
return force_drain_node_and_wait(node_name, duration)

def check_drain_process_not_completed(self, drain_process):
return check_drain_process_not_completed(drain_process)

def check_drain_process_completed(self, drain_process):
return check_drain_process_completed(drain_process)

def get_instance_manager_on_node(self, node_name, data_engine):
return get_instance_manager_on_node(node_name, data_engine)

def get_drain_process_error_log(self, drain_process):
return get_drain_process_error_log(drain_process)

def check_instance_manager_pdb_not_exist(self, instance_manager):
return check_instance_manager_pdb_not_exist(instance_manager)
4 changes: 4 additions & 0 deletions e2e/libs/keywords/node_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,9 @@ def enable_node_scheduling(self, node_name):

def reset_node_schedule(self):
nodes = self.node.list_node_names_by_role("worker")

for node_name in nodes:
self.enable_node_scheduling(node_name)

def check_node_is_not_schedulable(self, node_name):
self.node.check_node_schedulable(node_name, schedulable="False")
10 changes: 10 additions & 0 deletions e2e/libs/node/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from utility.utility import get_retry_count_and_interval
from utility.utility import logging

from k8s.k8s import uncordon_node

class Node:

DEFAULT_DISK_PATH = "/var/lib/longhorn/"
Expand Down Expand Up @@ -159,3 +161,11 @@ def set_default_disk_scheduling(self, node_name, allowScheduling):
if disk.path == self.DEFAULT_DISK_PATH:
disk.allowScheduling = allowScheduling
self.update_disks(node_name, node.disks)

def check_node_schedulable(self, node_name, schedulable):
node = get_longhorn_client().by_id_node(node_name)
for _ in range(self.retry_count):
if node["conditions"]["Schedulable"]["status"] == schedulable:
break
time.sleep(self.retry_interval)
assert node["conditions"]["Schedulable"]["status"] == schedulable
22 changes: 22 additions & 0 deletions e2e/libs/utility/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,3 +285,25 @@ def get_name_suffix(*args):
if arg:
suffix += f"-{arg}"
return suffix


def check_popen_process_not_completed(process):
assert process.poll() is None, f"process {process} terminated which is not expected."


def check_popen_process_completed(process):
retry_count, retry_interval = get_retry_count_and_interval()
for i in range(retry_count):
if process.poll() is not None:
return
time.sleep(retry_interval)
assert process.poll() is not None, f"process {process} not terminated which is not expected."


def get_popen_process_error_log(drain_process):
if drain_process.poll() is None:
drain_process.terminate()

stdout, stderr = drain_process.communicate()
logging(f"{stderr.decode('utf-8')}")
return stderr.decode('utf-8')
2 changes: 1 addition & 1 deletion e2e/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ kubernetes==27.2.0
requests==2.32.3
boto3==1.35.19
pyyaml==6.0.2
minio==5.0.10
minio==5.0.10
Loading

0 comments on commit e0353ac

Please sign in to comment.