From 6a084af571fac1871a1229fcaa118e15033ef7a0 Mon Sep 17 00:00:00 2001 From: Chris Date: Mon, 19 Feb 2024 18:43:14 +0800 Subject: [PATCH 1/3] Add test case test_drain_with_block_for_eviction_success ref: 7521 Signed-off-by: Chris --- manager/integration/tests/common.py | 1 + manager/integration/tests/test_node.py | 151 ++++++++++++++++++++++++- 2 files changed, 149 insertions(+), 3 deletions(-) diff --git a/manager/integration/tests/common.py b/manager/integration/tests/common.py index 28503ae13e..c9c36e0aa0 100644 --- a/manager/integration/tests/common.py +++ b/manager/integration/tests/common.py @@ -216,6 +216,7 @@ "allow-empty-node-selector-volume" SETTING_REPLICA_DISK_SOFT_ANTI_AFFINITY = "replica-disk-soft-anti-affinity" SETTING_ALLOW_EMPTY_DISK_SELECTOR_VOLUME = "allow-empty-disk-selector-volume" +SETTING_NODE_DRAIN_POLICY = "node-drain-policy" DEFAULT_BACKUP_COMPRESSION_METHOD = "lz4" BACKUP_COMPRESSION_METHOD_LZ4 = "lz4" diff --git a/manager/integration/tests/test_node.py b/manager/integration/tests/test_node.py index 1e4ad5dd32..f54b0e5335 100644 --- a/manager/integration/tests/test_node.py +++ b/manager/integration/tests/test_node.py @@ -3,6 +3,7 @@ import os import subprocess import time +import yaml from random import choice from string import ascii_lowercase, digits @@ -47,8 +48,14 @@ from common import set_node_scheduling_eviction from common import update_node_disks from common import update_setting +from common import SETTING_NODE_DRAIN_POLICY, DATA_SIZE_IN_MB_3 +from common import make_deployment_with_pvc # NOQA +from common import create_pv_for_volume +from common import create_pvc_for_volume, create_and_wait_deployment +from common import get_apps_api_client, write_pod_volume_random_data from backupstore import set_random_backupstore # NOQA +from concurrent.futures import ThreadPoolExecutor, TimeoutError CREATE_DEFAULT_DISK_LABEL = "node.longhorn.io/create-default-disk" @@ -2680,8 +2687,31 @@ def finalizer(): request.addfinalizer(finalizer) -@pytest.mark.skip(reason="TODO") # NOQA -def test_drain_with_block_for_eviction_success(): + +def drain_node(core_api, node): # NOQA + set_node_cordon(core_api, node.id, True) + + command = ["kubectl", "drain", node.id, "--ignore-daemonsets"] + subprocess.run(command, check=True) + + +def get_replica_detail(replica_name): + """ + Get allreplica information by this function + """ + command = ["kubectl", "get", + "replicas.longhorn.io", + "-n", + "longhorn-system", + replica_name, + "-o", + "yaml"] + output = subprocess.check_output(command, text=True) + replica_info = yaml.safe_load(output) + return replica_info + + +def test_drain_with_block_for_eviction_success(client, core_api, volume_name, make_deployment_with_pvc): # NOQA """ Test drain completes after evicting replica with node-drain-policy block-for-eviction @@ -2693,7 +2723,6 @@ def test_drain_with_block_for_eviction_success(): 4. Write data to the volume. 5. Drain a node one of the volume's replicas is scheduled to. 6. While the drain is ongoing: - - Verify that the volume never becomes degraded. - Verify that `node.status.autoEvicting == true`. - Optionally verify that `replica.spec.evictionRequested == true`. 7. Verify the drain completes. @@ -2703,6 +2732,122 @@ def test_drain_with_block_for_eviction_success(): 11. Verify that `replica.spec.evictionRequested == false`. 12. Verify the volume's data. """ + host_id = get_self_host_id() + nodes = client.list_node() + evict_nodes = [node for node in nodes if node.id != host_id][:2] + evict_source_node = evict_nodes[0] + evict_target_node = evict_nodes[1] + + # Step 1 + setting = client.by_id_setting( + SETTING_NODE_DRAIN_POLICY) + client.update(setting, value="block-for-eviction") + + # Step 2, 3, 4 + volume = client.create_volume(name=volume_name, + size=str(1 * Gi), + numberOfReplicas=3) + volume = common.wait_for_volume_detached(client, volume_name) + + pvc_name = volume_name + "-pvc" + create_pv_for_volume(client, core_api, volume, volume_name) + create_pvc_for_volume(client, core_api, volume, pvc_name) + deployment_name = volume_name + "-dep" + deployment = make_deployment_with_pvc(deployment_name, pvc_name) + deployment["spec"]["template"]["spec"]["nodeSelector"] \ + = {"kubernetes.io/hostname": host_id} + + apps_api = get_apps_api_client() + create_and_wait_deployment(apps_api, deployment) + + pod_names = common.get_deployment_pod_names(core_api, deployment) + data_path = '/data/test' + write_pod_volume_random_data(core_api, + pod_names[0], + data_path, + DATA_SIZE_IN_MB_3) + expected_test_data_checksum = get_pod_data_md5sum(core_api, + pod_names[0], + data_path) + + volume = wait_for_volume_healthy(client, volume_name) + + # Make replica not locate on eviction target node + volume.updateReplicaCount(replicaCount=2) + for replica in volume.replicas: + if replica.hostId == evict_target_node.id: + volume.replicaRemove(name=replica.name) + break + + wait_for_volume_replica_count(client, volume_name, 2) + + # Step 5 + # drain eviction source node + executor = ThreadPoolExecutor(max_workers=5) + future = executor.submit(drain_node, core_api, evict_source_node) + + # Step 6 + volume = client.by_id_volume(volume_name) + for replica in volume.replicas: + if replica.hostId == evict_source_node.id: + replica_name = replica.name + break + + replica_info = get_replica_detail(replica_name) + eviction_requested = replica_info["spec"]["evictionRequested"] + assert eviction_requested is True + + nodes = client.list_node() + for node in nodes: + if node.id == evict_source_node.id: + assert node.autoEvicting is True + + # Step 7 + thread_timeout = 60 + try: + future.result(timeout=thread_timeout) + drain_complete = True + except TimeoutError: + print("drain node thread exceed timeout ({})s".format(thread_timeout)) + drain_complete = False + future.cancel() + finally: + assert drain_complete is True + + wait_for_volume_replica_count(client, volume_name, 2) + + # Step 8 + set_node_cordon(core_api, evict_source_node.id, False) + + # Step 9 + volume = wait_for_volume_healthy(client, volume_name) + assert len(volume.replicas) == 2 + for replica in volume.replicas: + assert replica.hostId != evict_source_node.id + + # Stpe 10 + nodes = client.list_node() + for node in nodes: + assert node.autoEvicting is False + + # Step 11 + volume = client.by_id_volume(volume_name) + for replica in volume.replicas: + if replica.hostId == evict_target_node.id: + replica_name = replica.name + break + + replica_info = get_replica_detail(replica_name) + eviction_requested = replica_info["spec"]["evictionRequested"] + assert eviction_requested is False + + # Step 12 + test_data_checksum = get_pod_data_md5sum(core_api, + pod_names[0], + data_path) + + assert expected_test_data_checksum == test_data_checksum + @pytest.mark.skip(reason="TODO") # NOQA def test_drain_with_block_for_eviction_if_contains_last_replica_success(): From b4892013e8e63401cf30733c14b9ec92c298a758 Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 20 Feb 2024 16:48:23 +0800 Subject: [PATCH 2/3] Add test case test_drain_with_block_for_eviction_if_contains_last_replica_success ref: 7521 Signed-off-by: Chris --- manager/integration/Dockerfile | 2 +- manager/integration/tests/common.py | 39 ++++ manager/integration/tests/test_node.py | 291 ++++++++++++++++++------- 3 files changed, 257 insertions(+), 75 deletions(-) diff --git a/manager/integration/Dockerfile b/manager/integration/Dockerfile index 8541f5edd9..09ccfc9c2f 100644 --- a/manager/integration/Dockerfile +++ b/manager/integration/Dockerfile @@ -1,6 +1,6 @@ FROM registry.suse.com/bci/python:3.9 -ARG KUBECTL_VERSION=v1.17.0 +ARG KUBECTL_VERSION=v1.28.4 ARG YQ_VERSION=v4.24.2 ARG TERRAFORM_VERSION=1.3.5 ARG ARCH=amd64 diff --git a/manager/integration/tests/common.py b/manager/integration/tests/common.py index c9c36e0aa0..38e2c78743 100644 --- a/manager/integration/tests/common.py +++ b/manager/integration/tests/common.py @@ -6123,3 +6123,42 @@ def wait_for_instance_manager_count(client, number, retry_counts=120): time.sleep(RETRY_INTERVAL_LONG) return len(ims) + + +def create_deployment_and_write_data(client, # NOQA + core_api, # NOQA + make_deployment_with_pvc, # NOQA + volume_name, # NOQA + size, # NOQA + replica_count, # NOQA + data_size, # NOQA + attach_node_id=None): # NOQA + apps_api = get_apps_api_client() + volume = client.create_volume(name=volume_name, + size=size, + numberOfReplicas=replica_count) + volume = wait_for_volume_detached(client, volume_name) + + pvc_name = volume_name + "-pvc" + create_pv_for_volume(client, core_api, volume, volume_name) + create_pvc_for_volume(client, core_api, volume, pvc_name) + deployment_name = volume_name + "-dep" + deployment = make_deployment_with_pvc(deployment_name, pvc_name) + if attach_node_id: + deployment["spec"]["template"]["spec"]["nodeSelector"] \ + = {"kubernetes.io/hostname": attach_node_id} + + create_and_wait_deployment(apps_api, deployment) + + data_path = '/data/test' + deployment_pod_names = get_deployment_pod_names(core_api, + deployment) + write_pod_volume_random_data(core_api, + deployment_pod_names[0], + data_path, + data_size) + checksum = get_pod_data_md5sum(core_api, + deployment_pod_names[0], + data_path) + + return client.by_id_volume(volume_name), deployment_pod_names[0], checksum diff --git a/manager/integration/tests/test_node.py b/manager/integration/tests/test_node.py index f54b0e5335..55d1b3bc28 100644 --- a/manager/integration/tests/test_node.py +++ b/manager/integration/tests/test_node.py @@ -50,9 +50,8 @@ from common import update_setting from common import SETTING_NODE_DRAIN_POLICY, DATA_SIZE_IN_MB_3 from common import make_deployment_with_pvc # NOQA -from common import create_pv_for_volume -from common import create_pvc_for_volume, create_and_wait_deployment -from common import get_apps_api_client, write_pod_volume_random_data +from common import prepare_host_disk, wait_for_volume_degraded +from common import create_deployment_and_write_data from backupstore import set_random_backupstore # NOQA from concurrent.futures import ThreadPoolExecutor, TimeoutError @@ -2691,7 +2690,15 @@ def finalizer(): def drain_node(core_api, node): # NOQA set_node_cordon(core_api, node.id, True) - command = ["kubectl", "drain", node.id, "--ignore-daemonsets"] + command = [ + "kubectl", + "drain", + node.id, + "--ignore-daemonsets", + "--delete-emptydir-data", + "--grace-period=-1" + ] + subprocess.run(command, check=True) @@ -2711,8 +2718,84 @@ def get_replica_detail(replica_name): return replica_info +def check_node_auto_evict_state(client, target_node, expect_state): # NOQA + def get_specific_node(client, target_node): + nodes = client.list_node() + for node in nodes: + if node.id == target_node.id: + return node + + for i in range(RETRY_COUNTS): + node = get_specific_node(client, target_node) + if node.autoEvicting is expect_state: + break + time.sleep(RETRY_INTERVAL) + assert node.autoEvicting is expect_state + + +def check_replica_evict_state(client, volume_name, node, expect_state): # NOQA + volume = client.by_id_volume(volume_name) + for replica in volume.replicas: + if replica.hostId == node.id: + replica_name = replica.name + break + + replica_info = get_replica_detail(replica_name) + eviction_requested = replica_info["spec"]["evictionRequested"] + assert eviction_requested is expect_state + + +def wait_drain_complete(future, timeout): + """ + Wait concurrent.futures object complete in a duration + """ + thread_timeout = timeout + try: + future.result(timeout=thread_timeout) + drain_complete = True + except TimeoutError: + print("drain node thread exceed timeout ({})s".format(thread_timeout)) + drain_complete = False + future.cancel() + finally: + assert drain_complete is True + + +def make_replica_on_specific_node(client, volume_name, node): # NOQA + volume = client.by_id_volume(volume_name) + volume.updateReplicaCount(replicaCount=1) + for replica in volume.replicas: + if replica.hostId != node.id: + volume.replicaRemove(name=replica.name) + wait_for_volume_replica_count(client, volume_name, 1) + + +def get_all_replica_name(client, volume_name): # NOQA + volume_replicas = [] + volume = client.by_id_volume(volume_name) + for replica in volume.replicas: + volume_replicas.append(replica.name) + + return volume_replicas + + +def check_all_replicas_evict_state(client, volume_name, expect_state): # NOQA + volume = client.by_id_volume(volume_name) + for replica in volume.replicas: + replica_info = get_replica_detail(replica.name) + eviction_requested = replica_info["spec"]["evictionRequested"] + assert eviction_requested is expect_state + + +@pytest.mark.skip(reason="Can not run when in-cluster backup store pod exist") # NOQA def test_drain_with_block_for_eviction_success(client, core_api, volume_name, make_deployment_with_pvc): # NOQA """ + Test case has the potential to drain node where backup store pods are + located. + In that case, test case will fail because backup store pods can only be + forcibly drained. + --- + Test drain completes after evicting replica with node-drain-policy block-for-eviction @@ -2744,33 +2827,13 @@ def test_drain_with_block_for_eviction_success(client, core_api, volume_name, ma client.update(setting, value="block-for-eviction") # Step 2, 3, 4 - volume = client.create_volume(name=volume_name, - size=str(1 * Gi), - numberOfReplicas=3) - volume = common.wait_for_volume_detached(client, volume_name) - - pvc_name = volume_name + "-pvc" - create_pv_for_volume(client, core_api, volume, volume_name) - create_pvc_for_volume(client, core_api, volume, pvc_name) - deployment_name = volume_name + "-dep" - deployment = make_deployment_with_pvc(deployment_name, pvc_name) - deployment["spec"]["template"]["spec"]["nodeSelector"] \ - = {"kubernetes.io/hostname": host_id} - - apps_api = get_apps_api_client() - create_and_wait_deployment(apps_api, deployment) - - pod_names = common.get_deployment_pod_names(core_api, deployment) - data_path = '/data/test' - write_pod_volume_random_data(core_api, - pod_names[0], - data_path, - DATA_SIZE_IN_MB_3) - expected_test_data_checksum = get_pod_data_md5sum(core_api, - pod_names[0], - data_path) - - volume = wait_for_volume_healthy(client, volume_name) + volume, pod, checksum = create_deployment_and_write_data(client, + core_api, + make_deployment_with_pvc, # NOQA + volume_name, + str(1 * Gi), + 3, + DATA_SIZE_IN_MB_3, host_id) # NOQA # Make replica not locate on eviction target node volume.updateReplicaCount(replicaCount=2) @@ -2787,33 +2850,11 @@ def test_drain_with_block_for_eviction_success(client, core_api, volume_name, ma future = executor.submit(drain_node, core_api, evict_source_node) # Step 6 - volume = client.by_id_volume(volume_name) - for replica in volume.replicas: - if replica.hostId == evict_source_node.id: - replica_name = replica.name - break - - replica_info = get_replica_detail(replica_name) - eviction_requested = replica_info["spec"]["evictionRequested"] - assert eviction_requested is True - - nodes = client.list_node() - for node in nodes: - if node.id == evict_source_node.id: - assert node.autoEvicting is True + check_replica_evict_state(client, volume_name, evict_source_node, True) + check_node_auto_evict_state(client, evict_source_node, True) # Step 7 - thread_timeout = 60 - try: - future.result(timeout=thread_timeout) - drain_complete = True - except TimeoutError: - print("drain node thread exceed timeout ({})s".format(thread_timeout)) - drain_complete = False - future.cancel() - finally: - assert drain_complete is True - + wait_drain_complete(future, 60) wait_for_volume_replica_count(client, volume_name, 2) # Step 8 @@ -2826,32 +2867,29 @@ def test_drain_with_block_for_eviction_success(client, core_api, volume_name, ma assert replica.hostId != evict_source_node.id # Stpe 10 - nodes = client.list_node() - for node in nodes: - assert node.autoEvicting is False + check_node_auto_evict_state(client, evict_source_node, False) # Step 11 - volume = client.by_id_volume(volume_name) - for replica in volume.replicas: - if replica.hostId == evict_target_node.id: - replica_name = replica.name - break - - replica_info = get_replica_detail(replica_name) - eviction_requested = replica_info["spec"]["evictionRequested"] - assert eviction_requested is False + check_replica_evict_state(client, volume_name, evict_target_node, False) # Step 12 + data_path = data_path = '/data/test' test_data_checksum = get_pod_data_md5sum(core_api, - pod_names[0], + pod, data_path) + assert checksum == test_data_checksum - assert expected_test_data_checksum == test_data_checksum - -@pytest.mark.skip(reason="TODO") # NOQA -def test_drain_with_block_for_eviction_if_contains_last_replica_success(): +@pytest.mark.skip(reason="Can not run when in-cluster backup store pod exist") # NOQA +def test_drain_with_block_for_eviction_if_contains_last_replica_success(client, # NOQA + core_api, # NOQA + make_deployment_with_pvc): # NOQA """ + Test case has the potential to drain node where backup store pods are + located. + In that case, test case will fail because backup store pods can only be + forcibly drained. + --- Test drain completes after evicting replicas with node-drain-policy block-for-eviction-if-contains-last-replica @@ -2864,7 +2902,6 @@ def test_drain_with_block_for_eviction_if_contains_last_replica_success(): 4. Write data to the volumes. 5. Drain a node both volumes have a replica scheduled to. 6. While the drain is ongoing: - - Verify that the volume with one replica never becomes degraded. - Verify that the volume with three replicas becomes degraded. - Verify that `node.status.autoEvicting == true`. - Optionally verify that `replica.spec.evictionRequested == true` on the @@ -2880,6 +2917,112 @@ def test_drain_with_block_for_eviction_if_contains_last_replica_success(): 12. Verify that `replica.spec.evictionRequested == false` on all replicas. 13. Verify the the data in both volumes. """ + host_id = get_self_host_id() + nodes = client.list_node() + evict_nodes = [node for node in nodes if node.id != host_id][:2] + evict_source_node = evict_nodes[0] + + # Create extra disk on current node + node = client.by_id_node(host_id) + disks = node.disks + + disk_volume_name = 'vol-disk' + disk_volume = client.create_volume(name=disk_volume_name, + size=str(2 * Gi), + numberOfReplicas=1, + dataLocality="strict-local") + disk_volume = wait_for_volume_detached(client, disk_volume_name) + + disk_volume.attach(hostId=host_id) + disk_volume = wait_for_volume_healthy(client, disk_volume_name) + disk_path = prepare_host_disk(get_volume_endpoint(disk_volume), + disk_volume_name) + disk = {"path": disk_path, "allowScheduling": True} + + update_disk = get_update_disks(disks) + update_disk["disk1"] = disk + + node = update_node_disks(client, node.name, disks=update_disk, retry=True) + node = wait_for_disk_update(client, host_id, len(update_disk)) + assert len(node.disks) == len(update_disk) + + # Step 1 + setting = client.by_id_setting( + SETTING_NODE_DRAIN_POLICY) + client.update(setting, value="block-for-eviction-if-contains-last-replica") + + # Step 2, 3 + volume1_name = "vol-1" + volume2_name = "vol-2" + volume1, pod1, checksum1 = create_deployment_and_write_data(client, + core_api, + make_deployment_with_pvc, # NOQA + volume1_name, + str(1 * Gi), + 3, + DATA_SIZE_IN_MB_3, # NOQA + host_id) # NOQA + volume2, pod2, checksum2 = create_deployment_and_write_data(client, + core_api, + make_deployment_with_pvc, # NOQA + volume2_name, + str(1 * Gi), + 3, + DATA_SIZE_IN_MB_3, # NOQA + host_id) # NOQA + # Make volume 1 replica only located on evict_source_node + make_replica_on_specific_node(client, volume1_name, evict_source_node) + volume2_replicas = get_all_replica_name(client, volume2_name) + + # Step 5 + executor = ThreadPoolExecutor(max_workers=5) + future = executor.submit(drain_node, core_api, evict_source_node) + + # Step 6 + check_replica_evict_state(client, volume1_name, evict_source_node, True) + check_node_auto_evict_state(client, evict_source_node, True) + + volume2 = wait_for_volume_degraded(client, volume2_name) + check_all_replicas_evict_state(client, volume2_name, False) + + # Step 7 + wait_drain_complete(future, 60) + + # Step 8 + set_node_cordon(core_api, evict_source_node.id, False) + + # Step 9 + volume1 = client.by_id_volume(volume1_name) + assert len(volume1.replicas) == 1 + for replica in volume1.replicas: + assert replica.hostId != evict_source_node.id + + # Step 10 + # Verify volume2 replicas not moved by check replica name + # stored before the node drain + volume2 = wait_for_volume_healthy(client, volume2_name) + for replica in volume2.replicas: + assert replica.name in volume2_replicas + + # Step 11 + check_node_auto_evict_state(client, evict_source_node, False) + + # Step 12 + check_all_replicas_evict_state(client, volume1_name, False) + check_all_replicas_evict_state(client, volume2_name, False) + + # Step 13 + data_path = '/data/test' + test_data_checksum1 = get_pod_data_md5sum(core_api, + pod1, + data_path) + assert checksum1 == test_data_checksum1 + + test_data_checksum2 = get_pod_data_md5sum(core_api, + pod2, + data_path) + assert checksum2 == test_data_checksum2 + @pytest.mark.skip(reason="TODO") # NOQA def test_drain_with_block_for_eviction_failure(): From 2f3ad3e97d006f44d21d30365f27dc142c4d119e Mon Sep 17 00:00:00 2001 From: Chris Date: Fri, 23 Feb 2024 17:48:15 +0800 Subject: [PATCH 3/3] Add test case test_drain_with_block_for_eviction_failure ref: 7521 Signed-off-by: Chris --- manager/integration/tests/test_node.py | 82 +++++++++++++++++++------- 1 file changed, 61 insertions(+), 21 deletions(-) diff --git a/manager/integration/tests/test_node.py b/manager/integration/tests/test_node.py index 55d1b3bc28..d71011fa4a 100644 --- a/manager/integration/tests/test_node.py +++ b/manager/integration/tests/test_node.py @@ -2745,10 +2745,19 @@ def check_replica_evict_state(client, volume_name, node, expect_state): # NOQA assert eviction_requested is expect_state -def wait_drain_complete(future, timeout): +def wait_drain_complete(future, timeout, copmpleted=True): """ Wait concurrent.futures object complete in a duration """ + def stop_drain_process(): + """ + Both future.cancel() and executer.shutdown(wait=False) can not really + stop the drain process. + Use this function to stop drain process + """ + command = ["pkill", "-f", "kubectl drain"] + subprocess.check_output(command, text=True) + thread_timeout = timeout try: future.result(timeout=thread_timeout) @@ -2756,9 +2765,9 @@ def wait_drain_complete(future, timeout): except TimeoutError: print("drain node thread exceed timeout ({})s".format(thread_timeout)) drain_complete = False - future.cancel() + stop_drain_process() finally: - assert drain_complete is True + assert drain_complete is copmpleted def make_replica_on_specific_node(client, volume_name, node): # NOQA @@ -2787,15 +2796,11 @@ def check_all_replicas_evict_state(client, volume_name, expect_state): # NOQA assert eviction_requested is expect_state -@pytest.mark.skip(reason="Can not run when in-cluster backup store pod exist") # NOQA -def test_drain_with_block_for_eviction_success(client, core_api, volume_name, make_deployment_with_pvc): # NOQA +def test_drain_with_block_for_eviction_success(client, # NOQA + core_api, # NOQA + volume_name, # NOQA + make_deployment_with_pvc): # NOQA """ - Test case has the potential to drain node where backup store pods are - located. - In that case, test case will fail because backup store pods can only be - forcibly drained. - --- - Test drain completes after evicting replica with node-drain-policy block-for-eviction @@ -2880,16 +2885,10 @@ def test_drain_with_block_for_eviction_success(client, core_api, volume_name, ma assert checksum == test_data_checksum -@pytest.mark.skip(reason="Can not run when in-cluster backup store pod exist") # NOQA def test_drain_with_block_for_eviction_if_contains_last_replica_success(client, # NOQA core_api, # NOQA make_deployment_with_pvc): # NOQA """ - Test case has the potential to drain node where backup store pods are - located. - In that case, test case will fail because backup store pods can only be - forcibly drained. - --- Test drain completes after evicting replicas with node-drain-policy block-for-eviction-if-contains-last-replica @@ -2921,7 +2920,6 @@ def test_drain_with_block_for_eviction_if_contains_last_replica_success(client, nodes = client.list_node() evict_nodes = [node for node in nodes if node.id != host_id][:2] evict_source_node = evict_nodes[0] - # Create extra disk on current node node = client.by_id_node(host_id) disks = node.disks @@ -2993,7 +2991,7 @@ def test_drain_with_block_for_eviction_if_contains_last_replica_success(client, # Step 9 volume1 = client.by_id_volume(volume1_name) - assert len(volume1.replicas) == 1 + wait_for_volume_replica_count(client, volume1_name, 1) for replica in volume1.replicas: assert replica.hostId != evict_source_node.id @@ -3024,8 +3022,10 @@ def test_drain_with_block_for_eviction_if_contains_last_replica_success(client, assert checksum2 == test_data_checksum2 -@pytest.mark.skip(reason="TODO") # NOQA -def test_drain_with_block_for_eviction_failure(): +def test_drain_with_block_for_eviction_failure(client, # NOQA + core_api, # NOQA + volume_name, # NOQA + make_deployment_with_pvc): # NOQA """ Test drain never completes with node-drain-policy block-for-eviction @@ -3040,7 +3040,47 @@ def test_drain_with_block_for_eviction_failure(): - Verify that `node.status.autoEvicting == true`. - Verify that `replica.spec.evictionRequested == true`. 7. Verify the drain never completes. + 8. Stop the drain, check volume is healthy and data correct """ + host_id = get_self_host_id() + nodes = client.list_node() + evict_nodes = [node for node in nodes if node.id != host_id][:2] + evict_source_node = evict_nodes[0] + + # Step 1 + setting = client.by_id_setting( + SETTING_NODE_DRAIN_POLICY) + client.update(setting, value="block-for-eviction") + + # Step 2, 3, 4 + volume, pod, checksum = create_deployment_and_write_data(client, + core_api, + make_deployment_with_pvc, # NOQA + volume_name, + str(1 * Gi), + 3, + DATA_SIZE_IN_MB_3, host_id) # NOQA + + # Step 5 + executor = ThreadPoolExecutor(max_workers=5) + future = executor.submit(drain_node, core_api, evict_source_node) + + # Step 6 + check_replica_evict_state(client, volume_name, evict_source_node, True) + check_node_auto_evict_state(client, evict_source_node, True) + + # Step 7 + wait_drain_complete(future, 90, False) + + # Step 8 + set_node_cordon(core_api, evict_source_node.id, False) + wait_for_volume_healthy(client, volume_name) + data_path = '/data/test' + test_data_checksum = get_pod_data_md5sum(core_api, + pod, + data_path) + assert checksum == test_data_checksum + @pytest.mark.node # NOQA def test_auto_detach_volume_when_node_is_cordoned(client, core_api, volume_name): # NOQA