From 6a084af571fac1871a1229fcaa118e15033ef7a0 Mon Sep 17 00:00:00 2001
From: Chris <chris.chien@suse.com>
Date: Mon, 19 Feb 2024 18:43:14 +0800
Subject: [PATCH 1/3] Add test case test_drain_with_block_for_eviction_success

ref: 7521

Signed-off-by: Chris <chris.chien@suse.com>
---
 manager/integration/tests/common.py    |   1 +
 manager/integration/tests/test_node.py | 151 ++++++++++++++++++++++++-
 2 files changed, 149 insertions(+), 3 deletions(-)

diff --git a/manager/integration/tests/common.py b/manager/integration/tests/common.py
index 28503ae13e..c9c36e0aa0 100644
--- a/manager/integration/tests/common.py
+++ b/manager/integration/tests/common.py
@@ -216,6 +216,7 @@
     "allow-empty-node-selector-volume"
 SETTING_REPLICA_DISK_SOFT_ANTI_AFFINITY = "replica-disk-soft-anti-affinity"
 SETTING_ALLOW_EMPTY_DISK_SELECTOR_VOLUME = "allow-empty-disk-selector-volume"
+SETTING_NODE_DRAIN_POLICY = "node-drain-policy"
 
 DEFAULT_BACKUP_COMPRESSION_METHOD = "lz4"
 BACKUP_COMPRESSION_METHOD_LZ4 = "lz4"
diff --git a/manager/integration/tests/test_node.py b/manager/integration/tests/test_node.py
index 1e4ad5dd32..f54b0e5335 100644
--- a/manager/integration/tests/test_node.py
+++ b/manager/integration/tests/test_node.py
@@ -3,6 +3,7 @@
 import os
 import subprocess
 import time
+import yaml
 
 from random import choice
 from string import ascii_lowercase, digits
@@ -47,8 +48,14 @@
 from common import set_node_scheduling_eviction
 from common import update_node_disks
 from common import update_setting
+from common import SETTING_NODE_DRAIN_POLICY, DATA_SIZE_IN_MB_3
+from common import make_deployment_with_pvc # NOQA
+from common import create_pv_for_volume
+from common import create_pvc_for_volume, create_and_wait_deployment
+from common import get_apps_api_client, write_pod_volume_random_data
 
 from backupstore import set_random_backupstore # NOQA
+from concurrent.futures import ThreadPoolExecutor, TimeoutError
 
 
 CREATE_DEFAULT_DISK_LABEL = "node.longhorn.io/create-default-disk"
@@ -2680,8 +2687,31 @@ def finalizer():
 
     request.addfinalizer(finalizer)
 
-@pytest.mark.skip(reason="TODO")  # NOQA
-def test_drain_with_block_for_eviction_success():
+
+def drain_node(core_api, node): # NOQA    
+    set_node_cordon(core_api, node.id, True)
+
+    command = ["kubectl", "drain", node.id, "--ignore-daemonsets"]
+    subprocess.run(command, check=True)
+
+
+def get_replica_detail(replica_name):
+    """
+    Get allreplica information by this function
+    """
+    command = ["kubectl", "get",
+               "replicas.longhorn.io",
+               "-n",
+               "longhorn-system",
+               replica_name,
+               "-o",
+               "yaml"]
+    output = subprocess.check_output(command, text=True)
+    replica_info = yaml.safe_load(output)
+    return replica_info
+
+
+def test_drain_with_block_for_eviction_success(client, core_api, volume_name, make_deployment_with_pvc): # NOQA
     """
     Test drain completes after evicting replica with node-drain-policy
     block-for-eviction
@@ -2693,7 +2723,6 @@ def test_drain_with_block_for_eviction_success():
     4. Write data to the volume.
     5. Drain a node one of the volume's replicas is scheduled to.
     6. While the drain is ongoing:
-       - Verify that the volume never becomes degraded.
        - Verify that `node.status.autoEvicting == true`.
        - Optionally verify that `replica.spec.evictionRequested == true`.
     7. Verify the drain completes.
@@ -2703,6 +2732,122 @@ def test_drain_with_block_for_eviction_success():
     11. Verify that `replica.spec.evictionRequested == false`.
     12. Verify the volume's data.
     """
+    host_id = get_self_host_id()
+    nodes = client.list_node()
+    evict_nodes = [node for node in nodes if node.id != host_id][:2]
+    evict_source_node = evict_nodes[0]
+    evict_target_node = evict_nodes[1]
+
+    # Step 1
+    setting = client.by_id_setting(
+        SETTING_NODE_DRAIN_POLICY)
+    client.update(setting, value="block-for-eviction")
+
+    # Step 2, 3, 4
+    volume = client.create_volume(name=volume_name,
+                                  size=str(1 * Gi),
+                                  numberOfReplicas=3)
+    volume = common.wait_for_volume_detached(client, volume_name)
+
+    pvc_name = volume_name + "-pvc"
+    create_pv_for_volume(client, core_api, volume, volume_name)
+    create_pvc_for_volume(client, core_api, volume, pvc_name)
+    deployment_name = volume_name + "-dep"
+    deployment = make_deployment_with_pvc(deployment_name, pvc_name)
+    deployment["spec"]["template"]["spec"]["nodeSelector"] \
+        = {"kubernetes.io/hostname": host_id}
+
+    apps_api = get_apps_api_client()
+    create_and_wait_deployment(apps_api, deployment)
+
+    pod_names = common.get_deployment_pod_names(core_api, deployment)
+    data_path = '/data/test'
+    write_pod_volume_random_data(core_api,
+                                 pod_names[0],
+                                 data_path,
+                                 DATA_SIZE_IN_MB_3)
+    expected_test_data_checksum = get_pod_data_md5sum(core_api,
+                                                      pod_names[0],
+                                                      data_path)
+
+    volume = wait_for_volume_healthy(client, volume_name)
+
+    # Make replica not locate on eviction target node
+    volume.updateReplicaCount(replicaCount=2)
+    for replica in volume.replicas:
+        if replica.hostId == evict_target_node.id:
+            volume.replicaRemove(name=replica.name)
+            break
+
+    wait_for_volume_replica_count(client, volume_name, 2)
+
+    # Step 5
+    # drain eviction source node
+    executor = ThreadPoolExecutor(max_workers=5)
+    future = executor.submit(drain_node, core_api, evict_source_node)
+
+    # Step 6
+    volume = client.by_id_volume(volume_name)
+    for replica in volume.replicas:
+        if replica.hostId == evict_source_node.id:
+            replica_name = replica.name
+            break
+
+    replica_info = get_replica_detail(replica_name)
+    eviction_requested = replica_info["spec"]["evictionRequested"]
+    assert eviction_requested is True
+
+    nodes = client.list_node()
+    for node in nodes:
+        if node.id == evict_source_node.id:
+            assert node.autoEvicting is True
+
+    # Step 7
+    thread_timeout = 60
+    try:
+        future.result(timeout=thread_timeout)
+        drain_complete = True
+    except TimeoutError:
+        print("drain node thread exceed timeout ({})s".format(thread_timeout))
+        drain_complete = False
+        future.cancel()
+    finally:
+        assert drain_complete is True
+
+    wait_for_volume_replica_count(client, volume_name, 2)
+
+    # Step 8
+    set_node_cordon(core_api, evict_source_node.id, False)
+
+    # Step 9
+    volume = wait_for_volume_healthy(client, volume_name)
+    assert len(volume.replicas) == 2
+    for replica in volume.replicas:
+        assert replica.hostId != evict_source_node.id
+
+    # Stpe 10
+    nodes = client.list_node()
+    for node in nodes:
+        assert node.autoEvicting is False
+
+    # Step 11
+    volume = client.by_id_volume(volume_name)
+    for replica in volume.replicas:
+        if replica.hostId == evict_target_node.id:
+            replica_name = replica.name
+            break
+
+    replica_info = get_replica_detail(replica_name)
+    eviction_requested = replica_info["spec"]["evictionRequested"]
+    assert eviction_requested is False
+
+    # Step 12
+    test_data_checksum = get_pod_data_md5sum(core_api,
+                                             pod_names[0],
+                                             data_path)
+
+    assert expected_test_data_checksum == test_data_checksum
+
 
 @pytest.mark.skip(reason="TODO")  # NOQA
 def test_drain_with_block_for_eviction_if_contains_last_replica_success():

From b4892013e8e63401cf30733c14b9ec92c298a758 Mon Sep 17 00:00:00 2001
From: Chris <chris.chien@suse.com>
Date: Tue, 20 Feb 2024 16:48:23 +0800
Subject: [PATCH 2/3] Add test case
 test_drain_with_block_for_eviction_if_contains_last_replica_success

ref: 7521

Signed-off-by: Chris <chris.chien@suse.com>
---
 manager/integration/Dockerfile         |   2 +-
 manager/integration/tests/common.py    |  39 ++++
 manager/integration/tests/test_node.py | 291 ++++++++++++++++++-------
 3 files changed, 257 insertions(+), 75 deletions(-)

diff --git a/manager/integration/Dockerfile b/manager/integration/Dockerfile
index 8541f5edd9..09ccfc9c2f 100644
--- a/manager/integration/Dockerfile
+++ b/manager/integration/Dockerfile
@@ -1,6 +1,6 @@
 FROM registry.suse.com/bci/python:3.9
 
-ARG KUBECTL_VERSION=v1.17.0
+ARG KUBECTL_VERSION=v1.28.4
 ARG YQ_VERSION=v4.24.2
 ARG TERRAFORM_VERSION=1.3.5
 ARG ARCH=amd64
diff --git a/manager/integration/tests/common.py b/manager/integration/tests/common.py
index c9c36e0aa0..38e2c78743 100644
--- a/manager/integration/tests/common.py
+++ b/manager/integration/tests/common.py
@@ -6123,3 +6123,42 @@ def wait_for_instance_manager_count(client, number, retry_counts=120):
         time.sleep(RETRY_INTERVAL_LONG)
 
     return len(ims)
+
+
+def create_deployment_and_write_data(client, # NOQA
+                                     core_api, # NOQA
+                                     make_deployment_with_pvc, # NOQA
+                                     volume_name, # NOQA
+                                     size, # NOQA
+                                     replica_count, # NOQA
+                                     data_size, # NOQA
+                                     attach_node_id=None): # NOQA
+    apps_api = get_apps_api_client()
+    volume = client.create_volume(name=volume_name,
+                                  size=size,
+                                  numberOfReplicas=replica_count)
+    volume = wait_for_volume_detached(client, volume_name)
+
+    pvc_name = volume_name + "-pvc"
+    create_pv_for_volume(client, core_api, volume, volume_name)
+    create_pvc_for_volume(client, core_api, volume, pvc_name)
+    deployment_name = volume_name + "-dep"
+    deployment = make_deployment_with_pvc(deployment_name, pvc_name)
+    if attach_node_id:
+        deployment["spec"]["template"]["spec"]["nodeSelector"] \
+            = {"kubernetes.io/hostname": attach_node_id}
+
+    create_and_wait_deployment(apps_api, deployment)
+
+    data_path = '/data/test'
+    deployment_pod_names = get_deployment_pod_names(core_api,
+                                                    deployment)
+    write_pod_volume_random_data(core_api,
+                                 deployment_pod_names[0],
+                                 data_path,
+                                 data_size)
+    checksum = get_pod_data_md5sum(core_api,
+                                   deployment_pod_names[0],
+                                   data_path)
+
+    return client.by_id_volume(volume_name), deployment_pod_names[0], checksum
diff --git a/manager/integration/tests/test_node.py b/manager/integration/tests/test_node.py
index f54b0e5335..55d1b3bc28 100644
--- a/manager/integration/tests/test_node.py
+++ b/manager/integration/tests/test_node.py
@@ -50,9 +50,8 @@
 from common import update_setting
 from common import SETTING_NODE_DRAIN_POLICY, DATA_SIZE_IN_MB_3
 from common import make_deployment_with_pvc # NOQA
-from common import create_pv_for_volume
-from common import create_pvc_for_volume, create_and_wait_deployment
-from common import get_apps_api_client, write_pod_volume_random_data
+from common import prepare_host_disk, wait_for_volume_degraded
+from common import create_deployment_and_write_data
 
 from backupstore import set_random_backupstore # NOQA
 from concurrent.futures import ThreadPoolExecutor, TimeoutError
@@ -2691,7 +2690,15 @@ def finalizer():
 def drain_node(core_api, node): # NOQA    
     set_node_cordon(core_api, node.id, True)
 
-    command = ["kubectl", "drain", node.id, "--ignore-daemonsets"]
+    command = [
+        "kubectl",
+        "drain",
+        node.id,
+        "--ignore-daemonsets",
+        "--delete-emptydir-data",
+        "--grace-period=-1"
+    ]
+
     subprocess.run(command, check=True)
 
 
@@ -2711,8 +2718,84 @@ def get_replica_detail(replica_name):
     return replica_info
 
 
+def check_node_auto_evict_state(client, target_node, expect_state): # NOQA
+    def get_specific_node(client, target_node):
+        nodes = client.list_node()
+        for node in nodes:
+            if node.id == target_node.id:
+                return node
+
+    for i in range(RETRY_COUNTS):
+        node = get_specific_node(client, target_node)
+        if node.autoEvicting is expect_state:
+            break
+        time.sleep(RETRY_INTERVAL)
+    assert node.autoEvicting is expect_state
+
+
+def check_replica_evict_state(client, volume_name, node, expect_state): # NOQA
+    volume = client.by_id_volume(volume_name)
+    for replica in volume.replicas:
+        if replica.hostId == node.id:
+            replica_name = replica.name
+            break
+
+    replica_info = get_replica_detail(replica_name)
+    eviction_requested = replica_info["spec"]["evictionRequested"]
+    assert eviction_requested is expect_state
+
+
+def wait_drain_complete(future, timeout):
+    """
+    Wait concurrent.futures object complete in a duration
+    """
+    thread_timeout = timeout
+    try:
+        future.result(timeout=thread_timeout)
+        drain_complete = True
+    except TimeoutError:
+        print("drain node thread exceed timeout ({})s".format(thread_timeout))
+        drain_complete = False
+        future.cancel()
+    finally:
+        assert drain_complete is True
+
+
+def make_replica_on_specific_node(client, volume_name, node): # NOQA
+    volume = client.by_id_volume(volume_name)
+    volume.updateReplicaCount(replicaCount=1)
+    for replica in volume.replicas:
+        if replica.hostId != node.id:
+            volume.replicaRemove(name=replica.name)
+    wait_for_volume_replica_count(client, volume_name, 1)
+
+
+def get_all_replica_name(client, volume_name): # NOQA
+    volume_replicas = []
+    volume = client.by_id_volume(volume_name)
+    for replica in volume.replicas:
+        volume_replicas.append(replica.name)
+
+    return volume_replicas
+
+
+def check_all_replicas_evict_state(client, volume_name, expect_state): # NOQA
+    volume = client.by_id_volume(volume_name)
+    for replica in volume.replicas:
+        replica_info = get_replica_detail(replica.name)
+        eviction_requested = replica_info["spec"]["evictionRequested"]
+        assert eviction_requested is expect_state
+
+
+@pytest.mark.skip(reason="Can not run when in-cluster backup store pod exist")  # NOQA
 def test_drain_with_block_for_eviction_success(client, core_api, volume_name, make_deployment_with_pvc): # NOQA
     """
+    Test case has the potential to drain node where backup store pods are
+    located.
+    In that case, test case will fail because backup store pods can only be
+    forcibly drained.
+    ---
+
     Test drain completes after evicting replica with node-drain-policy
     block-for-eviction
 
@@ -2744,33 +2827,13 @@ def test_drain_with_block_for_eviction_success(client, core_api, volume_name, ma
     client.update(setting, value="block-for-eviction")
 
     # Step 2, 3, 4
-    volume = client.create_volume(name=volume_name,
-                                  size=str(1 * Gi),
-                                  numberOfReplicas=3)
-    volume = common.wait_for_volume_detached(client, volume_name)
-
-    pvc_name = volume_name + "-pvc"
-    create_pv_for_volume(client, core_api, volume, volume_name)
-    create_pvc_for_volume(client, core_api, volume, pvc_name)
-    deployment_name = volume_name + "-dep"
-    deployment = make_deployment_with_pvc(deployment_name, pvc_name)
-    deployment["spec"]["template"]["spec"]["nodeSelector"] \
-        = {"kubernetes.io/hostname": host_id}
-
-    apps_api = get_apps_api_client()
-    create_and_wait_deployment(apps_api, deployment)
-
-    pod_names = common.get_deployment_pod_names(core_api, deployment)
-    data_path = '/data/test'
-    write_pod_volume_random_data(core_api,
-                                 pod_names[0],
-                                 data_path,
-                                 DATA_SIZE_IN_MB_3)
-    expected_test_data_checksum = get_pod_data_md5sum(core_api,
-                                                      pod_names[0],
-                                                      data_path)
-
-    volume = wait_for_volume_healthy(client, volume_name)
+    volume, pod, checksum = create_deployment_and_write_data(client,
+                                                             core_api,
+                                                             make_deployment_with_pvc, # NOQA
+                                                             volume_name,
+                                                             str(1 * Gi),
+                                                             3,
+                                                             DATA_SIZE_IN_MB_3, host_id) # NOQA
 
     # Make replica not locate on eviction target node
     volume.updateReplicaCount(replicaCount=2)
@@ -2787,33 +2850,11 @@ def test_drain_with_block_for_eviction_success(client, core_api, volume_name, ma
     future = executor.submit(drain_node, core_api, evict_source_node)
 
     # Step 6
-    volume = client.by_id_volume(volume_name)
-    for replica in volume.replicas:
-        if replica.hostId == evict_source_node.id:
-            replica_name = replica.name
-            break
-
-    replica_info = get_replica_detail(replica_name)
-    eviction_requested = replica_info["spec"]["evictionRequested"]
-    assert eviction_requested is True
-
-    nodes = client.list_node()
-    for node in nodes:
-        if node.id == evict_source_node.id:
-            assert node.autoEvicting is True
+    check_replica_evict_state(client, volume_name, evict_source_node, True)
+    check_node_auto_evict_state(client, evict_source_node, True)
 
     # Step 7
-    thread_timeout = 60
-    try:
-        future.result(timeout=thread_timeout)
-        drain_complete = True
-    except TimeoutError:
-        print("drain node thread exceed timeout ({})s".format(thread_timeout))
-        drain_complete = False
-        future.cancel()
-    finally:
-        assert drain_complete is True
-
+    wait_drain_complete(future, 60)
     wait_for_volume_replica_count(client, volume_name, 2)
 
     # Step 8
@@ -2826,32 +2867,29 @@ def test_drain_with_block_for_eviction_success(client, core_api, volume_name, ma
         assert replica.hostId != evict_source_node.id
 
     # Stpe 10
-    nodes = client.list_node()
-    for node in nodes:
-        assert node.autoEvicting is False
+    check_node_auto_evict_state(client, evict_source_node, False)
 
     # Step 11
-    volume = client.by_id_volume(volume_name)
-    for replica in volume.replicas:
-        if replica.hostId == evict_target_node.id:
-            replica_name = replica.name
-            break
-
-    replica_info = get_replica_detail(replica_name)
-    eviction_requested = replica_info["spec"]["evictionRequested"]
-    assert eviction_requested is False
+    check_replica_evict_state(client, volume_name, evict_target_node, False)
 
     # Step 12
+    data_path = data_path = '/data/test'
     test_data_checksum = get_pod_data_md5sum(core_api,
-                                             pod_names[0],
+                                             pod,
                                              data_path)
+    assert checksum == test_data_checksum
 
-    assert expected_test_data_checksum == test_data_checksum
 
-
-@pytest.mark.skip(reason="TODO")  # NOQA
-def test_drain_with_block_for_eviction_if_contains_last_replica_success():
+@pytest.mark.skip(reason="Can not run when in-cluster backup store pod exist")  # NOQA
+def test_drain_with_block_for_eviction_if_contains_last_replica_success(client, # NOQA
+                                                                        core_api, # NOQA
+                                                                        make_deployment_with_pvc): # NOQA
     """
+    Test case has the potential to drain node where backup store pods are
+    located.
+    In that case, test case will fail because backup store pods can only be
+    forcibly drained.
+    ---
     Test drain completes after evicting replicas with node-drain-policy
     block-for-eviction-if-contains-last-replica
 
@@ -2864,7 +2902,6 @@ def test_drain_with_block_for_eviction_if_contains_last_replica_success():
     4. Write data to the volumes.
     5. Drain a node both volumes have a replica scheduled to.
     6. While the drain is ongoing:
-       - Verify that the volume with one replica never becomes degraded.
        - Verify that the volume with three replicas becomes degraded.
        - Verify that `node.status.autoEvicting == true`.
        - Optionally verify that `replica.spec.evictionRequested == true` on the
@@ -2880,6 +2917,112 @@ def test_drain_with_block_for_eviction_if_contains_last_replica_success():
     12. Verify that `replica.spec.evictionRequested == false` on all replicas.
     13. Verify the the data in both volumes.
     """
+    host_id = get_self_host_id()
+    nodes = client.list_node()
+    evict_nodes = [node for node in nodes if node.id != host_id][:2]
+    evict_source_node = evict_nodes[0]
+
+    # Create extra disk on current node
+    node = client.by_id_node(host_id)
+    disks = node.disks
+
+    disk_volume_name = 'vol-disk'
+    disk_volume = client.create_volume(name=disk_volume_name,
+                                       size=str(2 * Gi),
+                                       numberOfReplicas=1,
+                                       dataLocality="strict-local")
+    disk_volume = wait_for_volume_detached(client, disk_volume_name)
+
+    disk_volume.attach(hostId=host_id)
+    disk_volume = wait_for_volume_healthy(client, disk_volume_name)
+    disk_path = prepare_host_disk(get_volume_endpoint(disk_volume),
+                                  disk_volume_name)
+    disk = {"path": disk_path, "allowScheduling": True}
+
+    update_disk = get_update_disks(disks)
+    update_disk["disk1"] = disk
+
+    node = update_node_disks(client, node.name, disks=update_disk, retry=True)
+    node = wait_for_disk_update(client, host_id, len(update_disk))
+    assert len(node.disks) == len(update_disk)
+
+    # Step 1
+    setting = client.by_id_setting(
+        SETTING_NODE_DRAIN_POLICY)
+    client.update(setting, value="block-for-eviction-if-contains-last-replica")
+
+    # Step 2, 3
+    volume1_name = "vol-1"
+    volume2_name = "vol-2"
+    volume1, pod1, checksum1 = create_deployment_and_write_data(client,
+                                                                core_api,
+                                                                make_deployment_with_pvc, # NOQA
+                                                                volume1_name,
+                                                                str(1 * Gi),
+                                                                3,
+                                                                DATA_SIZE_IN_MB_3, # NOQA
+                                                                host_id) # NOQA
+    volume2, pod2, checksum2 = create_deployment_and_write_data(client,
+                                                                core_api,
+                                                                make_deployment_with_pvc,  # NOQA
+                                                                volume2_name,
+                                                                str(1 * Gi),
+                                                                3,
+                                                                DATA_SIZE_IN_MB_3, # NOQA
+                                                                host_id) # NOQA
+    # Make volume 1 replica only located on evict_source_node
+    make_replica_on_specific_node(client, volume1_name, evict_source_node)
+    volume2_replicas = get_all_replica_name(client, volume2_name)
+
+    # Step 5
+    executor = ThreadPoolExecutor(max_workers=5)
+    future = executor.submit(drain_node, core_api, evict_source_node)
+
+    # Step 6
+    check_replica_evict_state(client, volume1_name, evict_source_node, True)
+    check_node_auto_evict_state(client, evict_source_node, True)
+
+    volume2 = wait_for_volume_degraded(client, volume2_name)
+    check_all_replicas_evict_state(client, volume2_name, False)
+
+    # Step 7
+    wait_drain_complete(future, 60)
+
+    # Step 8
+    set_node_cordon(core_api, evict_source_node.id, False)
+
+    # Step 9
+    volume1 = client.by_id_volume(volume1_name)
+    assert len(volume1.replicas) == 1
+    for replica in volume1.replicas:
+        assert replica.hostId != evict_source_node.id
+
+    # Step 10
+    # Verify volume2 replicas not moved by check replica name
+    # stored before the node drain
+    volume2 = wait_for_volume_healthy(client, volume2_name)
+    for replica in volume2.replicas:
+        assert replica.name in volume2_replicas
+
+    # Step 11
+    check_node_auto_evict_state(client, evict_source_node, False)
+
+    # Step 12
+    check_all_replicas_evict_state(client, volume1_name, False)
+    check_all_replicas_evict_state(client, volume2_name, False)
+
+    # Step 13
+    data_path = '/data/test'
+    test_data_checksum1 = get_pod_data_md5sum(core_api,
+                                              pod1,
+                                              data_path)
+    assert checksum1 == test_data_checksum1
+
+    test_data_checksum2 = get_pod_data_md5sum(core_api,
+                                              pod2,
+                                              data_path)
+    assert checksum2 == test_data_checksum2
+
 
 @pytest.mark.skip(reason="TODO")  # NOQA
 def test_drain_with_block_for_eviction_failure():

From 2f3ad3e97d006f44d21d30365f27dc142c4d119e Mon Sep 17 00:00:00 2001
From: Chris <chris.chien@suse.com>
Date: Fri, 23 Feb 2024 17:48:15 +0800
Subject: [PATCH 3/3] Add test case test_drain_with_block_for_eviction_failure

ref: 7521

Signed-off-by: Chris <chris.chien@suse.com>
---
 manager/integration/tests/test_node.py | 82 +++++++++++++++++++-------
 1 file changed, 61 insertions(+), 21 deletions(-)

diff --git a/manager/integration/tests/test_node.py b/manager/integration/tests/test_node.py
index 55d1b3bc28..d71011fa4a 100644
--- a/manager/integration/tests/test_node.py
+++ b/manager/integration/tests/test_node.py
@@ -2745,10 +2745,19 @@ def check_replica_evict_state(client, volume_name, node, expect_state): # NOQA
     assert eviction_requested is expect_state
 
 
-def wait_drain_complete(future, timeout):
+def wait_drain_complete(future, timeout, copmpleted=True):
     """
     Wait concurrent.futures object complete in a duration
     """
+    def stop_drain_process():
+        """
+        Both future.cancel() and executer.shutdown(wait=False) can not really
+        stop the drain process.
+        Use this function to stop drain process
+        """
+        command = ["pkill", "-f", "kubectl drain"]
+        subprocess.check_output(command, text=True)
+
     thread_timeout = timeout
     try:
         future.result(timeout=thread_timeout)
@@ -2756,9 +2765,9 @@ def wait_drain_complete(future, timeout):
     except TimeoutError:
         print("drain node thread exceed timeout ({})s".format(thread_timeout))
         drain_complete = False
-        future.cancel()
+        stop_drain_process()
     finally:
-        assert drain_complete is True
+        assert drain_complete is copmpleted
 
 
 def make_replica_on_specific_node(client, volume_name, node): # NOQA
@@ -2787,15 +2796,11 @@ def check_all_replicas_evict_state(client, volume_name, expect_state): # NOQA
         assert eviction_requested is expect_state
 
 
-@pytest.mark.skip(reason="Can not run when in-cluster backup store pod exist")  # NOQA
-def test_drain_with_block_for_eviction_success(client, core_api, volume_name, make_deployment_with_pvc): # NOQA
+def test_drain_with_block_for_eviction_success(client, # NOQA
+                                               core_api, # NOQA
+                                               volume_name, # NOQA
+                                               make_deployment_with_pvc): # NOQA
     """
-    Test case has the potential to drain node where backup store pods are
-    located.
-    In that case, test case will fail because backup store pods can only be
-    forcibly drained.
-    ---
-
     Test drain completes after evicting replica with node-drain-policy
     block-for-eviction
 
@@ -2880,16 +2885,10 @@ def test_drain_with_block_for_eviction_success(client, core_api, volume_name, ma
     assert checksum == test_data_checksum
 
 
-@pytest.mark.skip(reason="Can not run when in-cluster backup store pod exist")  # NOQA
 def test_drain_with_block_for_eviction_if_contains_last_replica_success(client, # NOQA
                                                                         core_api, # NOQA
                                                                         make_deployment_with_pvc): # NOQA
     """
-    Test case has the potential to drain node where backup store pods are
-    located.
-    In that case, test case will fail because backup store pods can only be
-    forcibly drained.
-    ---
     Test drain completes after evicting replicas with node-drain-policy
     block-for-eviction-if-contains-last-replica
 
@@ -2921,7 +2920,6 @@ def test_drain_with_block_for_eviction_if_contains_last_replica_success(client,
     nodes = client.list_node()
     evict_nodes = [node for node in nodes if node.id != host_id][:2]
     evict_source_node = evict_nodes[0]
-
     # Create extra disk on current node
     node = client.by_id_node(host_id)
     disks = node.disks
@@ -2993,7 +2991,7 @@ def test_drain_with_block_for_eviction_if_contains_last_replica_success(client,
 
     # Step 9
     volume1 = client.by_id_volume(volume1_name)
-    assert len(volume1.replicas) == 1
+    wait_for_volume_replica_count(client, volume1_name, 1)
     for replica in volume1.replicas:
         assert replica.hostId != evict_source_node.id
 
@@ -3024,8 +3022,10 @@ def test_drain_with_block_for_eviction_if_contains_last_replica_success(client,
     assert checksum2 == test_data_checksum2
 
 
-@pytest.mark.skip(reason="TODO")  # NOQA
-def test_drain_with_block_for_eviction_failure():
+def test_drain_with_block_for_eviction_failure(client, # NOQA
+                                               core_api, # NOQA
+                                               volume_name, # NOQA
+                                               make_deployment_with_pvc): # NOQA
     """
     Test drain never completes with node-drain-policy block-for-eviction
 
@@ -3040,7 +3040,47 @@ def test_drain_with_block_for_eviction_failure():
        - Verify that `node.status.autoEvicting == true`.
        - Verify that `replica.spec.evictionRequested == true`.
     7. Verify the drain never completes.
+    8. Stop the drain, check volume is healthy and data correct
     """
+    host_id = get_self_host_id()
+    nodes = client.list_node()
+    evict_nodes = [node for node in nodes if node.id != host_id][:2]
+    evict_source_node = evict_nodes[0]
+
+    # Step 1
+    setting = client.by_id_setting(
+        SETTING_NODE_DRAIN_POLICY)
+    client.update(setting, value="block-for-eviction")
+
+    # Step 2, 3, 4
+    volume, pod, checksum = create_deployment_and_write_data(client,
+                                                             core_api,
+                                                             make_deployment_with_pvc, # NOQA
+                                                             volume_name,
+                                                             str(1 * Gi),
+                                                             3,
+                                                             DATA_SIZE_IN_MB_3, host_id) # NOQA
+
+    # Step 5
+    executor = ThreadPoolExecutor(max_workers=5)
+    future = executor.submit(drain_node, core_api, evict_source_node)
+
+    # Step 6
+    check_replica_evict_state(client, volume_name, evict_source_node, True)
+    check_node_auto_evict_state(client, evict_source_node, True)
+
+    # Step 7
+    wait_drain_complete(future, 90, False)
+
+    # Step 8
+    set_node_cordon(core_api, evict_source_node.id, False)
+    wait_for_volume_healthy(client, volume_name)
+    data_path = '/data/test'
+    test_data_checksum = get_pod_data_md5sum(core_api,
+                                             pod,
+                                             data_path)
+    assert checksum == test_data_checksum
+
 
 @pytest.mark.node  # NOQA
 def test_auto_detach_volume_when_node_is_cordoned(client, core_api, volume_name):  # NOQA