Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions tests/storage/linstor/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,14 @@
import pytest

import functools
import json
import logging
import os
from dataclasses import dataclass

import lib.commands as commands
from lib.common import safe_split
from lib.sr import SR

# explicit import for package-scope fixtures
from pkgfixtures import pool_with_saved_yum_state
Expand All @@ -19,6 +22,7 @@
from lib.pool import Pool
from lib.sr import SR
from lib.vdi import VDI
from lib.vm import VM

GROUP_NAME = 'linstor_group'
STORAGE_POOL_NAME = f'{GROUP_NAME}/thin_device'
Expand Down Expand Up @@ -171,3 +175,56 @@ def vm_on_linstor_sr(host: Host, linstor_sr: SR, vm_ref: str):
yield vm
logging.info("<< Destroy VM")
vm.destroy(verify=True)

@pytest.fixture(scope='function')
def host_and_corrupted_vdi_on_linstor_sr(host: Host, linstor_sr: SR, vm_ref: str):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May this function-scope fixture be reused in future tests? If yes, I think conftest.py is the right location for it. If not, it would be better defined close to the only function that currently requires it.

vm: VM = host.import_vm(vm_ref, sr_uuid=linstor_sr.uuid)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should be created in a fixture
That would avoid the big try/catch at the end

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I need the VM lifetime to be directly tied to the fixture's lifetime and since we already have a vm_on_linstor_sr fixture with the "innapropriate" scope we would have double the fixture with different scopes.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe worth a comment ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will break when vm_ref is a UUID referring to an existing VM. This also totally bypasses local cache for imported VMs.

If it's best to import a VM and you don't care which VM, that there's no advantage in , then you should import one of the mini VMs without depending on vm_ref, as we already do in some storage tests:

vm = host.import_vm(vm_image('mini-linux-x86_64-bios'), sr_uuid=sr.uuid)

On the contrary, if you want the test to run with any VM we can test (as part of a "multi" job), then maybe you should still rely on the imported_vm fixture, but then copy it to the SR where you need it.

And if what you want is actually an empty VM with just a disk that you can break, then maybe it's time we implemented creating a VM from scratch, without relying on an XVA.

Worth a discussion.

pool: Pool = host.pool
master: Host = pool.master

def get_vdi_volume_name_from_linstor() -> str:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are the helper functions local to the fixture definition. There's no chance we could reuse them in other tests? This makes the fixture definition rather big.

result = master.ssh([
"linstor-kv-tool",
"--dump-volumes",
"-g",
f"xcp-sr-{GROUP_NAME}_thin_device"
])
volumes = json.loads(result)
for k, v in volumes.items():
path = safe_split(k, "/")
if len(path) < 4:
continue
uuid = path[2]
data_type = path[3]
if uuid == vdi_uuid and data_type == "volume-name":
return v
raise FileNotFoundError(f"Could not find matching linstor volume for `{vdi_uuid}`")

def get_vdi_host(path: str) -> Host:
for h in pool.hosts:
result = h.ssh(["test", "-e", path], simple_output=False, check=False)
if result.returncode == 0:
return h
raise FileNotFoundError(f"Could not find matching host for `{vdi_uuid}`")

try:
vdi_uuid: str = next((
vdi.uuid for vdi in vm.vdis if vdi.sr.uuid == linstor_sr.uuid
))

volume_name = get_vdi_volume_name_from_linstor()
lv_path = f"/dev/{GROUP_NAME}/{volume_name}_00000"
vdi_host = get_vdi_host(lv_path)
logging.info("[%s]: corrupting `%s`", host, lv_path)
vdi_host.ssh([
"dd",
"if=/dev/urandom",
f"of={lv_path}",
"bs=4096",
# Lower values seems to go undetected sometimes
"count=10000" # ~40MB
])
yield vm, vdi_host, volume_name
finally:
logging.info("<< Destroy corrupted VDI")
vm.destroy(verify=True)
85 changes: 85 additions & 0 deletions tests/storage/linstor/test_linstor_sr.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
import pytest

import json
import logging
import time

from lib.commands import SSHCommandFailed
from lib.common import vm_image, wait_for
from lib.host import Host
from lib.vm import VM
from tests.storage import vdi_is_open

from .conftest import LINSTOR_PACKAGE

from typing import Tuple

# Requirements:
# - two or more XCP-ng hosts >= 8.2 with additional unused disk(s) for the SR
# - access to XCP-ng RPM repository from the host
Expand Down Expand Up @@ -52,6 +57,29 @@ def test_create_and_destroy_sr(self, pool_with_linstor, provisioning_type, stora
vm.destroy(verify=True)
sr.destroy(verify=True)


def get_drbd_status(host: Host, resource: str):
logging.debug("[%s] Fetching DRBD status for resource `%s`...", host, resource)
return json.loads(host.ssh(["drbdsetup", "status", resource, "--json"]))

def get_corrupted_resources(host: Host, resource: str):
return [
(
res.get("name", ""),
conn.get("name", ""),
peer.get("out-of-sync", 0),
)
for res in get_drbd_status(host, resource)
for conn in res.get("connections", [])
for peer in conn.get("peer_devices", [])
if peer.get("out-of-sync", 0) > 0
]

def wait_drbd_sync(host: Host, resource: str):
logging.info("[%s] Waiting for DRBD sync on resource `%s`...", host, resource)
host.ssh(["drbdadm", "wait-sync", resource])


@pytest.mark.usefixtures("linstor_sr")
class TestLinstorSR:
@pytest.mark.quicktest
Expand Down Expand Up @@ -88,6 +116,63 @@ def test_snapshot(self, vm_on_linstor_sr):
finally:
vm.shutdown(verify=True)

@pytest.mark.small_vm
def test_resynchronization(
self, host_and_corrupted_vdi_on_linstor_sr: Tuple[VM, Host, str]
):
(vm, host, resource_name) = host_and_corrupted_vdi_on_linstor_sr
hostname = host.hostname()

try:
other_host = next(
next(h for h in host.pool.hosts if h.hostname() == conn.get("name", ""))
for res in get_drbd_status(host, resource_name)
for conn in res.get("connections", [])
for peer in conn.get("peer_devices", [])
if peer.get("peer-disk-state", "") == "UpToDate"
)
logging.info("Elected `%s` as peer for verification and repair", other_host)
except StopIteration:
pytest.fail("Could not find an UpToDate peer host")

corrupted = None
max_attempts = 3
# Attempting several times since testing revealed `drbdadm verify` can be flaky
for attempt in range(1, max_attempts + 1):
logging.info("`drbdadm verify` attempt %d/%d", attempt, max_attempts)
logging.info("[%s] Running DRBD verify for %s...", other_host, resource_name)
other_host.ssh(["drbdadm", "verify", f"{resource_name}:{hostname}/0"])
wait_drbd_sync(other_host, resource_name)

corrupted_resources = get_corrupted_resources(other_host, resource_name)
if not corrupted_resources:
logging.warning("No corrupted resources found on attempt #%d", attempt)
continue
for res_name, peer_name, out_of_sync in corrupted_resources:
if res_name == resource_name and peer_name == hostname:
corrupted = (res_name, peer_name, out_of_sync)
if corrupted:
break
if not corrupted:
pytest.fail(f"Failed to identify corrupted resource after {max_attempts} attempts")

logging.info("Invalidating remote resource `%s`...", resource_name)
other_host.ssh([
"drbdadm", "invalidate-remote",
f"{resource_name}:{hostname}/0",
"--reset-bitmap=no"
])
wait_drbd_sync(other_host, resource_name)
if get_corrupted_resources(other_host, resource_name):
pytest.fail("Corrupted resource did not get fixed")

vm.start(on=host.uuid)
try:
vm.wait_for_os_booted()
vm.test_snapshot_on_running_vm()
finally:
vm.shutdown(verify=True)

# *** tests with reboots (longer tests).

@pytest.mark.reboot
Expand Down