diff --git a/dissect/hypervisor/__init__.py b/dissect/hypervisor/__init__.py index 6716621..2fd706c 100644 --- a/dissect/hypervisor/__init__.py +++ b/dissect/hypervisor/__init__.py @@ -1,4 +1,3 @@ -from dissect.hypervisor.backup import vma, xva from dissect.hypervisor.descriptor import hyperv, ovf, pvs, vbox, vmx from dissect.hypervisor.disk import hdd, qcow2, vdi, vhd, vhdx, vmdk from dissect.hypervisor.util import envelope, vmtar @@ -14,9 +13,7 @@ "vdi", "vhd", "vhdx", - "vma", "vmdk", "vmtar", "vmx", - "xva", ] diff --git a/dissect/hypervisor/backup/__init__.py b/dissect/hypervisor/backup/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/dissect/hypervisor/backup/c_vma.py b/dissect/hypervisor/backup/c_vma.py deleted file mode 100644 index a071709..0000000 --- a/dissect/hypervisor/backup/c_vma.py +++ /dev/null @@ -1,60 +0,0 @@ -from dissect.cstruct import cstruct - -vma_def = """ -#define VMA_BLOCK_BITS 12 -#define VMA_BLOCK_SIZE (1 << VMA_BLOCK_BITS) -#define VMA_CLUSTER_BITS (VMA_BLOCK_BITS + 4) -#define VMA_CLUSTER_SIZE (1 << VMA_CLUSTER_BITS) - -#define VMA_EXTENT_HEADER_SIZE 512 -#define VMA_BLOCKS_PER_EXTENT 59 -#define VMA_MAX_CONFIGS 256 - -#define VMA_MAX_EXTENT_SIZE (VMA_EXTENT_HEADER_SIZE + VMA_CLUSTER_SIZE * VMA_BLOCKS_PER_EXTENT) - -/* File Format Definitions */ - -struct VmaDeviceInfoHeader { - uint32 devname_ptr; /* offset into blob_buffer table */ - uint32 reserved0; - uint64 size; /* device size in bytes */ - uint64 reserved1; - uint64 reserved2; -}; - -struct VmaHeader { - char magic[4]; - uint32 version; - char uuid[16]; - int64 ctime; - char md5sum[16]; - - uint32 blob_buffer_offset; - uint32 blob_buffer_size; - uint32 header_size; - - char _reserved1[1984]; - - uint32 config_names[VMA_MAX_CONFIGS]; /* offset into blob_buffer table */ - uint32 config_data[VMA_MAX_CONFIGS]; /* offset into blob_buffer table */ - - char _reserved2[4]; - - VmaDeviceInfoHeader dev_info[256]; -}; - -struct VmaExtentHeader { - char magic[4]; - uint16 reserved1; - uint16 block_count; - char uuid[16]; - char md5sum[16]; - uint64 blockinfo[VMA_BLOCKS_PER_EXTENT]; -}; -""" - -c_vma = cstruct(endian=">").load(vma_def) - - -VMA_MAGIC = b"VMA\x00" -VMA_EXTENT_MAGIC = b"VMAE" diff --git a/dissect/hypervisor/backup/vma.py b/dissect/hypervisor/backup/vma.py deleted file mode 100644 index 9a733d0..0000000 --- a/dissect/hypervisor/backup/vma.py +++ /dev/null @@ -1,269 +0,0 @@ -# References: -# - https://git.proxmox.com/?p=pve-qemu.git;a=blob;f=vma_spec.txt -# - https://lists.gnu.org/archive/html/qemu-devel/2013-02/msg03667.html - -import hashlib -import struct -from collections import defaultdict -from functools import lru_cache -from uuid import UUID - -from dissect.util import ts -from dissect.util.stream import AlignedStream - -from dissect.hypervisor.backup.c_vma import VMA_EXTENT_MAGIC, VMA_MAGIC, c_vma -from dissect.hypervisor.exceptions import InvalidHeaderError - - -class VMA: - """Proxmox VMA. - - Parse and provide a readable object for devices in a Proxmox VMA backup file. - VMA is designed to be streamed for extraction, so we need to do some funny stuff to create a readable - object from it. Performance is not optimal, so it's generally advised to extract a VMA instead. - The vma-extract utility can be used for that. - """ - - def __init__(self, fh): - self.fh = fh - - offset = fh.tell() - self.header = c_vma.VmaHeader(fh) - if self.header.magic != VMA_MAGIC: - raise InvalidHeaderError("Invalid VMA header magic") - - fh.seek(offset) - header_data = bytearray(fh.read(self.header.header_size)) - header_data[32:48] = b"\x00" * 16 - if hashlib.md5(header_data).digest() != self.header.md5sum: - raise InvalidHeaderError("Invalid VMA checksum") - - self.version = self.header.version - self.uuid = UUID(bytes=self.header.uuid) - - blob_start = self.header.blob_buffer_offset - blob_end = self.header.blob_buffer_offset + self.header.blob_buffer_size - self._blob = memoryview(bytes(header_data))[blob_start:blob_end] - - blob_offset = 1 - self._blob_data = {} - while blob_offset + 2 <= self.header.blob_buffer_size: - # The header is in big endian, but this is little... - size = struct.unpack("" - - def open(self): - return DeviceDataStream(self) - - -class Extent: - def __init__(self, fh, offset): - self.fh = fh - self.offset = offset - self.data_offset = offset + c_vma.VMA_EXTENT_HEADER_SIZE - - self.fh.seek(offset) - header_data = bytearray(fh.read(c_vma.VMA_EXTENT_HEADER_SIZE)) - self.header = c_vma.VmaExtentHeader(header_data) - if self.header.magic != VMA_EXTENT_MAGIC: - raise InvalidHeaderError("Invalid VMA extent header magic") - - header_data[24:40] = b"\x00" * 16 - if hashlib.md5(header_data).digest() != self.header.md5sum: - raise InvalidHeaderError("Invalid VMA extent checksum") - - self.uuid = UUID(bytes=self.header.uuid) - self.size = self.header.block_count * c_vma.VMA_BLOCK_SIZE - - # Keep track of the lowest and highest cluster we have for any device - # We can use this to speed up extent lookup later on - # There are at most 59 entries, so safe to parse ahead of use - self._min = {} - self._max = {} - self.blocks = defaultdict(list) - block_offset = self.data_offset - for block_info in self.header.blockinfo: - cluster_num = block_info & 0xFFFFFFFF - dev_id = (block_info >> 32) & 0xFF - mask = block_info >> (32 + 16) - - if dev_id == 0: - continue - - if dev_id not in self._min: - self._min[dev_id] = cluster_num - self._max[dev_id] = cluster_num - elif cluster_num < self._min[dev_id]: - self._min[dev_id] = cluster_num - elif cluster_num > self._max[dev_id]: - self._max[dev_id] = cluster_num - - self.blocks[dev_id].append((cluster_num, mask, block_offset)) - - if mask == 0xFFFF: - block_offset += 16 * c_vma.VMA_BLOCK_SIZE - elif mask == 0: - pass - else: - block_offset += bin(mask).count("1") * c_vma.VMA_BLOCK_SIZE - - def __repr__(self): - return f"" - - -class DeviceDataStream(AlignedStream): - def __init__(self, device): - self.device = device - self.vma = device.vma - super().__init__(size=device.size, align=c_vma.VMA_CLUSTER_SIZE) - - def _read(self, offset, length): - cluster_offset = offset // c_vma.VMA_CLUSTER_SIZE - cluster_count = (length + c_vma.VMA_CLUSTER_SIZE - 1) // c_vma.VMA_CLUSTER_SIZE - block_count = (length + c_vma.VMA_BLOCK_SIZE - 1) // c_vma.VMA_BLOCK_SIZE - - result = [] - for _, mask, block_offset in _iter_clusters(self.vma, self.device.id, cluster_offset, cluster_count): - read_count = min(block_count, 16) - - # Optimize reading fully set and fully sparse masks - if mask == 0xFFFF: - self.vma.fh.seek(block_offset) - result.append(self.vma.fh.read(c_vma.VMA_BLOCK_SIZE * read_count)) - elif mask == 0: - result.append(b"\x00" * read_count * c_vma.VMA_BLOCK_SIZE) - else: - self.vma.fh.seek(block_offset) - for allocated, count in _iter_mask(mask, read_count): - if allocated: - result.append(self.vma.fh.read(c_vma.VMA_BLOCK_SIZE * count)) - else: - result.append(b"\x00" * count * c_vma.VMA_BLOCK_SIZE) - - block_count -= read_count - if block_count == 0: - break - - return b"".join(result) - - -def _iter_clusters(vma, dev_id, cluster, count): - # Find clusters and starting offsets in all extents - temp = {} - end = cluster + count - - for extent in vma.extents(): - if dev_id not in extent.blocks: - continue - - if end < extent._min[dev_id] or cluster > extent._max[dev_id]: - continue - - for cluster_num, mask, block_offset in extent.blocks[dev_id]: - if cluster_num == cluster: - yield cluster_num, mask, block_offset - cluster += 1 - - while cluster in temp: - yield temp[cluster] - del temp[cluster] - cluster += 1 - elif cluster < cluster_num <= end: - temp[cluster_num] = (cluster_num, mask, block_offset) - - if cluster == end: - break - - if cluster == end: - break - - while cluster in temp: - yield temp[cluster] - del temp[cluster] - cluster += 1 - - -def _iter_mask(mask, length): - # Yield consecutive bitmask values - current_status = mask & 1 - current_count = 0 - - for bit_idx in range(length): - status = (mask & (1 << bit_idx)) >> bit_idx - if status == current_status: - current_count += 1 - else: - yield current_status, current_count - current_status = status - current_count = 1 - - if current_count: - yield current_status, current_count diff --git a/dissect/hypervisor/backup/xva.py b/dissect/hypervisor/backup/xva.py deleted file mode 100644 index 692891c..0000000 --- a/dissect/hypervisor/backup/xva.py +++ /dev/null @@ -1,136 +0,0 @@ -import hashlib -import tarfile -from bisect import bisect_right -from xml.etree import ElementTree - -from dissect.util.stream import AlignedStream - -BLOCK_SIZE = 1024 * 1024 - - -class XVA: - """XVA reader. - - XenCenter export format. Basically a tar file with "blocks" of 1MB. - """ - - def __init__(self, fh): - # We don't have to cache tar members, tarfile already does that for us - self.tar = tarfile.open(fileobj=fh) - self._ova = None - - @property - def ova(self): - if not self._ova: - ova_member = self.tar.getmember("ova.xml") - ova_fh = self.tar.extractfile(ova_member) - self._ova = ElementTree.fromstring(ova_fh.read()) - return self._ova - - def disks(self): - return [ - el.text - for el in self.ova.findall( - "*//member/name[.='VDI']/../..//name[.='type']/..value[.='Disk']/../..//name[.='VDI']/../value" - ) - ] - - def open(self, ref, verify=False): - size = int( - self.ova.find(f"*//member/name[.='id']/../value[.='{ref}']/../..//name[.='virtual_size']/../value").text - ) - return XVAStream(self, ref, size, verify) - - -class XVAStream(AlignedStream): - """XVA stream. - - XenServer usually just streams an XVA file right into an output file, so our use-case requires a bit - more trickery. We generally don't stream directly into an output file, but try to create a file-like - object for other code to use. - - The numbers for the block files (weirdly) don't represent offsets. It's possible for a block file - to be 0 sized, in which case you should "add" that block to the stream, and continue on to the next. - The next block might have a number + 1 of what your current offset is, but it will still contain the - data for that current offset. For this reason we build a lookup list with offsets. - """ - - def __init__(self, xva, ref, size, verify=False): - self.xva = xva - self.ref = ref - self.verify = verify - - index = 0 - offset = 0 - self._lookup = [] - self._members = [] - for block_index, block_member, checksum_member in _iter_block_files(xva, ref): - if block_index > index + 1: - skipped = block_index - (index + 1) - offset += skipped * BLOCK_SIZE - - if block_member.size != 0: - self._lookup.append(offset) - self._members.append((block_member, checksum_member)) - - offset += block_member.size - - index = block_index - - super().__init__(size, align=BLOCK_SIZE) - - def _read(self, offset, length): - result = [] - - while length > 0: - # This method is probably sub-optimal, but it's fairly low effort and we rarely encounter XVA anyway - block_idx = bisect_right(self._lookup, offset) - nearest_offset = self._lookup[block_idx - 1] - - if offset >= nearest_offset + BLOCK_SIZE: - result.append(b"\x00" * BLOCK_SIZE) - else: - block_member, checksum_member = self._members[block_idx - 1] - buf = self.xva.tar.extractfile(block_member).read() - - if self.verify: - if checksum_member is None: - raise ValueError(f"No checksum for {block_member.name}") - - if ( - checksum_member.name.endswith("checksum") - and hashlib.sha1(buf).hexdigest() != self.xva.tar.extractfile(checksum_member).read().decode() - ): - raise ValueError(f"Invalid checksum for {checksum_member.name}") - else: - raise NotImplementedError(f"Unsupported checksum: {checksum_member.name}") - - result.append(buf) - - offset += BLOCK_SIZE - length -= BLOCK_SIZE - - return b"".join(result) - - -def _iter_block_files(xva, ref): - member_index = None - block_member = None - checksum_member = None - - for member in xva.tar.getmembers(): - if not member.name.startswith(ref): - continue - - index = int(member.name.split("/")[-1].split(".")[0]) - if member_index is None: - member_index = index - - if member_index != index: - yield (member_index, block_member, checksum_member) - member_index = index - - if member.name.endswith(("checksum", "xxhash")): - checksum_member = member - else: - block_member = member diff --git a/dissect/hypervisor/tools/vma.py b/dissect/hypervisor/tools/vma.py deleted file mode 100644 index 781d556..0000000 --- a/dissect/hypervisor/tools/vma.py +++ /dev/null @@ -1,173 +0,0 @@ -import argparse -import logging -import sys -from pathlib import Path - -from dissect.hypervisor.backup.c_vma import c_vma -from dissect.hypervisor.backup.vma import VMA, _iter_mask - -try: - from rich.logging import RichHandler - from rich.progress import ( - BarColumn, - DownloadColumn, - Progress, - TextColumn, - TimeRemainingColumn, - TransferSpeedColumn, - ) - - progress = Progress( - TextColumn("[bold blue]{task.fields[filename]}", justify="right"), - BarColumn(bar_width=None), - "[progress.percentage]{task.percentage:>3.1f}%", - "•", - DownloadColumn(), - "•", - TransferSpeedColumn(), - "•", - TimeRemainingColumn(), - transient=True, - ) -except ImportError: - RichHandler = logging.StreamHandler - - class Progress: - def __init__(self): - self.filename = None - self.total = None - - self._task_id = 0 - self._info = {} - - def __enter__(self): - pass - - def __exit__(self, *args, **kwargs): - sys.stderr.write("\n") - sys.stderr.flush() - - def add_task(self, name, filename, total, **kwargs): - task_id = self._task_id - self._task_id += 1 - - self._info[task_id] = {"filename": filename, "total": total, "position": 0} - - return task_id - - def update(self, task_id, advance): - self._info[task_id]["position"] += advance - self.draw() - - def draw(self): - infos = [] - for info in self._info.values(): - infos.append(f"{info['filename']} {(info['position'] / info['total']) * 100:0.2f}%") - sys.stderr.write("\r" + " | ".join(infos)) - sys.stderr.flush() - - progress = Progress() - - -log = logging.getLogger(__name__) - - -def setup_logging(logger, verbosity): - if verbosity == 1: - level = logging.ERROR - elif verbosity == 2: - level = logging.WARNING - elif verbosity == 3: - level = logging.INFO - elif verbosity >= 4: - level = logging.DEBUG - else: - level = logging.CRITICAL - - handler = RichHandler() - handler.setFormatter(logging.Formatter("%(message)s")) - handler.setLevel(level) - logger.addHandler(handler) - logger.setLevel(level) - - -def main(): - parser = argparse.ArgumentParser(description="VMA extractor") - parser.add_argument("input", type=Path, help="path to vma file") - parser.add_argument("-o", "--output", type=Path, required=True, help="path to output directory") - parser.add_argument("-v", "--verbose", action="count", default=3, help="increase output verbosity") - args = parser.parse_args() - - setup_logging(log, args.verbose) - - in_file = args.input.resolve() - if not in_file.exists(): - log.error("Input file does not exist: %s", in_file) - parser.exit() - - out_dir = args.output.resolve() - if not out_dir.exists(): - log.error("Output path does not exist: %s", out_dir) - parser.exit() - - if not out_dir.is_dir(): - log.error("Output path is not a directory: %s", out_dir) - parser.exit() - - with in_file.open("rb") as fh: - vma = VMA(fh) - - log.info("Extracting config files") - for config_name, config_data in vma.configs().items(): - out_file = out_dir.joinpath(config_name) - - log.info("%s -> %s (%d bytes)", config_name, out_file, len(config_data)) - out_file.write_bytes(config_data) - - log.info("Extracting device data") - tasks = {} - handles = {} - for device in vma.devices(): - task_id = progress.add_task("extract", filename=device.name, total=device.size) - tasks[device.id] = task_id - handles[device.id] = out_dir.joinpath(device.name).open("wb") - - with progress: - try: - for extent in vma.extents(): - vma.fh.seek(extent.data_offset) - for block_info in extent.header.blockinfo: - cluster_num = block_info & 0xFFFFFFFF - dev_id = (block_info >> 32) & 0xFF - mask = block_info >> (32 + 16) - - if dev_id == 0: - continue - - fh_out = handles[dev_id] - fh_out.seek(cluster_num * c_vma.VMA_CLUSTER_SIZE) - - if mask == 0xFFFF: - fh_out.write(vma.fh.read(c_vma.VMA_CLUSTER_SIZE)) - elif mask == 0: - fh_out.write(b"\x00" * c_vma.VMA_CLUSTER_SIZE) - else: - for allocated, count in _iter_mask(mask, 16): - if allocated: - fh_out.write(vma.fh.read(count * c_vma.VMA_BLOCK_SIZE)) - else: - fh_out.write(b"\x00" * count * c_vma.VMA_BLOCK_SIZE) - - progress.update(tasks[dev_id], advance=c_vma.VMA_CLUSTER_SIZE) - except Exception: - log.exception("Exception during extraction") - finally: - for handle in handles.values(): - handle.close() - - -if __name__ == "__main__": - try: - sys.exit(main()) - except KeyboardInterrupt: - pass diff --git a/pyproject.toml b/pyproject.toml index d6e2329..153bfb6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,6 @@ repository = "https://github.com/fox-it/dissect.hypervisor" [project.optional-dependencies] full = [ "pycryptodome", - "rich", ] dev = [ "dissect.hypervisor[full]", @@ -48,7 +47,6 @@ dev = [ ] [project.scripts] -vma-extract = "dissect.hypervisor.tools.vma:main" envelope-decrypt = "dissect.hypervisor.tools.envelope:main" [tool.black] diff --git a/tests/conftest.py b/tests/conftest.py index 054a07c..3acd221 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -79,11 +79,6 @@ def split_hdd() -> Iterator[str]: yield absolute_path("data/split.hdd") -@pytest.fixture -def simple_vma() -> Iterator[BinaryIO]: - yield from open_file_gz("data/test.vma.gz") - - @pytest.fixture def envelope() -> Iterator[BinaryIO]: yield from open_file("data/local.tgz.ve") diff --git a/tests/data/test.vma.gz b/tests/data/test.vma.gz deleted file mode 100644 index 3183050..0000000 Binary files a/tests/data/test.vma.gz and /dev/null differ diff --git a/tests/test_vma.py b/tests/test_vma.py deleted file mode 100644 index f07b7ab..0000000 --- a/tests/test_vma.py +++ /dev/null @@ -1,64 +0,0 @@ -import hashlib - -from dissect.hypervisor.backup.vma import VMA, _iter_clusters - - -def test_vma(simple_vma): - vma = VMA(simple_vma) - - assert vma.version == 1 - assert str(vma.uuid) == "04fc12eb-0fed-4322-9aaa-f4e412f68096" - - assert vma.blob_string(1) == "qemu-server.conf" - assert len(vma.blob_data(20)) == 417 - assert vma.blob_string(439) == "drive-scsi0" - - assert vma.config("qemu-server.conf") == vma.blob_data(20) - assert len(vma.configs()) == 1 - - assert len(vma.devices()) == 1 - - device = vma.device(1) - assert device.id == 1 - assert device.name == "drive-scsi0" - assert device.size == 10737418240 - - extents = list(vma.extents()) - # The test data is just a small piece of a real VMA file - assert len(extents) == 2 - - assert list(_iter_clusters(vma, device.id, 0, 23)) == [ - (0, 65535, 13312), - (1, 0, 78848), - (2, 0, 78848), - (3, 0, 78848), - (4, 0, 78848), - (5, 0, 78848), - (6, 0, 78848), - (7, 0, 78848), - (8, 0, 78848), - (9, 0, 78848), - (10, 0, 78848), - (11, 0, 78848), - (12, 0, 78848), - (13, 0, 78848), - (14, 0, 78848), - (15, 0, 78848), - (16, 65535, 79360), - (17, 65535, 144896), - (18, 65535, 210432), - (19, 65535, 275968), - (20, 65535, 341504), - (21, 65535, 407040), - (22, 65535, 472576), - ] - - stream = device.open() - buf = stream.read(65536) - assert hashlib.sha256(buf).hexdigest() == "cf4adcf1933a8c9a0a3ff5588e1400e6beea8a32212b3a35ba08c7b08e4e6b1f" - - buf = stream.read(65536 * 15) - assert buf.strip(b"\x00") == b"" - - buf = stream.read(65536 * 7) - assert hashlib.sha256(buf).hexdigest() == "8c989a3aa590795fa919ccb7d1f28651c85f8a0d9ba00ab22cdd9fb760fa7955"