diff --git a/dissect/archive/wim.py b/dissect/archive/wim.py index f28358b..513f459 100644 --- a/dissect/archive/wim.py +++ b/dissect/archive/wim.py @@ -2,10 +2,10 @@ import io import struct -from functools import cached_property, lru_cache -from typing import TYPE_CHECKING, BinaryIO, Callable +from functools import cached_property +from typing import TYPE_CHECKING, BinaryIO -from dissect.util.stream import AlignedStream, BufferedStream, RelativeStream +from dissect.util.stream import BufferedStream, CompressedStream, RelativeStream from dissect.util.ts import wintimestamp from dissect.archive.c_wim import ( @@ -84,10 +84,10 @@ class Resource: "flags", "hash", "offset", - "original_size", + "original_size", # uncompressed size of the resource "part_number", "reference_count", - "size", + "size", # Compressed size of the resource "wim", ) @@ -151,7 +151,7 @@ def open(self) -> BinaryIO: decompressor = DECOMPRESSOR_MAP.get(compression_flags) if decompressor is None: raise NotImplementedError(f"Compression algorithm not yet supported: {compression_flags}") - return CompressedStream( + return WimCompressedStream( self.wim.fh, self.offset, self.size, self.original_size, decompressor, self.wim.header.CompressionSize ) @@ -428,78 +428,60 @@ def relative(self) -> bool: return self.info.Flags == SYMLINK_FLAG.RELATIVE -class CompressedStream(AlignedStream): - def __init__( - self, - fh: BinaryIO, - offset: int, - compressed_size: int, - original_size: int, - decompressor: Callable[[bytes], bytes], - chunk_size: int = DEFAULT_CHUNK_SIZE, - ): - self.fh = fh - self.offset = offset - self.compressed_size = compressed_size - self.original_size = original_size - self.decompressor = decompressor - self.chunk_size = chunk_size - - # Read the chunk table in advance - fh.seek(self.offset) - num_chunks = (original_size + self.chunk_size - 1) // self.chunk_size - 1 - if num_chunks == 0: - self._chunks = (0,) - else: - entry_size = "Q" if original_size > 0xFFFFFFFF else "I" - pattern = f"<{num_chunks}{entry_size}" - self._chunks = (0, *struct.unpack(pattern, fh.read(struct.calcsize(pattern)))) +def _ts_to_ns(ts: int) -> int: + """Convert Windows timestamps to nanosecond timestamps.""" + return (ts * 100) - 11644473600000000000 - self._data_offset = fh.tell() - self._read_chunk = lru_cache(32)(self._read_chunk) - super().__init__(self.original_size) +def _read_name(fh: BinaryIO, length: int) -> str: + return fh.read(length).decode("utf-16-le") - def _read(self, offset: int, length: int) -> bytes: - result = [] - num_chunks = len(self._chunks) - chunk, offset_in_chunk = divmod(offset, self.chunk_size) +class WimCompressedStream(CompressedStream): + """Compressed stream for Windows Imaging (WIM) archives. This class handles the decompression of WIM archives + using the specified decompressor. - while length: - if chunk >= num_chunks: - # We somehow requested more data than we have runs for - break + Supported decompression methods are currently: + * LZXPRESS4K Huffman + * LZXPRESS8K Huffman + * LZXPRESS16K Huffman + * LZXPRESS32K Huffman (default) - chunk_offset = self._chunks[chunk] - if chunk < num_chunks - 1: - next_chunk_offset = self._chunks[chunk + 1] - chunk_remaining = self.chunk_size - offset_in_chunk - else: - next_chunk_offset = self.compressed_size - chunk_remaining = (self.original_size - (chunk * self.chunk_size)) - offset_in_chunk + Note that LZX decompression is not yet supported. - read_length = min(chunk_remaining, length) + Args: + fh: A file-like object for the compressed data. + offset: The offset to the start of the chunk table. + size: The size of the compressed data. + original_size: The original size of the uncompressed data. + decompress: The decompressor function to use. + chunk_size: The size of the chunks to read from the compressed data. (default: 32 KiB) + """ - buf = self._read_chunk(chunk_offset, next_chunk_offset - chunk_offset) - result.append(buf[offset_in_chunk : offset_in_chunk + read_length]) + def __init__( + self, + fh: BinaryIO, + offset: int, + size: int, + original_size: int, + decompress: callable, + chunk_size: int = DEFAULT_CHUNK_SIZE, + ): + fh.seek(offset) + num_chunks = (original_size + chunk_size - 1) // chunk_size - 1 - length -= read_length - offset += read_length - chunk += 1 + entry_size = "Q" if original_size > 0xFFFFFFFF else "I" + pattern = f"<{num_chunks}{entry_size}" + chunks = (0, *struct.unpack(pattern, fh.read(struct.calcsize(pattern)))) - return b"".join(result) + super().__init__(fh, fh.tell(), size, original_size, decompress, chunk_size, chunks) def _read_chunk(self, offset: int, size: int) -> bytes: - self.fh.seek(self._data_offset + offset) + self.fh.seek(self.offset + offset) buf = self.fh.read(size) - return self.decompressor(buf) - - -def _ts_to_ns(ts: int) -> int: - """Convert Windows timestamps to nanosecond timestamps.""" - return (ts * 100) - 11644473600000000000 + uncompressed_size = ( + ((self.original_size - 1) & (self.chunk_size - 1)) + 1 if offset == self.chunks[-1] else self.chunk_size + ) -def _read_name(fh: BinaryIO, length: int) -> str: - return fh.read(length).decode("utf-16-le") + return buf if len(buf) == uncompressed_size else self.decompressor(buf) diff --git a/tests/_data/lzms.wim.gz b/tests/_data/lzms.wim.gz new file mode 100644 index 0000000..3614c69 --- /dev/null +++ b/tests/_data/lzms.wim.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee11028e1362c5eee73b53daf2bd7c8401594681ad057165a2723d451e29a015 +size 1259 diff --git a/tests/_data/lzx.wim.gz b/tests/_data/lzx.wim.gz new file mode 100644 index 0000000..8712502 --- /dev/null +++ b/tests/_data/lzx.wim.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96ed547a8a87537968e7ef86e7fa147030cee8efa4b4a73c47eb08b0c56c9adb +size 1309 diff --git a/tests/_data/uncompressed.wim.gz b/tests/_data/uncompressed.wim.gz new file mode 100644 index 0000000..0b1b36a --- /dev/null +++ b/tests/_data/uncompressed.wim.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2c6e7e99cc63ce264d67efcdce5d8357c0f17670f8f58fd04db6c930c586a89 +size 1089 diff --git a/tests/conftest.py b/tests/conftest.py index fd5c35f..07b1fea 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -24,6 +24,21 @@ def open_file_gz(name: str, mode: str = "rb") -> Iterator[BinaryIO]: yield f +@pytest.fixture +def uncompressed_wim() -> Iterator[BinaryIO]: + yield from open_file_gz("_data/uncompressed.wim.gz") + + +@pytest.fixture +def lzms_wim() -> Iterator[BinaryIO]: + yield from open_file_gz("_data/lzms.wim.gz") + + +@pytest.fixture +def lzx_wim() -> Iterator[BinaryIO]: + yield from open_file_gz("_data/lzx.wim.gz") + + @pytest.fixture def basic_wim_4k() -> Iterator[BinaryIO]: yield from open_file_gz("_data/basic4k.wim.gz") diff --git a/tests/test_wim.py b/tests/test_wim.py index f8ebbaa..8f8d042 100644 --- a/tests/test_wim.py +++ b/tests/test_wim.py @@ -6,7 +6,7 @@ import pytest from dissect.util.compression.lzxpress_huffman import decompress -from dissect.archive.wim import WIM, CompressedStream +from dissect.archive.wim import WIM, WimCompressedStream @pytest.mark.parametrize( @@ -16,19 +16,24 @@ ("basic_wim_8k", 0x2000), ("basic_wim_16k", 0x4000), ("basic_wim_32k", 0x8000), + ("uncompressed_wim", 0), ], ) def test_wim(fixture: BinaryIO, chunk_size: int, request: pytest.FixtureRequest) -> None: value = request.getfixturevalue(fixture) wim = WIM(value) + assert wim.header.CompressionSize == chunk_size - resource = next(iter(wim.resources.values())) - assert resource.open().chunk_size == chunk_size + if chunk_size: + resource = next(iter(wim.resources.values())) + assert resource.open().chunk_size == chunk_size - stream = CompressedStream(wim.fh, resource.offset, resource.size, resource.original_size, decompress, chunk_size) - assert resource.wim.header.CompressionSize == stream.chunk_size - assert resource.open().read() == stream.read() + stream = WimCompressedStream( + wim.fh, resource.offset, resource.size, resource.original_size, decompress, chunk_size + ) + assert resource.wim.header.CompressionSize == stream.chunk_size + assert resource.open().read() == stream.read() images = list(wim.images()) assert len(images) == 1 @@ -74,3 +79,17 @@ def test_wim(fixture: BinaryIO, chunk_size: int, request: pytest.FixtureRequest) assert len(entry.streams) == 1 assert entry.size() == 60 assert hashlib.sha1(entry.open().read()).hexdigest() == "1fc83a896287fe48f6d42d8d04f88f6dc90c0c45" + + +@pytest.mark.parametrize( + ("fixture"), + [ + ("lzms_wim"), + ("lzx_wim"), + ], +) +def test_wim_not_implemented(fixture: BinaryIO, request: pytest.FixtureRequest) -> None: + data = request.getfixturevalue(fixture) + + with pytest.raises(NotImplementedError): + WIM(data)._images[0].open()