fox-it · Horofic · Apr 2, 2025 · Apr 17, 2025
diff --git a/dissect/archive/wim.py b/dissect/archive/wim.py
@@ -2,10 +2,10 @@
 
 import io
 import struct
-from functools import cached_property, lru_cache
-from typing import TYPE_CHECKING, BinaryIO, Callable
+from functools import cached_property
+from typing import TYPE_CHECKING, BinaryIO
 
-from dissect.util.stream import AlignedStream, BufferedStream, RelativeStream
+from dissect.util.stream import BufferedStream, CompressedStream, RelativeStream
 from dissect.util.ts import wintimestamp
 
 from dissect.archive.c_wim import (
@@ -84,10 +84,10 @@ class Resource:
         "flags",
         "hash",
         "offset",
-        "original_size",
+        "original_size", # uncompressed size of the resource
         "part_number",
         "reference_count",
-        "size",
+        "size",  # Compressed size of the resource
         "wim",
     )
 
@@ -151,7 +151,7 @@ def open(self) -> BinaryIO:
             decompressor = DECOMPRESSOR_MAP.get(compression_flags)
             if decompressor is None:
                 raise NotImplementedError(f"Compression algorithm not yet supported: {compression_flags}")
-            return CompressedStream(
+            return WimCompressedStream(
                 self.wim.fh, self.offset, self.size, self.original_size, decompressor, self.wim.header.CompressionSize
             )
 
@@ -428,78 +428,60 @@ def relative(self) -> bool:
         return self.info.Flags == SYMLINK_FLAG.RELATIVE
 
 
-class CompressedStream(AlignedStream):
-    def __init__(
-        self,
-        fh: BinaryIO,
-        offset: int,
-        compressed_size: int,
-        original_size: int,
-        decompressor: Callable[[bytes], bytes],
-        chunk_size: int = DEFAULT_CHUNK_SIZE,
-    ):
-        self.fh = fh
-        self.offset = offset
-        self.compressed_size = compressed_size
-        self.original_size = original_size
-        self.decompressor = decompressor
-        self.chunk_size = chunk_size
-
-        # Read the chunk table in advance
-        fh.seek(self.offset)
-        num_chunks = (original_size + self.chunk_size - 1) // self.chunk_size - 1
-        if num_chunks == 0:
-            self._chunks = (0,)
-        else:
-            entry_size = "Q" if original_size > 0xFFFFFFFF else "I"
-            pattern = f"<{num_chunks}{entry_size}"
-            self._chunks = (0, *struct.unpack(pattern, fh.read(struct.calcsize(pattern))))
+def _ts_to_ns(ts: int) -> int:
+    """Convert Windows timestamps to nanosecond timestamps."""
+    return (ts * 100) - 11644473600000000000
 
-        self._data_offset = fh.tell()
 
-        self._read_chunk = lru_cache(32)(self._read_chunk)
-        super().__init__(self.original_size)
+def _read_name(fh: BinaryIO, length: int) -> str:
+    return fh.read(length).decode("utf-16-le")
 
-    def _read(self, offset: int, length: int) -> bytes:
-        result = []
 
-        num_chunks = len(self._chunks)
-        chunk, offset_in_chunk = divmod(offset, self.chunk_size)
+class WimCompressedStream(CompressedStream):
+    """Compressed stream for Windows Imaging (WIM) archives. This class handles the decompression of WIM archives
+    using the specified decompressor.
 
-        while length:
-            if chunk >= num_chunks:
-                # We somehow requested more data than we have runs for
-                break
+    Supported decompression methods are currently:
+        * LZXPRESS4K Huffman
+        * LZXPRESS8K Huffman
+        * LZXPRESS16K Huffman
+        * LZXPRESS32K Huffman (default)
 
-            chunk_offset = self._chunks[chunk]
-            if chunk < num_chunks - 1:
-                next_chunk_offset = self._chunks[chunk + 1]
-                chunk_remaining = self.chunk_size - offset_in_chunk
-            else:
-                next_chunk_offset = self.compressed_size
-                chunk_remaining = (self.original_size - (chunk * self.chunk_size)) - offset_in_chunk
+    Note that LZX decompression is not yet supported.
 
-            read_length = min(chunk_remaining, length)
+    Args:
+        fh: A file-like object for the compressed data.
+        offset: The offset to the start of the chunk table.
+        size: The size of the compressed data.
+        original_size: The original size of the uncompressed data.
+        decompress: The decompressor function to use.
+        chunk_size: The size of the chunks to read from the compressed data. (default: 32 KiB)
+    """
 
-            buf = self._read_chunk(chunk_offset, next_chunk_offset - chunk_offset)
-            result.append(buf[offset_in_chunk : offset_in_chunk + read_length])
+    def __init__(
+        self,
+        fh: BinaryIO,
+        offset: int,
+        size: int,
+        original_size: int,
+        decompress: callable,
+        chunk_size: int = DEFAULT_CHUNK_SIZE,
+    ):
+        fh.seek(offset)
+        num_chunks = (original_size + chunk_size - 1) // chunk_size - 1
 
-            length -= read_length
-            offset += read_length
-            chunk += 1
+        entry_size = "Q" if original_size > 0xFFFFFFFF else "I"
+        pattern = f"<{num_chunks}{entry_size}"
+        chunks = (0, *struct.unpack(pattern, fh.read(struct.calcsize(pattern))))
 
-        return b"".join(result)
+        super().__init__(fh, fh.tell(), size, original_size, decompress, chunk_size, chunks)
 
     def _read_chunk(self, offset: int, size: int) -> bytes:
-        self.fh.seek(self._data_offset + offset)
+        self.fh.seek(self.offset + offset)
         buf = self.fh.read(size)
-        return self.decompressor(buf)
-
-
-def _ts_to_ns(ts: int) -> int:
-    """Convert Windows timestamps to nanosecond timestamps."""
-    return (ts * 100) - 11644473600000000000
 
+        uncompressed_size = (
+            ((self.original_size - 1) & (self.chunk_size - 1)) + 1 if offset == self.chunks[-1] else self.chunk_size
+        )
 
-def _read_name(fh: BinaryIO, length: int) -> str:
-    return fh.read(length).decode("utf-16-le")
+        return buf if len(buf) == uncompressed_size else self.decompressor(buf)
diff --git a/tests/_data/lzms.wim.gz b/tests/_data/lzms.wim.gz
diff --git a/tests/_data/lzx.wim.gz b/tests/_data/lzx.wim.gz
diff --git a/tests/_data/uncompressed.wim.gz b/tests/_data/uncompressed.wim.gz
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -24,6 +24,21 @@ def open_file_gz(name: str, mode: str = "rb") -> Iterator[BinaryIO]:
         yield f
 
 
+@pytest.fixture
+def uncompressed_wim() -> Iterator[BinaryIO]:
+    yield from open_file_gz("_data/uncompressed.wim.gz")
+
+
+@pytest.fixture
+def lzms_wim() -> Iterator[BinaryIO]:
+    yield from open_file_gz("_data/lzms.wim.gz")
+
+
+@pytest.fixture
+def lzx_wim() -> Iterator[BinaryIO]:
+    yield from open_file_gz("_data/lzx.wim.gz")
+
+
 @pytest.fixture
 def basic_wim_4k() -> Iterator[BinaryIO]:
     yield from open_file_gz("_data/basic4k.wim.gz")

diff --git a/tests/test_wim.py b/tests/test_wim.py
@@ -6,7 +6,7 @@
 import pytest
 from dissect.util.compression.lzxpress_huffman import decompress
 
-from dissect.archive.wim import WIM, CompressedStream
+from dissect.archive.wim import WIM, WimCompressedStream
 
 
 @pytest.mark.parametrize(
@@ -16,19 +16,24 @@
         ("basic_wim_8k", 0x2000),
         ("basic_wim_16k", 0x4000),
         ("basic_wim_32k", 0x8000),
+        ("uncompressed_wim", 0),
     ],
 )
 def test_wim(fixture: BinaryIO, chunk_size: int, request: pytest.FixtureRequest) -> None:
     value = request.getfixturevalue(fixture)
     wim = WIM(value)
+
     assert wim.header.CompressionSize == chunk_size
 
-    resource = next(iter(wim.resources.values()))
-    assert resource.open().chunk_size == chunk_size
+    if chunk_size:
+        resource = next(iter(wim.resources.values()))
+        assert resource.open().chunk_size == chunk_size
 
-    stream = CompressedStream(wim.fh, resource.offset, resource.size, resource.original_size, decompress, chunk_size)
-    assert resource.wim.header.CompressionSize == stream.chunk_size
-    assert resource.open().read() == stream.read()
+        stream = WimCompressedStream(
+            wim.fh, resource.offset, resource.size, resource.original_size, decompress, chunk_size
+        )
+        assert resource.wim.header.CompressionSize == stream.chunk_size
+        assert resource.open().read() == stream.read()
 
     images = list(wim.images())
     assert len(images) == 1
@@ -74,3 +79,17 @@ def test_wim(fixture: BinaryIO, chunk_size: int, request: pytest.FixtureRequest)
     assert len(entry.streams) == 1
     assert entry.size() == 60
     assert hashlib.sha1(entry.open().read()).hexdigest() == "1fc83a896287fe48f6d42d8d04f88f6dc90c0c45"
+
+
+@pytest.mark.parametrize(
+    ("fixture"),
+    [
+        ("lzms_wim"),
+        ("lzx_wim"),
+    ],
+)
+def test_wim_not_implemented(fixture: BinaryIO, request: pytest.FixtureRequest) -> None:
+    data = request.getfixturevalue(fixture)
+
+    with pytest.raises(NotImplementedError):
+        WIM(data)._images[0].open()