Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 48 additions & 66 deletions dissect/archive/wim.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

import io
import struct
from functools import cached_property, lru_cache
from typing import TYPE_CHECKING, BinaryIO, Callable
from functools import cached_property
from typing import TYPE_CHECKING, BinaryIO

from dissect.util.stream import AlignedStream, BufferedStream, RelativeStream
from dissect.util.stream import BufferedStream, CompressedStream, RelativeStream
from dissect.util.ts import wintimestamp

from dissect.archive.c_wim import (
Expand Down Expand Up @@ -84,10 +84,10 @@ class Resource:
"flags",
"hash",
"offset",
"original_size",
"original_size", # uncompressed size of the resource
"part_number",
"reference_count",
"size",
"size", # Compressed size of the resource
"wim",
)

Expand Down Expand Up @@ -151,7 +151,7 @@ def open(self) -> BinaryIO:
decompressor = DECOMPRESSOR_MAP.get(compression_flags)
if decompressor is None:
raise NotImplementedError(f"Compression algorithm not yet supported: {compression_flags}")
return CompressedStream(
return WimCompressedStream(
self.wim.fh, self.offset, self.size, self.original_size, decompressor, self.wim.header.CompressionSize
)

Expand Down Expand Up @@ -428,78 +428,60 @@ def relative(self) -> bool:
return self.info.Flags == SYMLINK_FLAG.RELATIVE


class CompressedStream(AlignedStream):
def __init__(
self,
fh: BinaryIO,
offset: int,
compressed_size: int,
original_size: int,
decompressor: Callable[[bytes], bytes],
chunk_size: int = DEFAULT_CHUNK_SIZE,
):
self.fh = fh
self.offset = offset
self.compressed_size = compressed_size
self.original_size = original_size
self.decompressor = decompressor
self.chunk_size = chunk_size

# Read the chunk table in advance
fh.seek(self.offset)
num_chunks = (original_size + self.chunk_size - 1) // self.chunk_size - 1
if num_chunks == 0:
self._chunks = (0,)
else:
entry_size = "Q" if original_size > 0xFFFFFFFF else "I"
pattern = f"<{num_chunks}{entry_size}"
self._chunks = (0, *struct.unpack(pattern, fh.read(struct.calcsize(pattern))))
def _ts_to_ns(ts: int) -> int:
"""Convert Windows timestamps to nanosecond timestamps."""
return (ts * 100) - 11644473600000000000

self._data_offset = fh.tell()

self._read_chunk = lru_cache(32)(self._read_chunk)
super().__init__(self.original_size)
def _read_name(fh: BinaryIO, length: int) -> str:
return fh.read(length).decode("utf-16-le")

def _read(self, offset: int, length: int) -> bytes:
result = []

num_chunks = len(self._chunks)
chunk, offset_in_chunk = divmod(offset, self.chunk_size)
class WimCompressedStream(CompressedStream):
"""Compressed stream for Windows Imaging (WIM) archives. This class handles the decompression of WIM archives
using the specified decompressor.

while length:
if chunk >= num_chunks:
# We somehow requested more data than we have runs for
break
Supported decompression methods are currently:
* LZXPRESS4K Huffman
* LZXPRESS8K Huffman
* LZXPRESS16K Huffman
* LZXPRESS32K Huffman (default)

chunk_offset = self._chunks[chunk]
if chunk < num_chunks - 1:
next_chunk_offset = self._chunks[chunk + 1]
chunk_remaining = self.chunk_size - offset_in_chunk
else:
next_chunk_offset = self.compressed_size
chunk_remaining = (self.original_size - (chunk * self.chunk_size)) - offset_in_chunk
Note that LZX decompression is not yet supported.

read_length = min(chunk_remaining, length)
Args:
fh: A file-like object for the compressed data.
offset: The offset to the start of the chunk table.
size: The size of the compressed data.
original_size: The original size of the uncompressed data.
decompress: The decompressor function to use.
chunk_size: The size of the chunks to read from the compressed data. (default: 32 KiB)
"""

buf = self._read_chunk(chunk_offset, next_chunk_offset - chunk_offset)
result.append(buf[offset_in_chunk : offset_in_chunk + read_length])
def __init__(
self,
fh: BinaryIO,
offset: int,
size: int,
original_size: int,
decompress: callable,
chunk_size: int = DEFAULT_CHUNK_SIZE,
):
fh.seek(offset)
num_chunks = (original_size + chunk_size - 1) // chunk_size - 1

length -= read_length
offset += read_length
chunk += 1
entry_size = "Q" if original_size > 0xFFFFFFFF else "I"
pattern = f"<{num_chunks}{entry_size}"
chunks = (0, *struct.unpack(pattern, fh.read(struct.calcsize(pattern))))

return b"".join(result)
super().__init__(fh, fh.tell(), size, original_size, decompress, chunk_size, chunks)

def _read_chunk(self, offset: int, size: int) -> bytes:
self.fh.seek(self._data_offset + offset)
self.fh.seek(self.offset + offset)
buf = self.fh.read(size)
return self.decompressor(buf)


def _ts_to_ns(ts: int) -> int:
"""Convert Windows timestamps to nanosecond timestamps."""
return (ts * 100) - 11644473600000000000

uncompressed_size = (
((self.original_size - 1) & (self.chunk_size - 1)) + 1 if offset == self.chunks[-1] else self.chunk_size
)

def _read_name(fh: BinaryIO, length: int) -> str:
return fh.read(length).decode("utf-16-le")
return buf if len(buf) == uncompressed_size else self.decompressor(buf)
3 changes: 3 additions & 0 deletions tests/_data/lzms.wim.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions tests/_data/lzx.wim.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions tests/_data/uncompressed.wim.gz
Git LFS file not shown
15 changes: 15 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,21 @@ def open_file_gz(name: str, mode: str = "rb") -> Iterator[BinaryIO]:
yield f


@pytest.fixture
def uncompressed_wim() -> Iterator[BinaryIO]:
yield from open_file_gz("_data/uncompressed.wim.gz")


@pytest.fixture
def lzms_wim() -> Iterator[BinaryIO]:
yield from open_file_gz("_data/lzms.wim.gz")


@pytest.fixture
def lzx_wim() -> Iterator[BinaryIO]:
yield from open_file_gz("_data/lzx.wim.gz")


@pytest.fixture
def basic_wim_4k() -> Iterator[BinaryIO]:
yield from open_file_gz("_data/basic4k.wim.gz")
Expand Down
31 changes: 25 additions & 6 deletions tests/test_wim.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import pytest
from dissect.util.compression.lzxpress_huffman import decompress

from dissect.archive.wim import WIM, CompressedStream
from dissect.archive.wim import WIM, WimCompressedStream


@pytest.mark.parametrize(
Expand All @@ -16,19 +16,24 @@
("basic_wim_8k", 0x2000),
("basic_wim_16k", 0x4000),
("basic_wim_32k", 0x8000),
("uncompressed_wim", 0),
],
)
def test_wim(fixture: BinaryIO, chunk_size: int, request: pytest.FixtureRequest) -> None:
value = request.getfixturevalue(fixture)
wim = WIM(value)

assert wim.header.CompressionSize == chunk_size

resource = next(iter(wim.resources.values()))
assert resource.open().chunk_size == chunk_size
if chunk_size:
resource = next(iter(wim.resources.values()))
assert resource.open().chunk_size == chunk_size

stream = CompressedStream(wim.fh, resource.offset, resource.size, resource.original_size, decompress, chunk_size)
assert resource.wim.header.CompressionSize == stream.chunk_size
assert resource.open().read() == stream.read()
stream = WimCompressedStream(
wim.fh, resource.offset, resource.size, resource.original_size, decompress, chunk_size
)
assert resource.wim.header.CompressionSize == stream.chunk_size
assert resource.open().read() == stream.read()

images = list(wim.images())
assert len(images) == 1
Expand Down Expand Up @@ -74,3 +79,17 @@ def test_wim(fixture: BinaryIO, chunk_size: int, request: pytest.FixtureRequest)
assert len(entry.streams) == 1
assert entry.size() == 60
assert hashlib.sha1(entry.open().read()).hexdigest() == "1fc83a896287fe48f6d42d8d04f88f6dc90c0c45"


@pytest.mark.parametrize(
("fixture"),
[
("lzms_wim"),
("lzx_wim"),
],
)
def test_wim_not_implemented(fixture: BinaryIO, request: pytest.FixtureRequest) -> None:
data = request.getfixturevalue(fixture)

with pytest.raises(NotImplementedError):
WIM(data)._images[0].open()