From 3da5100e70b7234bb20cca017fb031c6067b204c Mon Sep 17 00:00:00 2001 From: Horofic Date: Mon, 24 Mar 2025 20:19:48 +0100 Subject: [PATCH 1/7] Make chunk size configurable --- dissect/archive/wim.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/dissect/archive/wim.py b/dissect/archive/wim.py index bdaacdd..329b872 100644 --- a/dissect/archive/wim.py +++ b/dissect/archive/wim.py @@ -434,16 +434,18 @@ def __init__( compressed_size: int, original_size: int, decompressor: Callable[[bytes], bytes], + chunk_size: int = DEFAULT_CHUNK_SIZE, ): self.fh = fh self.offset = offset self.compressed_size = compressed_size self.original_size = original_size self.decompressor = decompressor + self.chuck_size = chunk_size # Read the chunk table in advance fh.seek(self.offset) - num_chunks = (original_size + DEFAULT_CHUNK_SIZE - 1) // DEFAULT_CHUNK_SIZE - 1 + num_chunks = (original_size + self.chuck_size - 1) // self.chuck_size - 1 if num_chunks == 0: self._chunks = (0,) else: @@ -460,7 +462,7 @@ def _read(self, offset: int, length: int) -> bytes: result = [] num_chunks = len(self._chunks) - chunk, offset_in_chunk = divmod(offset, DEFAULT_CHUNK_SIZE) + chunk, offset_in_chunk = divmod(offset, self.chuck_size) while length: if chunk >= num_chunks: @@ -470,10 +472,10 @@ def _read(self, offset: int, length: int) -> bytes: chunk_offset = self._chunks[chunk] if chunk < num_chunks - 1: next_chunk_offset = self._chunks[chunk + 1] - chunk_remaining = DEFAULT_CHUNK_SIZE - offset_in_chunk + chunk_remaining = self.chuck_size - offset_in_chunk else: next_chunk_offset = self.compressed_size - chunk_remaining = (self.original_size - (chunk * DEFAULT_CHUNK_SIZE)) - offset_in_chunk + chunk_remaining = (self.original_size - (chunk * self.chuck_size)) - offset_in_chunk read_length = min(chunk_remaining, length) From d937a7b6277241d5b3470882d6795beb173eb44a Mon Sep 17 00:00:00 2001 From: Horofic Date: Tue, 25 Mar 2025 09:47:19 +0100 Subject: [PATCH 2/7] Fix typo --- dissect/archive/wim.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dissect/archive/wim.py b/dissect/archive/wim.py index 329b872..1804737 100644 --- a/dissect/archive/wim.py +++ b/dissect/archive/wim.py @@ -441,11 +441,11 @@ def __init__( self.compressed_size = compressed_size self.original_size = original_size self.decompressor = decompressor - self.chuck_size = chunk_size + self.chunk_size = chunk_size # Read the chunk table in advance fh.seek(self.offset) - num_chunks = (original_size + self.chuck_size - 1) // self.chuck_size - 1 + num_chunks = (original_size + self.chunk_size - 1) // self.chunk_size - 1 if num_chunks == 0: self._chunks = (0,) else: @@ -462,7 +462,7 @@ def _read(self, offset: int, length: int) -> bytes: result = [] num_chunks = len(self._chunks) - chunk, offset_in_chunk = divmod(offset, self.chuck_size) + chunk, offset_in_chunk = divmod(offset, self.chunk_size) while length: if chunk >= num_chunks: @@ -472,10 +472,10 @@ def _read(self, offset: int, length: int) -> bytes: chunk_offset = self._chunks[chunk] if chunk < num_chunks - 1: next_chunk_offset = self._chunks[chunk + 1] - chunk_remaining = self.chuck_size - offset_in_chunk + chunk_remaining = self.chunk_size - offset_in_chunk else: next_chunk_offset = self.compressed_size - chunk_remaining = (self.original_size - (chunk * self.chuck_size)) - offset_in_chunk + chunk_remaining = (self.original_size - (chunk * self.chunk_size)) - offset_in_chunk read_length = min(chunk_remaining, length) From c97d4ce5801f5da9a3e02cb492008e15611b0906 Mon Sep 17 00:00:00 2001 From: Horofic Date: Wed, 26 Mar 2025 16:16:20 +0100 Subject: [PATCH 3/7] Add tests for different compression chunk sizes --- tests/conftest.py | 15 +++++++++++++++ tests/test_wim.py | 18 ++++++++++++++++-- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 721c11c..f5e034d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -29,6 +29,21 @@ def basic_wim() -> Iterator[BinaryIO]: yield from open_file_gz("data/basic.wim.gz") +@pytest.fixture +def test4k_wim() -> Iterator[BinaryIO]: + yield from open_file_gz("data/test4k.wim.gz") + + +@pytest.fixture +def test8k_wim() -> Iterator[BinaryIO]: + yield from open_file_gz("data/test8k.wim.gz") + + +@pytest.fixture +def test16k_wim() -> Iterator[BinaryIO]: + yield from open_file_gz("data/test16k.wim.gz") + + @pytest.fixture def basic_vma() -> Iterator[BinaryIO]: yield from open_file_gz("data/test.vma.gz") diff --git a/tests/test_wim.py b/tests/test_wim.py index d66d14b..f96f565 100644 --- a/tests/test_wim.py +++ b/tests/test_wim.py @@ -3,11 +3,25 @@ import hashlib from typing import BinaryIO +import pytest + from dissect.archive.wim import WIM -def test_wim(basic_wim: BinaryIO) -> None: - wim = WIM(basic_wim) +@pytest.mark.parametrize( + ("fixture", "chunk_size"), + [ + ("basic_wim", 0x8000), + ("test4k_wim", 0x1000), + ("test8k_wim", 0x2000), + ("test16k_wim", 0x4000), + ], +) +def test_wim(fixture: BinaryIO, chunk_size: int, request: pytest.FixtureRequest) -> None: + value = request.getfixturevalue(fixture) + wim = WIM(value) + + assert wim.header.CompressionSize == chunk_size images = list(wim.images()) assert len(images) == 1 From 8a76da997e8c7641fade8588b45cecdffe375481 Mon Sep 17 00:00:00 2001 From: Horofic Date: Wed, 26 Mar 2025 16:22:05 +0100 Subject: [PATCH 4/7] Add test data --- tests/data/test16k.wim.gz | 3 +++ tests/data/test4k.wim.gz | 3 +++ tests/data/test8k.wim.gz | 3 +++ 3 files changed, 9 insertions(+) create mode 100644 tests/data/test16k.wim.gz create mode 100644 tests/data/test4k.wim.gz create mode 100644 tests/data/test8k.wim.gz diff --git a/tests/data/test16k.wim.gz b/tests/data/test16k.wim.gz new file mode 100644 index 0000000..9ef8951 --- /dev/null +++ b/tests/data/test16k.wim.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cac03af6d4fbd8d00ef8ed85089f3ec59bb9027cda59f034254d68b61703d5de +size 1316 diff --git a/tests/data/test4k.wim.gz b/tests/data/test4k.wim.gz new file mode 100644 index 0000000..476e440 --- /dev/null +++ b/tests/data/test4k.wim.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a17ced84b2db27262a9822d840db330e53612162eb5d68ea1219da229b97cd9 +size 1315 diff --git a/tests/data/test8k.wim.gz b/tests/data/test8k.wim.gz new file mode 100644 index 0000000..74929f8 --- /dev/null +++ b/tests/data/test8k.wim.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82f9f627a9af62e1cc11c24fad282a2a770307339ecc8d1a82bb7dcd84413fbe +size 1316 From e2c518d6e11186a87c9fc8c4cad6246003498f6a Mon Sep 17 00:00:00 2001 From: Horofic Date: Wed, 26 Mar 2025 16:31:15 +0100 Subject: [PATCH 5/7] Add chunk_size to CompressedStream --- dissect/archive/wim.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dissect/archive/wim.py b/dissect/archive/wim.py index 1804737..f411667 100644 --- a/dissect/archive/wim.py +++ b/dissect/archive/wim.py @@ -151,7 +151,9 @@ def open(self) -> BinaryIO: decompressor = DECOMPRESSOR_MAP.get(compression_flags) if decompressor is None: raise NotImplementedError(f"Compression algorithm not yet supported: {compression_flags}") - return CompressedStream(self.wim.fh, self.offset, self.size, self.original_size, decompressor) + return CompressedStream( + self.wim.fh, self.offset, self.size, self.original_size, decompressor, self.wim.header.CompressionSize + ) return RelativeStream(self.wim.fh, self.offset, self.size) @@ -485,7 +487,6 @@ def _read(self, offset: int, length: int) -> bytes: length -= read_length offset += read_length chunk += 1 - return b"".join(result) def _read_chunk(self, offset: int, size: int) -> bytes: From 43d8912dc5c8a2280057b439492a1ea978fb7619 Mon Sep 17 00:00:00 2001 From: Horofic Date: Thu, 27 Mar 2025 10:49:14 +0100 Subject: [PATCH 6/7] Move test data and adjust WIM test --- .gitattributes | 2 +- dissect/archive/wim.py | 1 + .../test16k.wim.gz => _data/basic16k.wim.gz} | 0 .../basic.wim.gz => _data/basic32k.wim.gz} | 0 .../test4k.wim.gz => _data/basic4k.wim.gz} | 0 .../test8k.wim.gz => _data/basic8k.wim.gz} | 0 tests/{data => _data}/test.vma.gz | 0 tests/{data => _data}/test13.vbk.gz | 0 tests/{data => _data}/test9.vbk.gz | 0 tests/conftest.py | 22 +++++++++---------- tests/test_wim.py | 19 +++++++++++----- 11 files changed, 26 insertions(+), 18 deletions(-) rename tests/{data/test16k.wim.gz => _data/basic16k.wim.gz} (100%) rename tests/{data/basic.wim.gz => _data/basic32k.wim.gz} (100%) rename tests/{data/test4k.wim.gz => _data/basic4k.wim.gz} (100%) rename tests/{data/test8k.wim.gz => _data/basic8k.wim.gz} (100%) rename tests/{data => _data}/test.vma.gz (100%) rename tests/{data => _data}/test13.vbk.gz (100%) rename tests/{data => _data}/test9.vbk.gz (100%) diff --git a/.gitattributes b/.gitattributes index 5d87832..b677c2a 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1 +1 @@ -tests/data/* filter=lfs diff=lfs merge=lfs -text +tests/_data/* filter=lfs diff=lfs merge=lfs -text diff --git a/dissect/archive/wim.py b/dissect/archive/wim.py index f411667..f28358b 100644 --- a/dissect/archive/wim.py +++ b/dissect/archive/wim.py @@ -487,6 +487,7 @@ def _read(self, offset: int, length: int) -> bytes: length -= read_length offset += read_length chunk += 1 + return b"".join(result) def _read_chunk(self, offset: int, size: int) -> bytes: diff --git a/tests/data/test16k.wim.gz b/tests/_data/basic16k.wim.gz similarity index 100% rename from tests/data/test16k.wim.gz rename to tests/_data/basic16k.wim.gz diff --git a/tests/data/basic.wim.gz b/tests/_data/basic32k.wim.gz similarity index 100% rename from tests/data/basic.wim.gz rename to tests/_data/basic32k.wim.gz diff --git a/tests/data/test4k.wim.gz b/tests/_data/basic4k.wim.gz similarity index 100% rename from tests/data/test4k.wim.gz rename to tests/_data/basic4k.wim.gz diff --git a/tests/data/test8k.wim.gz b/tests/_data/basic8k.wim.gz similarity index 100% rename from tests/data/test8k.wim.gz rename to tests/_data/basic8k.wim.gz diff --git a/tests/data/test.vma.gz b/tests/_data/test.vma.gz similarity index 100% rename from tests/data/test.vma.gz rename to tests/_data/test.vma.gz diff --git a/tests/data/test13.vbk.gz b/tests/_data/test13.vbk.gz similarity index 100% rename from tests/data/test13.vbk.gz rename to tests/_data/test13.vbk.gz diff --git a/tests/data/test9.vbk.gz b/tests/_data/test9.vbk.gz similarity index 100% rename from tests/data/test9.vbk.gz rename to tests/_data/test9.vbk.gz diff --git a/tests/conftest.py b/tests/conftest.py index f5e034d..fd5c35f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -25,35 +25,35 @@ def open_file_gz(name: str, mode: str = "rb") -> Iterator[BinaryIO]: @pytest.fixture -def basic_wim() -> Iterator[BinaryIO]: - yield from open_file_gz("data/basic.wim.gz") +def basic_wim_4k() -> Iterator[BinaryIO]: + yield from open_file_gz("_data/basic4k.wim.gz") @pytest.fixture -def test4k_wim() -> Iterator[BinaryIO]: - yield from open_file_gz("data/test4k.wim.gz") +def basic_wim_8k() -> Iterator[BinaryIO]: + yield from open_file_gz("_data/basic8k.wim.gz") @pytest.fixture -def test8k_wim() -> Iterator[BinaryIO]: - yield from open_file_gz("data/test8k.wim.gz") +def basic_wim_16k() -> Iterator[BinaryIO]: + yield from open_file_gz("_data/basic16k.wim.gz") @pytest.fixture -def test16k_wim() -> Iterator[BinaryIO]: - yield from open_file_gz("data/test16k.wim.gz") +def basic_wim_32k() -> Iterator[BinaryIO]: + yield from open_file_gz("_data/basic32k.wim.gz") @pytest.fixture def basic_vma() -> Iterator[BinaryIO]: - yield from open_file_gz("data/test.vma.gz") + yield from open_file_gz("_data/test.vma.gz") @pytest.fixture def vbk9() -> Iterator[BinaryIO]: - yield from open_file_gz("data/test9.vbk.gz") + yield from open_file_gz("_data/test9.vbk.gz") @pytest.fixture def vbk13() -> Iterator[BinaryIO]: - yield from open_file_gz("data/test13.vbk.gz") + yield from open_file_gz("_data/test13.vbk.gz") diff --git a/tests/test_wim.py b/tests/test_wim.py index f96f565..f8ebbaa 100644 --- a/tests/test_wim.py +++ b/tests/test_wim.py @@ -4,25 +4,32 @@ from typing import BinaryIO import pytest +from dissect.util.compression.lzxpress_huffman import decompress -from dissect.archive.wim import WIM +from dissect.archive.wim import WIM, CompressedStream @pytest.mark.parametrize( ("fixture", "chunk_size"), [ - ("basic_wim", 0x8000), - ("test4k_wim", 0x1000), - ("test8k_wim", 0x2000), - ("test16k_wim", 0x4000), + ("basic_wim_4k", 0x1000), + ("basic_wim_8k", 0x2000), + ("basic_wim_16k", 0x4000), + ("basic_wim_32k", 0x8000), ], ) def test_wim(fixture: BinaryIO, chunk_size: int, request: pytest.FixtureRequest) -> None: value = request.getfixturevalue(fixture) wim = WIM(value) - assert wim.header.CompressionSize == chunk_size + resource = next(iter(wim.resources.values())) + assert resource.open().chunk_size == chunk_size + + stream = CompressedStream(wim.fh, resource.offset, resource.size, resource.original_size, decompress, chunk_size) + assert resource.wim.header.CompressionSize == stream.chunk_size + assert resource.open().read() == stream.read() + images = list(wim.images()) assert len(images) == 1 From 747d9dfbcd58fa833f5d99a6535bae082636e221 Mon Sep 17 00:00:00 2001 From: Horofic Date: Thu, 27 Mar 2025 15:41:49 +0100 Subject: [PATCH 7/7] Add docs and docs test --- .gitignore | 4 ++-- tests/_docs/Makefile | 24 ++++++++++++++++++++++++ tests/_docs/__init__.py | 0 tests/_docs/conf.py | 41 +++++++++++++++++++++++++++++++++++++++++ tests/_docs/index.rst | 8 ++++++++ tox.ini | 8 ++++---- 6 files changed, 79 insertions(+), 6 deletions(-) create mode 100644 tests/_docs/Makefile create mode 100644 tests/_docs/__init__.py create mode 100644 tests/_docs/conf.py create mode 100644 tests/_docs/index.rst diff --git a/.gitignore b/.gitignore index 74cecaf..2e944b3 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,6 @@ dist/ *.pyc __pycache__/ .pytest_cache/ -tests/docs/api -tests/docs/build +tests/_docs/api +tests/_docs/build .tox/ diff --git a/tests/_docs/Makefile b/tests/_docs/Makefile new file mode 100644 index 0000000..e693b42 --- /dev/null +++ b/tests/_docs/Makefile @@ -0,0 +1,24 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= -jauto -w $(BUILDDIR)/warnings.log --fail-on-warning +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: clean help Makefile + +clean: Makefile + rm -rf api + @$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/tests/_docs/__init__.py b/tests/_docs/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/_docs/conf.py b/tests/_docs/conf.py new file mode 100644 index 0000000..facc0c4 --- /dev/null +++ b/tests/_docs/conf.py @@ -0,0 +1,41 @@ +project = "dissect.archive" + +extensions = [ + "autoapi.extension", + "sphinx.ext.autodoc", + "sphinx.ext.autosectionlabel", + "sphinx.ext.doctest", + "sphinx.ext.napoleon", + "sphinx_argparse_cli", +] + +exclude_patterns = [] + +html_theme = "furo" + +autoapi_type = "python" +autoapi_dirs = ["../../dissect/"] +autoapi_ignore = ["*tests*", "*.tox*", "*venv*", "*examples*"] +autoapi_python_use_implicit_namespaces = True +autoapi_add_toctree_entry = False +autoapi_root = "api" +autoapi_options = [ + "members", + "undoc-members", + "show-inheritance", + "show-module-summary", + "special-members", + "imported-members", +] +autoapi_keep_files = True +autoapi_template_dir = "_templates/autoapi" + +autodoc_typehints = "signature" +autodoc_member_order = "groupwise" + +autosectionlabel_prefix_document = True + +suppress_warnings = [ + # https://github.com/readthedocs/sphinx-autoapi/issues/285 + "autoapi.python_import_resolution", +] diff --git a/tests/_docs/index.rst b/tests/_docs/index.rst new file mode 100644 index 0000000..ba16345 --- /dev/null +++ b/tests/_docs/index.rst @@ -0,0 +1,8 @@ +API Reference +============= + +.. toctree:: + :maxdepth: 1 + :glob: + + /api/*/*/index diff --git a/tox.ini b/tox.ini index e82fbf9..07a0873 100644 --- a/tox.ini +++ b/tox.ini @@ -62,12 +62,12 @@ deps = sphinx-design furo commands = - make -C tests/docs clean - make -C tests/docs html + make -C tests/_docs clean + make -C tests/_docs html [testenv:docs-linkcheck] allowlist_externals = make deps = {[testenv:docs-build]deps} commands = - make -C tests/docs clean - make -C tests/docs linkcheck + make -C tests/_docs clean + make -C tests/_docs linkcheck