From edd54d83a5d3e38277a2f7f1681c5a0ccd359e86 Mon Sep 17 00:00:00 2001 From: Sanskar Shukla Date: Sat, 21 Feb 2026 03:02:38 +0530 Subject: [PATCH] diff(): validate manifest keys to prevent path traversal --- library/src/iqb/ghremote/diff.py | 6 +++ library/tests/iqb/cli/cache_pull_test.py | 23 ++++++++++ library/tests/iqb/ghremote/diff_test.py | 56 ++++++++++++++++++++++++ 3 files changed, 85 insertions(+) diff --git a/library/src/iqb/ghremote/diff.py b/library/src/iqb/ghremote/diff.py index 2f679ea..4abe9cb 100644 --- a/library/src/iqb/ghremote/diff.py +++ b/library/src/iqb/ghremote/diff.py @@ -2,6 +2,7 @@ from __future__ import annotations +import logging import re from collections.abc import Callable, Iterator from dataclasses import dataclass @@ -10,6 +11,8 @@ from .cache import FileEntry, Manifest, compute_sha256 +log = logging.getLogger("iqb.ghremote.diff") + class DiffState(str, Enum): """State of a diff entry comparing manifest vs local cache.""" @@ -108,6 +111,9 @@ def diff( # Phase 2: iterate accepted manifest keys in sorted order seen: set[str] = set() for key in sorted(manifest.files): + if not _validate_cache_path(key): + log.warning("skipping invalid manifest key: %s", key) + continue if acceptp is not None and not acceptp(key): continue seen.add(key) diff --git a/library/tests/iqb/cli/cache_pull_test.py b/library/tests/iqb/cli/cache_pull_test.py index c989b77..f719dad 100644 --- a/library/tests/iqb/cli/cache_pull_test.py +++ b/library/tests/iqb/cli/cache_pull_test.py @@ -75,6 +75,29 @@ def test_nothing_to_download(self, tmp_path: Path): assert "Nothing to download" in result.output +class TestCachePullInvalidManifestKeys: + """Invalid manifest keys are ignored for safety.""" + + @patch("iqb.cli.cache_pull.requests.Session") + def test_traversal_key_is_ignored(self, mock_session_cls: MagicMock, tmp_path: Path): + _write_manifest( + tmp_path, + { + "../../etc/passwd": { + "sha256": _sha256(b"malicious"), + "url": "https://example.com/a", + } + }, + ) + + runner = CliRunner() + result = runner.invoke(cli, ["cache", "pull", "-d", str(tmp_path)]) + + assert result.exit_code == 0 + assert "Nothing to download" in result.output + mock_session_cls.assert_not_called() + + class TestCachePullOnlyRemote: """ONLY_REMOTE entries are downloaded.""" diff --git a/library/tests/iqb/ghremote/diff_test.py b/library/tests/iqb/ghremote/diff_test.py index edf1a6c..e6d5dac 100644 --- a/library/tests/iqb/ghremote/diff_test.py +++ b/library/tests/iqb/ghremote/diff_test.py @@ -258,6 +258,62 @@ def test_returns_iterator(self, tmp_path: Path): assert not isinstance(result, (list, tuple)) +class TestDiffInvalidManifestKeys: + """Invalid manifest keys are ignored for safety.""" + + def test_traversal_manifest_key_is_ignored(self, tmp_path: Path): + manifest = Manifest( + v=0, + files={ + "../../etc/passwd": FileEntry( + sha256="0" * 64, + url="https://example.com/passwd", + ) + }, + ) + + results = list(diff(manifest, tmp_path)) + + assert results == [] + + def test_dotdot_inside_cache_path_is_ignored(self, tmp_path: Path): + manifest = Manifest( + v=0, + files={ + f"cache/v1/{_TS1}/{_TS2}/{_NAME}/../data.parquet": FileEntry( + sha256="0" * 64, + url="https://example.com/a", + ) + }, + ) + + results = list(diff(manifest, tmp_path)) + + assert results == [] + + def test_mixed_valid_and_invalid_manifest_keys(self, tmp_path: Path): + content = b"remote content" + manifest = Manifest( + v=0, + files={ + "../../etc/passwd": FileEntry( + sha256="0" * 64, + url="https://example.com/passwd", + ), + _FILE_A: FileEntry( + sha256=_sha256(content), + url="https://example.com/a.parquet", + ), + }, + ) + + results = list(diff(manifest, tmp_path)) + + assert len(results) == 1 + assert results[0].file == _FILE_A + assert results[0].state == DiffState.ONLY_REMOTE + + class TestValidateCachePath: """Tests for _validate_cache_path covering all branches."""