From 9798149076f4815c49e2e3c295e2d220ad00505d Mon Sep 17 00:00:00 2001 From: Spencer Schrock Date: Wed, 24 Jul 2024 13:38:24 -0700 Subject: [PATCH] prevent serialization of symlinks by default (#251) * add serialization test for symlink model file Signed-off-by: Spencer Schrock * prevent serialization of symlinks by default This can be changed via the allow_symlink argument in the various serialization initializers. Signed-off-by: Spencer Schrock * convert symlink file fixture to symlink directory fixture Signed-off-by: Spencer Schrock * Address style and documentation feedback Signed-off-by: Spencer Schrock * add `Raises` section to serialize docstrings Signed-off-by: Spencer Schrock --------- Signed-off-by: Spencer Schrock --- model_signing/conftest.py | 12 +++++ .../serialization/serialize_by_file.py | 49 ++++++++++++++++--- .../serialization/serialize_by_file_shard.py | 35 +++++++++++-- .../serialize_by_file_shard_test.py | 23 +++++++-- .../serialization/serialize_by_file_test.py | 21 +++++++- .../TestDigestSerializer/symlink_model_folder | 1 + .../symlink_model_folder | 1 + .../TestDigestSerializer/symlink_model_folder | 1 + .../symlink_model_folder_small_shards | 1 + .../TestManifestSerializer/symlink_model_file | 1 + .../symlink_model_folder | 1 + .../symlink_model_folder_small_shards | 3 ++ model_signing/test_support.py | 1 + 13 files changed, 132 insertions(+), 18 deletions(-) create mode 100644 model_signing/serialization/testdata/serialize_by_file/TestDigestSerializer/symlink_model_folder create mode 100644 model_signing/serialization/testdata/serialize_by_file/TestManifestSerializer/symlink_model_folder create mode 100644 model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/symlink_model_folder create mode 100644 model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/symlink_model_folder_small_shards create mode 100644 model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/symlink_model_file create mode 100644 model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/symlink_model_folder create mode 100644 model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/symlink_model_folder_small_shards diff --git a/model_signing/conftest.py b/model_signing/conftest.py index 55f403ca..6fb1a428 100644 --- a/model_signing/conftest.py +++ b/model_signing/conftest.py @@ -14,6 +14,8 @@ """Test fixtures to share between tests. Not part of the public API.""" +import os +import pathlib import pytest from model_signing import test_support @@ -102,3 +104,13 @@ def deep_model_folder(tmp_path_factory): file.write_text(f"This is file f{i}.") return model_root + +@pytest.fixture +def symlink_model_folder(tmp_path_factory: pytest.TempPathFactory) -> pathlib.Path: + """A model folder with a symlink to an external file.""" + external_file = tmp_path_factory.mktemp("external") / "file" + external_file.write_bytes(test_support.KNOWN_MODEL_TEXT) + model_dir = tmp_path_factory.mktemp("model") + symlink_file = model_dir / "symlink_file" + os.symlink(external_file.absolute(), symlink_file.absolute()) + return model_dir diff --git a/model_signing/serialization/serialize_by_file.py b/model_signing/serialization/serialize_by_file.py index 19d54083..93c01352 100644 --- a/model_signing/serialization/serialize_by_file.py +++ b/model_signing/serialization/serialize_by_file.py @@ -27,7 +27,10 @@ from model_signing.serialization import serialization -def check_file_or_directory(path: pathlib.Path) -> None: +def check_file_or_directory( + path: pathlib.Path, + allow_symlinks: bool = False, +) -> None: """Checks that the given path is either a file or a directory. There is no support for sockets, pipes, or any other operating system @@ -38,10 +41,19 @@ def check_file_or_directory(path: pathlib.Path) -> None: Args: path: The path to check. + allow_symlinks: Controls whether symbolic links are included. If a + symlink is present but the flag is `False` (default) the + serialization would raise an error. Raises: - ValueError: The path is neither a file or a directory. + ValueError: The path is neither a file or a directory, or the path + is a symlink and `allow_symlinks` is false. """ + if not allow_symlinks and path.is_symlink(): + raise ValueError( + f"Cannot use '{path}' because it is a symlink. This" + " behavior can be changed with `allow_symlinks`." + ) if not (path.is_file() or path.is_dir()): raise ValueError( f"Cannot use '{path}' as file or directory. It could be a" @@ -87,6 +99,7 @@ def __init__( self, file_hasher_factory: Callable[[pathlib.Path], file.FileHasher], max_workers: int | None = None, + allow_symlinks: bool = False, ): """Initializes an instance to serialize a model with this serializer. @@ -95,15 +108,23 @@ def __init__( hash individual files. max_workers: Maximum number of workers to use in parallel. Default is to defer to the `concurrent.futures` library. + allow_symlinks: Controls whether symbolic links are included. If a + symlink is present but the flag is `False` (default) the + serialization would raise an error. """ self._hasher_factory = file_hasher_factory self._max_workers = max_workers + self._allow_symlinks = allow_symlinks @override def serialize(self, model_path: pathlib.Path) -> manifest.Manifest: - # TODO: github.com/sigstore/model-transparency/issues/196 - Add checks - # to exclude symlinks if desired. - check_file_or_directory(model_path) + """Serializes the model given by the `model_path` argument. + + Raises: + ValueError: The model contains a symbolic link, but the serializer + was not initialized with `allow_symlinks=True`. + """ + check_file_or_directory(model_path, allow_symlinks=self._allow_symlinks) paths = [] if model_path.is_file(): @@ -114,7 +135,9 @@ def serialize(self, model_path: pathlib.Path) -> manifest.Manifest: # with `pathlib.Path.walk` for a clearer interface, and some speed # improvement. for path in model_path.glob("**/*"): - check_file_or_directory(path) + check_file_or_directory( + path, allow_symlinks=self._allow_symlinks + ) if path.is_file(): paths.append(path) @@ -170,6 +193,10 @@ def serialize(self, model_path: pathlib.Path) -> manifest.FileLevelManifest: The only reason for the override is to change the return type, to be more restrictive. This is to signal that the only manifests that can be returned are `manifest.FileLevelManifest` instances. + + Raises: + ValueError: The model contains a symbolic link, but the serializer + was not initialized with `allow_symlinks=True`. """ return cast(manifest.FileLevelManifest, super().serialize(model_path)) @@ -276,6 +303,7 @@ def __init__( self, file_hasher: file.SimpleFileHasher, merge_hasher_factory: Callable[[], hashing.StreamingHashEngine], + allow_symlinks: bool = False, ): """Initializes an instance to serialize a model with this serializer. @@ -284,13 +312,16 @@ def __init__( merge_hasher_factory: A callable that returns a `hashing.StreamingHashEngine` instance used to merge individual file digests to compute an aggregate digest. + allow_symlinks: Controls whether symbolic links are included. If a + symlink is present but the flag is `False` (default) the + serialization would raise an error. """ def _factory(path: pathlib.Path) -> file.FileHasher: file_hasher.set_file(path) return file_hasher - super().__init__(_factory, max_workers=1) + super().__init__(_factory, max_workers=1, allow_symlinks=allow_symlinks) self._merge_hasher_factory = merge_hasher_factory @override @@ -300,6 +331,10 @@ def serialize(self, model_path: pathlib.Path) -> manifest.DigestManifest: The only reason for the override is to change the return type, to be more restrictive. This is to signal that the only manifests that can be returned are `manifest.DigestManifest` instances. + + Raises: + ValueError: The model contains a symbolic link, but the serializer + was not initialized with `allow_symlinks=True`. """ return cast(manifest.DigestManifest, super().serialize(model_path)) diff --git a/model_signing/serialization/serialize_by_file_shard.py b/model_signing/serialization/serialize_by_file_shard.py index c16c1fdd..158267af 100644 --- a/model_signing/serialization/serialize_by_file_shard.py +++ b/model_signing/serialization/serialize_by_file_shard.py @@ -93,6 +93,7 @@ def __init__( [pathlib.Path, int, int], file.ShardedFileHasher ], max_workers: int | None = None, + allow_symlinks: bool = False, ): """Initializes an instance to serialize a model with this serializer. @@ -104,9 +105,13 @@ def __init__( the shard. max_workers: Maximum number of workers to use in parallel. Default is to defer to the `concurrent.futures` library. + allow_symlinks: Controls whether symbolic links are included. If a + symlink is present but the flag is `False` (default) the + serialization would raise an error. """ self._hasher_factory = sharded_hasher_factory self._max_workers = max_workers + self._allow_symlinks = allow_symlinks # Precompute some private values only once by using a mock file hasher. # None of the arguments used to build the hasher are used. @@ -115,9 +120,15 @@ def __init__( @override def serialize(self, model_path: pathlib.Path) -> manifest.Manifest: - # TODO: github.com/sigstore/model-transparency/issues/196 - Add checks - # to exclude symlinks if desired. - serialize_by_file.check_file_or_directory(model_path) + """Serializes the model given by the `model_path` argument. + + Raises: + ValueError: The model contains a symbolic link, but the serializer + was not initialized with `allow_symlinks=True`. + """ + serialize_by_file.check_file_or_directory( + model_path, allow_symlinks=self._allow_symlinks + ) shards = [] if model_path.is_file(): @@ -128,7 +139,9 @@ def serialize(self, model_path: pathlib.Path) -> manifest.Manifest: # with `pathlib.Path.walk` for a clearer interface, and some speed # improvement. for path in model_path.glob("**/*"): - serialize_by_file.check_file_or_directory(path) + serialize_by_file.check_file_or_directory( + path, allow_symlinks=self._allow_symlinks + ) if path.is_file(): shards.extend(self._get_shards(path)) @@ -207,6 +220,10 @@ def serialize( The only reason for the override is to change the return type, to be more restrictive. This is to signal that the only manifests that can be returned are `manifest.FileLevelManifest` instances. + + Raises: + ValueError: The model contains a symbolic link, but the serializer + was not initialized with `allow_symlinks=True`. """ return cast(manifest.ShardLevelManifest, super().serialize(model_path)) @@ -230,6 +247,7 @@ def __init__( ], merge_hasher: hashing.StreamingHashEngine, max_workers: int | None = None, + allow_symlinks: bool = False, ): """Initializes an instance to serialize a model with this serializer. @@ -243,8 +261,11 @@ def __init__( individual file shard digests to compute an aggregate digest. max_workers: Maximum number of workers to use in parallel. Default is to defer to the `concurent.futures` library. + allow_symlinks: Controls whether symbolic links are included. If a + symlink is present but the flag is `False` (default) the + serialization would raise an error. """ - super().__init__(file_hasher_factory, max_workers) + super().__init__(file_hasher_factory, max_workers, allow_symlinks) self._merge_hasher = merge_hasher @override @@ -254,6 +275,10 @@ def serialize(self, model_path: pathlib.Path) -> manifest.DigestManifest: The only reason for the override is to change the return type, to be more restrictive. This is to signal that the only manifests that can be returned are `manifest.FileLevelManifest` instances. + + Raises: + ValueError: The model contains a symbolic link, but the serializer + was not initialized with `allow_symlinks=True`. """ return cast(manifest.DigestManifest, super().serialize(model_path)) diff --git a/model_signing/serialization/serialize_by_file_shard_test.py b/model_signing/serialization/serialize_by_file_shard_test.py index a0d1dc11..f9e68c61 100644 --- a/model_signing/serialization/serialize_by_file_shard_test.py +++ b/model_signing/serialization/serialize_by_file_shard_test.py @@ -58,7 +58,7 @@ def test_known_models(self, request, model_fixture_name): # Compute model manifest (act) serializer = serialize_by_file_shard.DigestSerializer( - self._hasher_factory, memory.SHA256() + self._hasher_factory, memory.SHA256(), allow_symlinks=True ) manifest = serializer.serialize(model) @@ -84,7 +84,9 @@ def test_known_models_small_shards(self, request, model_fixture_name): # Compute model manifest (act) serializer = serialize_by_file_shard.DigestSerializer( - self._hasher_factory_small_shards, memory.SHA256() + self._hasher_factory_small_shards, + memory.SHA256(), + allow_symlinks=True, ) manifest = serializer.serialize(model) @@ -299,6 +301,13 @@ def test_shard_size_changes_digests(self, sample_model_folder): assert manifest1.digest.digest_value != manifest2.digest.digest_value + def test_symlinks_disallowed_by_default(self, symlink_model_folder): + serializer = serialize_by_file_shard.DigestSerializer( + self._hasher_factory, memory.SHA256() + ) + with pytest.raises(ValueError): + _ = serializer.serialize(symlink_model_folder) + def _extract_shard_items_from_manifest( manifest: manifest.ShardLevelManifest, @@ -357,7 +366,7 @@ def test_known_models(self, request, model_fixture_name): # Compute model manifest (act) serializer = serialize_by_file_shard.ManifestSerializer( - self._hasher_factory + self._hasher_factory, allow_symlinks=True ) manifest_file = serializer.serialize(model) items = _extract_shard_items_from_manifest(manifest_file) @@ -388,7 +397,7 @@ def test_known_models_small_shards(self, request, model_fixture_name): # Compute model manifest (act) serializer = serialize_by_file_shard.ManifestSerializer( - self._hasher_factory_small_shards + self._hasher_factory_small_shards, allow_symlinks=True ) manifest_file = serializer.serialize(model) items = _extract_shard_items_from_manifest(manifest_file) @@ -654,6 +663,12 @@ def test_max_workers_does_not_change_digest(self, sample_model_folder): assert manifest1 == manifest2 assert manifest1 == manifest3 + def test_symlinks_disallowed_by_default(self, symlink_model_folder): + serializer = serialize_by_file_shard.ManifestSerializer( + self._hasher_factory + ) + with pytest.raises(ValueError): + _ = serializer.serialize(symlink_model_folder) def test_shard_to_string(self): """Ensure the shard's `__str__` method behaves as assumed.""" diff --git a/model_signing/serialization/serialize_by_file_test.py b/model_signing/serialization/serialize_by_file_test.py index 426e90b5..d83179c1 100644 --- a/model_signing/serialization/serialize_by_file_test.py +++ b/model_signing/serialization/serialize_by_file_test.py @@ -48,7 +48,7 @@ def test_known_models(self, request, model_fixture_name): test_support.UNUSED_PATH, memory.SHA256() ) serializer = serialize_by_file.DigestSerializer( - file_hasher, memory.SHA256 + file_hasher, memory.SHA256, allow_symlinks=True ) manifest = serializer.serialize(model) @@ -275,6 +275,16 @@ def test_model_with_empty_folder_hashes_differently_than_with_empty_file( assert folder_manifest != file_manifest + def test_symlinks_disallowed_by_default(self, symlink_model_folder): + file_hasher = file.SimpleFileHasher( + test_support.UNUSED_PATH, memory.SHA256() + ) + serializer = serialize_by_file.DigestSerializer( + file_hasher, memory.SHA256 + ) + with pytest.raises(ValueError): + _ = serializer.serialize(symlink_model_folder) + class TestManifestSerializer: @@ -292,7 +302,9 @@ def test_known_models(self, request, model_fixture_name): model = request.getfixturevalue(model_fixture_name) # Compute model manifest (act) - serializer = serialize_by_file.ManifestSerializer(self._hasher_factory) + serializer = serialize_by_file.ManifestSerializer( + self._hasher_factory, allow_symlinks=True + ) manifest = serializer.serialize(model) items = test_support.extract_items_from_manifest(manifest) @@ -536,6 +548,11 @@ def test_max_workers_does_not_change_digest(self, sample_model_folder): assert manifest1 == manifest2 assert manifest1 == manifest3 + def test_symlinks_disallowed_by_default(self, symlink_model_folder): + serializer = serialize_by_file.ManifestSerializer(self._hasher_factory) + with pytest.raises(ValueError): + _ = serializer.serialize(symlink_model_folder) + class TestUtilities: diff --git a/model_signing/serialization/testdata/serialize_by_file/TestDigestSerializer/symlink_model_folder b/model_signing/serialization/testdata/serialize_by_file/TestDigestSerializer/symlink_model_folder new file mode 100644 index 00000000..c061cc10 --- /dev/null +++ b/model_signing/serialization/testdata/serialize_by_file/TestDigestSerializer/symlink_model_folder @@ -0,0 +1 @@ +8372365be7578241d18db47ec83b735bb450a10a1b4298d9b7b0d8bf543b7271 diff --git a/model_signing/serialization/testdata/serialize_by_file/TestManifestSerializer/symlink_model_folder b/model_signing/serialization/testdata/serialize_by_file/TestManifestSerializer/symlink_model_folder new file mode 100644 index 00000000..f3982d94 --- /dev/null +++ b/model_signing/serialization/testdata/serialize_by_file/TestManifestSerializer/symlink_model_folder @@ -0,0 +1 @@ +symlink_file:3aab065c7181a173b5dd9e9d32a9f79923440b413be1e1ffcdba26a7365f719b diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/symlink_model_folder b/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/symlink_model_folder new file mode 100644 index 00000000..af0d255b --- /dev/null +++ b/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/symlink_model_folder @@ -0,0 +1 @@ +013322ae2977a76252119aa6a4d71044599c7c2d890f7ed96215b52308ee7142 diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/symlink_model_folder_small_shards b/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/symlink_model_folder_small_shards new file mode 100644 index 00000000..022aaddf --- /dev/null +++ b/model_signing/serialization/testdata/serialize_by_file_shard/TestDigestSerializer/symlink_model_folder_small_shards @@ -0,0 +1 @@ +de9d3fc1608836778e17540ac0332b42bf01730d4697767571b89460fae92fc3 diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/symlink_model_file b/model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/symlink_model_file new file mode 100644 index 00000000..6ef95249 --- /dev/null +++ b/model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/symlink_model_file @@ -0,0 +1 @@ +.:0:22:3aab065c7181a173b5dd9e9d32a9f79923440b413be1e1ffcdba26a7365f719b diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/symlink_model_folder b/model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/symlink_model_folder new file mode 100644 index 00000000..d1783d53 --- /dev/null +++ b/model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/symlink_model_folder @@ -0,0 +1 @@ +symlink_file:0:22:3aab065c7181a173b5dd9e9d32a9f79923440b413be1e1ffcdba26a7365f719b diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/symlink_model_folder_small_shards b/model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/symlink_model_folder_small_shards new file mode 100644 index 00000000..78cb37c4 --- /dev/null +++ b/model_signing/serialization/testdata/serialize_by_file_shard/TestManifestSerializer/symlink_model_folder_small_shards @@ -0,0 +1,3 @@ +symlink_file:0:8:a37010c994067764d86540bf479d93b4d0c3bb3955de7b61f951caf2fd0301b0 +symlink_file:8:16:bd762002a3528a27fb9a8822f822b949d3c9ab7e860af33039c9aa70ebbbe682 +symlink_file:16:22:a791e1e893ea4260c77475725101fb4cc6ad85f6340f21f10b239184e318cd21 diff --git a/model_signing/test_support.py b/model_signing/test_support.py index 6d8aefaf..d41e98b5 100644 --- a/model_signing/test_support.py +++ b/model_signing/test_support.py @@ -39,6 +39,7 @@ "empty_model_file", "empty_model_folder", "model_folder_with_empty_file", + "symlink_model_folder", ]