From 3344e2e3ee28c0e10a8fd4b167fce664253f093a Mon Sep 17 00:00:00 2001 From: Mihai Maruseac Date: Wed, 24 Jul 2024 13:00:41 -0700 Subject: [PATCH] Expose reosurce descriptors from manifests We aim this to be similar to in-toto's `ResourceDescriptor`. To support cases where in-toto cannot be directly used, we make this a dataclass that can be mapped to in-toto when needed, and used as its own otherwise. Not all fields from in-toto are specified at this moment. All fields here must be present, unlike in-toto, where all are optional. See https://github.com/in-toto/attestation/blob/main/spec/v1/resource_descriptor.md for the in-toto specification. This is the first separable PR for the signing support (see full draft on #253) Signed-off-by: Mihai Maruseac --- .github/workflows/lint.yml | 4 +- model_signing/manifest/manifest.py | 64 ++++++++++- model_signing/manifest/manifest_test.py | 100 ++++++++++++++++++ .../serialize_by_file_shard_test.py | 2 +- .../serialization/serialize_by_file_test.py | 2 +- model_signing/signing/__init__.py | 13 +++ model_signing/signing/signing.py | 80 ++++++++++++++ model_signing/test_support.py | 19 ++++ 8 files changed, 278 insertions(+), 6 deletions(-) create mode 100644 model_signing/signing/__init__.py create mode 100644 model_signing/signing/signing.py diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index e81d83fd..268b87e9 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -65,7 +65,7 @@ jobs: pip install -r model_signing/install/requirements_test_Linux.txt pip install -r model_signing/install/requirements_dev_Linux.txt # TODO: https://github.com/sigstore/model-transparency/issues/231 - Support all repo - pytype --keep-going model_signing/{hashing,manifest,serialization} + pytype --keep-going model_signing/{hashing,manifest,serialization,signing} pylint-lint: runs-on: ubuntu-latest @@ -85,4 +85,4 @@ jobs: pip install -r model_signing/install/requirements_dev_Linux.txt # TODO: https://github.com/sigstore/model-transparency/issues/231 - Support all repo # We should actually migrate to ruff, but that's configured via pyproject.toml which we use when we release the wheel - pylint --disable C0114,C0115,C0116,R0801,R0903,R0904,R0913,R0914,R1721,R1737,W0107,W0212,W0223,W0231,W0511,W0621 model_signing/{hashing,manifest,serialization} + pylint --disable C0114,C0115,C0116,R0801,R0903,R0904,R0913,R0914,R1721,R1737,W0107,W0212,W0223,W0231,W0511,W0621 model_signing/{hashing,manifest,serialization,signing} diff --git a/model_signing/manifest/manifest.py b/model_signing/manifest/manifest.py index 46ac3ed1..fbc6517b 100644 --- a/model_signing/manifest/manifest.py +++ b/model_signing/manifest/manifest.py @@ -55,15 +55,44 @@ from collections.abc import Iterable import dataclasses import pathlib -from typing import Self +from typing import Iterator, Self +from typing_extensions import override from model_signing.hashing import hashing +@dataclasses.dataclass(frozen=True) +class ResourceDescriptor: + """A description of any content from any `Manifest`. + + We aim this to be similar to in-toto's `ResourceDescriptor`. To support + cases where in-toto cannot be directly used, we make this a dataclass that + can be mapped to in-toto when needed, and used as its own otherwise. + + Not all fields from in-toto are specified at this moment. All fields here + must be present, unlike in-toto, where all are optional. + + See github.com/in-toto/attestation/blob/main/spec/v1/resource_descriptor.md + for the in-toto specification. + + Attributes: + identifier: A string that uniquely identifies this `ResourceDescriptor`. + Corresponds to `name`, `uri`, or `content` in in-toto specification. + digest: One digest for the item. Note that unlike in-toto, we only have + one digest for the item and it is always required. + """ + + identifier: str + digest: hashing.Digest + + class Manifest(metaclass=abc.ABCMeta): """Generic manifest file to represent a model.""" - pass + @abc.abstractmethod + def resource_descriptors(self) -> Iterator[ResourceDescriptor]: + """Yields each resource from the manifest, one by one.""" + pass @dataclasses.dataclass(frozen=True) @@ -72,6 +101,17 @@ class DigestManifest(Manifest): digest: hashing.Digest + @override + def resource_descriptors(self) -> Iterator[ResourceDescriptor]: + """Yields each resource from the manifest, one by one. + + In this case, we have only one descriptor to return. Since model paths + are already encoded in the digest, use "." for the digest. Subclasses + might record additional fields to have distinguishable human readable + identifiers. + """ + yield ResourceDescriptor(identifier=".", digest=self.digest) + class ItemizedManifest(Manifest): """A detailed manifest, recording integrity of every model component.""" @@ -130,6 +170,15 @@ def __init__(self, items: Iterable[FileManifestItem]): def __eq__(self, other: Self): return self._item_to_digest == other._item_to_digest + @override + def resource_descriptors(self) -> Iterator[ResourceDescriptor]: + """Yields each resource from the manifest, one by one. + + The items are returned in alphabetical order of the path. + """ + for item, digest in sorted(self._item_to_digest.items()): + yield ResourceDescriptor(identifier=str(item), digest=digest) + @dataclasses.dataclass(frozen=True, order=True) class Shard: @@ -200,3 +249,14 @@ def __init__(self, items: Iterable[ShardedFileManifestItem]): efficient updates and retrieval of digests. """ self._item_to_digest = {item.input_tuple: item.digest for item in items} + + @override + def resource_descriptors(self) -> Iterator[ResourceDescriptor]: + """Yields each resource from the manifest, one by one. + + The items are returned in the order given by the `input_tuple` property + of `ShardedFileManifestItem` used to create this instance (the triple of + file name and shard endpoints). + """ + for item, digest in sorted(self._item_to_digest.items()): + yield ResourceDescriptor(identifier=str(item), digest=digest) diff --git a/model_signing/manifest/manifest_test.py b/model_signing/manifest/manifest_test.py index f097c8de..ec5c8993 100644 --- a/model_signing/manifest/manifest_test.py +++ b/model_signing/manifest/manifest_test.py @@ -13,11 +13,31 @@ # limitations under the License. import pathlib +import pytest from model_signing.hashing import hashing from model_signing.manifest import manifest +class TestDigestManifest: + + def test_manifest_has_just_one_resource_descriptor(self): + digest = hashing.Digest("test", b"test_digest") + manifest_file = manifest.DigestManifest(digest) + + descriptors = list(manifest_file.resource_descriptors()) + + assert len(descriptors) == 1 + + def test_manifest_has_the_correct_resource_descriptor(self): + digest = hashing.Digest("test", b"test_digest") + manifest_file = manifest.DigestManifest(digest) + + for descriptor in manifest_file.resource_descriptors(): + assert descriptor.identifier == "." + assert descriptor.digest == digest + + class TestFileLevelManifest: def test_insert_order_does_not_matter(self): @@ -34,6 +54,39 @@ def test_insert_order_does_not_matter(self): assert manifest1 == manifest2 + @pytest.mark.parametrize("num_items", [1, 3, 5]) + def test_manifest_has_all_resource_descriptors(self, num_items): + items: list[manifest.FileManifestItem] = [] + for i in range(num_items): + path = pathlib.PurePath(f"file{i}") + digest = hashing.Digest("test", b"hash{i}") + item = manifest.FileManifestItem(path=path, digest=digest) + items.append(item) + manifest_file = manifest.FileLevelManifest(items) + + descriptors = list(manifest_file.resource_descriptors()) + + assert len(descriptors) == num_items + + def test_manifest_has_the_correct_resource_descriptors(self): + path1 = pathlib.PurePath("file1") + digest1 = hashing.Digest("test", b"hash1") + item1 = manifest.FileManifestItem(path=path1, digest=digest1) + + path2 = pathlib.PurePath("file2") + digest2 = hashing.Digest("test", b"hash2") + item2 = manifest.FileManifestItem(path=path2, digest=digest2) + + # Note order is reversed + manifest_file = manifest.FileLevelManifest([item2, item1]) + descriptors = list(manifest_file.resource_descriptors()) + + # But we expect the descriptors to be in order by file + assert descriptors[0].identifier == "file1" + assert descriptors[1].identifier == "file2" + assert descriptors[0].digest.digest_value == b"hash1" + assert descriptors[1].digest.digest_value == b"hash2" + class TestShardLevelManifest: @@ -70,3 +123,50 @@ def test_same_path_different_shards_gives_different_manifest(self): manifest2 = manifest.ShardLevelManifest([item]) assert manifest1 != manifest2 + + @pytest.mark.parametrize("num_items", [1, 3, 5]) + def test_manifest_has_all_resource_descriptors(self, num_items): + items: list[manifest.ShardedFileManifestItem] = [] + for i in range(num_items): + path = pathlib.PurePath("file") + digest = hashing.Digest("test", b"hash{i}") + item = manifest.ShardedFileManifestItem( + path=path, digest=digest, start=i, end=i + 2 + ) + items.append(item) + manifest_file = manifest.ShardLevelManifest(items) + + descriptors = list(manifest_file.resource_descriptors()) + + assert len(descriptors) == num_items + + def test_manifest_has_the_correct_resource_descriptors(self): + path1 = pathlib.PurePath("file1") + digest1 = hashing.Digest("test", b"hash1") + item1 = manifest.ShardedFileManifestItem( + path=path1, digest=digest1, start=0, end=4 + ) + + path2 = pathlib.PurePath("file2") + digest2 = hashing.Digest("test", b"hash2") + item2 = manifest.ShardedFileManifestItem( + path=path2, digest=digest2, start=0, end=4 + ) + + # First file, but second shard + digest3 = hashing.Digest("test", b"hash3") + item3 = manifest.ShardedFileManifestItem( + path=path1, digest=digest3, start=4, end=8 + ) + + # Note order is reversed + manifest_file = manifest.ShardLevelManifest([item3, item2, item1]) + descriptors = list(manifest_file.resource_descriptors()) + + # But we expect the descriptors to be in order by file shard + assert descriptors[0].identifier == "file1:0:4" + assert descriptors[1].identifier == "file1:4:8" + assert descriptors[2].identifier == "file2:0:4" + assert descriptors[0].digest.digest_value == b"hash1" + assert descriptors[1].digest.digest_value == b"hash3" + assert descriptors[2].digest.digest_value == b"hash2" diff --git a/model_signing/serialization/serialize_by_file_shard_test.py b/model_signing/serialization/serialize_by_file_shard_test.py index f9e68c61..94934655 100644 --- a/model_signing/serialization/serialize_by_file_shard_test.py +++ b/model_signing/serialization/serialize_by_file_shard_test.py @@ -23,11 +23,11 @@ import pathlib import pytest +from model_signing import test_support from model_signing.hashing import file from model_signing.hashing import memory from model_signing.manifest import manifest from model_signing.serialization import serialize_by_file_shard -from model_signing import test_support class TestDigestSerializer: diff --git a/model_signing/serialization/serialize_by_file_test.py b/model_signing/serialization/serialize_by_file_test.py index d83179c1..304ddbca 100644 --- a/model_signing/serialization/serialize_by_file_test.py +++ b/model_signing/serialization/serialize_by_file_test.py @@ -24,11 +24,11 @@ import pathlib import pytest +from model_signing import test_support from model_signing.hashing import file from model_signing.hashing import memory from model_signing.manifest import manifest from model_signing.serialization import serialize_by_file -from model_signing import test_support class TestDigestSerializer: diff --git a/model_signing/signing/__init__.py b/model_signing/signing/__init__.py new file mode 100644 index 00000000..0888a055 --- /dev/null +++ b/model_signing/signing/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2024 The Sigstore Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/model_signing/signing/signing.py b/model_signing/signing/signing.py new file mode 100644 index 00000000..b324ec58 --- /dev/null +++ b/model_signing/signing/signing.py @@ -0,0 +1,80 @@ +# Copyright 2024 The Sigstore Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Machinery for signing and verification of ML models. + +The serialization API produces a manifest representation of the models, which +can be used to implement various verification patterns. However, when signing, +we need to actually represent this manifest in a specific disk format. But, +there are multiple ways to use `manifest.Manifest` objects, so we add a new +`SigningMaterial` class hierarchy to serialize and sign manifests. + +The output of a signing process is a `Signature` instance, backed by a format to +serialize this to disk. In OSS, this is usually a Sigstore bundle. + +TODO: expand on this. +""" + +import abc +import pathlib +from typing import Self + +from model_signing.manifest import manifest + + +class SigningMaterial(metaclass=abc.ABCMeta): + """Generic material that we can sign.""" + + @classmethod + @abc.abstractmethod + def from_manifest(cls, manifest: manifest.Manifest) -> Self: + """Converts a manifest to the signing material used for signing.""" + pass + + @abc.abstractmethod + def sign(self) -> "Signature": + """Signs the current SigningMaterial with the provided key/signer. + + TODO: arguments, abstract over signing format, etc. + """ + pass + + +class Signature(metaclass=abc.ABCMeta): + """Generic signature support.""" + + @abc.abstractmethod + def write_signature(self, path: pathlib.Path): + """Writes the signature to disk, to the given path.""" + pass + + @classmethod + @abc.abstractmethod + def read_signature(cls, path: pathlib.Path) -> Self: + """Reads the signature from disk. + + Does not perform any verification, except what is needed to parse the + signature file. Use `verify` to validate the signature. + """ + pass + + @abc.abstractmethod + def verify(self): # TODO: signature + """Verifies the signature. + + If the verification passes, this method returns TODO: what? + + TODO: Document return and raises. + """ + pass diff --git a/model_signing/test_support.py b/model_signing/test_support.py index d41e98b5..44fcd940 100644 --- a/model_signing/test_support.py +++ b/model_signing/test_support.py @@ -43,6 +43,25 @@ ] +# All directory models to use in testing, where only non empty directory models +# are supported. See also `all_test_models` comments. +all_non_empty_directory_test_models = [ + "sample_model_folder", + "deep_model_folder", + "model_folder_with_empty_file", + "symlink_model_folder", +] + + +# All directory models to use in testing, where only non empty directory models +# are supported. See also `all_test_models` comments. +all_non_empty_directory_test_models = [ + "sample_model_folder", + "deep_model_folder", + "model_folder_with_empty_file", +] + + def get_first_directory(path: pathlib.Path) -> pathlib.Path: """Returns the first directory that is a children of path.