From 70b1ddb509596e7537d7df9864f4dcea92b30242 Mon Sep 17 00:00:00 2001 From: Mihai Maruseac Date: Wed, 24 Jul 2024 13:00:41 -0700 Subject: [PATCH] Expose reosurce descriptors from manifests We aim this to be similar to in-toto's `ResourceDescriptor`. To support cases where in-toto cannot be directly used, we make this a dataclass that can be mapped to in-toto when needed, and used as its own otherwise. Not all fields from in-toto are specified at this moment. All fields here must be present, unlike in-toto, where all are optional. See https://github.com/in-toto/attestation/blob/main/spec/v1/resource_descriptor.md for the in-toto specification. This is the first separable PR for the signing support (see full draft on #253) Signed-off-by: Mihai Maruseac --- .github/workflows/lint.yml | 4 +- model_signing/manifest/manifest.py | 70 +++++++- model_signing/manifest/manifest_test.py | 100 +++++++++++ .../serialize_by_file_shard_test.py | 2 +- .../serialization/serialize_by_file_test.py | 2 +- model_signing/signing/__init__.py | 13 ++ model_signing/signing/signing.py | 156 ++++++++++++++++++ model_signing/test_support.py | 10 ++ 8 files changed, 351 insertions(+), 6 deletions(-) create mode 100644 model_signing/signing/__init__.py create mode 100644 model_signing/signing/signing.py diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index e81d83fd..268b87e9 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -65,7 +65,7 @@ jobs: pip install -r model_signing/install/requirements_test_Linux.txt pip install -r model_signing/install/requirements_dev_Linux.txt # TODO: https://github.com/sigstore/model-transparency/issues/231 - Support all repo - pytype --keep-going model_signing/{hashing,manifest,serialization} + pytype --keep-going model_signing/{hashing,manifest,serialization,signing} pylint-lint: runs-on: ubuntu-latest @@ -85,4 +85,4 @@ jobs: pip install -r model_signing/install/requirements_dev_Linux.txt # TODO: https://github.com/sigstore/model-transparency/issues/231 - Support all repo # We should actually migrate to ruff, but that's configured via pyproject.toml which we use when we release the wheel - pylint --disable C0114,C0115,C0116,R0801,R0903,R0904,R0913,R0914,R1721,R1737,W0107,W0212,W0223,W0231,W0511,W0621 model_signing/{hashing,manifest,serialization} + pylint --disable C0114,C0115,C0116,R0801,R0903,R0904,R0913,R0914,R1721,R1737,W0107,W0212,W0223,W0231,W0511,W0621 model_signing/{hashing,manifest,serialization,signing} diff --git a/model_signing/manifest/manifest.py b/model_signing/manifest/manifest.py index 46ac3ed1..5461c866 100644 --- a/model_signing/manifest/manifest.py +++ b/model_signing/manifest/manifest.py @@ -52,18 +52,53 @@ """ import abc -from collections.abc import Iterable +from collections.abc import Iterable, Iterator import dataclasses import pathlib from typing import Self +from typing_extensions import override from model_signing.hashing import hashing +@dataclasses.dataclass(frozen=True) +class ResourceDescriptor: + """A description of any content from any `Manifest`. + + We aim this to be similar to in-toto's `ResourceDescriptor`. To support + cases where in-toto cannot be directly used, we make this a dataclass that + can be mapped to in-toto when needed, and used as its own otherwise. + + Not all fields from in-toto are specified at this moment. All fields here + must be present, unlike in-toto, where all are optional. + + See github.com/in-toto/attestation/blob/main/spec/v1/resource_descriptor.md + for the in-toto specification. + + Attributes: + identifier: A string that uniquely identifies this `ResourceDescriptor` + within the manifest. Depending on serialized format, users might + require the identifier to be unique across all manifests stored in a + system. Producers and consumers can agree on additional requirements + (e.g., several descriptors must have a common pattern for the + identifier and the integrity of the model implies integrity of all + these items, ignoring any other descriptor). Corresponds to `name`, + `uri`, or `content` in in-toto specification. + digest: One digest for the item. Note that unlike in-toto, we only have + one digest for the item and it is always required. + """ + + identifier: str + digest: hashing.Digest + + class Manifest(metaclass=abc.ABCMeta): """Generic manifest file to represent a model.""" - pass + @abc.abstractmethod + def resource_descriptors(self) -> Iterator[ResourceDescriptor]: + """Yields each resource from the manifest, one by one.""" + pass @dataclasses.dataclass(frozen=True) @@ -72,6 +107,17 @@ class DigestManifest(Manifest): digest: hashing.Digest + @override + def resource_descriptors(self) -> Iterator[ResourceDescriptor]: + """Yields each resource from the manifest, one by one. + + In this case, we have only one descriptor to return. Since model paths + are already encoded in the digest, use "." for the identifier. + Subclasses might record additional fields to have distinguishable human + readable identifiers. + """ + yield ResourceDescriptor(identifier=".", digest=self.digest) + class ItemizedManifest(Manifest): """A detailed manifest, recording integrity of every model component.""" @@ -130,6 +176,15 @@ def __init__(self, items: Iterable[FileManifestItem]): def __eq__(self, other: Self): return self._item_to_digest == other._item_to_digest + @override + def resource_descriptors(self) -> Iterator[ResourceDescriptor]: + """Yields each resource from the manifest, one by one. + + The items are returned in alphabetical order of the path. + """ + for item, digest in sorted(self._item_to_digest.items()): + yield ResourceDescriptor(identifier=str(item), digest=digest) + @dataclasses.dataclass(frozen=True, order=True) class Shard: @@ -200,3 +255,14 @@ def __init__(self, items: Iterable[ShardedFileManifestItem]): efficient updates and retrieval of digests. """ self._item_to_digest = {item.input_tuple: item.digest for item in items} + + @override + def resource_descriptors(self) -> Iterator[ResourceDescriptor]: + """Yields each resource from the manifest, one by one. + + The items are returned in the order given by the `Shard` dataclass + (implicit ordering: by file, shard start offset and shard end offset, in + order). + """ + for item, digest in sorted(self._item_to_digest.items()): + yield ResourceDescriptor(identifier=str(item), digest=digest) diff --git a/model_signing/manifest/manifest_test.py b/model_signing/manifest/manifest_test.py index f097c8de..b7f36230 100644 --- a/model_signing/manifest/manifest_test.py +++ b/model_signing/manifest/manifest_test.py @@ -13,11 +13,31 @@ # limitations under the License. import pathlib +import pytest from model_signing.hashing import hashing from model_signing.manifest import manifest +class TestDigestManifest: + + def test_manifest_has_just_one_resource_descriptor(self): + digest = hashing.Digest("test", b"test_digest") + manifest_file = manifest.DigestManifest(digest) + + descriptors = list(manifest_file.resource_descriptors()) + + assert len(descriptors) == 1 + + def test_manifest_has_the_correct_resource_descriptor(self): + digest = hashing.Digest("test", b"test_digest") + manifest_file = manifest.DigestManifest(digest) + + for descriptor in manifest_file.resource_descriptors(): + assert descriptor.identifier == "." + assert descriptor.digest == digest + + class TestFileLevelManifest: def test_insert_order_does_not_matter(self): @@ -34,6 +54,39 @@ def test_insert_order_does_not_matter(self): assert manifest1 == manifest2 + @pytest.mark.parametrize("num_items", [1, 3, 5]) + def test_manifest_has_all_resource_descriptors(self, num_items): + items: list[manifest.FileManifestItem] = [] + for i in range(num_items): + path = pathlib.PurePath(f"file{i}") + digest = hashing.Digest("test", b"hash{i}") + item = manifest.FileManifestItem(path=path, digest=digest) + items.append(item) + manifest_file = manifest.FileLevelManifest(items) + + descriptors = list(manifest_file.resource_descriptors()) + + assert len(descriptors) == num_items + + def test_manifest_has_the_correct_resource_descriptors(self): + path1 = pathlib.PurePath("file1") + digest1 = hashing.Digest("test", b"hash1") + item1 = manifest.FileManifestItem(path=path1, digest=digest1) + + path2 = pathlib.PurePath("file2") + digest2 = hashing.Digest("test", b"hash2") + item2 = manifest.FileManifestItem(path=path2, digest=digest2) + + # Note order is reversed + manifest_file = manifest.FileLevelManifest([item2, item1]) + descriptors = list(manifest_file.resource_descriptors()) + + # But we expect the descriptors to be in order by file + assert descriptors[0].identifier == "file1" + assert descriptors[1].identifier == "file2" + assert descriptors[0].digest.digest_value == b"hash1" + assert descriptors[1].digest.digest_value == b"hash2" + class TestShardLevelManifest: @@ -70,3 +123,50 @@ def test_same_path_different_shards_gives_different_manifest(self): manifest2 = manifest.ShardLevelManifest([item]) assert manifest1 != manifest2 + + @pytest.mark.parametrize("num_items", [1, 3, 5]) + def test_manifest_has_all_resource_descriptors(self, num_items): + items: list[manifest.ShardedFileManifestItem] = [] + for i in range(num_items): + path = pathlib.PurePath("file") + digest = hashing.Digest("test", b"hash{i}") + item = manifest.ShardedFileManifestItem( + path=path, digest=digest, start=i, end=i + 2 + ) + items.append(item) + manifest_file = manifest.ShardLevelManifest(items) + + descriptors = list(manifest_file.resource_descriptors()) + + assert len(descriptors) == num_items + + def test_manifest_has_the_correct_resource_descriptors(self): + path_to_file1 = pathlib.PurePath("file1") + digest1 = hashing.Digest("test", b"hash1") + item1 = manifest.ShardedFileManifestItem( + path=path_to_file1, digest=digest1, start=0, end=4 + ) + + # First file, but second shard + digest2 = hashing.Digest("test", b"hash2") + item2 = manifest.ShardedFileManifestItem( + path=path_to_file1, digest=digest2, start=4, end=8 + ) + + path_to_file2 = pathlib.PurePath("file2") + digest3 = hashing.Digest("test", b"hash3") + item3 = manifest.ShardedFileManifestItem( + path=path_to_file2, digest=digest3, start=0, end=4 + ) + + # Note order is not preserved (random permutation) + manifest_file = manifest.ShardLevelManifest([item2, item3, item1]) + descriptors = list(manifest_file.resource_descriptors()) + + # But we expect the descriptors to be in order by file shard + assert descriptors[0].identifier == "file1:0:4" + assert descriptors[1].identifier == "file1:4:8" + assert descriptors[2].identifier == "file2:0:4" + assert descriptors[0].digest.digest_value == b"hash1" + assert descriptors[1].digest.digest_value == b"hash2" + assert descriptors[2].digest.digest_value == b"hash3" diff --git a/model_signing/serialization/serialize_by_file_shard_test.py b/model_signing/serialization/serialize_by_file_shard_test.py index f9e68c61..94934655 100644 --- a/model_signing/serialization/serialize_by_file_shard_test.py +++ b/model_signing/serialization/serialize_by_file_shard_test.py @@ -23,11 +23,11 @@ import pathlib import pytest +from model_signing import test_support from model_signing.hashing import file from model_signing.hashing import memory from model_signing.manifest import manifest from model_signing.serialization import serialize_by_file_shard -from model_signing import test_support class TestDigestSerializer: diff --git a/model_signing/serialization/serialize_by_file_test.py b/model_signing/serialization/serialize_by_file_test.py index d83179c1..304ddbca 100644 --- a/model_signing/serialization/serialize_by_file_test.py +++ b/model_signing/serialization/serialize_by_file_test.py @@ -24,11 +24,11 @@ import pathlib import pytest +from model_signing import test_support from model_signing.hashing import file from model_signing.hashing import memory from model_signing.manifest import manifest from model_signing.serialization import serialize_by_file -from model_signing import test_support class TestDigestSerializer: diff --git a/model_signing/signing/__init__.py b/model_signing/signing/__init__.py new file mode 100644 index 00000000..0888a055 --- /dev/null +++ b/model_signing/signing/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2024 The Sigstore Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/model_signing/signing/signing.py b/model_signing/signing/signing.py new file mode 100644 index 00000000..e7aa4c24 --- /dev/null +++ b/model_signing/signing/signing.py @@ -0,0 +1,156 @@ +# Copyright 2024 The Sigstore Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Machinery for signing and verification of ML models. + +The serialization API produces a manifest representation of the models, which +can be used to implement various verification patterns. However, when signing, +we need to actually represent this manifest in a specific disk format. But, +there are multiple ways to use `manifest.Manifest` objects, so we add a new +`SigningMaterial` class hierarchy to serialize and sign manifests. + +The output of a signing process is a `Signature` instance, backed by a format to +serialize this to disk. In OSS, this is usually a Sigstore bundle. + +TODO: expand on this. +""" + +import abc +import pathlib +from typing import Self + +from model_signing.manifest import manifest + + +class SigningMaterial(metaclass=abc.ABCMeta): + """Generic material that we can sign.""" + + @classmethod + @abc.abstractmethod + def from_manifest(cls, manifest: manifest.Manifest) -> Self: + """Converts a manifest to the signing material used for signing. + + Args: + manifest: the manifest to convert to signing material for signing. + + Returns: + An instance of the class which can be passed to a `Signer` for + siging. Each `Signer` subclass accepts only some `SigningMaterial` + instances and will refuse to sign the others. + + Raises: + TypeError: If the provided manifest is not serializable to the + current `SigningMaterial`. This could happen on API mismatch or + lacking implementation for a new serialization format. + """ + pass + + +class Signature(metaclass=abc.ABCMeta): + """Generic signature support.""" + + @abc.abstractmethod + def write_signature(self, path: pathlib.Path) -> None: + """Writes the signature to disk, to the given path. + + Args: + manifest: the manifest to convert to signing material for signing. + """ + pass + + @classmethod + @abc.abstractmethod + def read_signature(cls, path: pathlib.Path) -> Self: + """Reads the signature from disk. + + Does not perform any verification, except what is needed to parse the + signature file. + + Args: + path: the path to read the signature from. + + Returns: + An instance of the class which can be passed to a `Verifier` for + signature and integrity verification. Each `Verifier` subclass + accepts only some `Signature` instances and may fail to verify the + others. + + Raises: + ValueError: If the provided path is not deserializable to the format + expected by the current `Signature` class. + """ + pass + + +class Signer(metaclass=abc.ABCMeta): + """Generic signer for `SigningMaterial` to produce a `Signature`. + + Each signer is opinionated on the format of the signature and on the + expected format for the signing materials. No signer is required to support + all possible `SigningMaterial` or `Signature` subclasses. + + Each signer can implement its own schema for managing the key material. + """ + + @abc.abstractmethod + def sign(self, signing_material: SigningMaterial) -> Signature: + """Signs the provided signing material to produce the signature. + + Args: + signing_material: the subjects to sign. + + Returns: + A valid signature, in a format that the `Signer` and the paired + `Verifier` can understand. + + Raises: + TypeError: If the signing material is not in a format that can be + signed with the current signer. + """ + pass + + +class Verifier(metaclass=abc.ABCMeta): + """Generic `Signature` verifier. + + Every `Verifier` is paired with a `Signer` instance. Both must support the + same `Signature` types and have similar key material management processes. + + A `Verifier` checks the `Signature`. If valid, the `Signature` format would + be expanded to a full `Manifest`, which is then used to check the model + integrity by comparing digests. + + A `Manifest` produced by a `Signature` must be accepted as valid argument to + `SigningMaterial.from_manifest()` for every subclass of `SigningMaterial` + that is accepted by the `Signer` paired with the `Verifier`. + """ + + @abc.abstractmethod + def verify(self, signature: Signature) -> manifest.Manifest: + """Verifies the signature. + + Args: + signature: the signature to verify. + + Returns: + A `manifest.Manifest` instance that represent the model + serialization, as signed. This can be used to further verify the + integrity of a model. + + Raises: + ValueError: If the signature is invalid. + TypeError: If the signature is not in a format that can be verified + with the current signer. + """ + pass diff --git a/model_signing/test_support.py b/model_signing/test_support.py index d41e98b5..c7ad7b1e 100644 --- a/model_signing/test_support.py +++ b/model_signing/test_support.py @@ -43,6 +43,16 @@ ] +# All directory models to use in testing, where only non empty directory models +# are supported. See also `all_test_models` comments. +all_non_empty_directory_test_models = [ + "sample_model_folder", + "deep_model_folder", + "model_folder_with_empty_file", + "symlink_model_folder", +] + + def get_first_directory(path: pathlib.Path) -> pathlib.Path: """Returns the first directory that is a children of path.