Skip to content

Commit

Permalink
Expose reosurce descriptors from manifests
Browse files Browse the repository at this point in the history
We aim this to be similar to in-toto's `ResourceDescriptor`. To support cases where in-toto cannot be directly used, we make this a dataclass that can be mapped to in-toto when needed, and used as its own otherwise.

Not all fields from in-toto are specified at this moment. All fields here must be present, unlike in-toto, where all are optional.

See https://github.com/in-toto/attestation/blob/main/spec/v1/resource_descriptor.md for the in-toto specification.

This is the first separable PR for the signing support (see full draft on sigstore#253)

Signed-off-by: Mihai Maruseac <mihaimaruseac@google.com>
  • Loading branch information
mihaimaruseac committed Jul 24, 2024
1 parent c3c4110 commit 436be61
Show file tree
Hide file tree
Showing 4 changed files with 164 additions and 5 deletions.
64 changes: 62 additions & 2 deletions model_signing/manifest/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,44 @@
from collections.abc import Iterable
import dataclasses
import pathlib
from typing import Self
from typing import Iterator, Self
from typing_extensions import override

from model_signing.hashing import hashing


@dataclasses.dataclass(frozen=True)
class ResourceDescriptor:
"""A description of any content from any `Manifest`.
We aim this to be similar to in-toto's `ResourceDescriptor`. To support
cases where in-toto cannot be directly used, we make this a dataclass that
can be mapped to in-toto when needed, and used as its own otherwise.
Not all fields from in-toto are specified at this moment. All fields here
must be present, unlike in-toto, where all are optional.
See github.com/in-toto/attestation/blob/main/spec/v1/resource_descriptor.md
for the in-toto specification.
Attributes:
identifier: A string that uniquely identifies this `ResourceDescriptor`.
Corresponds to `name`, `uri`, or `content` in in-toto specification.
digest: One digest for the item. Note that unlike in-toto, we only have
one digest for the item and it is always required.
"""

identifier: str
digest: hashing.Digest


class Manifest(metaclass=abc.ABCMeta):
"""Generic manifest file to represent a model."""

pass
@abc.abstractmethod
def resource_descriptors(self) -> Iterator[ResourceDescriptor]:
"""Yields each resource from the manifest, one by one."""
pass


@dataclasses.dataclass(frozen=True)
Expand All @@ -72,6 +101,17 @@ class DigestManifest(Manifest):

digest: hashing.Digest

@override
def resource_descriptors(self) -> Iterator[ResourceDescriptor]:
"""Yields each resource from the manifest, one by one.
In this case, we have only one descriptor to return. Since model paths
are already encoded in the digest, use "." for the digest. Subclasses
might record additional fields to have distinguishable human readable
identifiers.
"""
yield ResourceDescriptor(identifier=".", digest=self.digest)


class ItemizedManifest(Manifest):
"""A detailed manifest, recording integrity of every model component."""
Expand Down Expand Up @@ -130,6 +170,15 @@ def __init__(self, items: Iterable[FileManifestItem]):
def __eq__(self, other: Self):
return self._item_to_digest == other._item_to_digest

@override
def resource_descriptors(self) -> Iterator[ResourceDescriptor]:
"""Yields each resource from the manifest, one by one.
The items are returned in alphabetical order of the path.
"""
for item, digest in sorted(self._item_to_digest.items()):
yield ResourceDescriptor(identifier=str(item), digest=digest)


@dataclasses.dataclass(frozen=True, order=True)
class Shard:
Expand Down Expand Up @@ -200,3 +249,14 @@ def __init__(self, items: Iterable[ShardedFileManifestItem]):
efficient updates and retrieval of digests.
"""
self._item_to_digest = {item.input_tuple: item.digest for item in items}

@override
def resource_descriptors(self) -> Iterator[ResourceDescriptor]:
"""Yields each resource from the manifest, one by one.
The items are returned in the order given by the `input_tuple` property
of `ShardedFileManifestItem` used to create this instance (the triple of
file name and shard endpoints).
"""
for item, digest in sorted(self._item_to_digest.items()):
yield ResourceDescriptor(identifier=str(item), digest=digest)
100 changes: 100 additions & 0 deletions model_signing/manifest/manifest_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,31 @@
# limitations under the License.

import pathlib
import pytest

from model_signing.hashing import hashing
from model_signing.manifest import manifest


class TestDigestManifest:

def test_manifest_has_just_one_resource_descriptor(self):
digest = hashing.Digest("test", b"test_digest")
manifest_file = manifest.DigestManifest(digest)

descriptors = list(manifest_file.resource_descriptors())

assert len(descriptors) == 1

def test_manifest_has_the_correct_resource_descriptor(self):
digest = hashing.Digest("test", b"test_digest")
manifest_file = manifest.DigestManifest(digest)

for descriptor in manifest_file.resource_descriptors():
assert descriptor.identifier == "."
assert descriptor.digest == digest


class TestFileLevelManifest:

def test_insert_order_does_not_matter(self):
Expand All @@ -34,6 +54,39 @@ def test_insert_order_does_not_matter(self):

assert manifest1 == manifest2

@pytest.mark.parametrize("num_items", [1, 3, 5])
def test_manifest_has_all_resource_descriptors(self, num_items):
items: list[manifest.FileManifestItem] = []
for i in range(num_items):
path = pathlib.PurePath(f"file{i}")
digest = hashing.Digest("test", b"hash{i}")
item = manifest.FileManifestItem(path=path, digest=digest)
items.append(item)
manifest_file = manifest.FileLevelManifest(items)

descriptors = list(manifest_file.resource_descriptors())

assert len(descriptors) == num_items

def test_manifest_has_the_correct_resource_descriptors(self):
path1 = pathlib.PurePath(f"file1")
digest1 = hashing.Digest("test", b"hash1")
item1 = manifest.FileManifestItem(path=path1, digest=digest1)

path2 = pathlib.PurePath(f"file2")
digest2 = hashing.Digest("test", b"hash2")
item2 = manifest.FileManifestItem(path=path2, digest=digest2)

# Note order is reversed
manifest_file = manifest.FileLevelManifest([item2, item1])
descriptors = list(manifest_file.resource_descriptors())

# But we expect the descriptors to be in order by file
assert descriptors[0].identifier == "file1"
assert descriptors[1].identifier == "file2"
assert descriptors[0].digest.digest_value == b"hash1"
assert descriptors[1].digest.digest_value == b"hash2"


class TestShardLevelManifest:

Expand Down Expand Up @@ -70,3 +123,50 @@ def test_same_path_different_shards_gives_different_manifest(self):
manifest2 = manifest.ShardLevelManifest([item])

assert manifest1 != manifest2

@pytest.mark.parametrize("num_items", [1, 3, 5])
def test_manifest_has_all_resource_descriptors(self, num_items):
items: list[manifest.ShardedFileManifestItem] = []
for i in range(num_items):
path = pathlib.PurePath(f"file")
digest = hashing.Digest("test", b"hash{i}")
item = manifest.ShardedFileManifestItem(
path=path, digest=digest, start=i, end=i + 2
)
items.append(item)
manifest_file = manifest.ShardLevelManifest(items)

descriptors = list(manifest_file.resource_descriptors())

assert len(descriptors) == num_items

def test_manifest_has_the_correct_resource_descriptors(self):
path1 = pathlib.PurePath(f"file1")
digest1 = hashing.Digest("test", b"hash1")
item1 = manifest.ShardedFileManifestItem(
path=path1, digest=digest1, start=0, end=4
)

path2 = pathlib.PurePath(f"file2")
digest2 = hashing.Digest("test", b"hash2")
item2 = manifest.ShardedFileManifestItem(
path=path2, digest=digest2, start=0, end=4
)

# First file, but second shard
digest3 = hashing.Digest("test", b"hash3")
item3 = manifest.ShardedFileManifestItem(
path=path1, digest=digest3, start=4, end=8
)

# Note order is reversed
manifest_file = manifest.ShardLevelManifest([item3, item2, item1])
descriptors = list(manifest_file.resource_descriptors())

# But we expect the descriptors to be in order by file shard
assert descriptors[0].identifier == "file1:0:4"
assert descriptors[1].identifier == "file1:4:8"
assert descriptors[2].identifier == "file2:0:4"
assert descriptors[0].digest.digest_value == b"hash1"
assert descriptors[1].digest.digest_value == b"hash3"
assert descriptors[2].digest.digest_value == b"hash2"
3 changes: 1 addition & 2 deletions model_signing/serialization/serialize_by_file_shard_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@
import pathlib
import pytest

from model_signing import test_support
from model_signing.hashing import file
from model_signing.hashing import memory
from model_signing.manifest import manifest
from model_signing.serialization import serialize_by_file_shard
from model_signing import test_support


class TestDigestSerializer:
Expand Down Expand Up @@ -654,7 +654,6 @@ def test_max_workers_does_not_change_digest(self, sample_model_folder):
assert manifest1 == manifest2
assert manifest1 == manifest3


def test_shard_to_string(self):
"""Ensure the shard's `__str__` method behaves as assumed."""
shard = manifest.Shard(pathlib.PurePosixPath("a"), 0, 42)
Expand Down
2 changes: 1 addition & 1 deletion model_signing/serialization/serialize_by_file_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@
import pathlib
import pytest

from model_signing import test_support
from model_signing.hashing import file
from model_signing.hashing import memory
from model_signing.manifest import manifest
from model_signing.serialization import serialize_by_file
from model_signing import test_support


class TestDigestSerializer:
Expand Down

0 comments on commit 436be61

Please sign in to comment.