From 74dedf98efb5ee3aac8c8f07851e6f0b147fbe57 Mon Sep 17 00:00:00 2001 From: Mihai Maruseac Date: Wed, 11 Sep 2024 15:46:21 -0400 Subject: [PATCH] Add scripts for benchmarks using the current API (#306) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add environment for running and generating benchmarks Signed-off-by: Mihai Maruseac * Add generator for models Signed-off-by: Mihai Maruseac * Write in chunks Signed-off-by: Mihai Maruseac * Add matrix, expand description Signed-off-by: Mihai Maruseac * Add script for serialization benchmark Signed-off-by: Mihai Maruseac * Proper capitalization of help messages Signed-off-by: Mihai Maruseac * Add benchmark runner Signed-off-by: Mihai Maruseac * Use numpy to generate random data. We go from ``` [...]$ hyperfine -w 3 "python benchmarks/generate.py file --root /tmp/file 100000000" Benchmark 1: python benchmarks/generate.py file --root /tmp/file 100000000 Time (mean ± σ): 10.290 s ± 0.140 s [User: 10.197 s, System: 0.092 s] Range (min … max): 10.149 s … 10.541 s 10 runs ``` to ``` [...]$ hyperfine -w 3 "python benchmarks/generate.py file --root /tmp/file 100000000" --show-output Benchmark 1: python benchmarks/generate.py file --root /tmp/file 100000000 Time (mean ± σ): 381.1 ms ± 13.9 ms [User: 512.9 ms, System: 633.1 ms] Range (min … max): 365.5 ms … 412.1 ms 10 runs ``` Signed-off-by: Mihai Maruseac * Fix typos Signed-off-by: Mihai Maruseac * Document all functions Signed-off-by: Mihai Maruseac * Handle review Signed-off-by: Mihai Maruseac * Handle review Signed-off-by: Mihai Maruseac * Use id but with comment Signed-off-by: Mihai Maruseac * Undo de-indent added by editor Signed-off-by: Mihai Maruseac --------- Signed-off-by: Mihai Maruseac --- benchmarks/generate.py | 227 ++++++++++++++++++++++++++++++++++++ benchmarks/serialize.py | 247 ++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 18 ++- 3 files changed, 491 insertions(+), 1 deletion(-) create mode 100644 benchmarks/generate.py create mode 100644 benchmarks/serialize.py diff --git a/benchmarks/generate.py b/benchmarks/generate.py new file mode 100644 index 00000000..043bfd5f --- /dev/null +++ b/benchmarks/generate.py @@ -0,0 +1,227 @@ +# Copyright 2024 The Sigstore Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Script for generating benchmark data.""" + +import argparse +import itertools +import pathlib + +import numpy as np + + +def create_file_of_given_size(path: str, size: int) -> None: + """Writes a random file at the given path with given size. + + Args: + path: Path to a file to write to. Parents are created if needed. + size: Number of bytes to generate and write to file. + """ + file_path = pathlib.Path(path) + file_path.parent.mkdir(parents=True, exist_ok=True) + chunk_size = 8192 + num_chunks = size // chunk_size + + with file_path.open("wb") as f: + for _ in range(num_chunks): + s = np.random.randint(0, 256, chunk_size, dtype=np.uint8).tobytes() + f.write(s) + + if size % chunk_size != 0: + chunk_size = size % chunk_size + s = np.random.randint(0, 256, chunk_size, dtype=np.uint8).tobytes() + f.write(s) + + +def generate_file_sizes( + total_size: int, count: int, weights: list[int] | None = None +) -> list[int]: + """Generate file sizes splitting a total size into multiple files. + + If weights is missing (or made of equal elements), the resulting files have + equal sizes. Otherwise, the sizes are proportional to the weights. + + The weights are used in a cycle until all files are accounted for. + + Args: + total_size: Total size to split into files. + count: Number of files to generate. + weights: Optional weights to use when splitting. + + Returns: + The list of file sizes to generate. + """ + if weights is None: + weights = [1] + + weights = list(itertools.islice(itertools.cycle(weights), count)) + total_weight = sum(weights) + file_sizes = [int(total_size * w / total_weight) for w in weights] + file_sizes[-1] = total_size - sum(file_sizes[:-1]) + return file_sizes + + +def generate_file(args: argparse.Namespace): + """Generates a random model as a single file. + + Args: + args: The arguments specifying the request. + """ + create_file_of_given_size(args.root, args.size) + + +def generate_dir(args: argparse.Namespace): + """Generates a random model as N files in a directory. + + Args: + args: The arguments specifying the request. + """ + for i, sz in enumerate(generate_file_sizes(args.size, args.n, args.w)): + create_file_of_given_size(f"{args.root}/f{i}", sz) + + +def generate_matrix(args: argparse.Namespace): + """Generates a random model as M directories with N files each. + + Args: + args: The arguments specifying the request. + """ + sizes = generate_file_sizes(args.size // args.m, args.n, args.w) + exact = args.size % args.m == 0 + last = args.m if exact else (args.m - 1) + + for i in range(last): + for j, sz in enumerate(sizes): + create_file_of_given_size(f"{args.root}/d{i}/f{j}", sz) + + if not exact: + leftover = (args.size // args.m) + (args.size % args.m) + i = i + 1 + for j, sz in enumerate(generate_file_sizes(leftover, args.n, args.w)): + create_file_of_given_size(f"{args.root}/d{i}/f{j}", sz) + + +def generate_nested(args: argparse.Namespace): + """Generates a random model as N files in a directory with M ancestors. + + Args: + args: The arguments specifying the request. + """ + path = args.root + for i in range(args.m): + path = f"{path}/d{i}" + + for j, sz in enumerate(generate_file_sizes(args.size, args.n, args.w)): + create_file_of_given_size(f"{path}/f{j}", sz) + + +def add_size_arguments( + parser: argparse.ArgumentParser, multiple_files: bool = True +) -> None: + """Adds the size related arguments to a subparser. + + We need to pass in the size of the model to generate. If the model has + multiple files we support an additional repeated to specify what sizes these + files should have (instead of being all equal). + + Args: + parser: The parser to enhance. + multiple_files: Whether the generator generates multiple files. + """ + parser.add_argument("size", help="size of the model", type=int) + + if multiple_files: + parser.add_argument( + "-w", + help="optional weights for for model file sizes to generate", + nargs="+", + type=int, + ) + + +def add_count_arguments( + parser: argparse.ArgumentParser, with_dirs: bool = True +) -> None: + """Adds the count related arguments to a subparser. + + We have N files. In some cases, we also have M directories. + + Args: + parser: The parser to enhance. + with_dirs: Also add argument to generate the directories. + """ + parser.add_argument("-n", help="number of files", type=int, required=True) + + if with_dirs: + parser.add_argument( + "-m", help="number of directories", type=int, required=True + ) + + +def add_root_argument(parser: argparse.ArgumentParser) -> None: + """Adds the argument for the name of the root of the model. + + Args: + parser: The parser to enhance. + """ + parser.add_argument("--root", help="model root path", required=True) + + +def build_parser() -> argparse.ArgumentParser: + """Builds the command line parser for the generator.""" + parser = argparse.ArgumentParser( + description="generate benchmark data for model signing" + ) + parser.set_defaults(func=generate_file) + subparsers = parser.add_subparsers(title="Model shapes") + + parser_file = subparsers.add_parser( + "file", help="generate all data in a single file (default)" + ) + add_root_argument(parser_file) + add_size_arguments(parser_file, multiple_files=False) + parser_file.set_defaults(func=generate_file) + + parser_dir = subparsers.add_parser( + "dir", help="generate data split into N files in a single directory" + ) + add_root_argument(parser_dir) + add_size_arguments(parser_dir) + add_count_arguments(parser_dir, with_dirs=False) + parser_dir.set_defaults(func=generate_dir) + + parser_matrix = subparsers.add_parser( + "matrix", help="generate data split into N files in M directories" + ) + add_root_argument(parser_matrix) + add_size_arguments(parser_matrix) + add_count_arguments(parser_matrix) + parser_matrix.set_defaults(func=generate_matrix) + + parser_nested = subparsers.add_parser( + "nested", + help="generate data split into N files in a directory nested M levels", + ) + add_root_argument(parser_nested) + add_size_arguments(parser_nested) + add_count_arguments(parser_nested) + parser_nested.set_defaults(func=generate_nested) + + return parser + + +if __name__ == "__main__": + np.random.seed(42) + args = build_parser().parse_args() + args.func(args) diff --git a/benchmarks/serialize.py b/benchmarks/serialize.py new file mode 100644 index 00000000..165fefa8 --- /dev/null +++ b/benchmarks/serialize.py @@ -0,0 +1,247 @@ +# Copyright 2024 The Sigstore Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Script for benchmarking full model serialization.""" + +import argparse +from collections.abc import Callable +import pathlib + +from model_signing.hashing import file +from model_signing.hashing import hashing +from model_signing.hashing import memory +from model_signing.serialization import serialize_by_file +from model_signing.serialization import serialize_by_file_shard +from model_signing.signing import in_toto + + +def get_hash_engine_factory( + hash_algorithm: str, +) -> type[hashing.StreamingHashEngine]: + """Returns the class that implements a hashing method. + + Args: + hash_algorithm: the hash algorithm to implement. + + Returns: + The class that corresponds to the algorithm. + + Raises: + ValueError: if the algorithm is not implemented/not valid. + """ + match hash_algorithm: + case "sha256": + return memory.SHA256 + case "blake2": + return memory.BLAKE2 + + raise ValueError(f"Cannot convert {hash_algorithm} to a hash engine") + + +def get_sharded_file_hasher_factory( + hash_algorithm: str, chunk_size: int, shard_size: int +) -> Callable[[pathlib.Path, int, int], file.ShardedFileHasher]: + """Returns a hasher factory for sharded serialization. + + Args: + hash_algorithm: the hash algorithm to use for each shard. + chunk_size: the chunk size to use when reading shards. + shard_size: the shard size used in generating the shards. + + Returns: + A callable for the hashing factory. + """ + hash_engine = get_hash_engine_factory(hash_algorithm) + + def _hasher_factory( + path: pathlib.Path, start: int, end: int + ) -> file.ShardedFileHasher: + return file.ShardedFileHasher( + path, + hash_engine(), # pytype: disable=not-instantiable + start=start, + end=end, + chunk_size=chunk_size, + shard_size=shard_size, + ) + + return _hasher_factory + + +def get_file_hasher_factory( + hash_algorithm: str, chunk_size: int +) -> Callable[[pathlib.Path], file.FileHasher]: + """Returns a hasher factory for file serialization. + + Args: + hash_algorithm: the hash algorithm to use for each file. + chunk_size: the chunk size to use when reading files. + + Returns: + A callable for the hashing factory. + """ + hash_engine = get_hash_engine_factory(hash_algorithm) + + def _hasher_factory(path: pathlib.Path) -> file.FileHasher: + return file.SimpleFileHasher( + path, + hash_engine(), # pytype: disable=not-instantiable + chunk_size=chunk_size, + ) + + return _hasher_factory + + +def run(args: argparse.Namespace) -> None: + """Performs the benchmark. + + Args: + args: The arguments specifying the benchmark scenario. + """ + # 1. Hashing layer + if args.use_shards: + hasher = get_sharded_file_hasher_factory( + args.hash_method, args.chunk, args.shard + ) + else: + hasher = get_file_hasher_factory(args.hash_method, args.chunk) + + # 2. Serialization layer + if args.skip_manifest or args.single_digest: + merge_hasher_factory = get_hash_engine_factory(args.merge_hasher) + if args.use_shards: + serializer = serialize_by_file_shard.DigestSerializer( + hasher, + merge_hasher_factory(), # pytype: disable=not-instantiable + max_workers=args.max_workers, + ) + else: + # This gets complicated because the API here is not matching the + # rest. We should fix this. + if args.max_workers is not None and args.max_workers != 1: + raise ValueError("Currently, only 1 worker is supported here") + serializer = serialize_by_file.DigestSerializer( + # pytype: disable=wrong-arg-count + hasher(pathlib.Path("unused")), + # pytype: enable=wrong-arg-count + merge_hasher_factory, + ) + else: + if args.use_shards: + serializer_factory = serialize_by_file_shard.ManifestSerializer + else: + serializer_factory = serialize_by_file.ManifestSerializer + + serializer = serializer_factory(hasher, max_workers=args.max_workers) + + # 3. Signing layer + if args.skip_manifest: + in_toto_builder = id # Do nothing, just evaluate the argument + else: + if args.single_digest: + in_toto_builder = in_toto.SingleDigestIntotoPayload + else: + match (args.digest_of_digests, args.use_shards): + case (True, True): + in_toto_builder = in_toto.DigestOfShardDigestsIntotoPayload + case (True, False): + in_toto_builder = in_toto.DigestOfDigestsIntotoPayload + case (False, True): + in_toto_builder = in_toto.ShardDigestsIntotoPayload + case (False, False): + in_toto_builder = in_toto.DigestsIntotoPayload + + in_toto_builder = in_toto_builder.from_manifest + + # Put everything together + if not args.dry_run: + in_toto_builder(serializer.serialize(args.path)) + + +def build_parser() -> argparse.ArgumentParser: + """Builds the command line parser for the bechmark runner.""" + parser = argparse.ArgumentParser( + description="Benchmark full serialization of a model for model signing" + ) + + parser.add_argument("path", help="path to model", type=pathlib.Path) + parser.add_argument( + "--dry_run", help="don't run anything", action="store_true" + ) + parser.add_argument( + "--hash_method", + help="hash method to use (default: sha256)", + choices=["sha256", "blake2"], + default="sha256", + ) + parser.add_argument( + "--max_workers", help="number of parallel workers to use", type=int + ) + + param_groups = parser.add_argument_group("Internal parameters to fine-tune") + param_groups.add_argument( + "--chunk", help="chunk size (default: 8192)", type=int, default=8192 + ) + param_groups.add_argument( + "--shard", + help="shard size (default: 1000000)", + type=int, + default=1000000, + ) + + shard_group = parser.add_argument_group("Serialization modes") + shard_group.add_argument( + "--use_shards", help="serialize by shards", action="store_true" + ) + shard_group.add_argument( + "--skip_manifest", + help="serialize to a single digest, skip manifest creation", + action="store_true", + ) + shard_group.add_argument( + "--merge_hasher", + help="hasher to use to merge individual hashes " + "when skipping manifest creation (default: sha256)", + choices=["sha256", "blake2"], + default="sha256", + ) + + intoto_group = parser.add_argument_group( + "Manifest to in-toto serialization formats" + ) + intoto_group.add_argument( + "--single_digest", + help="serialize to a single digest, use manifest with one entry", + action="store_true", + ) + intoto_group.add_argument( + "--digest_of_digests", + help="generate an in-toto statement with a single subject", + action="store_true", + ) + + return parser + + +if __name__ == "__main__": + args = build_parser().parse_args() + if args.skip_manifest and (args.single_digest or args.digest_of_digests): + raise ValueError( + "Cannot combine --skip_manifest with manifest to in-toto options" + ) + if args.single_digest and args.digest_of_digests: + raise ValueError( + "At most one of --single_digest and --digest_of_digests can be used" + ) + run(args) diff --git a/pyproject.toml b/pyproject.toml index 443031f4..b2c772b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,6 +64,22 @@ randomize = true [[tool.hatch.envs.hatch-test.matrix]] python = ["3.10", "3.11", "3.12"] +[tool.hatch.envs.bench] +description = """Custom environment for running benchmarks. +Use `hatch run +py=3... bench:generate ${args}` to generate test models. +Use `hatch run +py=3... bench:serialize ${args}` to benchmark serialization code. +""" +extra-dependencies = [ + "numpy", +] + +[[tool.hatch.envs.bench.matrix]] +python = ["3.10", "3.11", "3.12"] + +[tool.hatch.envs.bench.scripts] +generate = "python benchmarks/generate.py {args}" +serialize = "python benchmarks/serialize.py {args}" + [tool.hatch.envs.docs] description = """Custom environment for pdoc. Use `hatch run docs:serve` to view documentation. @@ -88,7 +104,7 @@ installer = "pip" python = "3.11" [tool.hatch.envs.type.scripts] -check = "pytype -k -j auto src tests" +check = "pytype -k -j auto src tests benchmarks" [tool.coverage.report] exclude_also = [