From 74dedf98efb5ee3aac8c8f07851e6f0b147fbe57 Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihaimaruseac@google.com>
Date: Wed, 11 Sep 2024 15:46:21 -0400
Subject: [PATCH] Add scripts for benchmarks using the current API (#306)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add environment for running and generating benchmarks

Signed-off-by: Mihai Maruseac <mihaimaruseac@google.com>

* Add generator for models

Signed-off-by: Mihai Maruseac <mihaimaruseac@google.com>

* Write in chunks

Signed-off-by: Mihai Maruseac <mihaimaruseac@google.com>

* Add matrix, expand description

Signed-off-by: Mihai Maruseac <mihaimaruseac@google.com>

* Add script for serialization benchmark

Signed-off-by: Mihai Maruseac <mihaimaruseac@google.com>

* Proper capitalization of help messages

Signed-off-by: Mihai Maruseac <mihaimaruseac@google.com>

* Add benchmark runner

Signed-off-by: Mihai Maruseac <mihaimaruseac@google.com>

* Use numpy to generate random data.

We go from

```
[...]$ hyperfine -w 3 "python benchmarks/generate.py file --root /tmp/file 100000000"
Benchmark 1: python benchmarks/generate.py file --root /tmp/file 100000000
  Time (mean ± σ):     10.290 s ±  0.140 s    [User: 10.197 s, System: 0.092 s]
  Range (min … max):   10.149 s … 10.541 s    10 runs
```

to

```
[...]$ hyperfine -w 3 "python benchmarks/generate.py file --root /tmp/file 100000000" --show-output
Benchmark 1: python benchmarks/generate.py file --root /tmp/file 100000000
  Time (mean ± σ):     381.1 ms ±  13.9 ms    [User: 512.9 ms, System: 633.1 ms]
  Range (min … max):   365.5 ms … 412.1 ms    10 runs
```

Signed-off-by: Mihai Maruseac <mihaimaruseac@google.com>

* Fix typos

Signed-off-by: Mihai Maruseac <mihaimaruseac@google.com>

* Document all functions

Signed-off-by: Mihai Maruseac <mihaimaruseac@google.com>

* Handle review

Signed-off-by: Mihai Maruseac <mihaimaruseac@google.com>

* Handle review

Signed-off-by: Mihai Maruseac <mihaimaruseac@google.com>

* Use id but with comment

Signed-off-by: Mihai Maruseac <mihaimaruseac@google.com>

* Undo de-indent added by editor

Signed-off-by: Mihai Maruseac <mihaimaruseac@google.com>

---------

Signed-off-by: Mihai Maruseac <mihaimaruseac@google.com>
---
 benchmarks/generate.py  | 227 ++++++++++++++++++++++++++++++++++++
 benchmarks/serialize.py | 247 ++++++++++++++++++++++++++++++++++++++++
 pyproject.toml          |  18 ++-
 3 files changed, 491 insertions(+), 1 deletion(-)
 create mode 100644 benchmarks/generate.py
 create mode 100644 benchmarks/serialize.py

diff --git a/benchmarks/generate.py b/benchmarks/generate.py
new file mode 100644
index 00000000..043bfd5f
--- /dev/null
+++ b/benchmarks/generate.py
@@ -0,0 +1,227 @@
+# Copyright 2024 The Sigstore Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Script for generating benchmark data."""
+
+import argparse
+import itertools
+import pathlib
+
+import numpy as np
+
+
+def create_file_of_given_size(path: str, size: int) -> None:
+    """Writes a random file at the given path with given size.
+
+    Args:
+        path: Path to a file to write to. Parents are created if needed.
+        size: Number of bytes to generate and write to file.
+    """
+    file_path = pathlib.Path(path)
+    file_path.parent.mkdir(parents=True, exist_ok=True)
+    chunk_size = 8192
+    num_chunks = size // chunk_size
+
+    with file_path.open("wb") as f:
+        for _ in range(num_chunks):
+            s = np.random.randint(0, 256, chunk_size, dtype=np.uint8).tobytes()
+            f.write(s)
+
+        if size % chunk_size != 0:
+            chunk_size = size % chunk_size
+            s = np.random.randint(0, 256, chunk_size, dtype=np.uint8).tobytes()
+            f.write(s)
+
+
+def generate_file_sizes(
+    total_size: int, count: int, weights: list[int] | None = None
+) -> list[int]:
+    """Generate file sizes splitting a total size into multiple files.
+
+    If weights is missing (or made of equal elements), the resulting files have
+    equal sizes. Otherwise, the sizes are proportional to the weights.
+
+    The weights are used in a cycle until all files are accounted for.
+
+    Args:
+        total_size: Total size to split into files.
+        count: Number of files to generate.
+        weights: Optional weights to use when splitting.
+
+    Returns:
+        The list of file sizes to generate.
+    """
+    if weights is None:
+        weights = [1]
+
+    weights = list(itertools.islice(itertools.cycle(weights), count))
+    total_weight = sum(weights)
+    file_sizes = [int(total_size * w / total_weight) for w in weights]
+    file_sizes[-1] = total_size - sum(file_sizes[:-1])
+    return file_sizes
+
+
+def generate_file(args: argparse.Namespace):
+    """Generates a random model as a single file.
+
+    Args:
+        args: The arguments specifying the request.
+    """
+    create_file_of_given_size(args.root, args.size)
+
+
+def generate_dir(args: argparse.Namespace):
+    """Generates a random model as N files in a directory.
+
+    Args:
+        args: The arguments specifying the request.
+    """
+    for i, sz in enumerate(generate_file_sizes(args.size, args.n, args.w)):
+        create_file_of_given_size(f"{args.root}/f{i}", sz)
+
+
+def generate_matrix(args: argparse.Namespace):
+    """Generates a random model as M directories with N files each.
+
+    Args:
+        args: The arguments specifying the request.
+    """
+    sizes = generate_file_sizes(args.size // args.m, args.n, args.w)
+    exact = args.size % args.m == 0
+    last = args.m if exact else (args.m - 1)
+
+    for i in range(last):
+        for j, sz in enumerate(sizes):
+            create_file_of_given_size(f"{args.root}/d{i}/f{j}", sz)
+
+    if not exact:
+        leftover = (args.size // args.m) + (args.size % args.m)
+        i = i + 1
+        for j, sz in enumerate(generate_file_sizes(leftover, args.n, args.w)):
+            create_file_of_given_size(f"{args.root}/d{i}/f{j}", sz)
+
+
+def generate_nested(args: argparse.Namespace):
+    """Generates a random model as N files in a directory with M ancestors.
+
+    Args:
+        args: The arguments specifying the request.
+    """
+    path = args.root
+    for i in range(args.m):
+        path = f"{path}/d{i}"
+
+    for j, sz in enumerate(generate_file_sizes(args.size, args.n, args.w)):
+        create_file_of_given_size(f"{path}/f{j}", sz)
+
+
+def add_size_arguments(
+    parser: argparse.ArgumentParser, multiple_files: bool = True
+) -> None:
+    """Adds the size related arguments to a subparser.
+
+    We need to pass in the size of the model to generate. If the model has
+    multiple files we support an additional repeated to specify what sizes these
+    files should have (instead of being all equal).
+
+    Args:
+        parser: The parser to enhance.
+        multiple_files: Whether the generator generates multiple files.
+    """
+    parser.add_argument("size", help="size of the model", type=int)
+
+    if multiple_files:
+        parser.add_argument(
+            "-w",
+            help="optional weights for for model file sizes to generate",
+            nargs="+",
+            type=int,
+        )
+
+
+def add_count_arguments(
+    parser: argparse.ArgumentParser, with_dirs: bool = True
+) -> None:
+    """Adds the count related arguments to a subparser.
+
+    We have N files. In some cases, we also have M directories.
+
+    Args:
+        parser: The parser to enhance.
+        with_dirs: Also add argument to generate the directories.
+    """
+    parser.add_argument("-n", help="number of files", type=int, required=True)
+
+    if with_dirs:
+        parser.add_argument(
+            "-m", help="number of directories", type=int, required=True
+        )
+
+
+def add_root_argument(parser: argparse.ArgumentParser) -> None:
+    """Adds the argument for the name of the root of the model.
+
+    Args:
+        parser: The parser to enhance.
+    """
+    parser.add_argument("--root", help="model root path", required=True)
+
+
+def build_parser() -> argparse.ArgumentParser:
+    """Builds the command line parser for the generator."""
+    parser = argparse.ArgumentParser(
+        description="generate benchmark data for model signing"
+    )
+    parser.set_defaults(func=generate_file)
+    subparsers = parser.add_subparsers(title="Model shapes")
+
+    parser_file = subparsers.add_parser(
+        "file", help="generate all data in a single file (default)"
+    )
+    add_root_argument(parser_file)
+    add_size_arguments(parser_file, multiple_files=False)
+    parser_file.set_defaults(func=generate_file)
+
+    parser_dir = subparsers.add_parser(
+        "dir", help="generate data split into N files in a single directory"
+    )
+    add_root_argument(parser_dir)
+    add_size_arguments(parser_dir)
+    add_count_arguments(parser_dir, with_dirs=False)
+    parser_dir.set_defaults(func=generate_dir)
+
+    parser_matrix = subparsers.add_parser(
+        "matrix", help="generate data split into N files in M directories"
+    )
+    add_root_argument(parser_matrix)
+    add_size_arguments(parser_matrix)
+    add_count_arguments(parser_matrix)
+    parser_matrix.set_defaults(func=generate_matrix)
+
+    parser_nested = subparsers.add_parser(
+        "nested",
+        help="generate data split into N files in a directory nested M levels",
+    )
+    add_root_argument(parser_nested)
+    add_size_arguments(parser_nested)
+    add_count_arguments(parser_nested)
+    parser_nested.set_defaults(func=generate_nested)
+
+    return parser
+
+
+if __name__ == "__main__":
+    np.random.seed(42)
+    args = build_parser().parse_args()
+    args.func(args)
diff --git a/benchmarks/serialize.py b/benchmarks/serialize.py
new file mode 100644
index 00000000..165fefa8
--- /dev/null
+++ b/benchmarks/serialize.py
@@ -0,0 +1,247 @@
+# Copyright 2024 The Sigstore Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Script for benchmarking full model serialization."""
+
+import argparse
+from collections.abc import Callable
+import pathlib
+
+from model_signing.hashing import file
+from model_signing.hashing import hashing
+from model_signing.hashing import memory
+from model_signing.serialization import serialize_by_file
+from model_signing.serialization import serialize_by_file_shard
+from model_signing.signing import in_toto
+
+
+def get_hash_engine_factory(
+    hash_algorithm: str,
+) -> type[hashing.StreamingHashEngine]:
+    """Returns the class that implements a hashing method.
+
+    Args:
+        hash_algorithm: the hash algorithm to implement.
+
+    Returns:
+        The class that corresponds to the algorithm.
+
+    Raises:
+        ValueError: if the algorithm is not implemented/not valid.
+    """
+    match hash_algorithm:
+        case "sha256":
+            return memory.SHA256
+        case "blake2":
+            return memory.BLAKE2
+
+    raise ValueError(f"Cannot convert {hash_algorithm} to a hash engine")
+
+
+def get_sharded_file_hasher_factory(
+    hash_algorithm: str, chunk_size: int, shard_size: int
+) -> Callable[[pathlib.Path, int, int], file.ShardedFileHasher]:
+    """Returns a hasher factory for sharded serialization.
+
+    Args:
+        hash_algorithm: the hash algorithm to use for each shard.
+        chunk_size: the chunk size to use when reading shards.
+        shard_size: the shard size used in generating the shards.
+
+    Returns:
+        A callable for the hashing factory.
+    """
+    hash_engine = get_hash_engine_factory(hash_algorithm)
+
+    def _hasher_factory(
+        path: pathlib.Path, start: int, end: int
+    ) -> file.ShardedFileHasher:
+        return file.ShardedFileHasher(
+            path,
+            hash_engine(),  # pytype: disable=not-instantiable
+            start=start,
+            end=end,
+            chunk_size=chunk_size,
+            shard_size=shard_size,
+        )
+
+    return _hasher_factory
+
+
+def get_file_hasher_factory(
+    hash_algorithm: str, chunk_size: int
+) -> Callable[[pathlib.Path], file.FileHasher]:
+    """Returns a hasher factory for file serialization.
+
+    Args:
+        hash_algorithm: the hash algorithm to use for each file.
+        chunk_size: the chunk size to use when reading files.
+
+    Returns:
+        A callable for the hashing factory.
+    """
+    hash_engine = get_hash_engine_factory(hash_algorithm)
+
+    def _hasher_factory(path: pathlib.Path) -> file.FileHasher:
+        return file.SimpleFileHasher(
+            path,
+            hash_engine(),  # pytype: disable=not-instantiable
+            chunk_size=chunk_size,
+        )
+
+    return _hasher_factory
+
+
+def run(args: argparse.Namespace) -> None:
+    """Performs the benchmark.
+
+    Args:
+        args: The arguments specifying the benchmark scenario.
+    """
+    # 1. Hashing layer
+    if args.use_shards:
+        hasher = get_sharded_file_hasher_factory(
+            args.hash_method, args.chunk, args.shard
+        )
+    else:
+        hasher = get_file_hasher_factory(args.hash_method, args.chunk)
+
+    # 2. Serialization layer
+    if args.skip_manifest or args.single_digest:
+        merge_hasher_factory = get_hash_engine_factory(args.merge_hasher)
+        if args.use_shards:
+            serializer = serialize_by_file_shard.DigestSerializer(
+                hasher,
+                merge_hasher_factory(),  # pytype: disable=not-instantiable
+                max_workers=args.max_workers,
+            )
+        else:
+            # This gets complicated because the API here is not matching the
+            # rest. We should fix this.
+            if args.max_workers is not None and args.max_workers != 1:
+                raise ValueError("Currently, only 1 worker is supported here")
+            serializer = serialize_by_file.DigestSerializer(
+                # pytype: disable=wrong-arg-count
+                hasher(pathlib.Path("unused")),
+                # pytype: enable=wrong-arg-count
+                merge_hasher_factory,
+            )
+    else:
+        if args.use_shards:
+            serializer_factory = serialize_by_file_shard.ManifestSerializer
+        else:
+            serializer_factory = serialize_by_file.ManifestSerializer
+
+        serializer = serializer_factory(hasher, max_workers=args.max_workers)
+
+    # 3. Signing layer
+    if args.skip_manifest:
+        in_toto_builder = id  # Do nothing, just evaluate the argument
+    else:
+        if args.single_digest:
+            in_toto_builder = in_toto.SingleDigestIntotoPayload
+        else:
+            match (args.digest_of_digests, args.use_shards):
+                case (True, True):
+                    in_toto_builder = in_toto.DigestOfShardDigestsIntotoPayload
+                case (True, False):
+                    in_toto_builder = in_toto.DigestOfDigestsIntotoPayload
+                case (False, True):
+                    in_toto_builder = in_toto.ShardDigestsIntotoPayload
+                case (False, False):
+                    in_toto_builder = in_toto.DigestsIntotoPayload
+
+        in_toto_builder = in_toto_builder.from_manifest
+
+    # Put everything together
+    if not args.dry_run:
+        in_toto_builder(serializer.serialize(args.path))
+
+
+def build_parser() -> argparse.ArgumentParser:
+    """Builds the command line parser for the bechmark runner."""
+    parser = argparse.ArgumentParser(
+        description="Benchmark full serialization of a model for model signing"
+    )
+
+    parser.add_argument("path", help="path to model", type=pathlib.Path)
+    parser.add_argument(
+        "--dry_run", help="don't run anything", action="store_true"
+    )
+    parser.add_argument(
+        "--hash_method",
+        help="hash method to use (default: sha256)",
+        choices=["sha256", "blake2"],
+        default="sha256",
+    )
+    parser.add_argument(
+        "--max_workers", help="number of parallel workers to use", type=int
+    )
+
+    param_groups = parser.add_argument_group("Internal parameters to fine-tune")
+    param_groups.add_argument(
+        "--chunk", help="chunk size (default: 8192)", type=int, default=8192
+    )
+    param_groups.add_argument(
+        "--shard",
+        help="shard size (default: 1000000)",
+        type=int,
+        default=1000000,
+    )
+
+    shard_group = parser.add_argument_group("Serialization modes")
+    shard_group.add_argument(
+        "--use_shards", help="serialize by shards", action="store_true"
+    )
+    shard_group.add_argument(
+        "--skip_manifest",
+        help="serialize to a single digest, skip manifest creation",
+        action="store_true",
+    )
+    shard_group.add_argument(
+        "--merge_hasher",
+        help="hasher to use to merge individual hashes "
+        "when skipping manifest creation (default: sha256)",
+        choices=["sha256", "blake2"],
+        default="sha256",
+    )
+
+    intoto_group = parser.add_argument_group(
+        "Manifest to in-toto serialization formats"
+    )
+    intoto_group.add_argument(
+        "--single_digest",
+        help="serialize to a single digest, use manifest with one entry",
+        action="store_true",
+    )
+    intoto_group.add_argument(
+        "--digest_of_digests",
+        help="generate an in-toto statement with a single subject",
+        action="store_true",
+    )
+
+    return parser
+
+
+if __name__ == "__main__":
+    args = build_parser().parse_args()
+    if args.skip_manifest and (args.single_digest or args.digest_of_digests):
+        raise ValueError(
+            "Cannot combine --skip_manifest with manifest to in-toto options"
+        )
+    if args.single_digest and args.digest_of_digests:
+        raise ValueError(
+            "At most one of --single_digest and --digest_of_digests can be used"
+        )
+    run(args)
diff --git a/pyproject.toml b/pyproject.toml
index 443031f4..b2c772b6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -64,6 +64,22 @@ randomize = true
 [[tool.hatch.envs.hatch-test.matrix]]
 python = ["3.10", "3.11", "3.12"]
 
+[tool.hatch.envs.bench]
+description = """Custom environment for running benchmarks.
+Use `hatch run +py=3... bench:generate ${args}` to generate test models.
+Use `hatch run +py=3... bench:serialize ${args}` to benchmark serialization code.
+"""
+extra-dependencies = [
+  "numpy",
+]
+
+[[tool.hatch.envs.bench.matrix]]
+python = ["3.10", "3.11", "3.12"]
+
+[tool.hatch.envs.bench.scripts]
+generate = "python benchmarks/generate.py {args}"
+serialize = "python benchmarks/serialize.py {args}"
+
 [tool.hatch.envs.docs]
 description = """Custom environment for pdoc.
 Use `hatch run docs:serve` to view documentation.
@@ -88,7 +104,7 @@ installer = "pip"
 python = "3.11"
 
 [tool.hatch.envs.type.scripts]
-check = "pytype -k -j auto src tests"
+check = "pytype -k -j auto src tests benchmarks"
 
 [tool.coverage.report]
 exclude_also = [