From 446ab0f12efb696a03046095fd3a6dece7299228 Mon Sep 17 00:00:00 2001 From: Max Mehl Date: Tue, 13 Aug 2024 11:56:17 +0200 Subject: [PATCH 1/9] allow to print generated and enriched SBOM to stdout --- complassist/_helpers.py | 14 +++++++++++--- complassist/_sbom_generate.py | 9 +++++++-- complassist/main.py | 4 ++-- 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/complassist/_helpers.py b/complassist/_helpers.py index 095ea62..2f6d1a0 100644 --- a/complassist/_helpers.py +++ b/complassist/_helpers.py @@ -49,9 +49,17 @@ def read_json_file(path: str) -> dict: def write_json_file(data: dict, path: str) -> None: - """Write a dict into a JSON file""" - with open(path, "w", encoding="UTF-8") as jsonfile: - return json.dump(data, jsonfile, indent=2) + """Write a dict into a JSON file, unless path is `-` for which it will be stdout""" + if path == "-": + print(json.dumps(data, indent=2)) + else: + with open(path, "w", encoding="UTF-8") as jsonfile: + json.dump(data, jsonfile, indent=2) + + +def print_json_file(path: str) -> None: + """Open a JSON file and print it to stdout""" + write_json_file(read_json_file(path), "-") def delete_file(path: str) -> None: diff --git a/complassist/_sbom_generate.py b/complassist/_sbom_generate.py index 9e091f6..5c71224 100644 --- a/complassist/_sbom_generate.py +++ b/complassist/_sbom_generate.py @@ -15,6 +15,8 @@ import docker from docker.errors import APIError, ContainerError, DockerException, ImageNotFound +from ._helpers import print_json_file + def _sanitize_container_name(name: str) -> str: """ @@ -143,8 +145,11 @@ def generate_cdx_sbom(directory: str, output: str = "") -> str: with NamedTemporaryFile() as tmpfile: _run_cdxgen(dclient, directory, cont_name, tmpfile.name) - # Copy to final destination with user permissions - copy2(tmpfile.name, output) + # Copy to final destination with user permissions, or print file if requested + if output == "-": + print_json_file(tmpfile.name) + else: + copy2(tmpfile.name, output) logging.info("SBOM has been saved to %s", output) diff --git a/complassist/main.py b/complassist/main.py index 7a4e222..e41d1b6 100644 --- a/complassist/main.py +++ b/complassist/main.py @@ -62,7 +62,7 @@ "--output", help=( "Path where the generated SBOM shall be saved. " - "If unset, it will be stored in a temporary directory." + "If unset, it will be stored in a temporary directory. Use '-' to print it to stdout." ), ) @@ -81,7 +81,7 @@ parser_sbom_enrich.add_argument( "-o", "--output", - help="Path where the enriched SBOM shall be saved", + help="Path where the enriched SBOM shall be saved. Use '-' to print it to stdout.", required=True, ) From 343b2532fdb8a1e9db74c15ea1ba7745080a87df Mon Sep 17 00:00:00 2001 From: Max Mehl Date: Tue, 13 Aug 2024 12:15:57 +0200 Subject: [PATCH 2/9] separate enrichment process, fetch clearlydefined data first --- complassist/_sbom_enrich.py | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/complassist/_sbom_enrich.py b/complassist/_sbom_enrich.py index 49f7d22..355de6a 100644 --- a/complassist/_sbom_enrich.py +++ b/complassist/_sbom_enrich.py @@ -6,6 +6,7 @@ import logging from datetime import datetime +from typing import Any from . import __version__ from ._clearlydefined import ( @@ -132,12 +133,17 @@ def _compare_sbom_cd_copyright( return msg, msg_level -def _enrich_component_with_cd_data(component: dict) -> None: +def _enrich_component_with_cd_data( + component: dict, clearlydefined_data: dict[str, dict[str, str]] +) -> None: """ Enriches a single component with data from ClearlyDefined. Args: component (dict): The component data to enrich. + + clearlydefined_data (dict): Previously fetched data for detected PURLs + from ClearlyDefined. """ # Get purl, original licenses, and short/simplified licenses data from component raw_data = extract_items_from_component( @@ -150,10 +156,9 @@ def _enrich_component_with_cd_data(component: dict) -> None: sbom_license = licenses_short_to_string(sbom_licenses_short_item) sbom_copyright = raw_data["copyright"] - # Get licensing/copyright data from ClearlyDefined - cd_license, cd_copyright = get_clearlydefined_license_and_copyright( - coordinates=purl_to_cd_coordinates(purl) - ) + # Get fetched licensing/copyright data from ClearlyDefined + cd_license = clearlydefined_data[purl].get("license") + cd_copyright = clearlydefined_data[purl].get("copyright") # Compare license data of SBOM with ClearlyDefined msg, msg_level = _compare_sbom_cd_license( @@ -246,11 +251,20 @@ def enrich_sbom_with_clearlydefined(sbom_file: str, output_file: str) -> None: output_file (str): Path to save the enriched SBOM. """ - sbom = read_json_file(sbom_file) + sbom: dict[str, list[dict]] = read_json_file(sbom_file) + + # Loop all contained components, and collect ClearlyDefined data + clearlydefined_data: dict[str, dict[str, str]] = {} + for component in sbom.get("components", []): + purl = extract_items_from_component(component, ["purl"], use_flict=False)["purl"] + cd_license, cd_copyright = get_clearlydefined_license_and_copyright( + coordinates=purl_to_cd_coordinates(purl) + ) + clearlydefined_data[purl] = {"license": cd_license, "copyright": cd_copyright} - # Loop all contained components, and collect updates + # Now, update the components with the fetched ClearlyDefined data for component in sbom.get("components", []): - _enrich_component_with_cd_data(component) + _enrich_component_with_cd_data(component, clearlydefined_data) # Update SBOM metadata sbom = _update_sbom_metadata(sbom) From 274709c4c8b098d8f6a4accd8f2d4221d9aa4a4b Mon Sep 17 00:00:00 2001 From: Max Mehl Date: Tue, 13 Aug 2024 14:37:35 +0200 Subject: [PATCH 3/9] get clearlydefined data in chunks from API --- complassist/_clearlydefined.py | 50 ++++++++++++++++++++++++++++++++-- complassist/_sbom_enrich.py | 38 ++++++++++++++++++++------ complassist/main.py | 7 ++++- 3 files changed, 84 insertions(+), 11 deletions(-) diff --git a/complassist/_clearlydefined.py b/complassist/_clearlydefined.py index 33fd69f..0b12b0d 100644 --- a/complassist/_clearlydefined.py +++ b/complassist/_clearlydefined.py @@ -72,7 +72,7 @@ def _cdapi_call( method: str = "GET", api_url: str = "https://api.clearlydefined.io", basepath: str = "definitions", - json_dict: dict | None = None, + json_dict: dict | list | None = None, **params: str, ) -> dict: """ @@ -104,7 +104,7 @@ def _cdapi_call( """ url = urljoin(api_url, pathjoin(basepath, path)) if json_dict: - result = make_request_with_retry(method=method, url=url, json=json_dict) + result = make_request_with_retry(method=method, url=url, json=json_dict, params=params) else: result = make_request_with_retry(method=method, url=url, params=params) @@ -213,6 +213,52 @@ def get_clearlydefined_license_and_copyright(coordinates: str) -> tuple[str, str return declared_license, copyrights +def get_clearlydefined_license_and_copyright_in_batches( + purls: list[str], +) -> dict[str, tuple[str, str]]: + """ + Retrieves the declared license for multiple purls from + ClearlyDefined. + + Queries the ClearlyDefined API to get the declared license for the provided + packages via Package URLs. If no license is found, it initiates a + harvest request. + + Args: + coordinates (str): The ClearlyDefined coordinates or Package URL for + which to retrieve the license. + + Returns: + tuple[str, str]: A tuple containing: + - The declared license as a string, or an empty string if not found. + - The detected copyright attributions as a single string, with each + attribution separated by a newline, or an empty string if not + found. + """ + coordinates_purls = {purl_to_cd_coordinates(purl): purl for purl in purls} + api_return = _cdapi_call( + path="", method="POST", json_dict=list(coordinates_purls.keys()), expand="-files" + ) + + result: dict[str, tuple[str, str]] = {} + for pkg_coordinates, cd_data in api_return.items(): + pkg_purl = coordinates_purls[pkg_coordinates] + declared_license, copyrights = _extract_license_copyright(cd_data) + + # Declared license couldn't be extracted. Add to harvest + if not declared_license: + logging.info( + "Adding %s to be harvest by ClearlyDefined. " + "Make sure the package and this version actually exists, and try again later.", + pkg_coordinates, + ) + _send_cd_harvest_request(pkg_coordinates) + + result[pkg_purl] = (declared_license, copyrights) + + return result + + def print_clearlydefined_result(results: tuple[str, str]) -> None: """ Pretty-print the results for declared license and copyright attributions diff --git a/complassist/_sbom_enrich.py b/complassist/_sbom_enrich.py index 355de6a..b60d9d8 100644 --- a/complassist/_sbom_enrich.py +++ b/complassist/_sbom_enrich.py @@ -6,15 +6,16 @@ import logging from datetime import datetime -from typing import Any from . import __version__ from ._clearlydefined import ( get_clearlydefined_license_and_copyright, + get_clearlydefined_license_and_copyright_in_batches, purl_to_cd_coordinates, ) from ._helpers import extract_excerpt, read_json_file, write_json_file from ._sbom_parse import ( + extract_items_from_cdx_sbom, extract_items_from_component, licenses_short_to_string, spdx_expression_to_cdx_licenses, @@ -233,7 +234,9 @@ def _update_sbom_metadata(sbom: dict) -> dict: return sbom -def enrich_sbom_with_clearlydefined(sbom_file: str, output_file: str) -> None: +def enrich_sbom_with_clearlydefined( + sbom_file: str, output_file: str, in_chunks: bool = False +) -> None: """ Parse a SBOM and enrich license/copyright data of each component with ClearlyDefined. Write result to new SBOM file. @@ -249,18 +252,37 @@ def enrich_sbom_with_clearlydefined(sbom_file: str, output_file: str) -> None: Args: sbom_file (str): Path to the input SBOM file. output_file (str): Path to save the enriched SBOM. + in_chunks (bool): Ask ClearlyDefined API for multiple packages at once """ sbom: dict[str, list[dict]] = read_json_file(sbom_file) # Loop all contained components, and collect ClearlyDefined data clearlydefined_data: dict[str, dict[str, str]] = {} - for component in sbom.get("components", []): - purl = extract_items_from_component(component, ["purl"], use_flict=False)["purl"] - cd_license, cd_copyright = get_clearlydefined_license_and_copyright( - coordinates=purl_to_cd_coordinates(purl) - ) - clearlydefined_data[purl] = {"license": cd_license, "copyright": cd_copyright} + all_purls: list[str] = [ + c["purl"] for c in extract_items_from_cdx_sbom(sbom_file, information=["purl"]) + ] + if in_chunks: + + # Split all purls in chunks of `max_components` size + max_components = 10 + purls_chunks: list[list[str]] = [ + all_purls[x : x + max_components] for x in range(0, len(all_purls), max_components) + ] + for chunk in purls_chunks: + logging.info("Getting ClearlyDefined data for %s", ", ".join(chunk)) + result = get_clearlydefined_license_and_copyright_in_batches(chunk) + # Unpack results in chunks, and add to clearlydefined_data + for purl, (cd_license, cd_copyright) in result.items(): + clearlydefined_data[purl] = {"license": cd_license, "copyright": cd_copyright} + + else: + for purl in all_purls: + logging.info("Getting ClearlyDefined data for %s", purl) + cd_license, cd_copyright = get_clearlydefined_license_and_copyright( + coordinates=purl_to_cd_coordinates(purl) + ) + clearlydefined_data[purl] = {"license": cd_license, "copyright": cd_copyright} # Now, update the components with the fetched ClearlyDefined data for component in sbom.get("components", []): diff --git a/complassist/main.py b/complassist/main.py index e41d1b6..0fd1909 100644 --- a/complassist/main.py +++ b/complassist/main.py @@ -84,6 +84,11 @@ help="Path where the enriched SBOM shall be saved. Use '-' to print it to stdout.", required=True, ) +parser_sbom_enrich.add_argument( + "--in-chunks", + help="Request information for multiple packages at once from ClearlyDefined API", + action="store_true" +) # SBOM Parser parser_sbom_read = subparser_sbom.add_parser( @@ -256,7 +261,7 @@ def main(): # pylint: disable=too-many-branches, too-many-statements # Enrich SBOM by ClearlyDefined data elif args.sbom_command == "enrich": - enrich_sbom_with_clearlydefined(args.file, args.output) + enrich_sbom_with_clearlydefined(args.file, args.output, args.in_chunks) # Parse info from SBOM elif args.sbom_command == "parse": From 394aba245705b9e5e1e566031394cc47d63c1130 Mon Sep 17 00:00:00 2001 From: Max Mehl Date: Wed, 14 Aug 2024 09:56:22 +0200 Subject: [PATCH 4/9] add option to do intense HTTP logging --- complassist/_logging.py | 30 ++++++++++++++++++++++++++++++ complassist/main.py | 25 ++++++++++++------------- 2 files changed, 42 insertions(+), 13 deletions(-) create mode 100644 complassist/_logging.py diff --git a/complassist/_logging.py b/complassist/_logging.py new file mode 100644 index 0000000..aa610c1 --- /dev/null +++ b/complassist/_logging.py @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: 2024 DB Systel GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +"""Logging functions""" +import logging + + +def configure_logger(args) -> logging.Logger: + """Set logging options""" + # Base logger config + log = logging.getLogger() + logging.basicConfig( + encoding="utf-8", + format="%(levelname)s: %(message)s", + level=logging.INFO, + ) + # Adapt logging level + if args.verbose: + log.setLevel("DEBUG") + # Activate extreme logging for requests to also get POST data + if hasattr(args, "http_debug") and args.http_debug: + requests_log = logging.getLogger("requests.packages.urllib3") + requests_log.setLevel(logging.DEBUG) + requests_log.propagate = True + import http.client as http_client # pylint: disable=import-outside-toplevel + + http_client.HTTPConnection.debuglevel = 1 + + return log diff --git a/complassist/main.py b/complassist/main.py index 0fd1909..fb2b6b4 100644 --- a/complassist/main.py +++ b/complassist/main.py @@ -20,6 +20,7 @@ ) from ._helpers import dict_to_json from ._licensing import get_outbound_candidate, list_all_licenses +from ._logging import configure_logger from ._sbom_enrich import enrich_sbom_with_clearlydefined from ._sbom_generate import generate_cdx_sbom from ._sbom_parse import extract_items_from_cdx_sbom @@ -33,7 +34,7 @@ # Common flags, usable for all effective subcommands common_flags = argparse.ArgumentParser(add_help=False) # No automatic help to avoid duplication -common_flags.add_argument("-v", "--verbose", action="store_true", help="Verbose output") +common_flags.add_argument("-v", "--verbose", action="store_true", help="Verbose output (DEBUG)") # SBOM commands parser_sbom = subparsers.add_parser( @@ -89,6 +90,11 @@ help="Request information for multiple packages at once from ClearlyDefined API", action="store_true" ) +parser_sbom_enrich.add_argument( + "--http-debug", + help="Activate extreme HTTP logging", + action="store_true" +) # SBOM Parser parser_sbom_read = subparser_sbom.add_parser( @@ -150,6 +156,11 @@ help="Fetch licensing and copyright information of packages from ClearlyDefined", parents=[common_flags], ) +parser_cd_fetch.add_argument( + "--http-debug", + help="Activate extreme HTTP logging", + action="store_true" +) parser_cd_fetch_exclusive = parser_cd_fetch.add_mutually_exclusive_group(required=True) parser_cd_fetch_exclusive.add_argument( "-p", @@ -230,18 +241,6 @@ ) -def configure_logger(args) -> logging.Logger: - """Set logging options""" - log = logging.getLogger() - logging.basicConfig( - encoding="utf-8", - format="%(levelname)s: %(message)s", - level=(logging.DEBUG if args.verbose else logging.INFO), - ) - - return log - - def main(): # pylint: disable=too-many-branches, too-many-statements """Main function""" From 7148ceb50418a2f7c446585378a28baf42c56321 Mon Sep 17 00:00:00 2001 From: Max Mehl Date: Wed, 14 Aug 2024 10:10:52 +0200 Subject: [PATCH 5/9] fix mypy issues --- complassist/_sbom_enrich.py | 4 ++-- complassist/_sbom_parse.py | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/complassist/_sbom_enrich.py b/complassist/_sbom_enrich.py index b60d9d8..efa6682 100644 --- a/complassist/_sbom_enrich.py +++ b/complassist/_sbom_enrich.py @@ -24,7 +24,7 @@ def _compare_sbom_cd_license( component: dict, - cd_license: str, + cd_license: str | None, sbom_license: str, sbom_licenses_item: list[dict], sbom_licenses_short_item: list[dict], @@ -88,7 +88,7 @@ def _compare_sbom_cd_license( def _compare_sbom_cd_copyright( - component: dict, cd_copyright: str, sbom_copyright: str + component: dict, cd_copyright: str | None, sbom_copyright: str ) -> tuple[str, str]: """ Compares and potentially updates the SBOM component's copyright information diff --git a/complassist/_sbom_parse.py b/complassist/_sbom_parse.py index 6c1a6c2..ce2e8cd 100644 --- a/complassist/_sbom_parse.py +++ b/complassist/_sbom_parse.py @@ -135,10 +135,12 @@ def licenses_short_to_string(licenses: list) -> str: return "" -def spdx_expression_to_cdx_licenses(spdx_expression: str) -> list: +def spdx_expression_to_cdx_licenses(spdx_expression: str | None) -> list: """ Convert a SPDX expression to a valid CycloneDX licenses item """ + if spdx_expression is None: + return [{"expression": spdx_expression}] return [{"expression": spdx_expression}] From a73b93e46aee57683aae65d937c0309bb23cbd78 Mon Sep 17 00:00:00 2001 From: Max Mehl Date: Wed, 14 Aug 2024 10:19:10 +0200 Subject: [PATCH 6/9] refactor: split/combine common functions to send harvest --- complassist/_clearlydefined.py | 38 ++++++++++++++-------------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/complassist/_clearlydefined.py b/complassist/_clearlydefined.py index 0b12b0d..f171ef4 100644 --- a/complassist/_clearlydefined.py +++ b/complassist/_clearlydefined.py @@ -158,17 +158,23 @@ def _extract_license_copyright(cd_api_response: dict) -> tuple[str, str]: return license_declared, "\n".join(copyrights).strip() -def _send_cd_harvest_request(coordinates: str) -> None: +def _handle_missing_license_and_request_harvest(coordinates: str) -> None: """ - Sends a harvest request to ClearlyDefined for the given coordinates. + Handles the case when a declared license is not found and triggers a harvest + request. - Triggers a ClearlyDefined harvest operation for the provided coordinates to - request the collection of metadata and license information. + Logs the event of a missing license and sends a harvest request to + ClearlyDefined for the given coordinates. Args: - coordinates (str): The ClearlyDefined coordinates or Package URL (purl) - for which to request harvesting. + coordinates (str): The ClearlyDefined coordinates or Package URL for + which the license is missing. """ + logging.info( + "Adding %s to be harvested by ClearlyDefined. " + "Make sure the package and this version actually exists, and try again later.", + coordinates, + ) _cdapi_call( path="", method="POST", @@ -179,8 +185,7 @@ def _send_cd_harvest_request(coordinates: str) -> None: def get_clearlydefined_license_and_copyright(coordinates: str) -> tuple[str, str]: """ - Retrieves the declared license for the specified coordinates from - ClearlyDefined. + Retrieves the declared license for the specified coordinates from ClearlyDefined. Queries the ClearlyDefined API to get the declared license for the provided coordinates or Package URL (purl). If no license is found, it initiates a @@ -203,12 +208,7 @@ def get_clearlydefined_license_and_copyright(coordinates: str) -> tuple[str, str # Declared license couldn't be extracted. Add to harvest if not declared_license: - logging.info( - "Adding %s to be harvest by ClearlyDefined. " - "Make sure the package and this version actually exists, and try again later.", - coordinates, - ) - _send_cd_harvest_request(coordinates) + _handle_missing_license_and_request_harvest(coordinates) return declared_license, copyrights @@ -217,8 +217,7 @@ def get_clearlydefined_license_and_copyright_in_batches( purls: list[str], ) -> dict[str, tuple[str, str]]: """ - Retrieves the declared license for multiple purls from - ClearlyDefined. + Retrieves the declared license for multiple purls from ClearlyDefined. Queries the ClearlyDefined API to get the declared license for the provided packages via Package URLs. If no license is found, it initiates a @@ -247,12 +246,7 @@ def get_clearlydefined_license_and_copyright_in_batches( # Declared license couldn't be extracted. Add to harvest if not declared_license: - logging.info( - "Adding %s to be harvest by ClearlyDefined. " - "Make sure the package and this version actually exists, and try again later.", - pkg_coordinates, - ) - _send_cd_harvest_request(pkg_coordinates) + _handle_missing_license_and_request_harvest(pkg_coordinates) result[pkg_purl] = (declared_license, copyrights) From 6f384759a9f36964935e8952338aa5e75df4a4fa Mon Sep 17 00:00:00 2001 From: Max Mehl Date: Wed, 14 Aug 2024 11:06:12 +0200 Subject: [PATCH 7/9] wording: rename chunks to batches, more common --- complassist/_sbom_enrich.py | 18 +++++++++--------- complassist/main.py | 4 ++-- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/complassist/_sbom_enrich.py b/complassist/_sbom_enrich.py index efa6682..c25d8b9 100644 --- a/complassist/_sbom_enrich.py +++ b/complassist/_sbom_enrich.py @@ -235,7 +235,7 @@ def _update_sbom_metadata(sbom: dict) -> dict: def enrich_sbom_with_clearlydefined( - sbom_file: str, output_file: str, in_chunks: bool = False + sbom_file: str, output_file: str, in_batches: bool = False ) -> None: """ Parse a SBOM and enrich license/copyright data of each component with @@ -252,7 +252,7 @@ def enrich_sbom_with_clearlydefined( Args: sbom_file (str): Path to the input SBOM file. output_file (str): Path to save the enriched SBOM. - in_chunks (bool): Ask ClearlyDefined API for multiple packages at once + in_batches (bool): Ask ClearlyDefined API for multiple packages at once """ sbom: dict[str, list[dict]] = read_json_file(sbom_file) @@ -262,17 +262,17 @@ def enrich_sbom_with_clearlydefined( all_purls: list[str] = [ c["purl"] for c in extract_items_from_cdx_sbom(sbom_file, information=["purl"]) ] - if in_chunks: + if in_batches: - # Split all purls in chunks of `max_components` size + # Split all purls in batches of `max_components` size max_components = 10 - purls_chunks: list[list[str]] = [ + purls_batches: list[list[str]] = [ all_purls[x : x + max_components] for x in range(0, len(all_purls), max_components) ] - for chunk in purls_chunks: - logging.info("Getting ClearlyDefined data for %s", ", ".join(chunk)) - result = get_clearlydefined_license_and_copyright_in_batches(chunk) - # Unpack results in chunks, and add to clearlydefined_data + for batch in purls_batches: + logging.info("Getting ClearlyDefined data for %s", ", ".join(batch)) + result = get_clearlydefined_license_and_copyright_in_batches(batch) + # Unpack result batches, and add to clearlydefined_data for purl, (cd_license, cd_copyright) in result.items(): clearlydefined_data[purl] = {"license": cd_license, "copyright": cd_copyright} diff --git a/complassist/main.py b/complassist/main.py index fb2b6b4..182399e 100644 --- a/complassist/main.py +++ b/complassist/main.py @@ -86,7 +86,7 @@ required=True, ) parser_sbom_enrich.add_argument( - "--in-chunks", + "--in-batches", help="Request information for multiple packages at once from ClearlyDefined API", action="store_true" ) @@ -260,7 +260,7 @@ def main(): # pylint: disable=too-many-branches, too-many-statements # Enrich SBOM by ClearlyDefined data elif args.sbom_command == "enrich": - enrich_sbom_with_clearlydefined(args.file, args.output, args.in_chunks) + enrich_sbom_with_clearlydefined(args.file, args.output, args.in_batches) # Parse info from SBOM elif args.sbom_command == "parse": From 9d55bb682d619bf7663d6812e25f3a22360f340c Mon Sep 17 00:00:00 2001 From: Max Mehl Date: Wed, 14 Aug 2024 11:07:57 +0200 Subject: [PATCH 8/9] use ClearlyDefined batching by default --- complassist/_sbom_enrich.py | 4 ++-- complassist/main.py | 18 ++++++++---------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/complassist/_sbom_enrich.py b/complassist/_sbom_enrich.py index c25d8b9..c58014c 100644 --- a/complassist/_sbom_enrich.py +++ b/complassist/_sbom_enrich.py @@ -235,7 +235,7 @@ def _update_sbom_metadata(sbom: dict) -> dict: def enrich_sbom_with_clearlydefined( - sbom_file: str, output_file: str, in_batches: bool = False + sbom_file: str, output_file: str, in_batches: bool = True ) -> None: """ Parse a SBOM and enrich license/copyright data of each component with @@ -252,7 +252,7 @@ def enrich_sbom_with_clearlydefined( Args: sbom_file (str): Path to the input SBOM file. output_file (str): Path to save the enriched SBOM. - in_batches (bool): Ask ClearlyDefined API for multiple packages at once + in_batches (bool): Ask ClearlyDefined API for multiple packages at once. """ sbom: dict[str, list[dict]] = read_json_file(sbom_file) diff --git a/complassist/main.py b/complassist/main.py index 182399e..046df36 100644 --- a/complassist/main.py +++ b/complassist/main.py @@ -86,14 +86,14 @@ required=True, ) parser_sbom_enrich.add_argument( - "--in-batches", - help="Request information for multiple packages at once from ClearlyDefined API", - action="store_true" + "--no-batches", + help=( + "Request information for one package at a time from ClearlyDefined API, and not in batches." + ), + action="store_true", ) parser_sbom_enrich.add_argument( - "--http-debug", - help="Activate extreme HTTP logging", - action="store_true" + "--http-debug", help="Activate extreme HTTP logging", action="store_true" ) # SBOM Parser @@ -157,9 +157,7 @@ parents=[common_flags], ) parser_cd_fetch.add_argument( - "--http-debug", - help="Activate extreme HTTP logging", - action="store_true" + "--http-debug", help="Activate extreme HTTP logging", action="store_true" ) parser_cd_fetch_exclusive = parser_cd_fetch.add_mutually_exclusive_group(required=True) parser_cd_fetch_exclusive.add_argument( @@ -260,7 +258,7 @@ def main(): # pylint: disable=too-many-branches, too-many-statements # Enrich SBOM by ClearlyDefined data elif args.sbom_command == "enrich": - enrich_sbom_with_clearlydefined(args.file, args.output, args.in_batches) + enrich_sbom_with_clearlydefined(args.file, args.output, not args.no_batches) # Parse info from SBOM elif args.sbom_command == "parse": From 5c5ac71e403fb20acf552105c6bcdcd8ac18a215 Mon Sep 17 00:00:00 2001 From: Max Mehl Date: Wed, 14 Aug 2024 11:15:20 +0200 Subject: [PATCH 9/9] make clearlydefined batch size configurable --- complassist/_sbom_enrich.py | 9 ++++----- complassist/main.py | 14 +++++++++++++- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/complassist/_sbom_enrich.py b/complassist/_sbom_enrich.py index c58014c..d19441c 100644 --- a/complassist/_sbom_enrich.py +++ b/complassist/_sbom_enrich.py @@ -235,7 +235,7 @@ def _update_sbom_metadata(sbom: dict) -> dict: def enrich_sbom_with_clearlydefined( - sbom_file: str, output_file: str, in_batches: bool = True + sbom_file: str, output_file: str, in_batches: bool = True, batch_size: int = 15 ) -> None: """ Parse a SBOM and enrich license/copyright data of each component with @@ -253,6 +253,7 @@ def enrich_sbom_with_clearlydefined( sbom_file (str): Path to the input SBOM file. output_file (str): Path to save the enriched SBOM. in_batches (bool): Ask ClearlyDefined API for multiple packages at once. + batch_size (int): Number of packages for batch request at ClearlyDefined. """ sbom: dict[str, list[dict]] = read_json_file(sbom_file) @@ -263,11 +264,9 @@ def enrich_sbom_with_clearlydefined( c["purl"] for c in extract_items_from_cdx_sbom(sbom_file, information=["purl"]) ] if in_batches: - - # Split all purls in batches of `max_components` size - max_components = 10 + # Split all purls in batches of `batch_size` size purls_batches: list[list[str]] = [ - all_purls[x : x + max_components] for x in range(0, len(all_purls), max_components) + all_purls[x : x + batch_size] for x in range(0, len(all_purls), batch_size) ] for batch in purls_batches: logging.info("Getting ClearlyDefined data for %s", ", ".join(batch)) diff --git a/complassist/main.py b/complassist/main.py index 046df36..a342b98 100644 --- a/complassist/main.py +++ b/complassist/main.py @@ -71,6 +71,7 @@ parser_sbom_enrich = subparser_sbom.add_parser( "enrich", help="Enrich a CycloneDX SBOM and its licensing/copyright data via ClearlyDefined", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, parents=[common_flags], ) parser_sbom_enrich.add_argument( @@ -85,6 +86,12 @@ help="Path where the enriched SBOM shall be saved. Use '-' to print it to stdout.", required=True, ) +parser_sbom_enrich.add_argument( + "--batch-size", + help="The number of packages to request information for from ClearlyDefined at once.", + default=25, + type=int, +) parser_sbom_enrich.add_argument( "--no-batches", help=( @@ -258,7 +265,12 @@ def main(): # pylint: disable=too-many-branches, too-many-statements # Enrich SBOM by ClearlyDefined data elif args.sbom_command == "enrich": - enrich_sbom_with_clearlydefined(args.file, args.output, not args.no_batches) + enrich_sbom_with_clearlydefined( + sbom_file=args.file, + output_file=args.output, + in_batches=not args.no_batches, + batch_size=args.batch_size, + ) # Parse info from SBOM elif args.sbom_command == "parse":