diff --git a/src/azul/service/avro_pfb.py b/src/azul/service/avro_pfb.py index 3f689ec9a..06561c0ef 100644 --- a/src/azul/service/avro_pfb.py +++ b/src/azul/service/avro_pfb.py @@ -60,6 +60,7 @@ value_and_unit, ) from azul.types import ( + AnyJSON, AnyMutableJSON, JSON, MutableJSON, @@ -643,6 +644,22 @@ def _update_replica_schema(*, path: Sequence[str], key: str, value: AnyMutableJSON): + """ + Update in place a (part of an) existing PFB schema to ensure that it + accommodates a given (part of a) JSON document. The schema will only ever + expand, so after updating it will describe a superset of the documents that + it described pre-update. Starting from an empty schema, repeatedly calling + this function this allows us to discover a general schema for a series of + documents of unknown shape. + :param schema: a part of a PFB schema. It may be empty. + :param path: the series of field names that locate `schema` within its + top-level parent schema. The first entry should be the name of + the underlying PFB entity's record type. + :param key: the key within `schema` whose associated value will be updated + to describe `value`. This is the only part of `schema` that may + be mutated. + :param value: a part of a PFB entity. + """ try: old_type = schema[key] except KeyError: @@ -700,11 +717,20 @@ def _update_replica_schema(*, def _new_replica_schema(*, path: Sequence[str], - value: AnyMutableJSON, + value: AnyJSON, ) -> AnyMutableJSON: + """ + Create a part of a PFB schema to describe a part of a PFB entity represented + as a JSON document. + :param path: the location of `value` within the root document as a series + of keys. The first key should be the name of the underlying PFB + entity's type within the schema. + :param value: a part of a PFB entity. + :return: JSON describing the contents of `value` as a part of PFB schema. + """ if value is None: result = 'null' - elif isinstance(value, list): + elif isinstance(value, (tuple, list)): # Empty list indicates "no type" (emtpy union). This will be replaced # with an actual type unless we never encounter a non-empty array. result = {'type': 'array', 'items': []} diff --git a/test/service/test_manifest.py b/test/service/test_manifest.py index 82d2af11d..0b256528b 100644 --- a/test/service/test_manifest.py +++ b/test/service/test_manifest.py @@ -78,6 +78,7 @@ ) from azul.json import ( copy_json, + json_hash, ) from azul.logging import ( configure_test_logging, @@ -1298,12 +1299,12 @@ def test_manifest_content_disposition_header(self): 'The format is replica-based') @manifest_test def test_verbatim_jsonl_manifest(self): - bundle = self._load_canned_bundle(one(self.bundles())) expected = [ { 'type': replica_type, 'contents': bundle.metadata_files[key], } + for bundle in map(self._load_canned_bundle, self.bundles()) for replica_type, key in [ ('links', 'links.json'), ('cell_suspension', 'cell_suspension_0.json'), @@ -1320,12 +1321,8 @@ def test_verbatim_jsonl_manifest(self): for row in response.content.decode().splitlines() ] - def sort_key(hca_doc: JSON) -> str: - try: - return hca_doc['contents']['provenance']['document_id'] - except KeyError: - assert hca_doc['contents']['schema_type'] == 'link_bundle' - return '' + def sort_key(row: JSON) -> str: + return json_hash(row).digest() expected.sort(key=sort_key) response.sort(key=sort_key)