Skip to content

Commit

Permalink
fixup! [a] Create PFB-based verbatim manifest format for AnVIL (#6040)
Browse files Browse the repository at this point in the history
  • Loading branch information
nadove-ucsc committed Apr 18, 2024
1 parent b5ef9b0 commit e64a457
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 9 deletions.
30 changes: 28 additions & 2 deletions src/azul/service/avro_pfb.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
value_and_unit,
)
from azul.types import (
AnyJSON,
AnyMutableJSON,
JSON,
MutableJSON,
Expand Down Expand Up @@ -643,6 +644,22 @@ def _update_replica_schema(*,
path: Sequence[str],
key: str,
value: AnyMutableJSON):
"""
Update in place a (part of an) existing PFB schema to ensure that it
accommodates a given (part of a) JSON document. The schema will only ever
expand, so after updating it will describe a superset of the documents that
it described pre-update. Starting from an empty schema, repeatedly calling
this function this allows us to discover a general schema for a series of
documents of unknown shape.
:param schema: a part of a PFB schema. It may be empty.
:param path: the series of field names that locate `schema` within its
top-level parent schema. The first entry should be the name of
the underlying PFB entity's record type.
:param key: the key within `schema` whose associated value will be updated
to describe `value`. This is the only part of `schema` that may
be mutated.
:param value: a part of a PFB entity.
"""
try:
old_type = schema[key]
except KeyError:
Expand Down Expand Up @@ -700,11 +717,20 @@ def _update_replica_schema(*,

def _new_replica_schema(*,
path: Sequence[str],
value: AnyMutableJSON,
value: AnyJSON,
) -> AnyMutableJSON:
"""
Create a part of a PFB schema to describe a part of a PFB entity represented
as a JSON document.
:param path: the location of `value` within the root document as a series
of keys. The first key should be the name of the underlying PFB
entity's type within the schema.
:param value: a part of a PFB entity.
:return: JSON describing the contents of `value` as a part of PFB schema.
"""
if value is None:
result = 'null'
elif isinstance(value, list):
elif isinstance(value, (tuple, list)):
# Empty list indicates "no type" (emtpy union). This will be replaced
# with an actual type unless we never encounter a non-empty array.
result = {'type': 'array', 'items': []}
Expand Down
11 changes: 4 additions & 7 deletions test/service/test_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
)
from azul.json import (
copy_json,
json_hash,
)
from azul.logging import (
configure_test_logging,
Expand Down Expand Up @@ -1298,12 +1299,12 @@ def test_manifest_content_disposition_header(self):
'The format is replica-based')
@manifest_test
def test_verbatim_jsonl_manifest(self):
bundle = self._load_canned_bundle(one(self.bundles()))
expected = [
{
'type': replica_type,
'contents': bundle.metadata_files[key],
}
for bundle in map(self._load_canned_bundle, self.bundles())
for replica_type, key in [
('links', 'links.json'),
('cell_suspension', 'cell_suspension_0.json'),
Expand All @@ -1320,12 +1321,8 @@ def test_verbatim_jsonl_manifest(self):
for row in response.content.decode().splitlines()
]

def sort_key(hca_doc: JSON) -> str:
try:
return hca_doc['contents']['provenance']['document_id']
except KeyError:
assert hca_doc['contents']['schema_type'] == 'link_bundle'
return ''
def sort_key(row: JSON) -> str:
return json_hash(row).digest()

expected.sort(key=sort_key)
response.sort(key=sort_key)
Expand Down

0 comments on commit e64a457

Please sign in to comment.