From 55522529516fe2c33fbe96aa575b8046f465d757 Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Fri, 19 Apr 2024 18:57:36 -0700 Subject: [PATCH 01/11] Change row sorting in JSONL manifest test --- test/service/test_manifest.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/test/service/test_manifest.py b/test/service/test_manifest.py index eb6c60278..d4eaafaa5 100644 --- a/test/service/test_manifest.py +++ b/test/service/test_manifest.py @@ -73,6 +73,7 @@ ) from azul.json import ( copy_json, + json_hash, ) from azul.logging import ( configure_test_logging, @@ -1309,12 +1310,8 @@ def test_verbatim_jsonl_manifest(self): self.assertEqual(200, response.status_code) response = list(map(json.loads, response.content.decode().splitlines())) - def sort_key(hca_doc: JSON) -> str: - try: - return hca_doc['provenance']['document_id'] - except KeyError: - assert hca_doc['schema_type'] == 'link_bundle' - return '' + def sort_key(row: JSON) -> bytes: + return json_hash(row).digest() expected.sort(key=sort_key) response.sort(key=sort_key) From 46f5df75e062af339d71d681a89002ab46607368 Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Fri, 19 Apr 2024 15:58:23 -0700 Subject: [PATCH 02/11] Refactor JSONL manifest test --- test/service/test_manifest.py | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/test/service/test_manifest.py b/test/service/test_manifest.py index d4eaafaa5..cd21ac6e4 100644 --- a/test/service/test_manifest.py +++ b/test/service/test_manifest.py @@ -210,6 +210,28 @@ def _assert_tsv(self, expected: list[tuple[str, ...]], actual: Response): actual[1:], expected[1:] = sorted(actual[1:]), sorted(expected[1:]) self.assertEqual(expected, actual) + def _assert_jsonl(self, expected: list[JSON], actual: Response): + """ + Assert that the body of the given response is the expected JSON array, + disregarding any row ordering differences. + + :param expected: a list of JSON objects. + + :param actual: an HTTP response containing JSON objects separated by + newlines + """ + manifest = [ + json.loads(row) + for row in actual.content.decode().splitlines() + ] + + def sort_key(row: JSON) -> bytes: + return json_hash(row).digest() + + manifest.sort(key=sort_key) + expected.sort(key=sort_key) + self.assertEqual(expected, manifest) + def _file_url(self, file_id, version): return str(self.base_url.set(path='/repository/files/' + file_id, args=dict(catalog=self.catalog, @@ -1294,9 +1316,9 @@ def test_manifest_content_disposition_header(self): 'The format is replica-based') @manifest_test def test_verbatim_jsonl_manifest(self): - bundle = self._load_canned_bundle(one(self.bundles())) expected = [ bundle.metadata_files[d] + for bundle in map(self._load_canned_bundle, self.bundles()) for d in [ 'links.json', 'cell_suspension_0.json', @@ -1308,14 +1330,7 @@ def test_verbatim_jsonl_manifest(self): ] response = self._get_manifest(ManifestFormat.verbatim_jsonl, {}) self.assertEqual(200, response.status_code) - response = list(map(json.loads, response.content.decode().splitlines())) - - def sort_key(row: JSON) -> bytes: - return json_hash(row).digest() - - expected.sort(key=sort_key) - response.sort(key=sort_key) - self.assertEqual(expected, response) + self._assert_jsonl(expected, response) class TestManifestCache(DCP1ManifestTestCase): From 54eb44e218a39c3ba801e25cdca4e840a4aa2267 Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Fri, 19 Apr 2024 15:58:38 -0700 Subject: [PATCH 03/11] Add unit test for AnVIL JSONL manifest (#6140) --- test/service/test_manifest.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/test/service/test_manifest.py b/test/service/test_manifest.py index cd21ac6e4..18ef4928c 100644 --- a/test/service/test_manifest.py +++ b/test/service/test_manifest.py @@ -2050,3 +2050,16 @@ def test_compact_manifest(self): ) ] self._assert_tsv(expected, response) + + @unittest.skipIf(not config.enable_replicas, + 'The format is replica-based') + @manifest_test + def test_verbatim_jsonl_manifest(self): + response = self._get_manifest(ManifestFormat.verbatim_jsonl, filters={}) + self.assertEqual(200, response.status_code) + expected = [ + entity + for bundle in self.bundles() + for entity in self._load_canned_bundle(bundle).entities.values() + ] + self._assert_jsonl(expected, response) From 7ef15e43c6ddaf67a39fbd0a54d98b6c1dc006c8 Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Wed, 10 Apr 2024 01:58:54 -0700 Subject: [PATCH 04/11] Refactor canned file loading --- test/indexer/__init__.py | 12 +++++++++--- test/pfb_test_case.py | 14 ++++---------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/test/indexer/__init__.py b/test/indexer/__init__.py index b7e199c1f..ebbc6b4ef 100644 --- a/test/indexer/__init__.py +++ b/test/indexer/__init__.py @@ -3,9 +3,12 @@ abstractmethod, ) import json -import os +from pathlib import ( + Path, +) from typing import ( Generic, + Literal, Optional, Type, Union, @@ -89,6 +92,10 @@ class CannedFileTestCase(AzulUnitTestCase): expected outputs. """ + @classmethod + def _data_path(cls, module: Literal['service', 'indexer']) -> Path: + return Path(config.project_root) / 'test' / module / 'data' + @classmethod def _load_canned_file(cls, bundle: BundleFQID, @@ -111,10 +118,9 @@ def _load_canned_file_version(cls, version: Optional[str], extension: str ) -> Union[MutableJSONs, MutableJSON]: - data_prefix = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data') suffix = '' if version is None else '.' + version file_name = f'{uuid}{suffix}.{extension}.json' - with open(os.path.join(data_prefix, file_name), 'r') as infile: + with open(cls._data_path('indexer') / file_name, 'r') as infile: return json.load(infile) diff --git a/test/pfb_test_case.py b/test/pfb_test_case.py index 262d7d2d4..57d064143 100644 --- a/test/pfb_test_case.py +++ b/test/pfb_test_case.py @@ -1,17 +1,13 @@ import json -from pathlib import ( - Path, -) -import sys import fastavro -from azul_test_case import ( - AzulUnitTestCase, +from indexer import ( + CannedFileTestCase, ) -class PFBTestCase(AzulUnitTestCase): +class PFBTestCase(CannedFileTestCase): def _assert_pfb_schema(self, schema): fastavro.parse_schema(schema) @@ -22,9 +18,7 @@ def _assert_pfb_schema(self, schema): def to_json(records): return json.dumps(records, indent=4, sort_keys=True) - cls = type(self) - module = sys.modules[cls.__module__] - results_file = Path(module.__file__).parent / 'data' / 'pfb_manifest.schema.json' + results_file = self._data_path('service') / 'pfb_manifest.schema.json' if results_file.exists(): with open(results_file, 'r') as f: expected_records = json.load(f) From d3bfa68cbbf459706d5efe5f534722a04dd3680b Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Wed, 10 Apr 2024 01:07:57 -0700 Subject: [PATCH 05/11] Refactor HCA supported manifest formats to match AnVIL --- src/azul/plugins/metadata/hca/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/azul/plugins/metadata/hca/__init__.py b/src/azul/plugins/metadata/hca/__init__.py index cc583e88f..7ccd7ad9a 100644 --- a/src/azul/plugins/metadata/hca/__init__.py +++ b/src/azul/plugins/metadata/hca/__init__.py @@ -10,6 +10,7 @@ from azul import ( config, + iif, ) from azul.indexer.document import ( Aggregate, @@ -167,7 +168,7 @@ def manifest_formats(self) -> Sequence[ManifestFormat]: ManifestFormat.terra_bdbag, ManifestFormat.terra_pfb, ManifestFormat.curl, - *([ManifestFormat.verbatim_jsonl] if config.enable_replicas else []) + *iif(config.enable_replicas, [ManifestFormat.verbatim_jsonl]) ] @property From 7bd2fc0bba13a9e3456420b2ff5d96da8e65282f Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Wed, 10 Apr 2024 00:57:20 -0700 Subject: [PATCH 06/11] Refactor avro_pfb.py --- src/azul/service/avro_pfb.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/azul/service/avro_pfb.py b/src/azul/service/avro_pfb.py index cae1c0a2d..7d6e941db 100644 --- a/src/azul/service/avro_pfb.py +++ b/src/azul/service/avro_pfb.py @@ -15,6 +15,7 @@ from typing import ( ClassVar, MutableSet, + Self, ) from uuid import ( UUID, @@ -174,7 +175,7 @@ def from_json(cls, name: str, object_: MutableJSON, schema: JSON - ) -> 'PFBEntity': + ) -> Self: """ Derive ID from object in a reproducible way so that we can distinguish entities by comparing their IDs. @@ -207,7 +208,7 @@ def _add_missing_fields(cls, name: str, object_: MutableJSON, schema): if isinstance(field_type, list): assert 'null' in field_type, field default_value = None - elif field_type['type'] == 'array': + elif isinstance(field_type, dict) and field_type['type'] == 'array': if isinstance(field_type['items'], dict): assert field_type['items']['type'] in ('record', 'array'), field default_value = [] @@ -246,11 +247,11 @@ class PFBRelation: dst_name: str @classmethod - def to_entity(cls, entity: PFBEntity) -> 'PFBRelation': + def to_entity(cls, entity: PFBEntity) -> Self: return cls(dst_id=entity.id, dst_name=entity.name) -def pfb_metadata_entity(field_types: FieldTypes): +def pfb_metadata_entity(entity_types: Iterable[str]) -> MutableJSON: """ The Metadata entity encodes the possible relationships between tables. @@ -271,7 +272,7 @@ def pfb_metadata_entity(field_types: FieldTypes): 'name': 'files' }], 'properties': [] - } for entity_type in field_types.keys() + } for entity_type in entity_types ], 'misc': {} } From cd113dca9612b24632ec1563c884f786971d932a Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Fri, 22 Mar 2024 19:31:10 -0700 Subject: [PATCH 07/11] Extract ABC from verbatim manifest generator --- src/azul/service/manifest_service.py | 29 +++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/src/azul/service/manifest_service.py b/src/azul/service/manifest_service.py index f55b4546e..4962932b2 100644 --- a/src/azul/service/manifest_service.py +++ b/src/azul/service/manifest_service.py @@ -2012,19 +2012,7 @@ def qualify(qualifier, column_name, index=None): bundle_tsv_writer.writerow(row) -class VerbatimManifestGenerator(FileBasedManifestGenerator): - - @property - def content_type(self) -> str: - return 'application/jsonl' - - @classmethod - def file_name_extension(cls) -> str: - return 'jsonl' - - @classmethod - def format(cls) -> ManifestFormat: - return ManifestFormat.verbatim_jsonl +class VerbatimManifestGenerator(FileBasedManifestGenerator, metaclass=ABCMeta): @property def entity_type(self) -> str: @@ -2105,6 +2093,21 @@ def _join_replicas(self, keys: Iterable[ReplicaKeys]) -> Iterable[Hit]: ])) return request.scan() + +class JSONLVerbatimManifestGenerator(VerbatimManifestGenerator): + + @property + def content_type(self) -> str: + return 'application/jsonl' + + @classmethod + def file_name_extension(cls) -> str: + return 'jsonl' + + @classmethod + def format(cls) -> ManifestFormat: + return ManifestFormat.verbatim_jsonl + def create_file(self) -> tuple[str, Optional[str]]: fd, path = mkstemp(suffix=f'.{self.file_name_extension()}') os.close(fd) From cef22234c0d7e177e66fc98dec207d0b725b3d52 Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Wed, 10 Apr 2024 00:50:41 -0700 Subject: [PATCH 08/11] Include `replica_type` in JSONL manifest --- src/azul/service/manifest_service.py | 8 ++++++-- test/service/test_manifest.py | 26 ++++++++++++++++---------- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/src/azul/service/manifest_service.py b/src/azul/service/manifest_service.py index 4962932b2..9d432be00 100644 --- a/src/azul/service/manifest_service.py +++ b/src/azul/service/manifest_service.py @@ -2067,7 +2067,7 @@ def _all_replicas(self) -> Iterable[JSON]: replica_id = replica.meta.id if replica_id not in emitted_replica_ids: num_new_replicas += 1 - yield replica.contents.to_dict() + yield replica.to_dict() # Note that this will be zero for replicas that use implicit # hubs, in which case there are actually many hubs explicit_hub_count = len(replica.hub_ids) @@ -2113,6 +2113,10 @@ def create_file(self) -> tuple[str, Optional[str]]: os.close(fd) with open(path, 'w') as f: for replica in self._all_replicas(): - json.dump(replica, f) + entry = { + 'contents': replica['contents'], + 'type': replica['replica_type'] + } + json.dump(entry, f) f.write('\n') return path, None diff --git a/test/service/test_manifest.py b/test/service/test_manifest.py index 18ef4928c..8c5ac5d5f 100644 --- a/test/service/test_manifest.py +++ b/test/service/test_manifest.py @@ -1317,15 +1317,18 @@ def test_manifest_content_disposition_header(self): @manifest_test def test_verbatim_jsonl_manifest(self): expected = [ - bundle.metadata_files[d] + { + 'type': replica_type, + 'contents': bundle.metadata_files[key], + } for bundle in map(self._load_canned_bundle, self.bundles()) - for d in [ - 'links.json', - 'cell_suspension_0.json', - 'project_0.json', - 'sequence_file_0.json', - 'sequence_file_1.json', - 'specimen_from_organism_0.json' + for replica_type, key in [ + ('links', 'links.json'), + ('cell_suspension', 'cell_suspension_0.json'), + ('project', 'project_0.json'), + ('file', 'sequence_file_0.json'), + ('file', 'sequence_file_1.json'), + ('sample', 'specimen_from_organism_0.json') ] ] response = self._get_manifest(ManifestFormat.verbatim_jsonl, {}) @@ -2058,8 +2061,11 @@ def test_verbatim_jsonl_manifest(self): response = self._get_manifest(ManifestFormat.verbatim_jsonl, filters={}) self.assertEqual(200, response.status_code) expected = [ - entity + { + 'type': 'anvil_' + entity_ref.entity_type, + 'contents': entity, + } for bundle in self.bundles() - for entity in self._load_canned_bundle(bundle).entities.values() + for entity_ref, entity in self._load_canned_bundle(bundle).entities.items() ] self._assert_jsonl(expected, response) From 85fdaeb1888f81f15aca6518ded3f0ff12f10a5f Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Fri, 22 Mar 2024 19:34:28 -0700 Subject: [PATCH 09/11] [a] Create PFB-based verbatim manifest format for AnVIL (#6040) --- lambdas/service/app.py | 6 +- lambdas/service/openapi.json | 6 +- src/azul/plugins/__init__.py | 1 + src/azul/plugins/metadata/anvil/__init__.py | 5 +- src/azul/service/avro_pfb.py | 175 +++++- src/azul/service/manifest_service.py | 30 + test/integration_test.py | 3 +- test/service/data/verbatim/pfb_entities.json | 318 ++++++++++ test/service/data/verbatim/pfb_schema.json | 609 +++++++++++++++++++ test/service/test_manifest.py | 24 + 10 files changed, 1165 insertions(+), 12 deletions(-) create mode 100644 test/service/data/verbatim/pfb_entities.json create mode 100644 test/service/data/verbatim/pfb_schema.json diff --git a/lambdas/service/app.py b/lambdas/service/app.py index 142454955..d96cb744c 100644 --- a/lambdas/service/app.py +++ b/lambdas/service/app.py @@ -228,7 +228,7 @@ # changes and reset the minor version to zero. Otherwise, increment only # the minor version for backwards compatible changes. A backwards # compatible change is one that does not require updates to clients. - 'version': '7.2' + 'version': '7.3' }, 'tags': [ { @@ -1414,6 +1414,10 @@ def manifest_route(*, fetch: bool, initiate: bool): manifest in [JSONL][5] format. Each line contains an unaltered metadata entity from the underlying repository. + - `{ManifestFormat.verbatim_pfb.value}` for a verbatim + manifest in the [PFB format][3]. This format is mainly + used for exporting data to Terra. + [1]: https://bd2k.ini.usc.edu/tools/bdbag/ [2]: https://software.broadinstitute.org/firecloud/documentation/article?id=10954 diff --git a/lambdas/service/openapi.json b/lambdas/service/openapi.json index be7e9606b..6edb98e96 100644 --- a/lambdas/service/openapi.json +++ b/lambdas/service/openapi.json @@ -3,7 +3,7 @@ "info": { "title": "azul_service", "description": "\n# Overview\n\nAzul is a REST web service for querying metadata associated with\nboth experimental and analysis data from a data repository. In order\nto deliver response times that make it suitable for interactive use\ncases, the set of metadata properties that it exposes for sorting,\nfiltering, and aggregation is limited. Azul provides a uniform view\nof the metadata over a range of diverse schemas, effectively\nshielding clients from changes in the schemas as they occur over\ntime. It does so, however, at the expense of detail in the set of\nmetadata properties it exposes and in the accuracy with which it\naggregates them.\n\nAzul denormalizes and aggregates metadata into several different\nindices for selected entity types. Metadata entities can be queried\nusing the [Index](#operations-tag-Index) endpoints.\n\nA set of indices forms a catalog. There is a default catalog called\n`dcp2` which will be used unless a\ndifferent catalog name is specified using the `catalog` query\nparameter. Metadata from different catalogs is completely\nindependent: a response obtained by querying one catalog does not\nnecessarily correlate to a response obtained by querying another\none. Two catalogs can contain metadata from the same sources or\ndifferent sources. It is only guaranteed that the body of a\nresponse by any given endpoint adheres to one schema,\nindependently of which catalog was specified in the request.\n\nAzul provides the ability to download data and metadata via the\n[Manifests](#operations-tag-Manifests) endpoints. The\n`curl` format manifests can be used to\ndownload data files. Other formats provide various views of the\nmetadata. Manifests can be generated for a selection of files using\nfilters. These filters are interchangeable with the filters used by\nthe [Index](#operations-tag-Index) endpoints.\n\nAzul also provides a [summary](#operations-Index-get_index_summary)\nview of indexed data.\n\n## Data model\n\nAny index, when queried, returns a JSON array of hits. Each hit\nrepresents a metadata entity. Nested in each hit is a summary of the\nproperties of entities associated with the hit. An entity is\nassociated either by a direct edge in the original metadata graph,\nor indirectly as a series of edges. The nested properties are\ngrouped by the type of the associated entity. The properties of all\ndata files associated with a particular sample, for example, are\nlisted under `hits[*].files` in a `/index/samples` response. It is\nimportant to note that while each _hit_ represents a discrete\nentity, the properties nested within that hit are the result of an\naggregation over potentially many associated entities.\n\nTo illustrate this, consider a data file that is part of two\nprojects (a project is a group of related experiments, typically by\none laboratory, institution or consortium). Querying the `files`\nindex for this file yields a hit looking something like:\n\n```\n{\n \"projects\": [\n {\n \"projectTitle\": \"Project One\"\n \"laboratory\": ...,\n ...\n },\n {\n \"projectTitle\": \"Project Two\"\n \"laboratory\": ...,\n ...\n }\n ],\n \"files\": [\n {\n \"format\": \"pdf\",\n \"name\": \"Team description.pdf\",\n ...\n }\n ]\n}\n```\n\nThis example hit contains two kinds of nested entities (a hit in an\nactual Azul response will contain more): There are the two projects\nentities, and the file itself. These nested entities contain\nselected metadata properties extracted in a consistent way. This\nmakes filtering and sorting simple.\n\nAlso notice that there is only one file. When querying a particular\nindex, the corresponding entity will always be a singleton like\nthis.\n", - "version": "7.2" + "version": "7.3" }, "tags": [ { @@ -9482,7 +9482,7 @@ "verbatim.jsonl" ] }, - "description": "\nThe desired format of the output.\n\n- `compact` (the default) for a compact,\n tab-separated manifest\n\n- `terra.bdbag` for a manifest in the\n [BDBag format][1]. This provides a ZIP file containing two\n manifests: one for Participants (aka Donors) and one for\n Samples (aka Specimens). For more on the format of the\n manifests see [documentation here][2].\n\n- `terra.pfb` for a manifest in the [PFB\n format][3]. This format is mainly used for exporting data to\n Terra.\n\n- `curl` for a [curl configuration\n file][4] manifest. This manifest can be used with the curl\n program to download all the files listed in the manifest.\n\n- `verbatim.jsonl` for a verbatim\n manifest in [JSONL][5] format. Each line contains an\n unaltered metadata entity from the underlying repository.\n\n[1]: https://bd2k.ini.usc.edu/tools/bdbag/\n\n[2]: https://software.broadinstitute.org/firecloud/documentation/article?id=10954\n\n[3]: https://github.com/uc-cdis/pypfb\n\n[4]: https://curl.haxx.se/docs/manpage.html#-K\n\n[5]: https://jsonlines.org/\n" + "description": "\nThe desired format of the output.\n\n- `compact` (the default) for a compact,\n tab-separated manifest\n\n- `terra.bdbag` for a manifest in the\n [BDBag format][1]. This provides a ZIP file containing two\n manifests: one for Participants (aka Donors) and one for\n Samples (aka Specimens). For more on the format of the\n manifests see [documentation here][2].\n\n- `terra.pfb` for a manifest in the [PFB\n format][3]. This format is mainly used for exporting data to\n Terra.\n\n- `curl` for a [curl configuration\n file][4] manifest. This manifest can be used with the curl\n program to download all the files listed in the manifest.\n\n- `verbatim.jsonl` for a verbatim\n manifest in [JSONL][5] format. Each line contains an\n unaltered metadata entity from the underlying repository.\n\n- `verbatim.pfb` for a verbatim\n manifest in the [PFB format][3]. This format is mainly\n used for exporting data to Terra.\n\n[1]: https://bd2k.ini.usc.edu/tools/bdbag/\n\n[2]: https://software.broadinstitute.org/firecloud/documentation/article?id=10954\n\n[3]: https://github.com/uc-cdis/pypfb\n\n[4]: https://curl.haxx.se/docs/manpage.html#-K\n\n[5]: https://jsonlines.org/\n" } ], "responses": { @@ -10890,7 +10890,7 @@ "verbatim.jsonl" ] }, - "description": "\nThe desired format of the output.\n\n- `compact` (the default) for a compact,\n tab-separated manifest\n\n- `terra.bdbag` for a manifest in the\n [BDBag format][1]. This provides a ZIP file containing two\n manifests: one for Participants (aka Donors) and one for\n Samples (aka Specimens). For more on the format of the\n manifests see [documentation here][2].\n\n- `terra.pfb` for a manifest in the [PFB\n format][3]. This format is mainly used for exporting data to\n Terra.\n\n- `curl` for a [curl configuration\n file][4] manifest. This manifest can be used with the curl\n program to download all the files listed in the manifest.\n\n- `verbatim.jsonl` for a verbatim\n manifest in [JSONL][5] format. Each line contains an\n unaltered metadata entity from the underlying repository.\n\n[1]: https://bd2k.ini.usc.edu/tools/bdbag/\n\n[2]: https://software.broadinstitute.org/firecloud/documentation/article?id=10954\n\n[3]: https://github.com/uc-cdis/pypfb\n\n[4]: https://curl.haxx.se/docs/manpage.html#-K\n\n[5]: https://jsonlines.org/\n" + "description": "\nThe desired format of the output.\n\n- `compact` (the default) for a compact,\n tab-separated manifest\n\n- `terra.bdbag` for a manifest in the\n [BDBag format][1]. This provides a ZIP file containing two\n manifests: one for Participants (aka Donors) and one for\n Samples (aka Specimens). For more on the format of the\n manifests see [documentation here][2].\n\n- `terra.pfb` for a manifest in the [PFB\n format][3]. This format is mainly used for exporting data to\n Terra.\n\n- `curl` for a [curl configuration\n file][4] manifest. This manifest can be used with the curl\n program to download all the files listed in the manifest.\n\n- `verbatim.jsonl` for a verbatim\n manifest in [JSONL][5] format. Each line contains an\n unaltered metadata entity from the underlying repository.\n\n- `verbatim.pfb` for a verbatim\n manifest in the [PFB format][3]. This format is mainly\n used for exporting data to Terra.\n\n[1]: https://bd2k.ini.usc.edu/tools/bdbag/\n\n[2]: https://software.broadinstitute.org/firecloud/documentation/article?id=10954\n\n[3]: https://github.com/uc-cdis/pypfb\n\n[4]: https://curl.haxx.se/docs/manpage.html#-K\n\n[5]: https://jsonlines.org/\n" } ], "responses": { diff --git a/src/azul/plugins/__init__.py b/src/azul/plugins/__init__.py index c703aa127..444c85264 100644 --- a/src/azul/plugins/__init__.py +++ b/src/azul/plugins/__init__.py @@ -142,6 +142,7 @@ class ManifestFormat(Enum): terra_pfb = 'terra.pfb' curl = 'curl' verbatim_jsonl = 'verbatim.jsonl' + verbatim_pfb = 'verbatim.pfb' T = TypeVar('T', bound='Plugin') diff --git a/src/azul/plugins/metadata/anvil/__init__.py b/src/azul/plugins/metadata/anvil/__init__.py index 3f96a1987..3d95afed6 100644 --- a/src/azul/plugins/metadata/anvil/__init__.py +++ b/src/azul/plugins/metadata/anvil/__init__.py @@ -75,7 +75,10 @@ def manifest_formats(self) -> Sequence[ManifestFormat]: return [ ManifestFormat.compact, ManifestFormat.terra_pfb, - *iif(config.enable_replicas, [ManifestFormat.verbatim_jsonl]) + *iif(config.enable_replicas, [ + ManifestFormat.verbatim_jsonl, + ManifestFormat.verbatim_pfb + ]) ] def transformer_types(self) -> Iterable[Type[BaseTransformer]]: diff --git a/src/azul/service/avro_pfb.py b/src/azul/service/avro_pfb.py index 7d6e941db..1e64ef8c8 100644 --- a/src/azul/service/avro_pfb.py +++ b/src/azul/service/avro_pfb.py @@ -1,3 +1,4 @@ +import bisect from collections import ( defaultdict, ) @@ -16,6 +17,7 @@ ClassVar, MutableSet, Self, + Sequence, ) from uuid import ( UUID, @@ -58,6 +60,8 @@ value_and_unit, ) from azul.types import ( + AnyJSON, + AnyMutableJSON, JSON, MutableJSON, ) @@ -189,6 +193,17 @@ def from_json(cls, id_ = _reversible_join('.', map(str, (name, id_, len(ids)))) return cls(id=id_, name=name, object=object_) + @classmethod + def for_replica(cls, replica: MutableJSON, schema: JSON) -> Self: + name, object_ = replica['replica_type'], replica['contents'] + cls._add_missing_fields(name, object_, schema) + # Note that it is possible for two distinct replicas to have the same + # entity ID. For example, replicas representing the DUOS registration + # of AnVIL datasets have the same ID as the replica for the dataset + # itself. Terra appears to combine PFB entities with the same ID + # into a single row. + return cls(id=replica['entity_id'], name=name, object=object_) + @classmethod def _add_missing_fields(cls, name: str, object_: MutableJSON, schema): """ @@ -215,6 +230,8 @@ def _add_missing_fields(cls, name: str, object_: MutableJSON, schema): else: assert 'null' in field_type['items'], field default_value = [None] + elif field_type == 'null': + default_value = None else: assert False, field object_[field_name] = default_value @@ -251,7 +268,9 @@ def to_entity(cls, entity: PFBEntity) -> Self: return cls(dst_id=entity.id, dst_name=entity.name) -def pfb_metadata_entity(entity_types: Iterable[str]) -> MutableJSON: +def pfb_metadata_entity(entity_types: Iterable[str], + links: bool = True + ) -> MutableJSON: """ The Metadata entity encodes the possible relationships between tables. @@ -266,7 +285,7 @@ def pfb_metadata_entity(entity_types: Iterable[str]) -> MutableJSON: 'name': entity_type, 'ontology_reference': '', 'values': {}, - 'links': [] if entity_type == 'files' else [{ + 'links': [] if not links or entity_type == 'files' else [{ 'multiplicity': 'MANY_TO_MANY', 'dst': 'files', 'name': 'files' @@ -294,6 +313,20 @@ def pfb_schema_from_field_types(field_types: FieldTypes) -> JSON: return _avro_pfb_schema(entity_schemas) +def pfb_schema_from_replicas(replicas: Iterable[JSON] + ) -> tuple[Sequence[str], JSON]: + schemas_by_replica_type = {} + for replica in replicas: + replica_type, replica_contents = replica['replica_type'], replica['contents'] + _update_replica_schema(schema=schemas_by_replica_type, + path=(replica_type,), + key=replica_type, + value=replica_contents) + schemas_by_replica_type = sorted(schemas_by_replica_type.items()) + keys, values = zip(*schemas_by_replica_type) + return keys, _avro_pfb_schema(values) + + def _avro_pfb_schema(azul_avro_schema: Iterable[JSON]) -> JSON: """ The boilerplate Avro schema that comprises a PFB's schema is returned in @@ -474,6 +507,13 @@ def _inject_reference_handover_values(entity: MutableJSON, doc: JSON): # that all of the primitive field types types are nullable # https://github.com/DataBiosphere/azul/issues/4094 +_json_to_pfb_types = { + bool: 'boolean', + float: 'double', + int: 'long', + str: 'string' +} + _nullable_to_pfb_types = { null_bool: ['null', 'boolean'], null_float: ['null', 'double'], @@ -561,10 +601,7 @@ def _entity_schema_recursive(field_types: FieldTypes, 'type': 'array', 'items': { 'type': 'array', - 'items': { - int: 'long', - float: 'double' - }[field_type.ends_type.native_type] + 'items': _json_to_pfb_types[field_type.ends_type.native_type] } } } @@ -603,3 +640,129 @@ def _entity_schema_recursive(field_types: FieldTypes, pass else: assert False, field_type + + +def _update_replica_schema(*, + schema: MutableJSON, + path: tuple[str, ...], + key: str, + value: AnyMutableJSON): + """ + Update in place a (part of an) existing PFB schema to ensure that it + accommodates a given (part of a) JSON document. The schema will only ever + expand, so after updating it will describe a superset of the documents that + it described pre-update. Starting from an empty schema, repeatedly calling + this function allows us to discover a general schema for a series of + documents of unknown shape. + + :param schema: a part of a PFB schema. It may be empty. + + :param path: the series of field names that locate `schema` within its + top-level parent schema. The first entry should be the name of + the underlying PFB entity's record type. + + :param key: the key within `schema` whose associated value will be updated + to describe `value`. This is the only part of `schema` that may + be mutated. + + :param value: a part of a PFB entity. + """ + try: + old_type = schema[key] + except KeyError: + schema[key] = _new_replica_schema(path=path, value=value) + else: + if old_type == []: + schema[key] = _new_replica_schema(path=path, value=value) + elif value is None: + if old_type == 'null' or isinstance(old_type, list): + pass + else: + schema[key] = ['null', old_type] + elif old_type == 'null': + schema[key] = [ + 'null', + _new_replica_schema(path=path, value=value) + ] + elif isinstance(value, list): + if isinstance(old_type, list): + old_type = old_type[1] + assert old_type['type'] == 'array', old_type + for v in value: + _update_replica_schema(schema=old_type, + path=path, + key='items', + value=v) + elif isinstance(value, dict): + if isinstance(old_type, list): + old_type = old_type[1] + assert old_type['type'] == 'record', old_type + old_fields = {field['name']: field for field in old_type['fields']} + for k in value.keys() | old_fields.keys(): + try: + field = old_fields[k] + except KeyError: + field = { + 'name': k, + 'namespace': '.'.join(path), + 'type': 'null' + } + bisect.insort(old_type['fields'], field, key=itemgetter('name')) + new_value = value[k] + else: + new_value = value.get(k) + _update_replica_schema(schema=field, + path=(*path, k), + key='type', + value=new_value) + else: + new_type = _json_to_pfb_types[type(value)] + if isinstance(old_type, list): + old_type = old_type[1] + assert old_type == new_type, (old_type, value) + + +def _new_replica_schema(*, + path: tuple[str, ...], + value: AnyJSON, + ) -> AnyMutableJSON: + """ + Create a part of a PFB schema to describe a part of a PFB entity represented + as a JSON document. + + :param path: the location of `value` within the root document as a series + of keys. The first key should be the name of the underlying PFB + entity's type within the schema. + + :param value: a part of a PFB entity. + + :return: JSON describing the contents of `value` as a part of PFB schema. + """ + if value is None: + result = 'null' + elif isinstance(value, list): + # Empty list indicates "no type" (emtpy union). This will be replaced + # with an actual type unless we never encounter a non-empty array. + result = {'type': 'array', 'items': []} + for v in value: + _update_replica_schema(schema=result, + path=path, + key='items', + value=v) + elif isinstance(value, dict): + name = '.'.join(path) + result = { + 'name': name, + 'type': 'record', + 'fields': [ + { + 'name': k, + 'namespace': name, + 'type': _new_replica_schema(path=(*path, k), value=v) + } + for k, v in sorted(value.items()) + ] + } + else: + result = _json_to_pfb_types[type(value)] + return result diff --git a/src/azul/service/manifest_service.py b/src/azul/service/manifest_service.py index 9d432be00..3be00012c 100644 --- a/src/azul/service/manifest_service.py +++ b/src/azul/service/manifest_service.py @@ -2120,3 +2120,33 @@ def create_file(self) -> tuple[str, Optional[str]]: json.dump(entry, f) f.write('\n') return path, None + + +class PFBVerbatimManifestGenerator(VerbatimManifestGenerator): + + @property + def content_type(self) -> str: + return 'application/octet-stream' + + @classmethod + def file_name_extension(cls): + return 'avro' + + @classmethod + def format(cls) -> ManifestFormat: + return ManifestFormat.verbatim_pfb + + def create_file(self) -> tuple[str, Optional[str]]: + replicas = list(self._all_replicas()) + replica_types, pfb_schema = avro_pfb.pfb_schema_from_replicas(replicas) + pfb_metadata_entity = avro_pfb.pfb_metadata_entity(replica_types, links=False) + + def pfb_entities(): + yield pfb_metadata_entity + for replica in replicas: + yield avro_pfb.PFBEntity.for_replica(dict(replica), pfb_schema).to_json(()) + + fd, path = mkstemp(suffix=f'.{self.file_name_extension()}') + os.close(fd) + avro_pfb.write_pfb_entities(pfb_entities(), pfb_schema, path) + return path, None diff --git a/test/integration_test.py b/test/integration_test.py index 1a0f039aa..e9da70ae7 100644 --- a/test/integration_test.py +++ b/test/integration_test.py @@ -585,7 +585,8 @@ def _test_manifest(self, catalog: CatalogName): ManifestFormat.terra_bdbag: self._check_terra_bdbag_manifest, ManifestFormat.terra_pfb: self._check_terra_pfb_manifest, ManifestFormat.curl: self._check_curl_manifest, - ManifestFormat.verbatim_jsonl: self._check_jsonl_manifest + ManifestFormat.verbatim_jsonl: self._check_jsonl_manifest, + ManifestFormat.verbatim_pfb: self._check_terra_pfb_manifest } for format in [None, *supported_formats]: # IT catalogs with just one public source are always indexed diff --git a/test/service/data/verbatim/pfb_entities.json b/test/service/data/verbatim/pfb_entities.json new file mode 100644 index 000000000..69b48ca44 --- /dev/null +++ b/test/service/data/verbatim/pfb_entities.json @@ -0,0 +1,318 @@ +[ + { + "id": null, + "name": "Metadata", + "object": { + "misc": {}, + "nodes": [ + { + "links": [], + "name": "anvil_biosample", + "ontology_reference": "", + "properties": [], + "values": {} + }, + { + "links": [], + "name": "anvil_dataset", + "ontology_reference": "", + "properties": [], + "values": {} + }, + { + "links": [], + "name": "anvil_diagnosis", + "ontology_reference": "", + "properties": [], + "values": {} + }, + { + "links": [], + "name": "anvil_donor", + "ontology_reference": "", + "properties": [], + "values": {} + }, + { + "links": [], + "name": "anvil_file", + "ontology_reference": "", + "properties": [], + "values": {} + }, + { + "links": [], + "name": "anvil_sequencingactivity", + "ontology_reference": "", + "properties": [], + "values": {} + } + ] + }, + "relations": [] + }, + { + "id": "2370f948-2783-4eb6-afea-e022897f4dcf", + "name": "anvil_dataset", + "object": { + "consent_group": null, + "data_modality": null, + "data_use_permission": null, + "datarepo_row_id": null, + "dataset_id": null, + "description": "Study description from DUOS", + "owner": null, + "principal_investigator": null, + "registered_identifier": null, + "source_datarepo_row_ids": null, + "title": null, + "version": "2022-06-01T00:00:00.000000Z" + }, + "relations": [] + }, + { + "id": "816e364e-1193-4e5b-a91a-14e4b009157c", + "name": "anvil_sequencingactivity", + "object": { + "activity_type": "Sequencing", + "assay_type": [], + "data_modality": [], + "datarepo_row_id": "816e364e-1193-4e5b-a91a-14e4b009157c", + "sequencingactivity_id": "a60c5138-3749-f7cb-8714-52d389ad5231", + "source_datarepo_row_ids": [ + "sequencing:a6c663c7-6f26-4ed2-af9d-48e9c709a22b" + ], + "version": "2022-06-01T00:00:00.000000Z" + }, + "relations": [] + }, + { + "id": "677dd55c-3fa3-4b07-8c98-985d94d7577e", + "name": "anvil_dataset", + "object": { + "consent_group": [], + "data_modality": [], + "data_use_permission": [], + "datarepo_row_id": "677dd55c-3fa3-4b07-8c98-985d94d7577e", + "dataset_id": "385290c3-dff5-fb6d-2501-fa0ba3ad1c35", + "description": null, + "owner": [], + "principal_investigator": [], + "registered_identifier": [], + "source_datarepo_row_ids": [ + "workspace_attributes:95684a9c-e0a1-4c05-9f1f-de628a38420c" + ], + "title": "ANVIL_1000G_2019_Dev", + "version": "2022-06-01T00:00:00.000000Z" + }, + "relations": [] + }, + { + "id": "826dea02-e274-4ffe-aabc-eb3db63ad068", + "name": "anvil_biosample", + "object": { + "anatomical_site": null, + "apriori_cell_type": [], + "biosample_id": "f9d40cf6-37b8-22f3-ce35-0dc614d2452b", + "biosample_type": null, + "datarepo_row_id": "826dea02-e274-4ffe-aabc-eb3db63ad068", + "disease": null, + "donor_age_at_collection_lower_bound": null, + "donor_age_at_collection_unit": null, + "donor_age_at_collection_upper_bound": null, + "source_datarepo_row_ids": [ + "sample:98048c3b-2525-4090-94fd-477de31f2608" + ], + "version": "2022-06-01T00:00:00.000000Z" + }, + "relations": [] + }, + { + "id": "6b0f6c0f-5d80-4242-accb-840921351cd5", + "name": "anvil_file", + "object": { + "crc32": "", + "data_modality": [], + "datarepo_row_id": "6b0f6c0f-5d80-4242-accb-840921351cd5", + "drs_uri": "drs://mock_tdr.lan/v1_790795c4-49b1-4ac8-a060-207b92ea08c5_1fab11f5-7eab-4318-9a58-68d8d06e0715", + "file_format": ".txt", + "file_id": "1fab11f5-7eab-4318-9a58-68d8d06e0715", + "file_md5sum": "S/GBrRjzZAQYqh3rdiPYzA==", + "file_name": "CCDG_13607_B01_GRM_WGS_2019-02-19_chr15.recalibrated_variants.annotated.coding.txt", + "file_ref": "drs://mock_tdr.lan/v1_790795c4-49b1-4ac8-a060-207b92ea08c5_1fab11f5-7eab-4318-9a58-68d8d06e0715", + "file_size": 15079345, + "is_supplementary": true, + "reference_assembly": [], + "sha256": "", + "source_datarepo_row_ids": [ + "file_inventory:04ff3af2-0543-4ea6-830a-d31b957fa2ee" + ], + "version": "2022-06-01T00:00:00.000000Z" + }, + "relations": [] + }, + { + "id": "15d85d30-ad4a-4f50-87a8-a27f59dd1b5f", + "name": "anvil_diagnosis", + "object": { + "datarepo_row_id": "15d85d30-ad4a-4f50-87a8-a27f59dd1b5f", + "diagnosis_age_lower_bound": null, + "diagnosis_age_unit": null, + "diagnosis_age_upper_bound": null, + "diagnosis_id": "5ebe9bc4-a1be-0ddf-7277-b1e88276d0f6", + "disease": [ + "redacted-A61iJlLx" + ], + "onset_age_lower_bound": null, + "onset_age_unit": null, + "onset_age_upper_bound": null, + "phenopacket": [], + "phenotype": [ + "redacted-acSYHZUr" + ], + "source_datarepo_row_ids": [ + "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" + ], + "version": "2022-06-01T00:00:00.000000Z" + }, + "relations": [] + }, + { + "id": "1509ef40-d1ba-440d-b298-16b7c173dcd4", + "name": "anvil_sequencingactivity", + "object": { + "activity_type": "Sequencing", + "assay_type": [], + "data_modality": [], + "datarepo_row_id": "1509ef40-d1ba-440d-b298-16b7c173dcd4", + "sequencingactivity_id": "18b3be87-e26b-4376-0d8d-c1e370e90e07", + "source_datarepo_row_ids": [ + "sequencing:d4f6c0c4-1e11-438e-8218-cfea63b8b051" + ], + "version": "2022-06-01T00:00:00.000000Z" + }, + "relations": [] + }, + { + "id": "939a4bd3-86ed-4a8a-81f4-fbe0ee673461", + "name": "anvil_diagnosis", + "object": { + "datarepo_row_id": "939a4bd3-86ed-4a8a-81f4-fbe0ee673461", + "diagnosis_age_lower_bound": null, + "diagnosis_age_unit": null, + "diagnosis_age_upper_bound": null, + "diagnosis_id": "25ff8d32-18c9-fc3e-020a-5de20d35d906", + "disease": [ + "redacted-g50ublm/" + ], + "onset_age_lower_bound": null, + "onset_age_unit": null, + "onset_age_upper_bound": null, + "phenopacket": [], + "phenotype": [ + "redacted-acSYHZUr" + ], + "source_datarepo_row_ids": [ + "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" + ], + "version": "2022-06-01T00:00:00.000000Z" + }, + "relations": [] + }, + { + "id": "2370f948-2783-4eb6-afea-e022897f4dcf", + "name": "anvil_dataset", + "object": { + "consent_group": [ + "DS-BDIS" + ], + "data_modality": [], + "data_use_permission": [ + "DS-BDIS" + ], + "datarepo_row_id": "2370f948-2783-4eb6-afea-e022897f4dcf", + "dataset_id": "52ee7665-7033-63f2-a8d9-ce8e32666739", + "description": null, + "owner": [ + "Debbie Nickerson" + ], + "principal_investigator": [], + "registered_identifier": [ + "phs000693" + ], + "source_datarepo_row_ids": [ + "workspace_attributes:7a22b629-9d81-4e4d-9297-f9e44ed760bc" + ], + "title": "ANVIL_CMG_UWASH_DS_BDIS", + "version": "2022-06-01T00:00:00.000000Z" + }, + "relations": [] + }, + { + "id": "15b76f9c-6b46-433f-851d-34e89f1b9ba6", + "name": "anvil_file", + "object": { + "crc32": "", + "data_modality": [], + "datarepo_row_id": "15b76f9c-6b46-433f-851d-34e89f1b9ba6", + "drs_uri": "drs://mock_tdr.lan/v1_2ae00e5c-4aef-4a1e-9eca-d8d0747b5348_1e269f04-4347-4188-b060-1dcc69e71d67", + "file_format": ".vcf.gz", + "file_id": "1e269f04-4347-4188-b060-1dcc69e71d67", + "file_md5sum": "vuxgbuCqKZ/fkT9CWTFmIg==", + "file_name": "307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz", + "file_ref": "drs://mock_tdr.lan/v1_2ae00e5c-4aef-4a1e-9eca-d8d0747b5348_1e269f04-4347-4188-b060-1dcc69e71d67", + "file_size": 213021639, + "is_supplementary": false, + "reference_assembly": [], + "sha256": "", + "source_datarepo_row_ids": [ + "file_inventory:81d16471-97ac-48fe-99a0-73d9ec62c2c0" + ], + "version": "2022-06-01T00:00:00.000000Z" + }, + "relations": [] + }, + { + "id": "bfd991f2-2797-4083-972a-da7c6d7f1b2e", + "name": "anvil_donor", + "object": { + "datarepo_row_id": "bfd991f2-2797-4083-972a-da7c6d7f1b2e", + "donor_id": "1e2bd7e5-f45e-a391-daea-7c060be76acd", + "genetic_ancestry": [], + "organism_type": "redacted-ACw+6ecI", + "phenotypic_sex": "redacted-JfQ0b3xG", + "reported_ethnicity": [ + "redacted-NSkwDycK" + ], + "source_datarepo_row_ids": [ + "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" + ], + "version": "2022-06-01T00:00:00.000000Z" + }, + "relations": [] + }, + { + "id": "3b17377b-16b1-431c-9967-e5d01fc5923f", + "name": "anvil_file", + "object": { + "crc32": "", + "data_modality": [], + "datarepo_row_id": "3b17377b-16b1-431c-9967-e5d01fc5923f", + "drs_uri": "drs://mock_tdr.lan/v1_2ae00e5c-4aef-4a1e-9eca-d8d0747b5348_8b722e88-8103-49c1-b351-e64fa7c6ab37", + "file_format": ".bam", + "file_id": "8b722e88-8103-49c1-b351-e64fa7c6ab37", + "file_md5sum": "fNn9e1SovzgOROk3BvH6LQ==", + "file_name": "307500.merged.matefixed.sorted.markeddups.recal.bam", + "file_ref": "drs://mock_tdr.lan/v1_2ae00e5c-4aef-4a1e-9eca-d8d0747b5348_8b722e88-8103-49c1-b351-e64fa7c6ab37", + "file_size": 3306845592, + "is_supplementary": false, + "reference_assembly": [], + "sha256": "", + "source_datarepo_row_ids": [ + "file_inventory:9658d94a-511d-4b49-82c3-d0cb07e0cff2" + ], + "version": "2022-06-01T00:00:00.000000Z" + }, + "relations": [] + } +] \ No newline at end of file diff --git a/test/service/data/verbatim/pfb_schema.json b/test/service/data/verbatim/pfb_schema.json new file mode 100644 index 000000000..a858db5f4 --- /dev/null +++ b/test/service/data/verbatim/pfb_schema.json @@ -0,0 +1,609 @@ +{ + "fields": [ + { + "default": null, + "name": "id", + "type": [ + "null", + "string" + ] + }, + { + "name": "name", + "type": "string" + }, + { + "name": "object", + "type": [ + { + "fields": [ + { + "name": "nodes", + "type": { + "items": { + "fields": [ + { + "name": "name", + "type": "string" + }, + { + "name": "ontology_reference", + "type": "string" + }, + { + "name": "values", + "type": { + "type": "map", + "values": "string" + } + }, + { + "name": "links", + "type": { + "items": { + "fields": [ + { + "name": "multiplicity", + "type": { + "name": "Multiplicity", + "symbols": [ + "ONE_TO_ONE", + "ONE_TO_MANY", + "MANY_TO_ONE", + "MANY_TO_MANY" + ], + "type": "enum" + } + }, + { + "name": "dst", + "type": "string" + }, + { + "name": "name", + "type": "string" + } + ], + "name": "Link", + "type": "record" + }, + "type": "array" + } + }, + { + "name": "properties", + "type": { + "items": { + "fields": [ + { + "name": "name", + "type": "string" + }, + { + "name": "ontology_reference", + "type": "string" + }, + { + "name": "values", + "type": { + "type": "map", + "values": "string" + } + } + ], + "name": "Property", + "type": "record" + }, + "type": "array" + } + } + ], + "name": "Node", + "type": "record" + }, + "type": "array" + } + }, + { + "name": "misc", + "type": { + "type": "map", + "values": "string" + } + } + ], + "name": "Metadata", + "type": "record" + }, + { + "fields": [ + { + "name": "anatomical_site", + "namespace": "anvil_biosample", + "type": "null" + }, + { + "name": "apriori_cell_type", + "namespace": "anvil_biosample", + "type": { + "items": [], + "type": "array" + } + }, + { + "name": "biosample_id", + "namespace": "anvil_biosample", + "type": "string" + }, + { + "name": "biosample_type", + "namespace": "anvil_biosample", + "type": "null" + }, + { + "name": "datarepo_row_id", + "namespace": "anvil_biosample", + "type": "string" + }, + { + "name": "disease", + "namespace": "anvil_biosample", + "type": "null" + }, + { + "name": "donor_age_at_collection_lower_bound", + "namespace": "anvil_biosample", + "type": "null" + }, + { + "name": "donor_age_at_collection_unit", + "namespace": "anvil_biosample", + "type": "null" + }, + { + "name": "donor_age_at_collection_upper_bound", + "namespace": "anvil_biosample", + "type": "null" + }, + { + "name": "source_datarepo_row_ids", + "namespace": "anvil_biosample", + "type": { + "items": "string", + "type": "array" + } + }, + { + "name": "version", + "namespace": "anvil_biosample", + "type": "string" + } + ], + "name": "anvil_biosample", + "type": "record" + }, + { + "fields": [ + { + "name": "consent_group", + "namespace": "anvil_dataset", + "type": [ + "null", + { + "items": "string", + "type": "array" + } + ] + }, + { + "name": "data_modality", + "namespace": "anvil_dataset", + "type": [ + "null", + { + "items": [], + "type": "array" + } + ] + }, + { + "name": "data_use_permission", + "namespace": "anvil_dataset", + "type": [ + "null", + { + "items": "string", + "type": "array" + } + ] + }, + { + "name": "datarepo_row_id", + "namespace": "anvil_dataset", + "type": [ + "null", + "string" + ] + }, + { + "name": "dataset_id", + "namespace": "anvil_dataset", + "type": [ + "null", + "string" + ] + }, + { + "name": "description", + "namespace": "anvil_dataset", + "type": [ + "null", + "string" + ] + }, + { + "name": "owner", + "namespace": "anvil_dataset", + "type": [ + "null", + { + "items": "string", + "type": "array" + } + ] + }, + { + "name": "principal_investigator", + "namespace": "anvil_dataset", + "type": [ + "null", + { + "items": [], + "type": "array" + } + ] + }, + { + "name": "registered_identifier", + "namespace": "anvil_dataset", + "type": [ + "null", + { + "items": "string", + "type": "array" + } + ] + }, + { + "name": "source_datarepo_row_ids", + "namespace": "anvil_dataset", + "type": [ + "null", + { + "items": "string", + "type": "array" + } + ] + }, + { + "name": "title", + "namespace": "anvil_dataset", + "type": [ + "null", + "string" + ] + }, + { + "name": "version", + "namespace": "anvil_dataset", + "type": "string" + } + ], + "name": "anvil_dataset", + "type": "record" + }, + { + "fields": [ + { + "name": "datarepo_row_id", + "namespace": "anvil_diagnosis", + "type": "string" + }, + { + "name": "diagnosis_age_lower_bound", + "namespace": "anvil_diagnosis", + "type": "null" + }, + { + "name": "diagnosis_age_unit", + "namespace": "anvil_diagnosis", + "type": "null" + }, + { + "name": "diagnosis_age_upper_bound", + "namespace": "anvil_diagnosis", + "type": "null" + }, + { + "name": "diagnosis_id", + "namespace": "anvil_diagnosis", + "type": "string" + }, + { + "name": "disease", + "namespace": "anvil_diagnosis", + "type": { + "items": "string", + "type": "array" + } + }, + { + "name": "onset_age_lower_bound", + "namespace": "anvil_diagnosis", + "type": "null" + }, + { + "name": "onset_age_unit", + "namespace": "anvil_diagnosis", + "type": "null" + }, + { + "name": "onset_age_upper_bound", + "namespace": "anvil_diagnosis", + "type": "null" + }, + { + "name": "phenopacket", + "namespace": "anvil_diagnosis", + "type": { + "items": [], + "type": "array" + } + }, + { + "name": "phenotype", + "namespace": "anvil_diagnosis", + "type": { + "items": "string", + "type": "array" + } + }, + { + "name": "source_datarepo_row_ids", + "namespace": "anvil_diagnosis", + "type": { + "items": "string", + "type": "array" + } + }, + { + "name": "version", + "namespace": "anvil_diagnosis", + "type": "string" + } + ], + "name": "anvil_diagnosis", + "type": "record" + }, + { + "fields": [ + { + "name": "datarepo_row_id", + "namespace": "anvil_donor", + "type": "string" + }, + { + "name": "donor_id", + "namespace": "anvil_donor", + "type": "string" + }, + { + "name": "genetic_ancestry", + "namespace": "anvil_donor", + "type": { + "items": [], + "type": "array" + } + }, + { + "name": "organism_type", + "namespace": "anvil_donor", + "type": "string" + }, + { + "name": "phenotypic_sex", + "namespace": "anvil_donor", + "type": "string" + }, + { + "name": "reported_ethnicity", + "namespace": "anvil_donor", + "type": { + "items": "string", + "type": "array" + } + }, + { + "name": "source_datarepo_row_ids", + "namespace": "anvil_donor", + "type": { + "items": "string", + "type": "array" + } + }, + { + "name": "version", + "namespace": "anvil_donor", + "type": "string" + } + ], + "name": "anvil_donor", + "type": "record" + }, + { + "fields": [ + { + "name": "crc32", + "namespace": "anvil_file", + "type": "string" + }, + { + "name": "data_modality", + "namespace": "anvil_file", + "type": { + "items": [], + "type": "array" + } + }, + { + "name": "datarepo_row_id", + "namespace": "anvil_file", + "type": "string" + }, + { + "name": "drs_uri", + "namespace": "anvil_file", + "type": "string" + }, + { + "name": "file_format", + "namespace": "anvil_file", + "type": "string" + }, + { + "name": "file_id", + "namespace": "anvil_file", + "type": "string" + }, + { + "name": "file_md5sum", + "namespace": "anvil_file", + "type": "string" + }, + { + "name": "file_name", + "namespace": "anvil_file", + "type": "string" + }, + { + "name": "file_ref", + "namespace": "anvil_file", + "type": "string" + }, + { + "name": "file_size", + "namespace": "anvil_file", + "type": "long" + }, + { + "name": "is_supplementary", + "namespace": "anvil_file", + "type": "boolean" + }, + { + "name": "reference_assembly", + "namespace": "anvil_file", + "type": { + "items": [], + "type": "array" + } + }, + { + "name": "sha256", + "namespace": "anvil_file", + "type": "string" + }, + { + "name": "source_datarepo_row_ids", + "namespace": "anvil_file", + "type": { + "items": "string", + "type": "array" + } + }, + { + "name": "version", + "namespace": "anvil_file", + "type": "string" + } + ], + "name": "anvil_file", + "type": "record" + }, + { + "fields": [ + { + "name": "activity_type", + "namespace": "anvil_sequencingactivity", + "type": "string" + }, + { + "name": "assay_type", + "namespace": "anvil_sequencingactivity", + "type": { + "items": [], + "type": "array" + } + }, + { + "name": "data_modality", + "namespace": "anvil_sequencingactivity", + "type": { + "items": [], + "type": "array" + } + }, + { + "name": "datarepo_row_id", + "namespace": "anvil_sequencingactivity", + "type": "string" + }, + { + "name": "sequencingactivity_id", + "namespace": "anvil_sequencingactivity", + "type": "string" + }, + { + "name": "source_datarepo_row_ids", + "namespace": "anvil_sequencingactivity", + "type": { + "items": "string", + "type": "array" + } + }, + { + "name": "version", + "namespace": "anvil_sequencingactivity", + "type": "string" + } + ], + "name": "anvil_sequencingactivity", + "type": "record" + } + ] + }, + { + "default": [], + "name": "relations", + "type": { + "items": { + "fields": [ + { + "name": "dst_id", + "type": "string" + }, + { + "name": "dst_name", + "type": "string" + } + ], + "name": "Relation", + "type": "record" + }, + "type": "array" + } + } + ], + "name": "Entity", + "type": "record" +} \ No newline at end of file diff --git a/test/service/test_manifest.py b/test/service/test_manifest.py index 8c5ac5d5f..655c85eed 100644 --- a/test/service/test_manifest.py +++ b/test/service/test_manifest.py @@ -20,6 +20,9 @@ BytesIO, ) import json +from operator import ( + itemgetter, +) import os from pathlib import ( Path, @@ -67,6 +70,8 @@ ) from azul.collections import ( adict, + compose_keys, + none_safe_key, ) from azul.indexer import ( SourcedBundleFQID, @@ -2069,3 +2074,22 @@ def test_verbatim_jsonl_manifest(self): for entity_ref, entity in self._load_canned_bundle(bundle).entities.items() ] self._assert_jsonl(expected, response) + + @unittest.skipIf(not config.enable_replicas, + 'The format is replica-based') + @manifest_test + def test_verbatim_pfb_manifest(self): + response = self._get_manifest(ManifestFormat.verbatim_pfb, filters={}) + self.assertEqual(200, response.status_code) + manifest = fastavro.reader(BytesIO(response.content)) + schema = manifest.writer_schema + entities = list(manifest) + with open(self._data_path('service') / 'verbatim/pfb_schema.json') as f: + expected_schema = json.load(f) + with open(self._data_path('service') / 'verbatim/pfb_entities.json') as f: + expected_entities = json.load(f) + sort_key = compose_keys(none_safe_key(), itemgetter('id')) + entities.sort(key=sort_key) + expected_entities.sort(key=sort_key) + self.assertEqual(expected_schema, schema) + self.assertEqual(expected_entities, entities) From 827d610e69793240963f4c071ec868709b9aad22 Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Fri, 12 Apr 2024 17:24:39 -0700 Subject: [PATCH 10/11] Add FIXME (#6139) --- src/azul/service/avro_pfb.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/azul/service/avro_pfb.py b/src/azul/service/avro_pfb.py index 1e64ef8c8..c421f09aa 100644 --- a/src/azul/service/avro_pfb.py +++ b/src/azul/service/avro_pfb.py @@ -202,6 +202,8 @@ def for_replica(cls, replica: MutableJSON, schema: JSON) -> Self: # of AnVIL datasets have the same ID as the replica for the dataset # itself. Terra appears to combine PFB entities with the same ID # into a single row. + # FIXME: Improve handling of DUOS replicas + # https://github.com/DataBiosphere/azul/issues/6139 return cls(id=replica['entity_id'], name=name, object=object_) @classmethod From 8cb87e9ded6cc08afff69c5f65e96e2715b5b268 Mon Sep 17 00:00:00 2001 From: Hannes Schmidt Date: Wed, 17 Apr 2024 19:30:50 -0700 Subject: [PATCH 11/11] Refactor and document PFB integration test --- test/integration_test.py | 68 ++++++++++++++++++++++++++++++++++------ 1 file changed, 59 insertions(+), 9 deletions(-) diff --git a/test/integration_test.py b/test/integration_test.py index e9da70ae7..cb2834f0b 100644 --- a/test/integration_test.py +++ b/test/integration_test.py @@ -21,6 +21,7 @@ TextIOWrapper, ) from itertools import ( + count, starmap, ) import json @@ -954,17 +955,66 @@ def _check_terra_bdbag_manifest(self, catalog: CatalogName, response: bytes): self.assertEqual(size, int(response.headers['Content-Length'])) def _check_terra_pfb_manifest(self, _catalog: CatalogName, response: bytes): + # A PFB is an Avro Object Container File, i.e., a stream of Avro objects + # preceded by a schema describing these objects. The internals of the + # format are slightly more complicated and are described in + # + # https://avro.apache.org/docs/1.11.1/specification/#object-container-files + # reader = fastavro.reader(BytesIO(response)) + # The schema is also an Avro object, specifically a Avro record which + # FastAVRO exposes to us as a JSON object, i.e., a `dict` with string + # keys + record_schema = reader.writer_schema + # Each object in a PFB is also of type 'record' + self.assertEqual('record', record_schema['type']) + # PFB calls the records *entities*. Unfortunately, the PFB standard is + # afflicted with confusing terminology, so bear with us. + self.assertEqual('Entity', record_schema['name']) + # Each entity record has four fields: `id`, `name`, `object` and + # `relations`. The `object` field holds the actual entity. The `name` + # field, is a string denoting the type of entity. Entities records with + # the same value in the `name` field are expected to contain entities of + # the same shape. Here we extract the declaration of the `object` field + # from the schema: + object_field = one(f for f in record_schema['fields'] if f['name'] == 'object') + # The different shapes, i.e., entity types are defined as members of a + # union type, which manifests in Avro simply as an array of schemas. + # Here we extract each union member and index it into a dictionary for + # easy access by name. + entity_types = {e['name']: e for e in object_field['type']} + self.assertEqual(len(entity_types), len(object_field['type'])) + # The `id` field is a string uniquely identifying an entity among all + # entities of the same shape, i.e., with the same value in the `name` + # field of the containing record. The `relations` field holds references + # to other entities, as an array of nested Avro records, each record + # containing the `name` and `id` of the referenced entity. + num_records = count() for record in reader: - fastavro.validate(record, reader.writer_schema) - object_schema = one(f for f in reader.writer_schema['fields'] - if f['name'] == 'object') - entity_schema = one(e for e in object_schema['type'] - if e['name'] == record['name']) - fields = entity_schema['fields'] - rows_present = set(record['object'].keys()) - rows_expected = set(f['name'] for f in fields) - self.assertEqual(rows_present, rows_expected) + # Every record must follow the schema. Since each record's `object` + # field contains an entity, the schema check therefore extends to + # the various entity types. + fastavro.validate(record, record_schema) + if 0 == next(num_records): + # PFB requires a special `Metadata` entity to occur first. It is + # used to declare the relations between entity types, thereby + # expressing additional constraints on the `relations` field. + # + # FIXME: We don't currently declare relations + # https://github.com/DataBiosphere/azul/issues/6066 + # + # For now, we just check the `name` and the absence of an `id`. + self.assertEqual('Metadata', record['name']) + self.assertIsNone(record['id']) + # The following is redundant given the schema validation above but + # we'll leave it in for illustration. + fields = entity_types[record['name']]['fields'] + fields_present = set(record['object'].keys()) + fields_expected = set(f['name'] for f in fields) + self.assertEqual(fields_present, fields_expected) + # We expect to observe the special `Metadata` entity record and at least + # one additional entity record + self.assertGreater(next(num_records), 1) def _read_csv_manifest(self, file: IO[bytes]) -> csv.DictReader: text = TextIOWrapper(file)