diff --git a/lambdas/service/app.py b/lambdas/service/app.py index 142454955c..d96cb744c0 100644 --- a/lambdas/service/app.py +++ b/lambdas/service/app.py @@ -228,7 +228,7 @@ # changes and reset the minor version to zero. Otherwise, increment only # the minor version for backwards compatible changes. A backwards # compatible change is one that does not require updates to clients. - 'version': '7.2' + 'version': '7.3' }, 'tags': [ { @@ -1414,6 +1414,10 @@ def manifest_route(*, fetch: bool, initiate: bool): manifest in [JSONL][5] format. Each line contains an unaltered metadata entity from the underlying repository. + - `{ManifestFormat.verbatim_pfb.value}` for a verbatim + manifest in the [PFB format][3]. This format is mainly + used for exporting data to Terra. + [1]: https://bd2k.ini.usc.edu/tools/bdbag/ [2]: https://software.broadinstitute.org/firecloud/documentation/article?id=10954 diff --git a/lambdas/service/openapi.json b/lambdas/service/openapi.json index be7e9606b4..6edb98e96d 100644 --- a/lambdas/service/openapi.json +++ b/lambdas/service/openapi.json @@ -3,7 +3,7 @@ "info": { "title": "azul_service", "description": "\n# Overview\n\nAzul is a REST web service for querying metadata associated with\nboth experimental and analysis data from a data repository. In order\nto deliver response times that make it suitable for interactive use\ncases, the set of metadata properties that it exposes for sorting,\nfiltering, and aggregation is limited. Azul provides a uniform view\nof the metadata over a range of diverse schemas, effectively\nshielding clients from changes in the schemas as they occur over\ntime. It does so, however, at the expense of detail in the set of\nmetadata properties it exposes and in the accuracy with which it\naggregates them.\n\nAzul denormalizes and aggregates metadata into several different\nindices for selected entity types. Metadata entities can be queried\nusing the [Index](#operations-tag-Index) endpoints.\n\nA set of indices forms a catalog. There is a default catalog called\n`dcp2` which will be used unless a\ndifferent catalog name is specified using the `catalog` query\nparameter. Metadata from different catalogs is completely\nindependent: a response obtained by querying one catalog does not\nnecessarily correlate to a response obtained by querying another\none. Two catalogs can contain metadata from the same sources or\ndifferent sources. It is only guaranteed that the body of a\nresponse by any given endpoint adheres to one schema,\nindependently of which catalog was specified in the request.\n\nAzul provides the ability to download data and metadata via the\n[Manifests](#operations-tag-Manifests) endpoints. The\n`curl` format manifests can be used to\ndownload data files. Other formats provide various views of the\nmetadata. Manifests can be generated for a selection of files using\nfilters. These filters are interchangeable with the filters used by\nthe [Index](#operations-tag-Index) endpoints.\n\nAzul also provides a [summary](#operations-Index-get_index_summary)\nview of indexed data.\n\n## Data model\n\nAny index, when queried, returns a JSON array of hits. Each hit\nrepresents a metadata entity. Nested in each hit is a summary of the\nproperties of entities associated with the hit. An entity is\nassociated either by a direct edge in the original metadata graph,\nor indirectly as a series of edges. The nested properties are\ngrouped by the type of the associated entity. The properties of all\ndata files associated with a particular sample, for example, are\nlisted under `hits[*].files` in a `/index/samples` response. It is\nimportant to note that while each _hit_ represents a discrete\nentity, the properties nested within that hit are the result of an\naggregation over potentially many associated entities.\n\nTo illustrate this, consider a data file that is part of two\nprojects (a project is a group of related experiments, typically by\none laboratory, institution or consortium). Querying the `files`\nindex for this file yields a hit looking something like:\n\n```\n{\n \"projects\": [\n {\n \"projectTitle\": \"Project One\"\n \"laboratory\": ...,\n ...\n },\n {\n \"projectTitle\": \"Project Two\"\n \"laboratory\": ...,\n ...\n }\n ],\n \"files\": [\n {\n \"format\": \"pdf\",\n \"name\": \"Team description.pdf\",\n ...\n }\n ]\n}\n```\n\nThis example hit contains two kinds of nested entities (a hit in an\nactual Azul response will contain more): There are the two projects\nentities, and the file itself. These nested entities contain\nselected metadata properties extracted in a consistent way. This\nmakes filtering and sorting simple.\n\nAlso notice that there is only one file. When querying a particular\nindex, the corresponding entity will always be a singleton like\nthis.\n", - "version": "7.2" + "version": "7.3" }, "tags": [ { @@ -9482,7 +9482,7 @@ "verbatim.jsonl" ] }, - "description": "\nThe desired format of the output.\n\n- `compact` (the default) for a compact,\n tab-separated manifest\n\n- `terra.bdbag` for a manifest in the\n [BDBag format][1]. This provides a ZIP file containing two\n manifests: one for Participants (aka Donors) and one for\n Samples (aka Specimens). For more on the format of the\n manifests see [documentation here][2].\n\n- `terra.pfb` for a manifest in the [PFB\n format][3]. This format is mainly used for exporting data to\n Terra.\n\n- `curl` for a [curl configuration\n file][4] manifest. This manifest can be used with the curl\n program to download all the files listed in the manifest.\n\n- `verbatim.jsonl` for a verbatim\n manifest in [JSONL][5] format. Each line contains an\n unaltered metadata entity from the underlying repository.\n\n[1]: https://bd2k.ini.usc.edu/tools/bdbag/\n\n[2]: https://software.broadinstitute.org/firecloud/documentation/article?id=10954\n\n[3]: https://github.com/uc-cdis/pypfb\n\n[4]: https://curl.haxx.se/docs/manpage.html#-K\n\n[5]: https://jsonlines.org/\n" + "description": "\nThe desired format of the output.\n\n- `compact` (the default) for a compact,\n tab-separated manifest\n\n- `terra.bdbag` for a manifest in the\n [BDBag format][1]. This provides a ZIP file containing two\n manifests: one for Participants (aka Donors) and one for\n Samples (aka Specimens). For more on the format of the\n manifests see [documentation here][2].\n\n- `terra.pfb` for a manifest in the [PFB\n format][3]. This format is mainly used for exporting data to\n Terra.\n\n- `curl` for a [curl configuration\n file][4] manifest. This manifest can be used with the curl\n program to download all the files listed in the manifest.\n\n- `verbatim.jsonl` for a verbatim\n manifest in [JSONL][5] format. Each line contains an\n unaltered metadata entity from the underlying repository.\n\n- `verbatim.pfb` for a verbatim\n manifest in the [PFB format][3]. This format is mainly\n used for exporting data to Terra.\n\n[1]: https://bd2k.ini.usc.edu/tools/bdbag/\n\n[2]: https://software.broadinstitute.org/firecloud/documentation/article?id=10954\n\n[3]: https://github.com/uc-cdis/pypfb\n\n[4]: https://curl.haxx.se/docs/manpage.html#-K\n\n[5]: https://jsonlines.org/\n" } ], "responses": { @@ -10890,7 +10890,7 @@ "verbatim.jsonl" ] }, - "description": "\nThe desired format of the output.\n\n- `compact` (the default) for a compact,\n tab-separated manifest\n\n- `terra.bdbag` for a manifest in the\n [BDBag format][1]. This provides a ZIP file containing two\n manifests: one for Participants (aka Donors) and one for\n Samples (aka Specimens). For more on the format of the\n manifests see [documentation here][2].\n\n- `terra.pfb` for a manifest in the [PFB\n format][3]. This format is mainly used for exporting data to\n Terra.\n\n- `curl` for a [curl configuration\n file][4] manifest. This manifest can be used with the curl\n program to download all the files listed in the manifest.\n\n- `verbatim.jsonl` for a verbatim\n manifest in [JSONL][5] format. Each line contains an\n unaltered metadata entity from the underlying repository.\n\n[1]: https://bd2k.ini.usc.edu/tools/bdbag/\n\n[2]: https://software.broadinstitute.org/firecloud/documentation/article?id=10954\n\n[3]: https://github.com/uc-cdis/pypfb\n\n[4]: https://curl.haxx.se/docs/manpage.html#-K\n\n[5]: https://jsonlines.org/\n" + "description": "\nThe desired format of the output.\n\n- `compact` (the default) for a compact,\n tab-separated manifest\n\n- `terra.bdbag` for a manifest in the\n [BDBag format][1]. This provides a ZIP file containing two\n manifests: one for Participants (aka Donors) and one for\n Samples (aka Specimens). For more on the format of the\n manifests see [documentation here][2].\n\n- `terra.pfb` for a manifest in the [PFB\n format][3]. This format is mainly used for exporting data to\n Terra.\n\n- `curl` for a [curl configuration\n file][4] manifest. This manifest can be used with the curl\n program to download all the files listed in the manifest.\n\n- `verbatim.jsonl` for a verbatim\n manifest in [JSONL][5] format. Each line contains an\n unaltered metadata entity from the underlying repository.\n\n- `verbatim.pfb` for a verbatim\n manifest in the [PFB format][3]. This format is mainly\n used for exporting data to Terra.\n\n[1]: https://bd2k.ini.usc.edu/tools/bdbag/\n\n[2]: https://software.broadinstitute.org/firecloud/documentation/article?id=10954\n\n[3]: https://github.com/uc-cdis/pypfb\n\n[4]: https://curl.haxx.se/docs/manpage.html#-K\n\n[5]: https://jsonlines.org/\n" } ], "responses": { diff --git a/src/azul/plugins/__init__.py b/src/azul/plugins/__init__.py index c703aa1273..444c852641 100644 --- a/src/azul/plugins/__init__.py +++ b/src/azul/plugins/__init__.py @@ -142,6 +142,7 @@ class ManifestFormat(Enum): terra_pfb = 'terra.pfb' curl = 'curl' verbatim_jsonl = 'verbatim.jsonl' + verbatim_pfb = 'verbatim.pfb' T = TypeVar('T', bound='Plugin') diff --git a/src/azul/plugins/metadata/anvil/__init__.py b/src/azul/plugins/metadata/anvil/__init__.py index 3f96a19873..3d95afed6f 100644 --- a/src/azul/plugins/metadata/anvil/__init__.py +++ b/src/azul/plugins/metadata/anvil/__init__.py @@ -75,7 +75,10 @@ def manifest_formats(self) -> Sequence[ManifestFormat]: return [ ManifestFormat.compact, ManifestFormat.terra_pfb, - *iif(config.enable_replicas, [ManifestFormat.verbatim_jsonl]) + *iif(config.enable_replicas, [ + ManifestFormat.verbatim_jsonl, + ManifestFormat.verbatim_pfb + ]) ] def transformer_types(self) -> Iterable[Type[BaseTransformer]]: diff --git a/src/azul/service/avro_pfb.py b/src/azul/service/avro_pfb.py index 7d6e941db8..3f689ec9a7 100644 --- a/src/azul/service/avro_pfb.py +++ b/src/azul/service/avro_pfb.py @@ -1,3 +1,4 @@ +import bisect from collections import ( defaultdict, ) @@ -16,6 +17,7 @@ ClassVar, MutableSet, Self, + Sequence, ) from uuid import ( UUID, @@ -58,6 +60,7 @@ value_and_unit, ) from azul.types import ( + AnyMutableJSON, JSON, MutableJSON, ) @@ -189,6 +192,17 @@ def from_json(cls, id_ = _reversible_join('.', map(str, (name, id_, len(ids)))) return cls(id=id_, name=name, object=object_) + @classmethod + def for_replica(cls, replica: MutableJSON, schema: JSON) -> Self: + name, object_ = replica['replica_type'], replica['contents'] + cls._add_missing_fields(name, object_, schema) + # Note that it is possible for two distinct replicas to have the same + # entity ID. For example, replicas representing the DUOS registration + # of AnVIL datasets have the same ID as the replica for the dataset + # itself. Terra appears to combine PFB entities with the same ID + # into a single row. + return cls(id=replica['entity_id'], name=name, object=object_) + @classmethod def _add_missing_fields(cls, name: str, object_: MutableJSON, schema): """ @@ -215,6 +229,8 @@ def _add_missing_fields(cls, name: str, object_: MutableJSON, schema): else: assert 'null' in field_type['items'], field default_value = [None] + elif field_type == 'null': + default_value = None else: assert False, field object_[field_name] = default_value @@ -251,7 +267,7 @@ def to_entity(cls, entity: PFBEntity) -> Self: return cls(dst_id=entity.id, dst_name=entity.name) -def pfb_metadata_entity(entity_types: Iterable[str]) -> MutableJSON: +def pfb_metadata_entity(entity_types: Iterable[str], links: bool = True) -> MutableJSON: """ The Metadata entity encodes the possible relationships between tables. @@ -266,7 +282,7 @@ def pfb_metadata_entity(entity_types: Iterable[str]) -> MutableJSON: 'name': entity_type, 'ontology_reference': '', 'values': {}, - 'links': [] if entity_type == 'files' else [{ + 'links': [] if not links or entity_type == 'files' else [{ 'multiplicity': 'MANY_TO_MANY', 'dst': 'files', 'name': 'files' @@ -294,6 +310,19 @@ def pfb_schema_from_field_types(field_types: FieldTypes) -> JSON: return _avro_pfb_schema(entity_schemas) +def pfb_schema_from_replicas(replicas: Iterable[JSON]) -> tuple[Sequence[str], JSON]: + schemas_by_replica_type = {} + for replica in replicas: + replica_type, replica_contents = replica['replica_type'], replica['contents'] + _update_replica_schema(schema=schemas_by_replica_type, + path=(replica_type,), + key=replica_type, + value=replica_contents) + schemas_by_replica_type = sorted(schemas_by_replica_type.items()) + keys, values = zip(*schemas_by_replica_type) + return keys, _avro_pfb_schema(values) + + def _avro_pfb_schema(azul_avro_schema: Iterable[JSON]) -> JSON: """ The boilerplate Avro schema that comprises a PFB's schema is returned in @@ -474,6 +503,13 @@ def _inject_reference_handover_values(entity: MutableJSON, doc: JSON): # that all of the primitive field types types are nullable # https://github.com/DataBiosphere/azul/issues/4094 +_json_to_pfb_types = { + bool: 'boolean', + float: 'double', + int: 'long', + str: 'string' +} + _nullable_to_pfb_types = { null_bool: ['null', 'boolean'], null_float: ['null', 'double'], @@ -561,10 +597,7 @@ def _entity_schema_recursive(field_types: FieldTypes, 'type': 'array', 'items': { 'type': 'array', - 'items': { - int: 'long', - float: 'double' - }[field_type.ends_type.native_type] + 'items': _json_to_pfb_types[field_type.ends_type.native_type] } } } @@ -603,3 +636,97 @@ def _entity_schema_recursive(field_types: FieldTypes, pass else: assert False, field_type + + +def _update_replica_schema(*, + schema: MutableJSON, + path: Sequence[str], + key: str, + value: AnyMutableJSON): + try: + old_type = schema[key] + except KeyError: + schema[key] = _new_replica_schema(path=path, value=value) + else: + if old_type == []: + schema[key] = _new_replica_schema(path=path, value=value) + elif value is None: + if old_type == 'null' or isinstance(old_type, list): + pass + else: + schema[key] = ['null', old_type] + elif old_type == 'null': + schema[key] = [ + 'null', + _new_replica_schema(path=path, value=value) + ] + elif isinstance(value, list): + if isinstance(old_type, list): + old_type = old_type[1] + assert old_type['type'] == 'array', old_type + for v in value: + _update_replica_schema(schema=old_type, + path=path, + key='items', + value=v) + elif isinstance(value, dict): + if isinstance(old_type, list): + old_type = old_type[1] + assert old_type['type'] == 'record', old_type + old_fields = {field['name']: field for field in old_type['fields']} + for k in value.keys() | old_fields.keys(): + try: + field = old_fields[k] + except KeyError: + field = { + 'name': k, + 'namespace': '.'.join(path), + 'type': 'null' + } + bisect.insort(old_type['fields'], field, key=itemgetter('name')) + new_value = value[k] + else: + new_value = value.get(k) + _update_replica_schema(schema=field, + path=(*path, k), + key='type', + value=new_value) + else: + new_type = _json_to_pfb_types[type(value)] + if isinstance(old_type, list): + old_type = old_type[1] + assert old_type == new_type, (old_type, value) + + +def _new_replica_schema(*, + path: Sequence[str], + value: AnyMutableJSON, + ) -> AnyMutableJSON: + if value is None: + result = 'null' + elif isinstance(value, list): + # Empty list indicates "no type" (emtpy union). This will be replaced + # with an actual type unless we never encounter a non-empty array. + result = {'type': 'array', 'items': []} + for v in value: + _update_replica_schema(schema=result, + path=path, + key='items', + value=v) + elif isinstance(value, dict): + name = '.'.join(path) + result = { + 'name': name, + 'type': 'record', + 'fields': [ + { + 'name': k, + 'namespace': name, + 'type': _new_replica_schema(path=(*path, k), value=v) + } + for k, v in sorted(value.items()) + ] + } + else: + result = _json_to_pfb_types[type(value)] + return result diff --git a/src/azul/service/manifest_service.py b/src/azul/service/manifest_service.py index 9d432be00f..3be00012c0 100644 --- a/src/azul/service/manifest_service.py +++ b/src/azul/service/manifest_service.py @@ -2120,3 +2120,33 @@ def create_file(self) -> tuple[str, Optional[str]]: json.dump(entry, f) f.write('\n') return path, None + + +class PFBVerbatimManifestGenerator(VerbatimManifestGenerator): + + @property + def content_type(self) -> str: + return 'application/octet-stream' + + @classmethod + def file_name_extension(cls): + return 'avro' + + @classmethod + def format(cls) -> ManifestFormat: + return ManifestFormat.verbatim_pfb + + def create_file(self) -> tuple[str, Optional[str]]: + replicas = list(self._all_replicas()) + replica_types, pfb_schema = avro_pfb.pfb_schema_from_replicas(replicas) + pfb_metadata_entity = avro_pfb.pfb_metadata_entity(replica_types, links=False) + + def pfb_entities(): + yield pfb_metadata_entity + for replica in replicas: + yield avro_pfb.PFBEntity.for_replica(dict(replica), pfb_schema).to_json(()) + + fd, path = mkstemp(suffix=f'.{self.file_name_extension()}') + os.close(fd) + avro_pfb.write_pfb_entities(pfb_entities(), pfb_schema, path) + return path, None diff --git a/test/integration_test.py b/test/integration_test.py index 1a0f039aa2..e9da70ae75 100644 --- a/test/integration_test.py +++ b/test/integration_test.py @@ -585,7 +585,8 @@ def _test_manifest(self, catalog: CatalogName): ManifestFormat.terra_bdbag: self._check_terra_bdbag_manifest, ManifestFormat.terra_pfb: self._check_terra_pfb_manifest, ManifestFormat.curl: self._check_curl_manifest, - ManifestFormat.verbatim_jsonl: self._check_jsonl_manifest + ManifestFormat.verbatim_jsonl: self._check_jsonl_manifest, + ManifestFormat.verbatim_pfb: self._check_terra_pfb_manifest } for format in [None, *supported_formats]: # IT catalogs with just one public source are always indexed diff --git a/test/service/data/verbatim/pfb_entities.json b/test/service/data/verbatim/pfb_entities.json new file mode 100644 index 0000000000..69b48ca446 --- /dev/null +++ b/test/service/data/verbatim/pfb_entities.json @@ -0,0 +1,318 @@ +[ + { + "id": null, + "name": "Metadata", + "object": { + "misc": {}, + "nodes": [ + { + "links": [], + "name": "anvil_biosample", + "ontology_reference": "", + "properties": [], + "values": {} + }, + { + "links": [], + "name": "anvil_dataset", + "ontology_reference": "", + "properties": [], + "values": {} + }, + { + "links": [], + "name": "anvil_diagnosis", + "ontology_reference": "", + "properties": [], + "values": {} + }, + { + "links": [], + "name": "anvil_donor", + "ontology_reference": "", + "properties": [], + "values": {} + }, + { + "links": [], + "name": "anvil_file", + "ontology_reference": "", + "properties": [], + "values": {} + }, + { + "links": [], + "name": "anvil_sequencingactivity", + "ontology_reference": "", + "properties": [], + "values": {} + } + ] + }, + "relations": [] + }, + { + "id": "2370f948-2783-4eb6-afea-e022897f4dcf", + "name": "anvil_dataset", + "object": { + "consent_group": null, + "data_modality": null, + "data_use_permission": null, + "datarepo_row_id": null, + "dataset_id": null, + "description": "Study description from DUOS", + "owner": null, + "principal_investigator": null, + "registered_identifier": null, + "source_datarepo_row_ids": null, + "title": null, + "version": "2022-06-01T00:00:00.000000Z" + }, + "relations": [] + }, + { + "id": "816e364e-1193-4e5b-a91a-14e4b009157c", + "name": "anvil_sequencingactivity", + "object": { + "activity_type": "Sequencing", + "assay_type": [], + "data_modality": [], + "datarepo_row_id": "816e364e-1193-4e5b-a91a-14e4b009157c", + "sequencingactivity_id": "a60c5138-3749-f7cb-8714-52d389ad5231", + "source_datarepo_row_ids": [ + "sequencing:a6c663c7-6f26-4ed2-af9d-48e9c709a22b" + ], + "version": "2022-06-01T00:00:00.000000Z" + }, + "relations": [] + }, + { + "id": "677dd55c-3fa3-4b07-8c98-985d94d7577e", + "name": "anvil_dataset", + "object": { + "consent_group": [], + "data_modality": [], + "data_use_permission": [], + "datarepo_row_id": "677dd55c-3fa3-4b07-8c98-985d94d7577e", + "dataset_id": "385290c3-dff5-fb6d-2501-fa0ba3ad1c35", + "description": null, + "owner": [], + "principal_investigator": [], + "registered_identifier": [], + "source_datarepo_row_ids": [ + "workspace_attributes:95684a9c-e0a1-4c05-9f1f-de628a38420c" + ], + "title": "ANVIL_1000G_2019_Dev", + "version": "2022-06-01T00:00:00.000000Z" + }, + "relations": [] + }, + { + "id": "826dea02-e274-4ffe-aabc-eb3db63ad068", + "name": "anvil_biosample", + "object": { + "anatomical_site": null, + "apriori_cell_type": [], + "biosample_id": "f9d40cf6-37b8-22f3-ce35-0dc614d2452b", + "biosample_type": null, + "datarepo_row_id": "826dea02-e274-4ffe-aabc-eb3db63ad068", + "disease": null, + "donor_age_at_collection_lower_bound": null, + "donor_age_at_collection_unit": null, + "donor_age_at_collection_upper_bound": null, + "source_datarepo_row_ids": [ + "sample:98048c3b-2525-4090-94fd-477de31f2608" + ], + "version": "2022-06-01T00:00:00.000000Z" + }, + "relations": [] + }, + { + "id": "6b0f6c0f-5d80-4242-accb-840921351cd5", + "name": "anvil_file", + "object": { + "crc32": "", + "data_modality": [], + "datarepo_row_id": "6b0f6c0f-5d80-4242-accb-840921351cd5", + "drs_uri": "drs://mock_tdr.lan/v1_790795c4-49b1-4ac8-a060-207b92ea08c5_1fab11f5-7eab-4318-9a58-68d8d06e0715", + "file_format": ".txt", + "file_id": "1fab11f5-7eab-4318-9a58-68d8d06e0715", + "file_md5sum": "S/GBrRjzZAQYqh3rdiPYzA==", + "file_name": "CCDG_13607_B01_GRM_WGS_2019-02-19_chr15.recalibrated_variants.annotated.coding.txt", + "file_ref": "drs://mock_tdr.lan/v1_790795c4-49b1-4ac8-a060-207b92ea08c5_1fab11f5-7eab-4318-9a58-68d8d06e0715", + "file_size": 15079345, + "is_supplementary": true, + "reference_assembly": [], + "sha256": "", + "source_datarepo_row_ids": [ + "file_inventory:04ff3af2-0543-4ea6-830a-d31b957fa2ee" + ], + "version": "2022-06-01T00:00:00.000000Z" + }, + "relations": [] + }, + { + "id": "15d85d30-ad4a-4f50-87a8-a27f59dd1b5f", + "name": "anvil_diagnosis", + "object": { + "datarepo_row_id": "15d85d30-ad4a-4f50-87a8-a27f59dd1b5f", + "diagnosis_age_lower_bound": null, + "diagnosis_age_unit": null, + "diagnosis_age_upper_bound": null, + "diagnosis_id": "5ebe9bc4-a1be-0ddf-7277-b1e88276d0f6", + "disease": [ + "redacted-A61iJlLx" + ], + "onset_age_lower_bound": null, + "onset_age_unit": null, + "onset_age_upper_bound": null, + "phenopacket": [], + "phenotype": [ + "redacted-acSYHZUr" + ], + "source_datarepo_row_ids": [ + "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" + ], + "version": "2022-06-01T00:00:00.000000Z" + }, + "relations": [] + }, + { + "id": "1509ef40-d1ba-440d-b298-16b7c173dcd4", + "name": "anvil_sequencingactivity", + "object": { + "activity_type": "Sequencing", + "assay_type": [], + "data_modality": [], + "datarepo_row_id": "1509ef40-d1ba-440d-b298-16b7c173dcd4", + "sequencingactivity_id": "18b3be87-e26b-4376-0d8d-c1e370e90e07", + "source_datarepo_row_ids": [ + "sequencing:d4f6c0c4-1e11-438e-8218-cfea63b8b051" + ], + "version": "2022-06-01T00:00:00.000000Z" + }, + "relations": [] + }, + { + "id": "939a4bd3-86ed-4a8a-81f4-fbe0ee673461", + "name": "anvil_diagnosis", + "object": { + "datarepo_row_id": "939a4bd3-86ed-4a8a-81f4-fbe0ee673461", + "diagnosis_age_lower_bound": null, + "diagnosis_age_unit": null, + "diagnosis_age_upper_bound": null, + "diagnosis_id": "25ff8d32-18c9-fc3e-020a-5de20d35d906", + "disease": [ + "redacted-g50ublm/" + ], + "onset_age_lower_bound": null, + "onset_age_unit": null, + "onset_age_upper_bound": null, + "phenopacket": [], + "phenotype": [ + "redacted-acSYHZUr" + ], + "source_datarepo_row_ids": [ + "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" + ], + "version": "2022-06-01T00:00:00.000000Z" + }, + "relations": [] + }, + { + "id": "2370f948-2783-4eb6-afea-e022897f4dcf", + "name": "anvil_dataset", + "object": { + "consent_group": [ + "DS-BDIS" + ], + "data_modality": [], + "data_use_permission": [ + "DS-BDIS" + ], + "datarepo_row_id": "2370f948-2783-4eb6-afea-e022897f4dcf", + "dataset_id": "52ee7665-7033-63f2-a8d9-ce8e32666739", + "description": null, + "owner": [ + "Debbie Nickerson" + ], + "principal_investigator": [], + "registered_identifier": [ + "phs000693" + ], + "source_datarepo_row_ids": [ + "workspace_attributes:7a22b629-9d81-4e4d-9297-f9e44ed760bc" + ], + "title": "ANVIL_CMG_UWASH_DS_BDIS", + "version": "2022-06-01T00:00:00.000000Z" + }, + "relations": [] + }, + { + "id": "15b76f9c-6b46-433f-851d-34e89f1b9ba6", + "name": "anvil_file", + "object": { + "crc32": "", + "data_modality": [], + "datarepo_row_id": "15b76f9c-6b46-433f-851d-34e89f1b9ba6", + "drs_uri": "drs://mock_tdr.lan/v1_2ae00e5c-4aef-4a1e-9eca-d8d0747b5348_1e269f04-4347-4188-b060-1dcc69e71d67", + "file_format": ".vcf.gz", + "file_id": "1e269f04-4347-4188-b060-1dcc69e71d67", + "file_md5sum": "vuxgbuCqKZ/fkT9CWTFmIg==", + "file_name": "307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz", + "file_ref": "drs://mock_tdr.lan/v1_2ae00e5c-4aef-4a1e-9eca-d8d0747b5348_1e269f04-4347-4188-b060-1dcc69e71d67", + "file_size": 213021639, + "is_supplementary": false, + "reference_assembly": [], + "sha256": "", + "source_datarepo_row_ids": [ + "file_inventory:81d16471-97ac-48fe-99a0-73d9ec62c2c0" + ], + "version": "2022-06-01T00:00:00.000000Z" + }, + "relations": [] + }, + { + "id": "bfd991f2-2797-4083-972a-da7c6d7f1b2e", + "name": "anvil_donor", + "object": { + "datarepo_row_id": "bfd991f2-2797-4083-972a-da7c6d7f1b2e", + "donor_id": "1e2bd7e5-f45e-a391-daea-7c060be76acd", + "genetic_ancestry": [], + "organism_type": "redacted-ACw+6ecI", + "phenotypic_sex": "redacted-JfQ0b3xG", + "reported_ethnicity": [ + "redacted-NSkwDycK" + ], + "source_datarepo_row_ids": [ + "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" + ], + "version": "2022-06-01T00:00:00.000000Z" + }, + "relations": [] + }, + { + "id": "3b17377b-16b1-431c-9967-e5d01fc5923f", + "name": "anvil_file", + "object": { + "crc32": "", + "data_modality": [], + "datarepo_row_id": "3b17377b-16b1-431c-9967-e5d01fc5923f", + "drs_uri": "drs://mock_tdr.lan/v1_2ae00e5c-4aef-4a1e-9eca-d8d0747b5348_8b722e88-8103-49c1-b351-e64fa7c6ab37", + "file_format": ".bam", + "file_id": "8b722e88-8103-49c1-b351-e64fa7c6ab37", + "file_md5sum": "fNn9e1SovzgOROk3BvH6LQ==", + "file_name": "307500.merged.matefixed.sorted.markeddups.recal.bam", + "file_ref": "drs://mock_tdr.lan/v1_2ae00e5c-4aef-4a1e-9eca-d8d0747b5348_8b722e88-8103-49c1-b351-e64fa7c6ab37", + "file_size": 3306845592, + "is_supplementary": false, + "reference_assembly": [], + "sha256": "", + "source_datarepo_row_ids": [ + "file_inventory:9658d94a-511d-4b49-82c3-d0cb07e0cff2" + ], + "version": "2022-06-01T00:00:00.000000Z" + }, + "relations": [] + } +] \ No newline at end of file diff --git a/test/service/data/verbatim/pfb_schema.json b/test/service/data/verbatim/pfb_schema.json new file mode 100644 index 0000000000..a858db5f44 --- /dev/null +++ b/test/service/data/verbatim/pfb_schema.json @@ -0,0 +1,609 @@ +{ + "fields": [ + { + "default": null, + "name": "id", + "type": [ + "null", + "string" + ] + }, + { + "name": "name", + "type": "string" + }, + { + "name": "object", + "type": [ + { + "fields": [ + { + "name": "nodes", + "type": { + "items": { + "fields": [ + { + "name": "name", + "type": "string" + }, + { + "name": "ontology_reference", + "type": "string" + }, + { + "name": "values", + "type": { + "type": "map", + "values": "string" + } + }, + { + "name": "links", + "type": { + "items": { + "fields": [ + { + "name": "multiplicity", + "type": { + "name": "Multiplicity", + "symbols": [ + "ONE_TO_ONE", + "ONE_TO_MANY", + "MANY_TO_ONE", + "MANY_TO_MANY" + ], + "type": "enum" + } + }, + { + "name": "dst", + "type": "string" + }, + { + "name": "name", + "type": "string" + } + ], + "name": "Link", + "type": "record" + }, + "type": "array" + } + }, + { + "name": "properties", + "type": { + "items": { + "fields": [ + { + "name": "name", + "type": "string" + }, + { + "name": "ontology_reference", + "type": "string" + }, + { + "name": "values", + "type": { + "type": "map", + "values": "string" + } + } + ], + "name": "Property", + "type": "record" + }, + "type": "array" + } + } + ], + "name": "Node", + "type": "record" + }, + "type": "array" + } + }, + { + "name": "misc", + "type": { + "type": "map", + "values": "string" + } + } + ], + "name": "Metadata", + "type": "record" + }, + { + "fields": [ + { + "name": "anatomical_site", + "namespace": "anvil_biosample", + "type": "null" + }, + { + "name": "apriori_cell_type", + "namespace": "anvil_biosample", + "type": { + "items": [], + "type": "array" + } + }, + { + "name": "biosample_id", + "namespace": "anvil_biosample", + "type": "string" + }, + { + "name": "biosample_type", + "namespace": "anvil_biosample", + "type": "null" + }, + { + "name": "datarepo_row_id", + "namespace": "anvil_biosample", + "type": "string" + }, + { + "name": "disease", + "namespace": "anvil_biosample", + "type": "null" + }, + { + "name": "donor_age_at_collection_lower_bound", + "namespace": "anvil_biosample", + "type": "null" + }, + { + "name": "donor_age_at_collection_unit", + "namespace": "anvil_biosample", + "type": "null" + }, + { + "name": "donor_age_at_collection_upper_bound", + "namespace": "anvil_biosample", + "type": "null" + }, + { + "name": "source_datarepo_row_ids", + "namespace": "anvil_biosample", + "type": { + "items": "string", + "type": "array" + } + }, + { + "name": "version", + "namespace": "anvil_biosample", + "type": "string" + } + ], + "name": "anvil_biosample", + "type": "record" + }, + { + "fields": [ + { + "name": "consent_group", + "namespace": "anvil_dataset", + "type": [ + "null", + { + "items": "string", + "type": "array" + } + ] + }, + { + "name": "data_modality", + "namespace": "anvil_dataset", + "type": [ + "null", + { + "items": [], + "type": "array" + } + ] + }, + { + "name": "data_use_permission", + "namespace": "anvil_dataset", + "type": [ + "null", + { + "items": "string", + "type": "array" + } + ] + }, + { + "name": "datarepo_row_id", + "namespace": "anvil_dataset", + "type": [ + "null", + "string" + ] + }, + { + "name": "dataset_id", + "namespace": "anvil_dataset", + "type": [ + "null", + "string" + ] + }, + { + "name": "description", + "namespace": "anvil_dataset", + "type": [ + "null", + "string" + ] + }, + { + "name": "owner", + "namespace": "anvil_dataset", + "type": [ + "null", + { + "items": "string", + "type": "array" + } + ] + }, + { + "name": "principal_investigator", + "namespace": "anvil_dataset", + "type": [ + "null", + { + "items": [], + "type": "array" + } + ] + }, + { + "name": "registered_identifier", + "namespace": "anvil_dataset", + "type": [ + "null", + { + "items": "string", + "type": "array" + } + ] + }, + { + "name": "source_datarepo_row_ids", + "namespace": "anvil_dataset", + "type": [ + "null", + { + "items": "string", + "type": "array" + } + ] + }, + { + "name": "title", + "namespace": "anvil_dataset", + "type": [ + "null", + "string" + ] + }, + { + "name": "version", + "namespace": "anvil_dataset", + "type": "string" + } + ], + "name": "anvil_dataset", + "type": "record" + }, + { + "fields": [ + { + "name": "datarepo_row_id", + "namespace": "anvil_diagnosis", + "type": "string" + }, + { + "name": "diagnosis_age_lower_bound", + "namespace": "anvil_diagnosis", + "type": "null" + }, + { + "name": "diagnosis_age_unit", + "namespace": "anvil_diagnosis", + "type": "null" + }, + { + "name": "diagnosis_age_upper_bound", + "namespace": "anvil_diagnosis", + "type": "null" + }, + { + "name": "diagnosis_id", + "namespace": "anvil_diagnosis", + "type": "string" + }, + { + "name": "disease", + "namespace": "anvil_diagnosis", + "type": { + "items": "string", + "type": "array" + } + }, + { + "name": "onset_age_lower_bound", + "namespace": "anvil_diagnosis", + "type": "null" + }, + { + "name": "onset_age_unit", + "namespace": "anvil_diagnosis", + "type": "null" + }, + { + "name": "onset_age_upper_bound", + "namespace": "anvil_diagnosis", + "type": "null" + }, + { + "name": "phenopacket", + "namespace": "anvil_diagnosis", + "type": { + "items": [], + "type": "array" + } + }, + { + "name": "phenotype", + "namespace": "anvil_diagnosis", + "type": { + "items": "string", + "type": "array" + } + }, + { + "name": "source_datarepo_row_ids", + "namespace": "anvil_diagnosis", + "type": { + "items": "string", + "type": "array" + } + }, + { + "name": "version", + "namespace": "anvil_diagnosis", + "type": "string" + } + ], + "name": "anvil_diagnosis", + "type": "record" + }, + { + "fields": [ + { + "name": "datarepo_row_id", + "namespace": "anvil_donor", + "type": "string" + }, + { + "name": "donor_id", + "namespace": "anvil_donor", + "type": "string" + }, + { + "name": "genetic_ancestry", + "namespace": "anvil_donor", + "type": { + "items": [], + "type": "array" + } + }, + { + "name": "organism_type", + "namespace": "anvil_donor", + "type": "string" + }, + { + "name": "phenotypic_sex", + "namespace": "anvil_donor", + "type": "string" + }, + { + "name": "reported_ethnicity", + "namespace": "anvil_donor", + "type": { + "items": "string", + "type": "array" + } + }, + { + "name": "source_datarepo_row_ids", + "namespace": "anvil_donor", + "type": { + "items": "string", + "type": "array" + } + }, + { + "name": "version", + "namespace": "anvil_donor", + "type": "string" + } + ], + "name": "anvil_donor", + "type": "record" + }, + { + "fields": [ + { + "name": "crc32", + "namespace": "anvil_file", + "type": "string" + }, + { + "name": "data_modality", + "namespace": "anvil_file", + "type": { + "items": [], + "type": "array" + } + }, + { + "name": "datarepo_row_id", + "namespace": "anvil_file", + "type": "string" + }, + { + "name": "drs_uri", + "namespace": "anvil_file", + "type": "string" + }, + { + "name": "file_format", + "namespace": "anvil_file", + "type": "string" + }, + { + "name": "file_id", + "namespace": "anvil_file", + "type": "string" + }, + { + "name": "file_md5sum", + "namespace": "anvil_file", + "type": "string" + }, + { + "name": "file_name", + "namespace": "anvil_file", + "type": "string" + }, + { + "name": "file_ref", + "namespace": "anvil_file", + "type": "string" + }, + { + "name": "file_size", + "namespace": "anvil_file", + "type": "long" + }, + { + "name": "is_supplementary", + "namespace": "anvil_file", + "type": "boolean" + }, + { + "name": "reference_assembly", + "namespace": "anvil_file", + "type": { + "items": [], + "type": "array" + } + }, + { + "name": "sha256", + "namespace": "anvil_file", + "type": "string" + }, + { + "name": "source_datarepo_row_ids", + "namespace": "anvil_file", + "type": { + "items": "string", + "type": "array" + } + }, + { + "name": "version", + "namespace": "anvil_file", + "type": "string" + } + ], + "name": "anvil_file", + "type": "record" + }, + { + "fields": [ + { + "name": "activity_type", + "namespace": "anvil_sequencingactivity", + "type": "string" + }, + { + "name": "assay_type", + "namespace": "anvil_sequencingactivity", + "type": { + "items": [], + "type": "array" + } + }, + { + "name": "data_modality", + "namespace": "anvil_sequencingactivity", + "type": { + "items": [], + "type": "array" + } + }, + { + "name": "datarepo_row_id", + "namespace": "anvil_sequencingactivity", + "type": "string" + }, + { + "name": "sequencingactivity_id", + "namespace": "anvil_sequencingactivity", + "type": "string" + }, + { + "name": "source_datarepo_row_ids", + "namespace": "anvil_sequencingactivity", + "type": { + "items": "string", + "type": "array" + } + }, + { + "name": "version", + "namespace": "anvil_sequencingactivity", + "type": "string" + } + ], + "name": "anvil_sequencingactivity", + "type": "record" + } + ] + }, + { + "default": [], + "name": "relations", + "type": { + "items": { + "fields": [ + { + "name": "dst_id", + "type": "string" + }, + { + "name": "dst_name", + "type": "string" + } + ], + "name": "Relation", + "type": "record" + }, + "type": "array" + } + } + ], + "name": "Entity", + "type": "record" +} \ No newline at end of file diff --git a/test/service/test_manifest.py b/test/service/test_manifest.py index 52c6c239c5..df06218234 100644 --- a/test/service/test_manifest.py +++ b/test/service/test_manifest.py @@ -20,6 +20,9 @@ BytesIO, ) import json +from operator import ( + itemgetter, +) import os from pathlib import ( Path, @@ -67,6 +70,8 @@ ) from azul.collections import ( adict, + compose_keys, + none_safe_key, ) from azul.indexer import ( SourcedBundleFQID, @@ -2065,3 +2070,22 @@ def sort_key(anvil_doc): manifest.sort(key=sort_key) expected.sort(key=sort_key) self.assertEqual(expected, manifest) + + @unittest.skipIf(not config.enable_replicas, + 'The format is replica-based') + @manifest_test + def test_pfb_manifest(self): + response = self._get_manifest(ManifestFormat.verbatim_pfb, filters={}) + self.assertEqual(200, response.status_code) + manifest = fastavro.reader(BytesIO(response.content)) + schema = manifest.writer_schema + entities = list(manifest) + with open(self._data_path('service') / 'verbatim/pfb_schema.json') as f: + expected_schema = json.load(f) + with open(self._data_path('service') / 'verbatim/pfb_entities.json') as f: + expected_entities = json.load(f) + sort_key = compose_keys(none_safe_key(), itemgetter('id')) + entities.sort(key=sort_key) + expected_entities.sort(key=sort_key) + self.assertEqual(expected_schema, schema) + self.assertEqual(expected_entities, entities)