From db4a9b62318b7208cc7990d19124a7e653bf8774 Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Tue, 30 Jul 2024 13:33:31 -0700 Subject: [PATCH 1/2] Rename `contents` to `value` in JSONL manifest (#6453) --- src/azul/service/manifest_service.py | 2 +- test/service/test_manifest.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/azul/service/manifest_service.py b/src/azul/service/manifest_service.py index 83b82c842..953c58ece 100644 --- a/src/azul/service/manifest_service.py +++ b/src/azul/service/manifest_service.py @@ -2081,7 +2081,7 @@ def create_file(self) -> tuple[str, Optional[str]]: with open(path, 'w') as f: for replica in self._all_replicas(): entry = { - 'contents': replica['contents'], + 'value': replica['contents'], 'type': replica['replica_type'] } json.dump(entry, f) diff --git a/test/service/test_manifest.py b/test/service/test_manifest.py index ff10c774b..f1f955f18 100644 --- a/test/service/test_manifest.py +++ b/test/service/test_manifest.py @@ -1333,7 +1333,7 @@ def test_verbatim_jsonl_manifest(self): expected = [ { 'type': replica_type, - 'contents': bundle.metadata_files[key], + 'value': bundle.metadata_files[key], } for bundle in map(self._load_canned_bundle, self.bundles()) for replica_type, key in [ @@ -2082,7 +2082,7 @@ def test_verbatim_jsonl_manifest(self): # Consolidate entities with the same replica (i.e. datasets) json_hash(entity).digest(): { 'type': 'anvil_' + entity_ref.entity_type, - 'contents': entity, + 'value': entity, } for bundle in self.bundles() for entity_ref, entity in self._load_canned_bundle(bundle).entities.items() From 7766c1d44bd51a8604d35405d79ca59470884d52 Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Wed, 31 Jul 2024 12:46:27 -0700 Subject: [PATCH 2/2] [r] Use DCP2 schema name for HCA replica_type (#6453) --- .../plugins/metadata/hca/indexer/transform.py | 4 +- ...d.2018-11-02T11:33:44.698028Z.results.json | 6 +- .../data/verbatim/hca/pfb_entities.json | 14 +- .../service/data/verbatim/hca/pfb_schema.json | 222 +++++++++--------- test/service/test_manifest.py | 6 +- 5 files changed, 126 insertions(+), 126 deletions(-) diff --git a/src/azul/plugins/metadata/hca/indexer/transform.py b/src/azul/plugins/metadata/hca/indexer/transform.py index 62803e299..8c6a3829f 100644 --- a/src/azul/plugins/metadata/hca/indexer/transform.py +++ b/src/azul/plugins/metadata/hca/indexer/transform.py @@ -462,8 +462,8 @@ class BaseTransformer(Transformer, metaclass=ABCMeta): api_bundle: api.Bundle def replica_type(self, entity: EntityReference) -> str: - assert entity.entity_type == self.entity_type(), entity - return entity.entity_type.removesuffix('s') + api_entity = self.api_bundle.entities[UUID(entity.entity_id)] + return api_entity.schema_name @classmethod def aggregator(cls, entity_type: EntityType) -> Optional[EntityAggregator]: diff --git a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json index c170e2088..febff19e8 100644 --- a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json +++ b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json @@ -3471,7 +3471,7 @@ } }, "entity_id": "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb", - "replica_type": "file", + "replica_type": "sequence_file", "hub_ids": [ "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb" ] @@ -3541,7 +3541,7 @@ } }, "entity_id": "70d1af4a-82c8-478a-8960-e9028b3616ca", - "replica_type": "file", + "replica_type": "sequence_file", "hub_ids": [ "70d1af4a-82c8-478a-8960-e9028b3616ca" ] @@ -3593,7 +3593,7 @@ } }, "entity_id": "a21dc760-a500-4236-bcff-da34a0e873d2", - "replica_type": "sample", + "replica_type": "specimen_from_organism", "hub_ids": [ "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb", "70d1af4a-82c8-478a-8960-e9028b3616ca" diff --git a/test/service/data/verbatim/hca/pfb_entities.json b/test/service/data/verbatim/hca/pfb_entities.json index dd65d389f..cdb1266ad 100644 --- a/test/service/data/verbatim/hca/pfb_entities.json +++ b/test/service/data/verbatim/hca/pfb_entities.json @@ -14,28 +14,28 @@ }, { "links": [], - "name": "file", + "name": "links", "ontology_reference": "", "properties": [], "values": {} }, { "links": [], - "name": "links", + "name": "project", "ontology_reference": "", "properties": [], "values": {} }, { "links": [], - "name": "project", + "name": "sequence_file", "ontology_reference": "", "properties": [], "values": {} }, { "links": [], - "name": "sample", + "name": "specimen_from_organism", "ontology_reference": "", "properties": [], "values": {} @@ -46,7 +46,7 @@ }, { "id": "70d1af4a-82c8-478a-8960-e9028b3616ca", - "name": "file", + "name": "sequence_file", "object": { "describedBy": "https://schema.humancellatlas.org/type/file/6.5.2/sequence_file", "file_core": { @@ -69,7 +69,7 @@ }, { "id": "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb", - "name": "file", + "name": "sequence_file", "object": { "describedBy": "https://schema.humancellatlas.org/type/file/6.5.2/sequence_file", "file_core": { @@ -122,7 +122,7 @@ }, { "id": "a21dc760-a500-4236-bcff-da34a0e873d2", - "name": "sample", + "name": "specimen_from_organism", "object": { "biomaterial_core": { "biomaterial_id": "DID_scRSq06_pancreas", diff --git a/test/service/data/verbatim/hca/pfb_schema.json b/test/service/data/verbatim/hca/pfb_schema.json index 07228a90f..af50aa32f 100644 --- a/test/service/data/verbatim/hca/pfb_schema.json +++ b/test/service/data/verbatim/hca/pfb_schema.json @@ -222,85 +222,6 @@ "name": "cell_suspension", "type": "record" }, - { - "fields": [ - { - "name": "describedBy", - "namespace": "file", - "type": "string" - }, - { - "name": "file_core", - "namespace": "file", - "type": { - "fields": [ - { - "name": "file_format", - "namespace": "file.file_core", - "type": "string" - }, - { - "name": "file_name", - "namespace": "file.file_core", - "type": "string" - } - ], - "name": "file.file_core", - "type": "record" - } - }, - { - "name": "insdc_run", - "namespace": "file", - "type": { - "items": "string", - "type": "array" - } - }, - { - "name": "provenance", - "namespace": "file", - "type": { - "fields": [ - { - "name": "document_id", - "namespace": "file.provenance", - "type": "string" - }, - { - "name": "submission_date", - "namespace": "file.provenance", - "type": "string" - }, - { - "name": "update_date", - "namespace": "file.provenance", - "type": "string" - } - ], - "name": "file.provenance", - "type": "record" - } - }, - { - "name": "read_index", - "namespace": "file", - "type": "string" - }, - { - "name": "read_length", - "namespace": "file", - "type": "long" - }, - { - "name": "schema_type", - "namespace": "file", - "type": "string" - } - ], - "name": "file", - "type": "record" - }, { "fields": [ { @@ -592,59 +513,138 @@ "name": "project", "type": "record" }, + { + "fields": [ + { + "name": "describedBy", + "namespace": "sequence_file", + "type": "string" + }, + { + "name": "file_core", + "namespace": "sequence_file", + "type": { + "fields": [ + { + "name": "file_format", + "namespace": "sequence_file.file_core", + "type": "string" + }, + { + "name": "file_name", + "namespace": "sequence_file.file_core", + "type": "string" + } + ], + "name": "sequence_file.file_core", + "type": "record" + } + }, + { + "name": "insdc_run", + "namespace": "sequence_file", + "type": { + "items": "string", + "type": "array" + } + }, + { + "name": "provenance", + "namespace": "sequence_file", + "type": { + "fields": [ + { + "name": "document_id", + "namespace": "sequence_file.provenance", + "type": "string" + }, + { + "name": "submission_date", + "namespace": "sequence_file.provenance", + "type": "string" + }, + { + "name": "update_date", + "namespace": "sequence_file.provenance", + "type": "string" + } + ], + "name": "sequence_file.provenance", + "type": "record" + } + }, + { + "name": "read_index", + "namespace": "sequence_file", + "type": "string" + }, + { + "name": "read_length", + "namespace": "sequence_file", + "type": "long" + }, + { + "name": "schema_type", + "namespace": "sequence_file", + "type": "string" + } + ], + "name": "sequence_file", + "type": "record" + }, { "fields": [ { "name": "biomaterial_core", - "namespace": "sample", + "namespace": "specimen_from_organism", "type": { "fields": [ { "name": "biomaterial_id", - "namespace": "sample.biomaterial_core", + "namespace": "specimen_from_organism.biomaterial_core", "type": "string" }, { "name": "ncbi_taxon_id", - "namespace": "sample.biomaterial_core", + "namespace": "specimen_from_organism.biomaterial_core", "type": { "items": "long", "type": "array" } } ], - "name": "sample.biomaterial_core", + "name": "specimen_from_organism.biomaterial_core", "type": "record" } }, { "name": "describedBy", - "namespace": "sample", + "namespace": "specimen_from_organism", "type": "string" }, { "name": "diseases", - "namespace": "sample", + "namespace": "specimen_from_organism", "type": { "items": { "fields": [ { "name": "ontology", - "namespace": "sample.diseases", + "namespace": "specimen_from_organism.diseases", "type": "string" }, { "name": "ontology_label", - "namespace": "sample.diseases", + "namespace": "specimen_from_organism.diseases", "type": "string" }, { "name": "text", - "namespace": "sample.diseases", + "namespace": "specimen_from_organism.diseases", "type": "string" } ], - "name": "sample.diseases", + "name": "specimen_from_organism.diseases", "type": "record" }, "type": "array" @@ -652,27 +652,27 @@ }, { "name": "genus_species", - "namespace": "sample", + "namespace": "specimen_from_organism", "type": { "items": { "fields": [ { "name": "ontology", - "namespace": "sample.genus_species", + "namespace": "specimen_from_organism.genus_species", "type": "string" }, { "name": "ontology_label", - "namespace": "sample.genus_species", + "namespace": "specimen_from_organism.genus_species", "type": "string" }, { "name": "text", - "namespace": "sample.genus_species", + "namespace": "specimen_from_organism.genus_species", "type": "string" } ], - "name": "sample.genus_species", + "name": "specimen_from_organism.genus_species", "type": "record" }, "type": "array" @@ -680,86 +680,86 @@ }, { "name": "organ", - "namespace": "sample", + "namespace": "specimen_from_organism", "type": { "fields": [ { "name": "ontology", - "namespace": "sample.organ", + "namespace": "specimen_from_organism.organ", "type": "string" }, { "name": "ontology_label", - "namespace": "sample.organ", + "namespace": "specimen_from_organism.organ", "type": "string" }, { "name": "text", - "namespace": "sample.organ", + "namespace": "specimen_from_organism.organ", "type": "string" } ], - "name": "sample.organ", + "name": "specimen_from_organism.organ", "type": "record" } }, { "name": "organ_part", - "namespace": "sample", + "namespace": "specimen_from_organism", "type": { "fields": [ { "name": "ontology", - "namespace": "sample.organ_part", + "namespace": "specimen_from_organism.organ_part", "type": "string" }, { "name": "ontology_label", - "namespace": "sample.organ_part", + "namespace": "specimen_from_organism.organ_part", "type": "string" }, { "name": "text", - "namespace": "sample.organ_part", + "namespace": "specimen_from_organism.organ_part", "type": "string" } ], - "name": "sample.organ_part", + "name": "specimen_from_organism.organ_part", "type": "record" } }, { "name": "provenance", - "namespace": "sample", + "namespace": "specimen_from_organism", "type": { "fields": [ { "name": "document_id", - "namespace": "sample.provenance", + "namespace": "specimen_from_organism.provenance", "type": "string" }, { "name": "submission_date", - "namespace": "sample.provenance", + "namespace": "specimen_from_organism.provenance", "type": "string" }, { "name": "update_date", - "namespace": "sample.provenance", + "namespace": "specimen_from_organism.provenance", "type": "string" } ], - "name": "sample.provenance", + "name": "specimen_from_organism.provenance", "type": "record" } }, { "name": "schema_type", - "namespace": "sample", + "namespace": "specimen_from_organism", "type": "string" } ], - "name": "sample", + "name": "specimen_from_organism", "type": "record" } ] diff --git a/test/service/test_manifest.py b/test/service/test_manifest.py index f1f955f18..0bd177bfb 100644 --- a/test/service/test_manifest.py +++ b/test/service/test_manifest.py @@ -1340,9 +1340,9 @@ def test_verbatim_jsonl_manifest(self): ('links', 'links.json'), ('cell_suspension', 'cell_suspension_0.json'), ('project', 'project_0.json'), - ('file', 'sequence_file_0.json'), - ('file', 'sequence_file_1.json'), - ('sample', 'specimen_from_organism_0.json') + ('sequence_file', 'sequence_file_0.json'), + ('sequence_file', 'sequence_file_1.json'), + ('specimen_from_organism', 'specimen_from_organism_0.json') ] ] response = self._get_manifest(ManifestFormat.verbatim_jsonl, {})