diff --git a/src/azul/plugins/metadata/anvil/__init__.py b/src/azul/plugins/metadata/anvil/__init__.py index fe0c0203a..b3fe44532 100644 --- a/src/azul/plugins/metadata/anvil/__init__.py +++ b/src/azul/plugins/metadata/anvil/__init__.py @@ -136,10 +136,14 @@ def _field_mapping(self) -> MetadataPlugin._FieldMapping: return { 'entity_id': 'entryId', 'bundles': { + # These field paths have a brittle coupling that must be + # maintained to the field lookups in `self.manifest_config`. 'uuid': self.special_fields.bundle_uuid, 'version': self.special_fields.bundle_version }, 'sources': { + # These field paths have a brittle coupling that must be + # maintained to the field lookups in `self.manifest_config`. 'id': self.special_fields.source_id, 'spec': self.special_fields.source_spec }, @@ -196,6 +200,9 @@ def _field_mapping(self) -> MetadataPlugin._FieldMapping: f: f'activities.{f}' for f in [ *common_fields, 'activity_id', + # This field path has a brittle coupling that must be + # maintained to the field lookup in + # `self.manifest_config`. 'activity_table', 'activity_type', 'assay_type', @@ -222,7 +229,9 @@ def _field_mapping(self) -> MetadataPlugin._FieldMapping: ] }, # These field names are hard-coded in the implementation of - # the repository service/controller. + # the repository service/controller. Also, these field paths + # have a brittle coupling that must be maintained to the + # field lookups in `self.manifest_config`. **{ # Not in schema 'version': 'fileVersion', @@ -273,6 +282,23 @@ def facets(self) -> Sequence[str]: def manifest_config(self) -> ManifestConfig: result = defaultdict(dict) + # Note that there is a brittle coupling that must be maintained between + # the fields listed here and those used in `self._field_mapping`. + fields_to_omit_from_manifest = [ + ('contents', 'activities', 'activity_table'), + ('contents', 'files', 'uuid'), + ('contents', 'files', 'version'), + ] + + # Furthermore, renamed values should match the field's path in a + # response hit from the `/index/files` endpoint. + fields_to_rename_in_manifest = { + ('bundles', 'uuid'): 'bundles.bundle_uuid', + ('bundles', 'version'): 'bundles.bundle_version', + ('sources', 'id'): 'sources.source_id', + ('sources', 'spec'): 'sources.source_spec', + } + def recurse(mapping: MetadataPlugin._FieldMapping, path: FieldPath): for path_element, name_or_type in mapping.items(): new_path = (*path, path_element) @@ -281,20 +307,23 @@ def recurse(mapping: MetadataPlugin._FieldMapping, path: FieldPath): elif isinstance(name_or_type, str): if new_path == ('entity_id',): pass - elif new_path == ('contents', 'files', 'uuid'): - # Request the injection of a file URL … - result[path]['file_url'] = 'files.file_url' - # … but suppress the columns for the fields … - result[path][path_element] = None - elif new_path == ('contents', 'files', 'version'): - # … only used by that injection. + elif new_path in fields_to_omit_from_manifest: result[path][path_element] = None + fields_to_omit_from_manifest.remove(new_path) + elif new_path in fields_to_rename_in_manifest: + result[path][path_element] = fields_to_rename_in_manifest.pop(new_path) else: result[path][path_element] = name_or_type else: assert False, (path, path_element, name_or_type) recurse(self._field_mapping, ()) + assert len(fields_to_omit_from_manifest) == 0, fields_to_omit_from_manifest + assert len(fields_to_rename_in_manifest) == 0, fields_to_rename_in_manifest + # The file URL is synthesized from the `uuid` and `version` fields. + # Above, we already configured these two fields to be omitted from the + # manifest since they are not informative to the user. + result[('contents', 'files')]['file_url'] = 'files.azul_file_url' return result def verbatim_pfb_schema(self, diff --git a/src/azul/plugins/metadata/anvil/service/response.py b/src/azul/plugins/metadata/anvil/service/response.py index 0d938ced6..8d0be3c12 100644 --- a/src/azul/plugins/metadata/anvil/service/response.py +++ b/src/azul/plugins/metadata/anvil/service/response.py @@ -139,6 +139,9 @@ def choose_entry(_term): def _make_hit(self, es_hit: JSON) -> MutableJSON: return { 'entryId': es_hit['entity_id'], + # Note that there is a brittle coupling that must be maintained + # between the `sources` and `bundles` field paths here and the + # renamed fields in `Plugin.manifest_config`. 'sources': list(map(self._make_source, es_hit['sources'])), 'bundles': list(map(self._make_bundle, es_hit['bundles'])), **self._make_contents(es_hit['contents']) diff --git a/test/integration_test.py b/test/integration_test.py index ee66f595c..a1c6b9941 100644 --- a/test/integration_test.py +++ b/test/integration_test.py @@ -749,6 +749,14 @@ def _project_type(self, catalog: CatalogName) -> EntityType: else: assert False, catalog + def _uuid_column_name(self, catalog: CatalogName) -> str: + if config.is_hca_enabled(catalog): + return 'bundle_uuid' + elif config.is_anvil_enabled(catalog): + return 'bundles.bundle_uuid' + else: + assert False, catalog + def _test_dos_and_drs(self, catalog: CatalogName): if config.is_dss_enabled(catalog) and config.dss_direct_access: _, file = self._get_one_inner_file(catalog) @@ -927,8 +935,8 @@ def _assertResponseStatus(self, ) ) - def _check_compact_manifest(self, _catalog: CatalogName, response: bytes): - self.__check_csv_manifest(BytesIO(response), 'bundle_uuid') + def _check_compact_manifest(self, catalog: CatalogName, response: bytes): + self.__check_csv_manifest(BytesIO(response), self._uuid_column_name(catalog)) def _check_terra_bdbag_manifest(self, catalog: CatalogName, response: bytes): with ZipFile(BytesIO(response)) as zip_fh: @@ -1047,14 +1055,14 @@ def _read_csv_manifest(self, file: IO[bytes]) -> csv.DictReader: def __check_csv_manifest(self, file: IO[bytes], - uuid_field_name: str + uuid_column_name: str ) -> list[Mapping[str, str]]: reader = self._read_csv_manifest(file) rows = list(reader) log.info(f'Manifest contains {len(rows)} rows.') self.assertGreater(len(rows), 0) - self.assertIn(uuid_field_name, reader.fieldnames) - bundle_uuids = rows[0][uuid_field_name].split(ManifestGenerator.padded_joiner) + self.assertIn(uuid_column_name, reader.fieldnames) + bundle_uuids = rows[0][uuid_column_name].split(ManifestGenerator.padded_joiner) self.assertGreater(len(bundle_uuids), 0) for bundle_uuid in bundle_uuids: self.assertEqual(bundle_uuid, str(uuid.UUID(bundle_uuid))) @@ -1605,9 +1613,10 @@ def bundle_uuids(hit: JSON) -> set[str]: def test_compact_manifest(expected_bundles): manifest = BytesIO(self._get_url_content(PUT, manifest_url)) manifest_rows = self._read_csv_manifest(manifest) + uuid_column_name = self._uuid_column_name(catalog) all_found_bundles = set() for row in manifest_rows: - row_bundles = set(row['bundle_uuid'].split(ManifestGenerator.padded_joiner)) + row_bundles = set(row[uuid_column_name].split(ManifestGenerator.padded_joiner)) # It's possible for one file to be present in multiple # bundles (e.g. due to stitching), so each row may include # additional bundles besides those included in the filters. diff --git a/test/service/test_manifest.py b/test/service/test_manifest.py index 92d0b5546..da3bf4b3a 100644 --- a/test/service/test_manifest.py +++ b/test/service/test_manifest.py @@ -1716,25 +1716,25 @@ def test_compact_manifest(self): self.assertEqual(200, response.status_code) expected = [ ( - 'bundle_uuid', + 'bundles.bundle_uuid', '6b0f6c0f-5d80-a242-accb-840921351cd5', '826dea02-e274-affe-aabc-eb3db63ad068', '826dea02-e274-affe-aabc-eb3db63ad068' ), ( - 'bundle_version', + 'bundles.bundle_version', '2022-06-01T00:00:00.000000Z', '2022-06-01T00:00:00.000000Z', '2022-06-01T00:00:00.000000Z' ), ( - 'source_id', + 'sources.source_id', '6c87f0e1-509d-46a4-b845-7584df39263b', '6c87f0e1-509d-46a4-b845-7584df39263b', '6c87f0e1-509d-46a4-b845-7584df39263b' ), ( - 'source_spec', + 'sources.source_spec', 'tdr:bigquery:gcp:test_anvil_project:anvil_snapshot:/2', 'tdr:bigquery:gcp:test_anvil_project:anvil_snapshot:/2', 'tdr:bigquery:gcp:test_anvil_project:anvil_snapshot:/2' @@ -1973,12 +1973,6 @@ def test_compact_manifest(self): '18b3be87-e26b-4376-0d8d-c1e370e90e07', 'a60c5138-3749-f7cb-8714-52d389ad5231' ), - ( - 'activities.activity_table', - '', - 'sequencingactivity', - 'sequencingactivity' - ), ( 'activities.activity_type', '', @@ -2082,7 +2076,7 @@ def test_compact_manifest(self): self._drs_uri('v1_6c87f0e1-509d-46a4-b845-7584df39263b_8b722e88-8103-49c1-b351-e64fa7c6ab37') ), ( - 'files.file_url', + 'files.azul_file_url', self._file_url('6b0f6c0f-5d80-4242-accb-840921351cd5', self.version), self._file_url('15b76f9c-6b46-433f-851d-34e89f1b9ba6', self.version), self._file_url('3b17377b-16b1-431c-9967-e5d01fc5923f', self.version)