Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix: Invalid columns in compact manifest for AnVIL (#6110) #6566

Merged
merged 2 commits into from
Oct 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 37 additions & 8 deletions src/azul/plugins/metadata/anvil/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,10 +136,14 @@ def _field_mapping(self) -> MetadataPlugin._FieldMapping:
return {
'entity_id': 'entryId',
'bundles': {
# These field paths have a brittle coupling that must be
# maintained to the field lookups in `self.manifest_config`.
'uuid': self.special_fields.bundle_uuid,
'version': self.special_fields.bundle_version
},
'sources': {
# These field paths have a brittle coupling that must be
# maintained to the field lookups in `self.manifest_config`.
'id': self.special_fields.source_id,
'spec': self.special_fields.source_spec
},
Expand Down Expand Up @@ -196,6 +200,9 @@ def _field_mapping(self) -> MetadataPlugin._FieldMapping:
f: f'activities.{f}' for f in [
*common_fields,
'activity_id',
# This field path has a brittle coupling that must be
# maintained to the field lookup in
# `self.manifest_config`.
'activity_table',
'activity_type',
'assay_type',
Expand All @@ -222,7 +229,9 @@ def _field_mapping(self) -> MetadataPlugin._FieldMapping:
]
},
# These field names are hard-coded in the implementation of
# the repository service/controller.
# the repository service/controller. Also, these field paths
# have a brittle coupling that must be maintained to the
# field lookups in `self.manifest_config`.
**{
# Not in schema
'version': 'fileVersion',
Expand Down Expand Up @@ -273,6 +282,23 @@ def facets(self) -> Sequence[str]:
def manifest_config(self) -> ManifestConfig:
result = defaultdict(dict)

# Note that there is a brittle coupling that must be maintained between
# the fields listed here and those used in `self._field_mapping`.
fields_to_omit_from_manifest = [
('contents', 'activities', 'activity_table'),
('contents', 'files', 'uuid'),
('contents', 'files', 'version'),
]

# Furthermore, renamed values should match the field's path in a
# response hit from the `/index/files` endpoint.
fields_to_rename_in_manifest = {
('bundles', 'uuid'): 'bundles.bundle_uuid',
achave11-ucsc marked this conversation as resolved.
Show resolved Hide resolved
('bundles', 'version'): 'bundles.bundle_version',
('sources', 'id'): 'sources.source_id',
('sources', 'spec'): 'sources.source_spec',
}

def recurse(mapping: MetadataPlugin._FieldMapping, path: FieldPath):
for path_element, name_or_type in mapping.items():
new_path = (*path, path_element)
Expand All @@ -281,20 +307,23 @@ def recurse(mapping: MetadataPlugin._FieldMapping, path: FieldPath):
elif isinstance(name_or_type, str):
if new_path == ('entity_id',):
pass
elif new_path == ('contents', 'files', 'uuid'):
# Request the injection of a file URL …
result[path]['file_url'] = 'files.file_url'
# … but suppress the columns for the fields …
result[path][path_element] = None
elif new_path == ('contents', 'files', 'version'):
# … only used by that injection.
elif new_path in fields_to_omit_from_manifest:
result[path][path_element] = None
fields_to_omit_from_manifest.remove(new_path)
elif new_path in fields_to_rename_in_manifest:
result[path][path_element] = fields_to_rename_in_manifest.pop(new_path)
else:
result[path][path_element] = name_or_type
else:
assert False, (path, path_element, name_or_type)

recurse(self._field_mapping, ())
assert len(fields_to_omit_from_manifest) == 0, fields_to_omit_from_manifest
assert len(fields_to_rename_in_manifest) == 0, fields_to_rename_in_manifest
# The file URL is synthesized from the `uuid` and `version` fields.
# Above, we already configured these two fields to be omitted from the
# manifest since they are not informative to the user.
result[('contents', 'files')]['file_url'] = 'files.azul_file_url'
return result

def verbatim_pfb_schema(self,
Expand Down
3 changes: 3 additions & 0 deletions src/azul/plugins/metadata/anvil/service/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,9 @@ def choose_entry(_term):
def _make_hit(self, es_hit: JSON) -> MutableJSON:
return {
'entryId': es_hit['entity_id'],
# Note that there is a brittle coupling that must be maintained
# between the `sources` and `bundles` field paths here and the
# renamed fields in `Plugin.manifest_config`.
'sources': list(map(self._make_source, es_hit['sources'])),
'bundles': list(map(self._make_bundle, es_hit['bundles'])),
**self._make_contents(es_hit['contents'])
Expand Down
21 changes: 15 additions & 6 deletions test/integration_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -749,6 +749,14 @@ def _project_type(self, catalog: CatalogName) -> EntityType:
else:
assert False, catalog

def _uuid_column_name(self, catalog: CatalogName) -> str:
if config.is_hca_enabled(catalog):
return 'bundle_uuid'
elif config.is_anvil_enabled(catalog):
return 'bundles.bundle_uuid'
else:
assert False, catalog

def _test_dos_and_drs(self, catalog: CatalogName):
if config.is_dss_enabled(catalog) and config.dss_direct_access:
_, file = self._get_one_inner_file(catalog)
Expand Down Expand Up @@ -927,8 +935,8 @@ def _assertResponseStatus(self,
)
)

def _check_compact_manifest(self, _catalog: CatalogName, response: bytes):
self.__check_csv_manifest(BytesIO(response), 'bundle_uuid')
def _check_compact_manifest(self, catalog: CatalogName, response: bytes):
self.__check_csv_manifest(BytesIO(response), self._uuid_column_name(catalog))

def _check_terra_bdbag_manifest(self, catalog: CatalogName, response: bytes):
with ZipFile(BytesIO(response)) as zip_fh:
Expand Down Expand Up @@ -1047,14 +1055,14 @@ def _read_csv_manifest(self, file: IO[bytes]) -> csv.DictReader:

def __check_csv_manifest(self,
file: IO[bytes],
uuid_field_name: str
uuid_column_name: str
) -> list[Mapping[str, str]]:
reader = self._read_csv_manifest(file)
rows = list(reader)
log.info(f'Manifest contains {len(rows)} rows.')
self.assertGreater(len(rows), 0)
self.assertIn(uuid_field_name, reader.fieldnames)
bundle_uuids = rows[0][uuid_field_name].split(ManifestGenerator.padded_joiner)
self.assertIn(uuid_column_name, reader.fieldnames)
bundle_uuids = rows[0][uuid_column_name].split(ManifestGenerator.padded_joiner)
self.assertGreater(len(bundle_uuids), 0)
for bundle_uuid in bundle_uuids:
self.assertEqual(bundle_uuid, str(uuid.UUID(bundle_uuid)))
Expand Down Expand Up @@ -1605,9 +1613,10 @@ def bundle_uuids(hit: JSON) -> set[str]:
def test_compact_manifest(expected_bundles):
manifest = BytesIO(self._get_url_content(PUT, manifest_url))
manifest_rows = self._read_csv_manifest(manifest)
uuid_column_name = self._uuid_column_name(catalog)
all_found_bundles = set()
for row in manifest_rows:
row_bundles = set(row['bundle_uuid'].split(ManifestGenerator.padded_joiner))
row_bundles = set(row[uuid_column_name].split(ManifestGenerator.padded_joiner))
# It's possible for one file to be present in multiple
# bundles (e.g. due to stitching), so each row may include
# additional bundles besides those included in the filters.
Expand Down
16 changes: 5 additions & 11 deletions test/service/test_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1716,25 +1716,25 @@ def test_compact_manifest(self):
self.assertEqual(200, response.status_code)
expected = [
(
'bundle_uuid',
'bundles.bundle_uuid',
'6b0f6c0f-5d80-a242-accb-840921351cd5',
'826dea02-e274-affe-aabc-eb3db63ad068',
'826dea02-e274-affe-aabc-eb3db63ad068'
),
(
'bundle_version',
'bundles.bundle_version',
'2022-06-01T00:00:00.000000Z',
'2022-06-01T00:00:00.000000Z',
'2022-06-01T00:00:00.000000Z'
),
(
'source_id',
'sources.source_id',
'6c87f0e1-509d-46a4-b845-7584df39263b',
'6c87f0e1-509d-46a4-b845-7584df39263b',
'6c87f0e1-509d-46a4-b845-7584df39263b'
),
(
'source_spec',
'sources.source_spec',
'tdr:bigquery:gcp:test_anvil_project:anvil_snapshot:/2',
'tdr:bigquery:gcp:test_anvil_project:anvil_snapshot:/2',
'tdr:bigquery:gcp:test_anvil_project:anvil_snapshot:/2'
Expand Down Expand Up @@ -1973,12 +1973,6 @@ def test_compact_manifest(self):
'18b3be87-e26b-4376-0d8d-c1e370e90e07',
'a60c5138-3749-f7cb-8714-52d389ad5231'
),
(
'activities.activity_table',
'',
'sequencingactivity',
'sequencingactivity'
),
(
'activities.activity_type',
'',
Expand Down Expand Up @@ -2082,7 +2076,7 @@ def test_compact_manifest(self):
self._drs_uri('v1_6c87f0e1-509d-46a4-b845-7584df39263b_8b722e88-8103-49c1-b351-e64fa7c6ab37')
),
(
'files.file_url',
'files.azul_file_url',
self._file_url('6b0f6c0f-5d80-4242-accb-840921351cd5', self.version),
self._file_url('15b76f9c-6b46-433f-851d-34e89f1b9ba6', self.version),
self._file_url('3b17377b-16b1-431c-9967-e5d01fc5923f', self.version)
Expand Down
Loading