Skip to content

Commit

Permalink
[r] Fix: Compact manifest rows for related files have main file's DRS…
Browse files Browse the repository at this point in the history
… URI (#2846)
  • Loading branch information
achave11-ucsc committed Apr 23, 2021
1 parent 359029e commit aa977e7
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 14 deletions.
10 changes: 10 additions & 0 deletions src/azul/plugins/metadata/hca/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -762,6 +762,7 @@ def _file(self, file: api.File, related_files: Iterable[api.File] = ()) -> Mutab
@classmethod
def _related_file_types(cls) -> FieldTypes:
return {
'content-type': null_str,
'name': null_str,
'crc32c': null_str,
'sha256': null_str,
Expand All @@ -773,6 +774,7 @@ def _related_file_types(cls) -> FieldTypes:

def _related_file(self, file: api.File) -> MutableJSON:
return {
'content-type': file.manifest_entry.content_type,
'name': file.manifest_entry.name,
'crc32c': file.manifest_entry.crc32c,
'sha256': file.manifest_entry.sha256,
Expand Down Expand Up @@ -1086,6 +1088,14 @@ def _protocols(self, visitor) -> Mapping[str, JSONs]:
)
}

@classmethod
def validate_class(cls):
# Manifest generation depends on this:
assert cls._related_file_types().keys() <= cls._file_types().keys()


BaseTransformer.validate_class()


def _parse_zarr_file_name(file_name: str) -> Tuple[bool, Optional[str], Optional[str]]:
file_name = file_name.split('.zarr/')
Expand Down
25 changes: 12 additions & 13 deletions src/azul/service/manifest_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@
from typing import (
Any,
IO,
Iterable,
List,
Mapping,
MutableMapping,
Expand Down Expand Up @@ -110,7 +109,6 @@
from azul.types import (
JSON,
JSONs,
MutableJSON,
)
from azul.vendored.frozendict import (
frozendict,
Expand Down Expand Up @@ -911,26 +909,27 @@ def write_to(self, output: IO[str]) -> Optional[str]:
for bundle in list(doc['bundles'])[0:100]: # iterate over copy …
doc['bundles'] = [bundle] # … to facilitate this in-place modification
row = {}
related_rows = []
for doc_path, column_mapping in self.manifest_config.items():
entities = [
dict(e, file_url=file_url)
for e in self._get_entities(doc_path, doc)
]
self._extract_fields(entities, column_mapping, row)
if doc_path == 'contents.files':
entity = one(entities)
if 'related_files' in entity:
for file in entity['related_files']:
related_row = {}
entity.update(file)
self._extract_fields([entity], column_mapping, related_row)
related_rows.append(related_row)
writer.writerow(row)
writer.writerows(self._get_related_rows(doc, row))
for related in related_rows:
row.update(related)
writer.writerow(row)
return None

def _get_related_rows(self, doc: JSON, row: MutableJSON) -> Iterable[MutableJSON]:
file_ = one(doc['contents']['files'])
for related in file_['related_files']:
# FIXME: Properly provision related_files in row
# https://github.com/DataBiosphere/azul/issues/2846
del related['drs_path']
new_row = row.copy()
new_row.update({'file_' + k: v for k, v in related.items()})
yield new_row


class FullManifestGenerator(StreamingManifestGenerator):

Expand Down
19 changes: 18 additions & 1 deletion test/service/test_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,83 +342,99 @@ def test_manifest_zarr(self):
'file_crc32c': '4e75003e',
'file_name': '377f2f5a-4a45-4c62-8fb0-db9ef33f5cf0.zarr/.zattrs',
'file_uuid': 'c1c4a2bc-b5fb-4083-af64-f5dec70d7f9d',
'file_drs_uri': '/c1c4a2bc-b5fb-4083-af64-f5dec70d7f9d?version=2018-10-10T031037.983672Z',
'specimen_from_organism_organ': 'brain'
},
# Related files from zarray store
{
'file_crc32c': '444a7707',
'file_name': '377f2f5a-4a45-4c62-8fb0-db9ef33f5cf0.zarr/.zgroup',
'file_uuid': '54541cc5-9010-425b-9037-22e43948c97c',
'file_drs_uri': '/54541cc5-9010-425b-9037-22e43948c97c?version=2018-10-10T031038.239541Z',
'specimen_from_organism_organ': 'brain'
},
{
'file_crc32c': '444a7707',
'file_name': '377f2f5a-4a45-4c62-8fb0-db9ef33f5cf0.zarr/expression_matrix/.zgroup',
'file_uuid': '66b8f976-6f1e-45b3-bd97-069658c3c847',
'file_drs_uri': '/66b8f976-6f1e-45b3-bd97-069658c3c847?version=2018-10-10T031038.474167Z',
'specimen_from_organism_organ': 'brain'
},
{
'file_crc32c': 'c6ab0701',
'file_name': '377f2f5a-4a45-4c62-8fb0-db9ef33f5cf0.zarr/expression_matrix/cell_id/.zarray',
'file_uuid': 'ac05d7fb-d6b9-4ab1-8c04-6211450dbb62',
'file_drs_uri': '/ac05d7fb-d6b9-4ab1-8c04-6211450dbb62?version=2018-10-10T031038.714461Z',
'specimen_from_organism_organ': 'brain'
},
{
'file_crc32c': 'cd2fd51f',
'file_name': '377f2f5a-4a45-4c62-8fb0-db9ef33f5cf0.zarr/expression_matrix/cell_id/0.0',
'file_uuid': '0c518a52-f315-4ea2-beed-1c9d8f2d802b',
'file_drs_uri': '/0c518a52-f315-4ea2-beed-1c9d8f2d802b?version=2018-10-10T031039.039270Z',
'specimen_from_organism_organ': 'brain'
},
{
'file_crc32c': 'b89e6723',
'file_name': '377f2f5a-4a45-4c62-8fb0-db9ef33f5cf0.zarr/expression_matrix/expression/.zarray',
'file_uuid': '136108ab-277e-47a4-acc3-1feed8fb2f25',
'file_drs_uri': '/136108ab-277e-47a4-acc3-1feed8fb2f25?version=2018-10-10T031039.426609Z',
'specimen_from_organism_organ': 'brain'
},
{
'file_crc32c': 'caaefa77',
'file_name': '377f2f5a-4a45-4c62-8fb0-db9ef33f5cf0.zarr/expression_matrix/expression/0.0',
'file_uuid': '0bef5419-739c-4a2c-aedb-43754d55d51c',
'file_drs_uri': '/0bef5419-739c-4a2c-aedb-43754d55d51c?version=2018-10-10T031039.642846Z',
'specimen_from_organism_organ': 'brain'
},
{
'file_crc32c': 'f629ec34',
'file_name': '377f2f5a-4a45-4c62-8fb0-db9ef33f5cf0.zarr/expression_matrix/gene_id/.zarray',
'file_uuid': '3a5f7299-1aa1-4060-9631-212c29b4d807',
'file_drs_uri': '/3a5f7299-1aa1-4060-9631-212c29b4d807?version=2018-10-10T031039.899615Z',
'specimen_from_organism_organ': 'brain'
},
{
'file_crc32c': '59d86b68',
'file_name': '377f2f5a-4a45-4c62-8fb0-db9ef33f5cf0.zarr/expression_matrix/gene_id/0.0',
'file_uuid': 'a8f0dc39-6019-4fc7-899d-4e34a48d03e5',
'file_drs_uri': '/a8f0dc39-6019-4fc7-899d-4e34a48d03e5?version=2018-10-10T031040.113268Z',
'specimen_from_organism_organ': 'brain'
},
{
'file_crc32c': '25d193cf',
'file_name': '377f2f5a-4a45-4c62-8fb0-db9ef33f5cf0.zarr/expression_matrix/qc_metric/.zarray',
'file_uuid': '68ba4711-1447-42ac-aa40-9c0e4cda1666',
'file_drs_uri': '/68ba4711-1447-42ac-aa40-9c0e4cda1666?version=2018-10-10T031040.583439Z',
'specimen_from_organism_organ': 'brain'
},
{
'file_crc32c': '17a84191',
'file_name': '377f2f5a-4a45-4c62-8fb0-db9ef33f5cf0.zarr/expression_matrix/qc_metric/0.0',
'file_uuid': '27e66328-e337-4bcd-ba15-7893ecaf841f',
'file_drs_uri': '/27e66328-e337-4bcd-ba15-7893ecaf841f?version=2018-10-10T031040.801631Z',
'specimen_from_organism_organ': 'brain'
},
{
'file_crc32c': '25d193cf',
'file_name': '377f2f5a-4a45-4c62-8fb0-db9ef33f5cf0.zarr/expression_matrix/qc_values/.zarray',
'file_uuid': '2ab1a516-ef36-41b6-a78f-513361658feb',
'file_drs_uri': '/2ab1a516-ef36-41b6-a78f-513361658feb?version=2018-10-10T031040.958708Z',
'specimen_from_organism_organ': 'brain'
},
{
'file_crc32c': 'bdc30523',
'file_name': '377f2f5a-4a45-4c62-8fb0-db9ef33f5cf0.zarr/expression_matrix/qc_values/0.0',
'file_uuid': '351970aa-bc4c-405e-a274-be9e08e42e98',
'file_drs_uri': '/351970aa-bc4c-405e-a274-be9e08e42e98?version=2018-10-10T031041.135992Z',
'specimen_from_organism_organ': 'brain'
}
]

for entry in expected:
url = furl(entry['file_drs_uri'])
entry['file_drs_uri'] = url.set(scheme='drs',
host=config.drs_domain).url
bundle_fqid = self.bundle_fqid(uuid='587d74b4-1075-4bbf-b96a-4d1ede0481b2',
version='2018-10-10T022343.182000Z')
self._index_canned_bundle(bundle_fqid)
Expand All @@ -440,6 +456,7 @@ def test_manifest_zarr(self):
rows = [dict(file_crc32c=row['file_crc32c'],
file_name=row['file_name'],
file_uuid=row['file_uuid'],
file_drs_uri=row['file_drs_uri'],
specimen_from_organism_organ=row['specimen_from_organism.organ']) for row in rows]
self.assertEqual(expected, rows)

Expand Down

0 comments on commit aa977e7

Please sign in to comment.