Skip to content

Commit

Permalink
fixup! Add JSONL-based verbatim manifest format (#6028)
Browse files Browse the repository at this point in the history
  • Loading branch information
nadove-ucsc committed Mar 15, 2024
1 parent d4841b1 commit c4544d8
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 21 deletions.
7 changes: 7 additions & 0 deletions lambdas/service/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -1398,13 +1398,20 @@ def manifest_route(*, fetch: bool, initiate: bool):
file][4] manifest. This manifest can be used with the curl
program to download all the files listed in the manifest.
- `{ManifestFormat.verbatim_jsonl.value}` for a verbatim
manifest in [JSONL][5] format. Each line contains an
untransformed metadata entity from the underlying
repository.
[1]: https://bd2k.ini.usc.edu/tools/bdbag/
[2]: https://software.broadinstitute.org/firecloud/documentation/article?id=10954
[3]: https://github.com/uc-cdis/pypfb
[4]: https://curl.haxx.se/docs/manpage.html#-K
[5]: https://jsonlines.org/
'''
)
] if initiate else [],
Expand Down
4 changes: 2 additions & 2 deletions lambdas/service/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -9482,7 +9482,7 @@
"verbatim.jsonl"
]
},
"description": "\nThe desired format of the output.\n\n- `compact` (the default) for a compact,\n tab-separated manifest\n\n- `terra.bdbag` for a manifest in the\n [BDBag format][1]. This provides a ZIP file containing two\n manifests: one for Participants (aka Donors) and one for\n Samples (aka Specimens). For more on the format of the\n manifests see [documentation here][2].\n\n- `terra.pfb` for a manifest in the [PFB\n format][3]. This format is mainly used for exporting data to\n Terra.\n\n- `curl` for a [curl configuration\n file][4] manifest. This manifest can be used with the curl\n program to download all the files listed in the manifest.\n\n[1]: https://bd2k.ini.usc.edu/tools/bdbag/\n\n[2]: https://software.broadinstitute.org/firecloud/documentation/article?id=10954\n\n[3]: https://github.com/uc-cdis/pypfb\n\n[4]: https://curl.haxx.se/docs/manpage.html#-K\n"
"description": "\nThe desired format of the output.\n\n- `compact` (the default) for a compact,\n tab-separated manifest\n\n- `terra.bdbag` for a manifest in the\n [BDBag format][1]. This provides a ZIP file containing two\n manifests: one for Participants (aka Donors) and one for\n Samples (aka Specimens). For more on the format of the\n manifests see [documentation here][2].\n\n- `terra.pfb` for a manifest in the [PFB\n format][3]. This format is mainly used for exporting data to\n Terra.\n\n- `curl` for a [curl configuration\n file][4] manifest. This manifest can be used with the curl\n program to download all the files listed in the manifest.\n\n- `verbatim.jsonl` for a verbatim\n manifest in [JSONL][5] format. Each line contains an\n untransformed metadata entity from the underlying\n repository.\n\n[1]: https://bd2k.ini.usc.edu/tools/bdbag/\n\n[2]: https://software.broadinstitute.org/firecloud/documentation/article?id=10954\n\n[3]: https://github.com/uc-cdis/pypfb\n\n[4]: https://curl.haxx.se/docs/manpage.html#-K\n\n[5]: https://jsonlines.org/\n"
}
],
"responses": {
Expand Down Expand Up @@ -10890,7 +10890,7 @@
"verbatim.jsonl"
]
},
"description": "\nThe desired format of the output.\n\n- `compact` (the default) for a compact,\n tab-separated manifest\n\n- `terra.bdbag` for a manifest in the\n [BDBag format][1]. This provides a ZIP file containing two\n manifests: one for Participants (aka Donors) and one for\n Samples (aka Specimens). For more on the format of the\n manifests see [documentation here][2].\n\n- `terra.pfb` for a manifest in the [PFB\n format][3]. This format is mainly used for exporting data to\n Terra.\n\n- `curl` for a [curl configuration\n file][4] manifest. This manifest can be used with the curl\n program to download all the files listed in the manifest.\n\n[1]: https://bd2k.ini.usc.edu/tools/bdbag/\n\n[2]: https://software.broadinstitute.org/firecloud/documentation/article?id=10954\n\n[3]: https://github.com/uc-cdis/pypfb\n\n[4]: https://curl.haxx.se/docs/manpage.html#-K\n"
"description": "\nThe desired format of the output.\n\n- `compact` (the default) for a compact,\n tab-separated manifest\n\n- `terra.bdbag` for a manifest in the\n [BDBag format][1]. This provides a ZIP file containing two\n manifests: one for Participants (aka Donors) and one for\n Samples (aka Specimens). For more on the format of the\n manifests see [documentation here][2].\n\n- `terra.pfb` for a manifest in the [PFB\n format][3]. This format is mainly used for exporting data to\n Terra.\n\n- `curl` for a [curl configuration\n file][4] manifest. This manifest can be used with the curl\n program to download all the files listed in the manifest.\n\n- `verbatim.jsonl` for a verbatim\n manifest in [JSONL][5] format. Each line contains an\n untransformed metadata entity from the underlying\n repository.\n\n[1]: https://bd2k.ini.usc.edu/tools/bdbag/\n\n[2]: https://software.broadinstitute.org/firecloud/documentation/article?id=10954\n\n[3]: https://github.com/uc-cdis/pypfb\n\n[4]: https://curl.haxx.se/docs/manpage.html#-K\n\n[5]: https://jsonlines.org/\n"
}
],
"responses": {
Expand Down
49 changes: 30 additions & 19 deletions src/azul/service/manifest_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -2011,6 +2011,28 @@ def qualify(qualifier, column_name, index=None):
bundle_tsv_writer.writerow(row)


@attrs.frozen(kw_only=True)
class ReplicaKeys:
"""
Most replicas contain a list of the replica's hubs. However, some entities
(e.g. projects) have too many hubs to track within the replica document.
Replicas of such entities are instead retrieved by their entity ID.
"""
hub_id: str
implicit_hub_entity_id: str

@classmethod
def prepare_query(cls, keys: Iterable[Self]) -> Q:
terms = {'hub_ids': set(), 'entity_id': set()}
for self in keys:
terms['hub_ids'].add(self.hub_id)
terms['entity_id'].add(self.implicit_hub_entity_id)
return Q('bool', should=[
{'terms': {f'{field}.keyword': list(values)}}
for field, values in terms.items()
])


class VerbatimManifestGenerator(FileBasedManifestGenerator):

@property
Expand Down Expand Up @@ -2043,17 +2065,13 @@ def included_fields(self) -> list[FieldPath]:
def implicit_hub_type(self) -> str:
return self.service.metadata_plugin(self.catalog).implicit_hub_type

def _replica_keys(self) -> Iterable[dict[str, str]]:
def _replica_keys(self) -> Iterable[ReplicaKeys]:
request = self._create_request()
for hit in request.scan():
yield {
# Most replicas track their hubs explicitly, however...
'hub_ids': hit['entity_id'],
# ... for projects and datasets, there are too many hubs to
# track them all in the replica, so they are instead retrieved
# by entity ID.
'entity_id': one(one(hit['contents'][self.implicit_hub_type])['document_id'])
}
yield ReplicaKeys(
hub_id=hit['entity_id'],
implicit_hub_entity_id=one(one(hit['contents'][self.implicit_hub_type])['document_id'])
)

def _all_replicas(self) -> Iterable[JSON]:
emitted_replica_ids = set()
Expand All @@ -2076,20 +2094,13 @@ def _all_replicas(self) -> Iterable[JSON]:
emitted_replica_ids.add(replica_id)

def _join_replicas(self,
keys_page: Iterable[dict[str, str]]
keys_page: Iterable[ReplicaKeys]
) -> Iterable[Hit]:
terms_by_field = defaultdict(set)
for keys in keys_page:
for field, term in keys.items():
terms_by_field[field].add(term)

request = self.service.create_request(catalog=self.catalog,
entity_type='replica',
doc_type=DocumentType.replica)
request = request.query(Q('bool', should=[
{'terms': {f'{field}.keyword': list(terms)}}
for field, terms in terms_by_field.items()
]
))
request = request.query(ReplicaKeys.prepare_query(keys_page))
return request.scan()

def create_file(self) -> tuple[str, Optional[str]]:
Expand Down

0 comments on commit c4544d8

Please sign in to comment.