Skip to content

Commit

Permalink
fixup! Add JSONL-based verbatim manifest format (#6028)
Browse files Browse the repository at this point in the history
  • Loading branch information
nadove-ucsc committed Mar 21, 2024
1 parent 63b7fe3 commit 7855018
Show file tree
Hide file tree
Showing 6 changed files with 36 additions and 37 deletions.
3 changes: 1 addition & 2 deletions lambdas/service/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -1400,8 +1400,7 @@ def manifest_route(*, fetch: bool, initiate: bool):
- `{ManifestFormat.verbatim_jsonl.value}` for a verbatim
manifest in [JSONL][5] format. Each line contains an
untransformed metadata entity from the underlying
repository.
unaltered metadata entity from the underlying repository.
[1]: https://bd2k.ini.usc.edu/tools/bdbag/
Expand Down
4 changes: 2 additions & 2 deletions lambdas/service/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -9482,7 +9482,7 @@
"verbatim.jsonl"
]
},
"description": "\nThe desired format of the output.\n\n- `compact` (the default) for a compact,\n tab-separated manifest\n\n- `terra.bdbag` for a manifest in the\n [BDBag format][1]. This provides a ZIP file containing two\n manifests: one for Participants (aka Donors) and one for\n Samples (aka Specimens). For more on the format of the\n manifests see [documentation here][2].\n\n- `terra.pfb` for a manifest in the [PFB\n format][3]. This format is mainly used for exporting data to\n Terra.\n\n- `curl` for a [curl configuration\n file][4] manifest. This manifest can be used with the curl\n program to download all the files listed in the manifest.\n\n- `verbatim.jsonl` for a verbatim\n manifest in [JSONL][5] format. Each line contains an\n untransformed metadata entity from the underlying\n repository.\n\n[1]: https://bd2k.ini.usc.edu/tools/bdbag/\n\n[2]: https://software.broadinstitute.org/firecloud/documentation/article?id=10954\n\n[3]: https://github.com/uc-cdis/pypfb\n\n[4]: https://curl.haxx.se/docs/manpage.html#-K\n\n[5]: https://jsonlines.org/\n"
"description": "\nThe desired format of the output.\n\n- `compact` (the default) for a compact,\n tab-separated manifest\n\n- `terra.bdbag` for a manifest in the\n [BDBag format][1]. This provides a ZIP file containing two\n manifests: one for Participants (aka Donors) and one for\n Samples (aka Specimens). For more on the format of the\n manifests see [documentation here][2].\n\n- `terra.pfb` for a manifest in the [PFB\n format][3]. This format is mainly used for exporting data to\n Terra.\n\n- `curl` for a [curl configuration\n file][4] manifest. This manifest can be used with the curl\n program to download all the files listed in the manifest.\n\n- `verbatim.jsonl` for a verbatim\n manifest in [JSONL][5] format. Each line contains an\n unaltered metadata entity from the underlying repository.\n\n[1]: https://bd2k.ini.usc.edu/tools/bdbag/\n\n[2]: https://software.broadinstitute.org/firecloud/documentation/article?id=10954\n\n[3]: https://github.com/uc-cdis/pypfb\n\n[4]: https://curl.haxx.se/docs/manpage.html#-K\n\n[5]: https://jsonlines.org/\n"
}
],
"responses": {
Expand Down Expand Up @@ -10890,7 +10890,7 @@
"verbatim.jsonl"
]
},
"description": "\nThe desired format of the output.\n\n- `compact` (the default) for a compact,\n tab-separated manifest\n\n- `terra.bdbag` for a manifest in the\n [BDBag format][1]. This provides a ZIP file containing two\n manifests: one for Participants (aka Donors) and one for\n Samples (aka Specimens). For more on the format of the\n manifests see [documentation here][2].\n\n- `terra.pfb` for a manifest in the [PFB\n format][3]. This format is mainly used for exporting data to\n Terra.\n\n- `curl` for a [curl configuration\n file][4] manifest. This manifest can be used with the curl\n program to download all the files listed in the manifest.\n\n- `verbatim.jsonl` for a verbatim\n manifest in [JSONL][5] format. Each line contains an\n untransformed metadata entity from the underlying\n repository.\n\n[1]: https://bd2k.ini.usc.edu/tools/bdbag/\n\n[2]: https://software.broadinstitute.org/firecloud/documentation/article?id=10954\n\n[3]: https://github.com/uc-cdis/pypfb\n\n[4]: https://curl.haxx.se/docs/manpage.html#-K\n\n[5]: https://jsonlines.org/\n"
"description": "\nThe desired format of the output.\n\n- `compact` (the default) for a compact,\n tab-separated manifest\n\n- `terra.bdbag` for a manifest in the\n [BDBag format][1]. This provides a ZIP file containing two\n manifests: one for Participants (aka Donors) and one for\n Samples (aka Specimens). For more on the format of the\n manifests see [documentation here][2].\n\n- `terra.pfb` for a manifest in the [PFB\n format][3]. This format is mainly used for exporting data to\n Terra.\n\n- `curl` for a [curl configuration\n file][4] manifest. This manifest can be used with the curl\n program to download all the files listed in the manifest.\n\n- `verbatim.jsonl` for a verbatim\n manifest in [JSONL][5] format. Each line contains an\n unaltered metadata entity from the underlying repository.\n\n[1]: https://bd2k.ini.usc.edu/tools/bdbag/\n\n[2]: https://software.broadinstitute.org/firecloud/documentation/article?id=10954\n\n[3]: https://github.com/uc-cdis/pypfb\n\n[4]: https://curl.haxx.se/docs/manpage.html#-K\n\n[5]: https://jsonlines.org/\n"
}
],
"responses": {
Expand Down
4 changes: 4 additions & 0 deletions src/azul/plugins/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,10 @@ def source_id_field(self) -> str:
@property
@abstractmethod
def implicit_hub_type(self) -> str:
"""
The type of entities that do not explicitly track their hubs in replica
documents.
"""
raise NotImplementedError

@property
Expand Down
3 changes: 2 additions & 1 deletion src/azul/plugins/metadata/anvil/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from azul import (
config,
iif,
)
from azul.indexer.document import (
DocumentType,
Expand Down Expand Up @@ -69,7 +70,7 @@ def manifest_formats(self) -> Sequence[ManifestFormat]:
return [
ManifestFormat.compact,
ManifestFormat.terra_pfb,
*([ManifestFormat.verbatim_jsonl] if config.enable_replicas else [])
*iif(config.enable_replicas, [ManifestFormat.verbatim_jsonl])
]

def transformer_types(self) -> Iterable[Type[BaseTransformer]]:
Expand Down
2 changes: 1 addition & 1 deletion src/azul/service/elasticsearch_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -668,7 +668,7 @@ def create_request(self,
) -> Search:
"""
Create an Elasticsearch request against the index containing documents
for the given entity+document type in the given catalog.
of the given entity and document types, in the given catalog.
"""
return Search(using=self._es_client,
index=str(IndexName.create(catalog=catalog,
Expand Down
57 changes: 26 additions & 31 deletions src/azul/service/manifest_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -2008,28 +2008,6 @@ def qualify(qualifier, column_name, index=None):
bundle_tsv_writer.writerow(row)


@attrs.frozen(kw_only=True)
class ReplicaKeys:
"""
Most replicas contain a list of the replica's hubs. However, some entities
(e.g. projects) have too many hubs to track within the replica document.
Replicas of such entities are instead retrieved by their entity ID.
"""
hub_id: str
implicit_hub_entity_id: str

@classmethod
def prepare_query(cls, keys: Iterable[Self]) -> Q:
terms = {'hub_ids': set(), 'entity_id': set()}
for self in keys:
terms['hub_ids'].add(self.hub_id)
terms['entity_id'].add(self.implicit_hub_entity_id)
return Q('bool', should=[
{'terms': {f'{field}.keyword': list(values)}}
for field, values in terms.items()
])


class VerbatimManifestGenerator(FileBasedManifestGenerator):

@property
Expand Down Expand Up @@ -2062,13 +2040,26 @@ def included_fields(self) -> list[FieldPath]:
def implicit_hub_type(self) -> str:
return self.service.metadata_plugin(self.catalog).implicit_hub_type

@attrs.frozen(kw_only=True)
class ReplicaKeys:
"""
Most replicas contain a list of the entity ID of their hubs, usually
file entities. However, some low-cardinality entities like HCA projects
have too many hubs to track within their replica document.
This class captures the information needed to locate all replicas
associated with a given a hub entity, either using the hub's entity ID
or the replica's entity ID.
"""
hub_id: str
replica_id: str

def _replica_keys(self) -> Iterable[ReplicaKeys]:
hub_type = self.implicit_hub_type
request = self._create_request()
for hit in request.scan():
yield ReplicaKeys(
hub_id=hit['entity_id'],
implicit_hub_entity_id=one(one(hit['contents'][self.implicit_hub_type])['document_id'])
)
yield self.ReplicaKeys(hub_id=hit['entity_id'],
replica_id=one(one(hit['contents'][hub_type])['document_id']))

def _all_replicas(self) -> Iterable[JSON]:
emitted_replica_ids = set()
Expand All @@ -2090,14 +2081,18 @@ def _all_replicas(self) -> Iterable[JSON]:
if explicit_hub_count != 1:
emitted_replica_ids.add(replica_id)

def _join_replicas(self,
keys_page: Iterable[ReplicaKeys]
) -> Iterable[Hit]:

def _join_replicas(self, keys: Iterable[ReplicaKeys]) -> Iterable[Hit]:
request = self.service.create_request(catalog=self.catalog,
entity_type='replica',
doc_type=DocumentType.replica)
request = request.query(ReplicaKeys.prepare_query(keys_page))
hub_ids, replica_ids = set(), set()
for key in keys:
hub_ids.add(key.hub_id)
replica_ids.add(key.replica_id)
request = request.query(Q('bool', should=[
{'terms': {'hub_ids.keyword': list(hub_ids)}},
{'terms': {'entity_id.keyword': list(replica_ids)}}
]))
return request.scan()

def create_file(self) -> tuple[str, Optional[str]]:
Expand Down

0 comments on commit 7855018

Please sign in to comment.