Skip to content

Commit

Permalink
[r] Remove support for full manifest (#3527, #2693)
Browse files Browse the repository at this point in the history
  • Loading branch information
hannes-ucsc authored and achave11-ucsc committed Oct 14, 2021
1 parent 02ab607 commit bc26ab8
Show file tree
Hide file tree
Showing 10 changed files with 12 additions and 1,199 deletions.
2 changes: 0 additions & 2 deletions lambdas/service/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -1318,8 +1318,6 @@ def manifest_path_spec(*, fetch: bool):
- `{ManifestFormat.compact.value}` (the default) for a compact, tab-separated
manifest
- `{ManifestFormat.full.value}` for a full tab-separated manifest
- `{ManifestFormat.terra_bdbag.value}` for a manifest in the
[BDBag format][1]. This provides a ZIP file containing two manifests: one for
Participants (aka Donors) and one for Samples (aka Specimens). For more on the
Expand Down
6 changes: 2 additions & 4 deletions lambdas/service/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -34785,13 +34785,12 @@
"type": "string",
"enum": [
"compact",
"full",
"terra.bdbag",
"terra.pfb",
"curl"
]
},
"description": "\nThe desired format of the output.\n\n- `compact` (the default) for a compact, tab-separated\n manifest\n\n- `full` for a full tab-separated manifest\n\n- `terra.bdbag` for a manifest in the\n [BDBag format][1]. This provides a ZIP file containing two manifests: one for\n Participants (aka Donors) and one for Samples (aka Specimens). For more on the\n format of the manifests see [documentation here][2].\n\n- `terra.pfb` for a manifest in the [PFB format][3]. This\n format is mainly used for exporting data to Terra.\n\n- `curl` for a [curl configuration file][4] manifest.\nThis manifest can be used with the curl program to download all the files listed\nin the manifest.\n\n[1]: http://bd2k.ini.usc.edu/tools/bdbag/\n\n[2]: https://software.broadinstitute.org/firecloud/documentation/article?id=10954\n\n[3]: https://github.com/uc-cdis/pypfb\n\n[4]: https://curl.haxx.se/docs/manpage.html#-K\n"
"description": "\nThe desired format of the output.\n\n- `compact` (the default) for a compact, tab-separated\n manifest\n\n- `terra.bdbag` for a manifest in the\n [BDBag format][1]. This provides a ZIP file containing two manifests: one for\n Participants (aka Donors) and one for Samples (aka Specimens). For more on the\n format of the manifests see [documentation here][2].\n\n- `terra.pfb` for a manifest in the [PFB format][3]. This\n format is mainly used for exporting data to Terra.\n\n- `curl` for a [curl configuration file][4] manifest.\nThis manifest can be used with the curl program to download all the files listed\nin the manifest.\n\n[1]: http://bd2k.ini.usc.edu/tools/bdbag/\n\n[2]: https://software.broadinstitute.org/firecloud/documentation/article?id=10954\n\n[3]: https://github.com/uc-cdis/pypfb\n\n[4]: https://curl.haxx.se/docs/manpage.html#-K\n"
},
{
"name": "objectKey",
Expand Down Expand Up @@ -37760,13 +37759,12 @@
"type": "string",
"enum": [
"compact",
"full",
"terra.bdbag",
"terra.pfb",
"curl"
]
},
"description": "\nThe desired format of the output.\n\n- `compact` (the default) for a compact, tab-separated\n manifest\n\n- `full` for a full tab-separated manifest\n\n- `terra.bdbag` for a manifest in the\n [BDBag format][1]. This provides a ZIP file containing two manifests: one for\n Participants (aka Donors) and one for Samples (aka Specimens). For more on the\n format of the manifests see [documentation here][2].\n\n- `terra.pfb` for a manifest in the [PFB format][3]. This\n format is mainly used for exporting data to Terra.\n\n- `curl` for a [curl configuration file][4] manifest.\nThis manifest can be used with the curl program to download all the files listed\nin the manifest.\n\n[1]: http://bd2k.ini.usc.edu/tools/bdbag/\n\n[2]: https://software.broadinstitute.org/firecloud/documentation/article?id=10954\n\n[3]: https://github.com/uc-cdis/pypfb\n\n[4]: https://curl.haxx.se/docs/manpage.html#-K\n"
"description": "\nThe desired format of the output.\n\n- `compact` (the default) for a compact, tab-separated\n manifest\n\n- `terra.bdbag` for a manifest in the\n [BDBag format][1]. This provides a ZIP file containing two manifests: one for\n Participants (aka Donors) and one for Samples (aka Specimens). For more on the\n format of the manifests see [documentation here][2].\n\n- `terra.pfb` for a manifest in the [PFB format][3]. This\n format is mainly used for exporting data to Terra.\n\n- `curl` for a [curl configuration file][4] manifest.\nThis manifest can be used with the curl program to download all the files listed\nin the manifest.\n\n[1]: http://bd2k.ini.usc.edu/tools/bdbag/\n\n[2]: https://software.broadinstitute.org/firecloud/documentation/article?id=10954\n\n[3]: https://github.com/uc-cdis/pypfb\n\n[4]: https://curl.haxx.se/docs/manpage.html#-K\n"
},
{
"name": "token",
Expand Down
26 changes: 1 addition & 25 deletions src/azul/plugins/metadata/hca/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,6 @@
from azul.plugins.metadata.hca.contributor_matrices import (
parse_strata,
)
from azul.plugins.metadata.hca.full_metadata import (
FullMetadata,
)
from azul.time import (
format_dcp2_datetime,
)
Expand Down Expand Up @@ -1564,37 +1561,16 @@ def entity_type(cls) -> str:

class BundleTransformer(BundleProjectTransformer):

def __init__(self, bundle: Bundle, deleted: bool) -> None:
super().__init__(bundle, deleted)
if 'project.json' in bundle.metadata_files:
# we can't handle v5 bundles
self.metadata = []
else:
full_metadata = FullMetadata()
full_metadata.add_bundle(bundle)
self.metadata = full_metadata.dump()

def _get_entity_id(self, project: api.Project) -> api.UUID4:
return self.api_bundle.uuid

@classmethod
def get_aggregator(cls, entity_type):
if entity_type in ('files', 'metadata'):
if entity_type == 'files':
return None
else:
return super().get_aggregator(entity_type)

@classmethod
def entity_type(cls) -> str:
return 'bundles'

def _contribution(self, contents: MutableJSON, entity_id: api.UUID4) -> Contribution:
contents['metadata'] = self.metadata
return super()._contribution(contents, entity_id)

@classmethod
def field_types(cls) -> FieldTypes:
return {
**super().field_types(),
'metadata': [pass_thru_json] # Exclude full metadata from translation
}
118 changes: 0 additions & 118 deletions src/azul/service/manifest_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
import os
import re
import shlex
import string
from tempfile import (
TemporaryDirectory,
mkstemp,
Expand All @@ -51,7 +50,6 @@
MutableMapping,
Optional,
Protocol,
Set,
Tuple,
Type,
Union,
Expand All @@ -65,7 +63,6 @@
bdbag_api,
)
from elasticsearch_dsl import (
Q,
Search,
)
from elasticsearch_dsl.response import (
Expand Down Expand Up @@ -134,7 +131,6 @@

class ManifestFormat(Enum):
compact = 'compact'
full = 'full'
terra_bdbag = 'terra.bdbag'
terra_pfb = 'terra.pfb'
curl = 'curl'
Expand Down Expand Up @@ -1355,120 +1351,6 @@ def write_page_to(self,
return partition.last_page()


class FullManifestGenerator(PagedManifestGenerator):

@classmethod
def format(cls) -> ManifestFormat:
return ManifestFormat.full

@property
def content_type(self) -> str:
return 'text/tab-separated-values'

@property
def file_name_extension(self):
return 'tsv'

@property
def entity_type(self) -> str:
return 'bundles'

@property
def source_filter(self) -> SourceFilters:
return ['contents.metadata.*']

def write_page_to(self,
partition: ManifestPartition,
output: IO[str]
) -> ManifestPartition:
sources = list(self.manifest_config['contents'].keys())
writer = csv.DictWriter(output, sources, dialect='excel-tab')

if partition.page_index == 0:
writer.writeheader()

request = self._create_paged_request(partition)
response = request.execute()
if response.hits:
project_short_names = set()
hit = None
for hit in response.hits:
# If source filters select a field that is an empty value in any
# document, Elasticsearch will return an empty hit instead of a
# hit containing the field. We use .get() to work around this.
to_dict = hit.to_dict()
contents = to_dict.get('contents', {})
for metadata in list(contents.get('metadata', [])):
if len(project_short_names) < 2:
project_short_names.add(metadata['project.project_core.project_short_name'])
row = dict.fromkeys(sources)
row.update(metadata)
writer.writerow(row)
assert hit is not None
search_after = tuple(hit.meta.sort)
file_name = project_short_names.pop() if len(project_short_names) == 1 else None
return partition.next_page(file_name=file_name,
search_after=search_after)
else:
return partition.last_page()

@cached_property
def manifest_config(self) -> ManifestConfig:
es_search = self._create_request()
map_script = '''
for (row in params._source.contents.metadata) {
for (f in row.keySet()) {
params._agg.fields.add(f);
}
}
'''
reduce_script = '''
Set fields = new HashSet();
for (agg in params._aggs) {
fields.addAll(agg);
}
return new ArrayList(fields);
'''
es_search.aggs.metric('fields', 'scripted_metric',
init_script='params._agg.fields = new HashSet()',
map_script=map_script,
combine_script='return new ArrayList(params._agg.fields)',
reduce_script=reduce_script)
es_search = es_search.extra(size=0)
fields = self._partitioned_search(es_search)
return {
'contents': {
value: value.split('.')[-1]
for value in sorted(fields)
}
}

def _partitioned_search(self, es_search: Search) -> Set[str]:
"""
Partition ES request by prefix and execute sequentially to avoid timeouts
"""

def execute(es_search: Search) -> List[str]:
response = es_search.execute()
# Script failures could still come back as a successful response
# with one or more failed shards.
# noinspection PyProtectedMember
assert response._shards.failed == 0, response._shards.failures
assert len(response.hits) == 0, response.hits
return response.aggregations.fields.value

start = time.time()
fields = []
for prefix in string.hexdigits[:16]:
es_query = es_search.query(Q('bool',
must=Q('prefix',
**{'bundles.uuid.keyword': prefix})))
fields.append(execute(es_query))
logger.info('Elasticsearch partitioned requests completed after %.003fs',
time.time() - start)
return set(chain(*fields))


FQID = Tuple[str, str]
Qualifier = str

Expand Down
Loading

0 comments on commit bc26ab8

Please sign in to comment.