[r] Remove support for full manifest (#3527, #2693)

DataBiosphere · Oct 14, 2021 · bc26ab8 · bc26ab8
1 parent 02ab607
commit bc26ab8
Show file tree

Hide file tree

Showing 10 changed files with 12 additions and 1,199 deletions.
diff --git a/...zul/plugins/metadata/hca/full_metadata.py → ...zul/plugins/metadata/hca/full_metadata.py b/...zul/plugins/metadata/hca/full_metadata.py → ...zul/plugins/metadata/hca/full_metadata.py
diff --git a/lambdas/service/app.py b/lambdas/service/app.py
@@ -1318,8 +1318,6 @@ def manifest_path_spec(*, fetch: bool):
                 - `{ManifestFormat.compact.value}` (the default) for a compact, tab-separated
                   manifest
 
-                - `{ManifestFormat.full.value}` for a full tab-separated manifest
-
                 - `{ManifestFormat.terra_bdbag.value}` for a manifest in the
                   [BDBag format][1]. This provides a ZIP file containing two manifests: one for
                   Participants (aka Donors) and one for Samples (aka Specimens). For more on the

diff --git a/lambdas/service/openapi.json b/lambdas/service/openapi.json
@@ -34785,13 +34785,12 @@
                         "type": "string",
                         "enum": [
                             "compact",
-                            "full",
                             "terra.bdbag",
                             "terra.pfb",
                             "curl"
                         ]
                     },
-                    "description": "\nThe desired format of the output.\n\n- `compact` (the default) for a compact, tab-separated\n  manifest\n\n- `full` for a full tab-separated manifest\n\n- `terra.bdbag` for a manifest in the\n  [BDBag format][1]. This provides a ZIP file containing two manifests: one for\n  Participants (aka Donors) and one for Samples (aka Specimens). For more on the\n  format of the manifests see [documentation here][2].\n\n- `terra.pfb` for a manifest in the [PFB format][3]. This\n  format is mainly used for exporting data to Terra.\n\n- `curl` for a [curl configuration file][4] manifest.\nThis manifest can be used with the curl program to download all the files listed\nin the manifest.\n\n[1]: http://bd2k.ini.usc.edu/tools/bdbag/\n\n[2]: https://software.broadinstitute.org/firecloud/documentation/article?id=10954\n\n[3]: https://github.com/uc-cdis/pypfb\n\n[4]: https://curl.haxx.se/docs/manpage.html#-K\n"
+                    "description": "\nThe desired format of the output.\n\n- `compact` (the default) for a compact, tab-separated\n  manifest\n\n- `terra.bdbag` for a manifest in the\n  [BDBag format][1]. This provides a ZIP file containing two manifests: one for\n  Participants (aka Donors) and one for Samples (aka Specimens). For more on the\n  format of the manifests see [documentation here][2].\n\n- `terra.pfb` for a manifest in the [PFB format][3]. This\n  format is mainly used for exporting data to Terra.\n\n- `curl` for a [curl configuration file][4] manifest.\nThis manifest can be used with the curl program to download all the files listed\nin the manifest.\n\n[1]: http://bd2k.ini.usc.edu/tools/bdbag/\n\n[2]: https://software.broadinstitute.org/firecloud/documentation/article?id=10954\n\n[3]: https://github.com/uc-cdis/pypfb\n\n[4]: https://curl.haxx.se/docs/manpage.html#-K\n"
                 },
                 {
                     "name": "objectKey",
@@ -37760,13 +37759,12 @@
                         "type": "string",
                         "enum": [
                             "compact",
-                            "full",
                             "terra.bdbag",
                             "terra.pfb",
                             "curl"
                         ]
                     },
-                    "description": "\nThe desired format of the output.\n\n- `compact` (the default) for a compact, tab-separated\n  manifest\n\n- `full` for a full tab-separated manifest\n\n- `terra.bdbag` for a manifest in the\n  [BDBag format][1]. This provides a ZIP file containing two manifests: one for\n  Participants (aka Donors) and one for Samples (aka Specimens). For more on the\n  format of the manifests see [documentation here][2].\n\n- `terra.pfb` for a manifest in the [PFB format][3]. This\n  format is mainly used for exporting data to Terra.\n\n- `curl` for a [curl configuration file][4] manifest.\nThis manifest can be used with the curl program to download all the files listed\nin the manifest.\n\n[1]: http://bd2k.ini.usc.edu/tools/bdbag/\n\n[2]: https://software.broadinstitute.org/firecloud/documentation/article?id=10954\n\n[3]: https://github.com/uc-cdis/pypfb\n\n[4]: https://curl.haxx.se/docs/manpage.html#-K\n"
+                    "description": "\nThe desired format of the output.\n\n- `compact` (the default) for a compact, tab-separated\n  manifest\n\n- `terra.bdbag` for a manifest in the\n  [BDBag format][1]. This provides a ZIP file containing two manifests: one for\n  Participants (aka Donors) and one for Samples (aka Specimens). For more on the\n  format of the manifests see [documentation here][2].\n\n- `terra.pfb` for a manifest in the [PFB format][3]. This\n  format is mainly used for exporting data to Terra.\n\n- `curl` for a [curl configuration file][4] manifest.\nThis manifest can be used with the curl program to download all the files listed\nin the manifest.\n\n[1]: http://bd2k.ini.usc.edu/tools/bdbag/\n\n[2]: https://software.broadinstitute.org/firecloud/documentation/article?id=10954\n\n[3]: https://github.com/uc-cdis/pypfb\n\n[4]: https://curl.haxx.se/docs/manpage.html#-K\n"
                 },
                 {
                     "name": "token",

diff --git a/src/azul/plugins/metadata/hca/transform.py b/src/azul/plugins/metadata/hca/transform.py
@@ -94,9 +94,6 @@
 from azul.plugins.metadata.hca.contributor_matrices import (
     parse_strata,
 )
-from azul.plugins.metadata.hca.full_metadata import (
-    FullMetadata,
-)
 from azul.time import (
     format_dcp2_datetime,
 )
@@ -1564,37 +1561,16 @@ def entity_type(cls) -> str:
 
 class BundleTransformer(BundleProjectTransformer):
 
-    def __init__(self, bundle: Bundle, deleted: bool) -> None:
-        super().__init__(bundle, deleted)
-        if 'project.json' in bundle.metadata_files:
-            # we can't handle v5 bundles
-            self.metadata = []
-        else:
-            full_metadata = FullMetadata()
-            full_metadata.add_bundle(bundle)
-            self.metadata = full_metadata.dump()
-
     def _get_entity_id(self, project: api.Project) -> api.UUID4:
         return self.api_bundle.uuid
 
     @classmethod
     def get_aggregator(cls, entity_type):
-        if entity_type in ('files', 'metadata'):
+        if entity_type == 'files':
             return None
         else:
             return super().get_aggregator(entity_type)
 
     @classmethod
     def entity_type(cls) -> str:
         return 'bundles'
-
-    def _contribution(self, contents: MutableJSON, entity_id: api.UUID4) -> Contribution:
-        contents['metadata'] = self.metadata
-        return super()._contribution(contents, entity_id)
-
-    @classmethod
-    def field_types(cls) -> FieldTypes:
-        return {
-            **super().field_types(),
-            'metadata': [pass_thru_json]  # Exclude full metadata from translation
-        }
diff --git a/src/azul/service/manifest_service.py b/src/azul/service/manifest_service.py
@@ -36,7 +36,6 @@
 import os
 import re
 import shlex
-import string
 from tempfile import (
     TemporaryDirectory,
     mkstemp,
@@ -51,7 +50,6 @@
     MutableMapping,
     Optional,
     Protocol,
-    Set,
     Tuple,
     Type,
     Union,
@@ -65,7 +63,6 @@
     bdbag_api,
 )
 from elasticsearch_dsl import (
-    Q,
     Search,
 )
 from elasticsearch_dsl.response import (
@@ -134,7 +131,6 @@
 
 class ManifestFormat(Enum):
     compact = 'compact'
-    full = 'full'
     terra_bdbag = 'terra.bdbag'
     terra_pfb = 'terra.pfb'
     curl = 'curl'
@@ -1355,120 +1351,6 @@ def write_page_to(self,
             return partition.last_page()
 
 
-class FullManifestGenerator(PagedManifestGenerator):
-
-    @classmethod
-    def format(cls) -> ManifestFormat:
-        return ManifestFormat.full
-
-    @property
-    def content_type(self) -> str:
-        return 'text/tab-separated-values'
-
-    @property
-    def file_name_extension(self):
-        return 'tsv'
-
-    @property
-    def entity_type(self) -> str:
-        return 'bundles'
-
-    @property
-    def source_filter(self) -> SourceFilters:
-        return ['contents.metadata.*']
-
-    def write_page_to(self,
-                      partition: ManifestPartition,
-                      output: IO[str]
-                      ) -> ManifestPartition:
-        sources = list(self.manifest_config['contents'].keys())
-        writer = csv.DictWriter(output, sources, dialect='excel-tab')
-
-        if partition.page_index == 0:
-            writer.writeheader()
-
-        request = self._create_paged_request(partition)
-        response = request.execute()
-        if response.hits:
-            project_short_names = set()
-            hit = None
-            for hit in response.hits:
-                # If source filters select a field that is an empty value in any
-                # document, Elasticsearch will return an empty hit instead of a
-                # hit containing the field. We use .get() to work around this.
-                to_dict = hit.to_dict()
-                contents = to_dict.get('contents', {})
-                for metadata in list(contents.get('metadata', [])):
-                    if len(project_short_names) < 2:
-                        project_short_names.add(metadata['project.project_core.project_short_name'])
-                    row = dict.fromkeys(sources)
-                    row.update(metadata)
-                    writer.writerow(row)
-            assert hit is not None
-            search_after = tuple(hit.meta.sort)
-            file_name = project_short_names.pop() if len(project_short_names) == 1 else None
-            return partition.next_page(file_name=file_name,
-                                       search_after=search_after)
-        else:
-            return partition.last_page()
-
-    @cached_property
-    def manifest_config(self) -> ManifestConfig:
-        es_search = self._create_request()
-        map_script = '''
-                for (row in params._source.contents.metadata) {
-                    for (f in row.keySet()) {
-                        params._agg.fields.add(f);
-                    }
-                }
-            '''
-        reduce_script = '''
-                Set fields = new HashSet();
-                for (agg in params._aggs) {
-                    fields.addAll(agg);
-                }
-                return new ArrayList(fields);
-            '''
-        es_search.aggs.metric('fields', 'scripted_metric',
-                              init_script='params._agg.fields = new HashSet()',
-                              map_script=map_script,
-                              combine_script='return new ArrayList(params._agg.fields)',
-                              reduce_script=reduce_script)
-        es_search = es_search.extra(size=0)
-        fields = self._partitioned_search(es_search)
-        return {
-            'contents': {
-                value: value.split('.')[-1]
-                for value in sorted(fields)
-            }
-        }
-
-    def _partitioned_search(self, es_search: Search) -> Set[str]:
-        """
-        Partition ES request by prefix and execute sequentially to avoid timeouts
-        """
-
-        def execute(es_search: Search) -> List[str]:
-            response = es_search.execute()
-            # Script failures could still come back as a successful response
-            # with one or more failed shards.
-            # noinspection PyProtectedMember
-            assert response._shards.failed == 0, response._shards.failures
-            assert len(response.hits) == 0, response.hits
-            return response.aggregations.fields.value
-
-        start = time.time()
-        fields = []
-        for prefix in string.hexdigits[:16]:
-            es_query = es_search.query(Q('bool',
-                                         must=Q('prefix',
-                                                **{'bundles.uuid.keyword': prefix})))
-            fields.append(execute(es_query))
-        logger.info('Elasticsearch partitioned requests completed after %.003fs',
-                    time.time() - start)
-        return set(chain(*fields))
-
-
 FQID = Tuple[str, str]
 Qualifier = str