diff --git a/Dockerfile b/Dockerfile index 080c0eb..5504d47 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ ARG micromamba_version ARG micromamba_version=${micromamba_version:-1.5.3} ############# base image ################## -FROM --platform=$BUILDPLATFORM google/cloud-sdk:489.0.0-stable AS base +FROM --platform=$BUILDPLATFORM google/cloud-sdk:489.0.0-stable as base # local apt mirror support # start every stage with updated apt sources @@ -13,13 +13,13 @@ RUN apt-get update --allow-releaseinfo-change --fix-missing ############# micromamba image ################## -FROM --platform=$BUILDPLATFORM mambaorg/micromamba:${micromamba_version} AS micromamba +FROM --platform=$BUILDPLATFORM mambaorg/micromamba:${micromamba_version} as micromamba RUN echo "Getting micromamba image" ############# Build Stage: Final ################## # Build the final image -FROM base AS final +FROM base as final # if image defaults to a non-root user, then we may want to make the # next 3 ARG commands match the values in our image. diff --git a/export_large_tsv.py b/export_large_tsv.py new file mode 100644 index 0000000..eaea209 --- /dev/null +++ b/export_large_tsv.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- +"""Download a remote tsv from a Terra workspace data model when it is too large to export from Terra UI.""" +from firecloud import api as fapi +from tqdm import tqdm +import argparse +import math + +DEFAULT_PAGE_SIZE = 1000 + + +def get_entity_by_page(project, workspace, entity_type, page, page_size=DEFAULT_PAGE_SIZE, sort_direction='asc', filter_terms=None): + """Get entities from workspace by page given a page_size(number of entities/rows in entity table).""" + # API = https://api.firecloud.org/#!/Entities/entityQuery + response = fapi.get_entities_query(project, workspace, entity_type, page=page, + page_size=page_size, sort_direction=sort_direction, + filter_terms=filter_terms) + + if response.status_code != 200: + print(response.text) + exit(1) + + return(response.json()) + + +def download_tsv_from_workspace(project, workspace, entity_type, tsv_name, page_size=DEFAULT_PAGE_SIZE, attr_list=None): + """Download large TSV file from Terra workspace by designated number of rows.""" + # get all entity types in workspace using API call + # API = https://api.firecloud.org/#!/Entities/getEntityTypes + response = fapi.list_entity_types(project, workspace) + if response.status_code != 200: + print(response.text) + exit(1) + + # get/report # of entities + associated attributes(column names) of input entity type + entity_types_json = response.json() + entity_count = entity_types_json[entity_type]["count"] + entity_id = entity_types_json[entity_type]["idName"] + # if user provided list of specific attributes to return, else return all attributes + if attr_list: + all_attribute_names = entity_types_json[entity_type]["attributeNames"] + attribute_names = [attr for attr in all_attribute_names if attr in attr_list] + else: + attribute_names = entity_types_json[entity_type]["attributeNames"] + + # add the entity_id value to list of attributes (not a default attribute of API response) + attribute_names.insert(0, entity_id) + + print(f'{entity_count} {entity_type}(s) to export.') + + with open(tsv_name, "w") as tsvout: + # add header with attribute values to tsv + tsvout.write("\t".join(attribute_names) + "\n") + # set starting row value and calculate number of pages + row_num = 0 + num_pages = int(math.ceil(float(entity_count) / page_size)) + + # get entities by page where each page has page_size # of rows using API call + print(f'Getting all {num_pages} pages of entity data.') + all_page_responses = [] + for page in tqdm(range(1, num_pages + 1)): + all_page_responses.append(get_entity_by_page(project, workspace, entity_type, page, page_size)) + + # for each response(page) in all_page_responses[] - contains parameter metadata + print(f'Writing {entity_count} attributes to tsv file.') + for page_response in tqdm(all_page_responses): + # for each set of attributes in results (no parameters) get attribute names and entity_id(name) + for entity_json in page_response["results"]: + attributes = entity_json["attributes"] + name = entity_json["name"] + # add name and value to dictionary of attributes + attributes[entity_id] = name + + values = [] + # for each attribute(column name) in list of attribute names(all columns for entity) + for attribute_name in attribute_names: + value = "" + # if entity's attribute(column) is in list of attributes from response, set response's attribute value + if attribute_name in attributes: + value = attributes[attribute_name] + + values.append(str(value)) + + tsvout.write("\t".join(values) + "\n") + row_num += 1 + + print(f'Finished exporting {entity_type}(s) to tsv with name {tsv_name}.') + + +if __name__ == "__main__": + # argument parser + parser = argparse.ArgumentParser(description="Exports/downloadload a TSV file from Terra when it is too large to download via the UI.") + # application arguments + parser.add_argument('-p', '--project', type=str, required=True, help='Terra namespace/project of workspace.') + parser.add_argument('-w', '--workspace', type=str, required=True, help='Name of Terra workspace.') + parser.add_argument('-e', '--entity_type', type=str, required=True, help='Entity type being requested for tsv export to local destination.') + parser.add_argument('-f', '--tsv_filename', type=str, required=True, help='Name of tsv file to be exported from Terra to local destination.') + parser.add_argument('-n', '--page_size', type=int, default=DEFAULT_PAGE_SIZE, help='Number of entities/rows to export per page.') + parser.add_argument('-a', '--attribute_list', nargs='+', help='column names to return - separated by spaces. ex. -a col1 col2') + + args = parser.parse_args() + download_tsv_from_workspace(args.project, args.workspace, args.entity_type, args.tsv_filename, args.page_size, args.attribute_list)