From ed853c1c5dc9c143e8f9e61def3e98a534b0d978 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Tue, 14 Sep 2021 23:35:20 +0100 Subject: [PATCH 01/14] Replace requests with urllib --- bin/sra_ids_to_runinfo.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index 1480e8f2..41a5c0ee 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -5,8 +5,9 @@ import sys import csv import errno -import requests import argparse +from urllib.request import urlopen +from urllib.error import URLError, HTTPError ## Example ids supported by this script SRA_IDS = ['PRJNA63463', 'SAMN00765663', 'SRA023522', 'SRP003255', 'SRR390278', 'SRS282569', 'SRX111814'] @@ -61,13 +62,17 @@ def make_dir(path): def fetch_url(url, encoding='utf-8'): try: - r = requests.get(url) - except requests.exceptions.RequestException as e: - raise SystemExit(e) - if r.status_code != 200: - print("ERROR: Connection failed\nError code '{}'".format(r.status_code)) + with urlopen(url) as f: + r = f.read().decode(encoding).splitlines() + except HTTPError as e: + print('The server couldn\'t fulfill the request.') + print('Error code: {}'.format(e.code)) sys.exit(1) - return r.content.decode(encoding).splitlines() + except URLError as e: + print('We failed to reach a server.') + print('Reason: {}'.format(e.reason)) + sys.exit(1) + return r def id_to_srx(db_id): ids = [] From 79444d5605404c85ba20404ffdc86185a6c51321 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Tue, 14 Sep 2021 23:35:35 +0100 Subject: [PATCH 02/14] Bump Python version to 3.9.5 --- modules/local/get_software_versions.nf | 6 +++--- modules/local/multiqc_mappings_config.nf | 6 +++--- modules/local/sra_ids_to_runinfo.nf | 6 +++--- modules/local/sra_runinfo_to_ftp.nf | 6 +++--- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/modules/local/get_software_versions.nf b/modules/local/get_software_versions.nf index 4d37bd6a..65b7f340 100644 --- a/modules/local/get_software_versions.nf +++ b/modules/local/get_software_versions.nf @@ -8,11 +8,11 @@ process GET_SOFTWARE_VERSIONS { mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:'pipeline_info', meta:[:], publish_by_meta:[]) } - conda (params.enable_conda ? "conda-forge::python=3.8.3" : null) + conda (params.enable_conda ? "conda-forge::python=3.9.5" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/python:3.8.3" + container "https://depot.galaxyproject.org/singularity/python:3.9--1" } else { - container "quay.io/biocontainers/python:3.8.3" + container "quay.io/biocontainers/python:3.9--1" } cache false diff --git a/modules/local/multiqc_mappings_config.nf b/modules/local/multiqc_mappings_config.nf index 63121b40..8360cd34 100644 --- a/modules/local/multiqc_mappings_config.nf +++ b/modules/local/multiqc_mappings_config.nf @@ -8,11 +8,11 @@ process MULTIQC_MAPPINGS_CONFIG { mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } - conda (params.enable_conda ? "conda-forge::python=3.8.3" : null) + conda (params.enable_conda ? "conda-forge::python=3.9.5" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/python:3.8.3" + container "https://depot.galaxyproject.org/singularity/python:3.9--1" } else { - container "quay.io/biocontainers/python:3.8.3" + container "quay.io/biocontainers/python:3.9--1" } input: diff --git a/modules/local/sra_ids_to_runinfo.nf b/modules/local/sra_ids_to_runinfo.nf index b277197e..3d3fc063 100644 --- a/modules/local/sra_ids_to_runinfo.nf +++ b/modules/local/sra_ids_to_runinfo.nf @@ -10,11 +10,11 @@ process SRA_IDS_TO_RUNINFO { mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } - conda (params.enable_conda ? "conda-forge::sed=4.7" : null) + conda (params.enable_conda ? "conda-forge::python=3.9.5" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img" + container "https://depot.galaxyproject.org/singularity/python:3.9--1" } else { - container "biocontainers/biocontainers:v1.2.0_cv1" + container "quay.io/biocontainers/python:3.9--1" } input: diff --git a/modules/local/sra_runinfo_to_ftp.nf b/modules/local/sra_runinfo_to_ftp.nf index b0421aea..f426f4ab 100644 --- a/modules/local/sra_runinfo_to_ftp.nf +++ b/modules/local/sra_runinfo_to_ftp.nf @@ -8,11 +8,11 @@ process SRA_RUNINFO_TO_FTP { mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } - conda (params.enable_conda ? "conda-forge::python=3.8.3" : null) + conda (params.enable_conda ? "conda-forge::python=3.9.5" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/python:3.8.3" + container "https://depot.galaxyproject.org/singularity/python:3.9--1" } else { - container "quay.io/biocontainers/python:3.8.3" + container "quay.io/biocontainers/python:3.9--1" } input: From 74b7f148b5462998d8b5f485ad0bad55f73e2ef3 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Tue, 14 Sep 2021 23:35:43 +0100 Subject: [PATCH 03/14] Update CHANGELOG --- CHANGELOG.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index be77eb6f..eae33fae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,18 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [[1.3](https://github.com/nf-core/fetchngs/releases/tag/1.3)] - 2021-09-15 + +### Enhancements & fixes + +* Replaced Python `requests` with `urllib` to fetch ENA metadata + +### Software dependencies + +| Dependency | Old version | New version | +|-------------|-------------|-------------| +| `python` | 3.8.3 | 3.9.5 | + ## [[1.2](https://github.com/nf-core/fetchngs/releases/tag/1.2)] - 2021-07-28 ### Enhancements & fixes From 48c53f268f2d7738df1f9c0c09cc81e378079cf3 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Tue, 14 Sep 2021 23:35:54 +0100 Subject: [PATCH 04/14] Bump pipeline version --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 3b3ea780..52382140 100644 --- a/nextflow.config +++ b/nextflow.config @@ -147,7 +147,7 @@ manifest { description = 'Pipeline to fetch metadata and raw FastQ files from public databases' mainScript = 'main.nf' nextflowVersion = '!>=21.04.0' - version = '1.2' + version = '1.3' } // Function to ensure that resource requirements don't go beyond From 57a303ca77439a7334b0517303c8bb243c836765 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Wed, 15 Sep 2021 14:21:59 +0200 Subject: [PATCH 05/14] refactor: introduce logging to replace print --- bin/sra_ids_to_runinfo.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index 41a5c0ee..24c6af0d 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -6,9 +6,13 @@ import csv import errno import argparse +import logging from urllib.request import urlopen from urllib.error import URLError, HTTPError + +logger = logging.getLogger() + ## Example ids supported by this script SRA_IDS = ['PRJNA63463', 'SAMN00765663', 'SRA023522', 'SRP003255', 'SRR390278', 'SRS282569', 'SRX111814'] ENA_IDS = ['ERA2421642', 'ERP120836', 'ERR674736', 'ERS4399631', 'ERX629702', 'PRJEB7743', 'SAMEA3121481'] @@ -48,7 +52,7 @@ def validate_csv_param(param, valid_vals, param_desc): if len(intersect) == len(user_vals): valid_list = intersect else: - print("ERROR: Please provide a valid value for {}!\nProvided values = {}\nAccepted values = {}".format(param_desc,param,','.join(valid_vals))) + logger.error(f"Please provide a valid value for {param_desc}!\nProvided values = {param}\nAccepted values = {','.join(valid_vals)}") sys.exit(1) return valid_list @@ -65,12 +69,12 @@ def fetch_url(url, encoding='utf-8'): with urlopen(url) as f: r = f.read().decode(encoding).splitlines() except HTTPError as e: - print('The server couldn\'t fulfill the request.') - print('Error code: {}'.format(e.code)) + logger.error("The server couldn't fulfill the request.") + logger.error(f"Status: {e.code} {e.reason}") sys.exit(1) except URLError as e: - print('We failed to reach a server.') - print('Reason: {}'.format(e.reason)) + logger.error('We failed to reach a server.') + logger.error(f"Reason: {e.reason}") sys.exit(1) return r @@ -143,7 +147,7 @@ def fetch_sra_runinfo(file_in, file_out, ena_metadata_fields=ENA_METADATA_FIELDS fout.write('{}\n'.format('\t'.join(header))) else: if header != row.keys(): - print("ERROR: Metadata columns do not match for id {}!\nLine: '{}'".format(run_id,line.strip())) + logger.error(f"Metadata columns do not match for id {run_id}!\nLine: '{line.strip()}'") sys.exit(1) fout.write('{}\n'.format('\t'.join([row[x] for x in header]))) total_out += 1 @@ -151,16 +155,16 @@ def fetch_sra_runinfo(file_in, file_out, ena_metadata_fields=ENA_METADATA_FIELDS seen_ids.append(db_id) if not ids: - print("ERROR: No matches found for database id {}!\nLine: '{}'".format(db_id,line.strip())) + logger.error(f"No matches found for database id {db_id}!\nLine: '{line.strip()}'") sys.exit(1) else: id_str = ', '.join([x + "*" for x in PREFIX_LIST]) - print("ERROR: Please provide a valid database id starting with {}!\nLine: '{}'".format(id_str,line.strip())) + logger.error(f"Please provide a valid database id starting with {id_str}!\nLine: '{line.strip()}'") sys.exit(1) else: id_str = ', '.join([x + "*" for x in PREFIX_LIST]) - print("ERROR: Please provide a valid database id starting with {}!\nLine: '{}'".format(id_str,line.strip())) + logger.error(f"Please provide a valid database id starting with {id_str}!\nLine: '{line.strip()}'") sys.exit(1) def main(args=None): @@ -172,4 +176,5 @@ def main(args=None): fetch_sra_runinfo(args.FILE_IN, args.FILE_OUT, ena_metadata_fields) if __name__ == '__main__': + logging.basicConfig(level='INFO', format='[%(levelname)s] %(message)s') sys.exit(main()) From 2e132dcced092335dd2fcbc5460f4f53e63928d8 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Wed, 15 Sep 2021 14:25:05 +0200 Subject: [PATCH 06/14] refactor: make global constants immutable tuples --- bin/sra_ids_to_runinfo.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index 24c6af0d..eadfd8e5 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -14,15 +14,15 @@ logger = logging.getLogger() ## Example ids supported by this script -SRA_IDS = ['PRJNA63463', 'SAMN00765663', 'SRA023522', 'SRP003255', 'SRR390278', 'SRS282569', 'SRX111814'] -ENA_IDS = ['ERA2421642', 'ERP120836', 'ERR674736', 'ERS4399631', 'ERX629702', 'PRJEB7743', 'SAMEA3121481'] -GEO_IDS = ['GSE18729', 'GSM465244'] +SRA_IDS = ('PRJNA63463', 'SAMN00765663', 'SRA023522', 'SRP003255', 'SRR390278', 'SRS282569', 'SRX111814') +ENA_IDS = ('ERA2421642', 'ERP120836', 'ERR674736', 'ERS4399631', 'ERX629702', 'PRJEB7743', 'SAMEA3121481') +GEO_IDS = ('GSE18729', 'GSM465244') ID_REGEX = r'^[A-Z]+' PREFIX_LIST = sorted(list(set([re.search(ID_REGEX,x).group() for x in SRA_IDS + ENA_IDS + GEO_IDS]))) ## List of meta fields fetched from the ENA API - can be overriden by --ena_metadata_fields ## Full list of accepted fields can be obtained here: https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run -ENA_METADATA_FIELDS = [ +ENA_METADATA_FIELDS = ( 'accession', 'run_accession', 'experiment_accession', 'sample_accession', 'secondary_sample_accession', 'study_accession', 'secondary_study_accession', 'parent_study', 'submission_accession', 'run_alias', 'experiment_alias', 'sample_alias', 'study_alias', 'library_layout', 'library_selection', 'library_source', 'library_strategy', 'library_name', @@ -32,7 +32,7 @@ 'sample_title', 'experiment_title', 'study_title', 'description', 'sample_description', 'fastq_md5', 'fastq_bytes', 'fastq_ftp', 'fastq_galaxy', 'fastq_aspera' -] + ) def parse_args(args=None): Description = 'Download and create a run information metadata file from SRA/ENA/GEO identifiers.' From 99c9a620096a1d9e41a1aa664d2abc85f7d8eb1a Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Wed, 15 Sep 2021 14:29:49 +0200 Subject: [PATCH 07/14] refactor: use compiled regex pattern --- bin/sra_ids_to_runinfo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index eadfd8e5..c70df48d 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -17,8 +17,8 @@ SRA_IDS = ('PRJNA63463', 'SAMN00765663', 'SRA023522', 'SRP003255', 'SRR390278', 'SRS282569', 'SRX111814') ENA_IDS = ('ERA2421642', 'ERP120836', 'ERR674736', 'ERS4399631', 'ERX629702', 'PRJEB7743', 'SAMEA3121481') GEO_IDS = ('GSE18729', 'GSM465244') -ID_REGEX = r'^[A-Z]+' -PREFIX_LIST = sorted(list(set([re.search(ID_REGEX,x).group() for x in SRA_IDS + ENA_IDS + GEO_IDS]))) +ID_REGEX = re.compile(r'[A-Z]+') +PREFIX_LIST = sorted({ID_REGEX.match(x).group() for x in SRA_IDS + ENA_IDS + GEO_IDS}) ## List of meta fields fetched from the ENA API - can be overriden by --ena_metadata_fields ## Full list of accepted fields can be obtained here: https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run @@ -116,7 +116,7 @@ def fetch_sra_runinfo(file_in, file_out, ena_metadata_fields=ENA_METADATA_FIELDS with open(file_in,"r") as fin, open(file_out,"w") as fout: for line in fin: db_id = line.strip() - match = re.search(ID_REGEX, db_id) + match = ID_REGEX.match(db_id) if match: prefix = match.group() if prefix in PREFIX_LIST: From 2ea9dd7173e449f93650169936f195a4ec985ac2 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Wed, 15 Sep 2021 14:32:54 +0200 Subject: [PATCH 08/14] refactor: use set for better lookup performance --- bin/sra_ids_to_runinfo.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index c70df48d..f90f7d9d 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -110,7 +110,8 @@ def get_ena_fields(): def fetch_sra_runinfo(file_in, file_out, ena_metadata_fields=ENA_METADATA_FIELDS): total_out = 0 - seen_ids = []; run_ids = [] + seen_ids = set() + run_ids = set() header = [] make_dir(os.path.dirname(file_out)) with open(file_in,"r") as fin, open(file_out,"w") as fout: @@ -120,7 +121,7 @@ def fetch_sra_runinfo(file_in, file_out, ena_metadata_fields=ENA_METADATA_FIELDS if match: prefix = match.group() if prefix in PREFIX_LIST: - if not db_id in seen_ids: + if db_id not in seen_ids: ids = [db_id] ## Resolve/expand these ids against GEO URL @@ -141,7 +142,7 @@ def fetch_sra_runinfo(file_in, file_out, ena_metadata_fields=ENA_METADATA_FIELDS csv_dict = csv.DictReader(fetch_url(url), delimiter='\t') for row in csv_dict: run_id = row['run_accession'] - if not run_id in run_ids: + if run_id not in run_ids: if total_out == 0: header = row.keys() fout.write('{}\n'.format('\t'.join(header))) @@ -151,8 +152,8 @@ def fetch_sra_runinfo(file_in, file_out, ena_metadata_fields=ENA_METADATA_FIELDS sys.exit(1) fout.write('{}\n'.format('\t'.join([row[x] for x in header]))) total_out += 1 - run_ids.append(run_id) - seen_ids.append(db_id) + run_ids.add(run_id) + seen_ids.add(db_id) if not ids: logger.error(f"No matches found for database id {db_id}!\nLine: '{line.strip()}'") From 473b2378f51104148b61b2737c39357e0c9a10a3 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Wed, 15 Sep 2021 15:19:04 +0200 Subject: [PATCH 09/14] feat: enable decompression and decoding from HTTP headers --- bin/sra_ids_to_runinfo.py | 84 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 80 insertions(+), 4 deletions(-) diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index f90f7d9d..1aef2cfe 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -7,6 +7,9 @@ import errno import argparse import logging +import gzip +import zlib +import cgi from urllib.request import urlopen from urllib.error import URLError, HTTPError @@ -34,6 +37,79 @@ 'fastq_md5', 'fastq_bytes', 'fastq_ftp', 'fastq_galaxy', 'fastq_aspera' ) + +class Response: + """ + Define an HTTP response class. + + This class should not have to be instantiated directly. + + Attributes: + status (int): The numeric HTTP status code of the response. + reason (str): The response's reason phrase. + body (bytes): The response's decompressed body content as bytes. + + Methods: + text: The response's body as a decoded string. + + """ + + def __init__(self, *, response, **kwargs) -> None: + """ + Initialize an HTTP response object. + + Args: + response (http.client.HTTPResponse): A standard library response object + that is wrapped by this class. + **kwargs: Passed to parent classes. + + """ + super().__init__(**kwargs) + self._response = response + # Immediately read the body while the response context is still available. + self._raw = self._response.read() + self._content = None + + def _decompress(self): + """Decompress the response body if necessary.""" + method = self._response.getheader("Content-Encoding", "") + if not method: + self._content = self._raw + return + if method == "gzip": + self._content = gzip.decompress(self._raw) + elif method == "deflate": + self._content = zlib.decompress(self._raw) + else: + raise ValueError(f"Unsupported compression: {method}") + + @property + def status(self): + """Get the response's HTTP status code.""" + return self._response.status + + @property + def reason(self): + """Get the response's reason phrase.""" + return self._response.reason + + @property + def body(self): + """Get the response's decompressed body content as bytes.""" + if self._content is None: + self._decompress() + return self._content + + def text(self, encoding=None): + """Return the response's body as a decoded string.""" + if encoding is not None: + return self._content.decode(encoding) + + _, params = cgi.parse_header(self._response.getheader("Content-Type", "")) + encoding = params.get("charset", "utf-8") + return self._content.decode(encoding) + + def parse_args(args=None): Description = 'Download and create a run information metadata file from SRA/ENA/GEO identifiers.' Epilog = 'Example usage: python fetch_sra_runinfo.py ' @@ -64,10 +140,10 @@ def make_dir(path): if exception.errno != errno.EEXIST: raise -def fetch_url(url, encoding='utf-8'): +def fetch_url(url): try: - with urlopen(url) as f: - r = f.read().decode(encoding).splitlines() + with urlopen(url) as response: + result = Response(response=response).text().splitlines() except HTTPError as e: logger.error("The server couldn't fulfill the request.") logger.error(f"Status: {e.code} {e.reason}") @@ -76,7 +152,7 @@ def fetch_url(url, encoding='utf-8'): logger.error('We failed to reach a server.') logger.error(f"Reason: {e.reason}") sys.exit(1) - return r + return result def id_to_srx(db_id): ids = [] From 7798aa1a63d68e9162c7dc4714f78a9775dc7011 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Wed, 15 Sep 2021 15:25:33 +0200 Subject: [PATCH 10/14] refactor: use f-strings where possible --- bin/sra_ids_to_runinfo.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index 1aef2cfe..4b4ed2d7 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -117,7 +117,7 @@ def parse_args(args=None): parser = argparse.ArgumentParser(description=Description, epilog=Epilog) parser.add_argument('FILE_IN', help="File containing database identifiers, one per line.") parser.add_argument('FILE_OUT', help="Output file in tab-delimited format.") - parser.add_argument('-ef', '--ena_metadata_fields', type=str, dest="ENA_METADATA_FIELDS", default='', help="Comma-separated list of ENA metadata fields to fetch. (default: {}).".format(','.join(ENA_METADATA_FIELDS))) + parser.add_argument('-ef', '--ena_metadata_fields', type=str, dest="ENA_METADATA_FIELDS", default='', help=f"Comma-separated list of ENA metadata fields to fetch. (default: {','.join(ENA_METADATA_FIELDS)}).") return parser.parse_args(args) def validate_csv_param(param, valid_vals, param_desc): @@ -220,13 +220,15 @@ def fetch_sra_runinfo(file_in, file_out, ena_metadata_fields=ENA_METADATA_FIELDS run_id = row['run_accession'] if run_id not in run_ids: if total_out == 0: - header = row.keys() - fout.write('{}\n'.format('\t'.join(header))) + header = '\t'.join(row.keys()) + fout.write(f"{header}\n") else: if header != row.keys(): logger.error(f"Metadata columns do not match for id {run_id}!\nLine: '{line.strip()}'") sys.exit(1) - fout.write('{}\n'.format('\t'.join([row[x] for x in header]))) + + ordered_row = '\t'.join([row[x] for x in header]) + fout.write(f'{ordered_row}\n') total_out += 1 run_ids.add(run_id) seen_ids.add(db_id) From eaca802ba4ee15883a049c523b5a7c104347cd7e Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Wed, 15 Sep 2021 15:52:17 +0200 Subject: [PATCH 11/14] refactor: manage URLs with encoding --- bin/sra_ids_to_runinfo.py | 79 +++++++++++++++++++++++++-------------- 1 file changed, 51 insertions(+), 28 deletions(-) diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index 4b4ed2d7..87fe51f1 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -1,18 +1,18 @@ #!/usr/bin/env python -import os -import re -import sys +import argparse +import cgi import csv import errno -import argparse -import logging import gzip +import logging +import os +import re +import sys import zlib -import cgi -from urllib.request import urlopen from urllib.error import URLError, HTTPError - +from urllib.parse import urlencode +from urllib.request import urlopen logger = logging.getLogger() @@ -155,34 +155,53 @@ def fetch_url(url): return result def id_to_srx(db_id): - ids = [] - url = 'https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?save=efetch&db=sra&rettype=runinfo&term={}'.format(db_id) - for row in csv.DictReader(fetch_url(url), delimiter=','): - ids.append(row['Experiment']) - return ids + params = { + "save": "efetch", + "db": "sra", + "rettype": "runinfo", + "term": db_id + } + url = f'https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?{urlencode(params)}' + return [ + row['Experiment'] for row in csv.DictReader(fetch_url(url), delimiter=',') + ] def id_to_erx(db_id): - ids = [] fields = ['run_accession', 'experiment_accession'] - url = 'https://www.ebi.ac.uk/ena/portal/api/filereport?accession={}&result=read_run&fields={}'.format(db_id,','.join(fields)) - for row in csv.DictReader(fetch_url(url), delimiter='\t'): - ids.append(row['experiment_accession']) - return ids + params = { + "accession": db_id, + "result": "read_run", + "fields": ",".join(fields) + } + url = f'https://www.ebi.ac.uk/ena/portal/api/filereport?{urlencode(params)}' + return [ + row['experiment_accession'] for row in csv.DictReader(fetch_url(url), delimiter='\t') + ] def gse_to_srx(db_id): ids = [] - url = 'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={}&targ=gsm&view=data&form=text'.format(db_id) - gsm_ids = [x.split('=')[1].strip() for x in fetch_url(url) if x.find('GSM') != -1] + params = { + "acc": db_id, + "targ": "gsm", + "view": "data", + "form": "text" + } + url = f'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?{urlencode(params)}' + gsm_ids = [x.split('=')[1].strip() for x in fetch_url(url) if x.startswith('GSM')] for gsm_id in gsm_ids: ids += id_to_srx(gsm_id) return ids def get_ena_fields(): - fields = [] - url = 'https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run' - for row in csv.DictReader(fetch_url(url), delimiter='\t'): - fields.append(row['columnId']) - return fields + params = { + "dataPortal": "ena", + "format": "tsv", + "result": "read_run" + } + url = f'https://www.ebi.ac.uk/ena/portal/api/returnFields?{urlencode(params)}' + return [ + row['columnId'] for row in csv.DictReader(fetch_url(url), delimiter='\t') + ] def fetch_sra_runinfo(file_in, file_out, ena_metadata_fields=ENA_METADATA_FIELDS): total_out = 0 @@ -190,6 +209,10 @@ def fetch_sra_runinfo(file_in, file_out, ena_metadata_fields=ENA_METADATA_FIELDS run_ids = set() header = [] make_dir(os.path.dirname(file_out)) + params = { + "result": "read_run", + "fields": ','.join(ena_metadata_fields) + } with open(file_in,"r") as fin, open(file_out,"w") as fout: for line in fin: db_id = line.strip() @@ -214,9 +237,9 @@ def fetch_sra_runinfo(file_in, file_out, ena_metadata_fields=ENA_METADATA_FIELDS ## Resolve/expand to get run identifier from ENA and write to file for id in ids: - url = 'https://www.ebi.ac.uk/ena/portal/api/filereport?accession={}&result=read_run&fields={}'.format(id,','.join(ena_metadata_fields)) - csv_dict = csv.DictReader(fetch_url(url), delimiter='\t') - for row in csv_dict: + params["accession"] = id + url = f'https://www.ebi.ac.uk/ena/portal/api/filereport?{urlencode(params)}' + for row in csv.DictReader(fetch_url(url), delimiter='\t'): run_id = row['run_accession'] if run_id not in run_ids: if total_out == 0: From fa38ddaacd1cc48c6c1d6616a0116885fcc6f0be Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Wed, 15 Sep 2021 16:07:31 +0200 Subject: [PATCH 12/14] fix: maintain header list in variable, not string --- bin/sra_ids_to_runinfo.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index 87fe51f1..11fe5247 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -103,11 +103,11 @@ def body(self): def text(self, encoding=None): """Return the response's body as a decoded string.""" if encoding is not None: - return self._content.decode(encoding) + return self.body.decode(encoding) _, params = cgi.parse_header(self._response.getheader("Content-Type", "")) encoding = params.get("charset", "utf-8") - return self._content.decode(encoding) + return self.body.decode(encoding) def parse_args(args=None): @@ -243,8 +243,9 @@ def fetch_sra_runinfo(file_in, file_out, ena_metadata_fields=ENA_METADATA_FIELDS run_id = row['run_accession'] if run_id not in run_ids: if total_out == 0: - header = '\t'.join(row.keys()) - fout.write(f"{header}\n") + header = row.keys() + header_line = '\t'.join(header) + fout.write(f"{header_line}\n") else: if header != row.keys(): logger.error(f"Metadata columns do not match for id {run_id}!\nLine: '{line.strip()}'") From 1f7ae6a7cb17e681accf2940460483043ff3b69b Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Wed, 15 Sep 2021 16:13:53 +0200 Subject: [PATCH 13/14] style: remove space --- bin/sra_ids_to_runinfo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index 11fe5247..68b1c924 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -35,7 +35,7 @@ 'sample_title', 'experiment_title', 'study_title', 'description', 'sample_description', 'fastq_md5', 'fastq_bytes', 'fastq_ftp', 'fastq_galaxy', 'fastq_aspera' - ) +) class Response: From ce7ed6016ef67377651960b8fb212a80dcdca4ea Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Wed, 15 Sep 2021 18:04:27 +0200 Subject: [PATCH 14/14] style: remove last type annotation --- bin/sra_ids_to_runinfo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index 68b1c924..ae60f545 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -54,7 +54,7 @@ class Response: """ - def __init__(self, *, response, **kwargs) -> None: + def __init__(self, *, response, **kwargs): """ Initialize an HTTP response object.