diff --git a/CHANGELOG.md b/CHANGELOG.md index be77eb6f..eae33fae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,18 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [[1.3](https://github.com/nf-core/fetchngs/releases/tag/1.3)] - 2021-09-15 + +### Enhancements & fixes + +* Replaced Python `requests` with `urllib` to fetch ENA metadata + +### Software dependencies + +| Dependency | Old version | New version | +|-------------|-------------|-------------| +| `python` | 3.8.3 | 3.9.5 | + ## [[1.2](https://github.com/nf-core/fetchngs/releases/tag/1.2)] - 2021-07-28 ### Enhancements & fixes diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index 1480e8f2..ae60f545 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -1,23 +1,31 @@ #!/usr/bin/env python +import argparse +import cgi +import csv +import errno +import gzip +import logging import os import re import sys -import csv -import errno -import requests -import argparse +import zlib +from urllib.error import URLError, HTTPError +from urllib.parse import urlencode +from urllib.request import urlopen + +logger = logging.getLogger() ## Example ids supported by this script -SRA_IDS = ['PRJNA63463', 'SAMN00765663', 'SRA023522', 'SRP003255', 'SRR390278', 'SRS282569', 'SRX111814'] -ENA_IDS = ['ERA2421642', 'ERP120836', 'ERR674736', 'ERS4399631', 'ERX629702', 'PRJEB7743', 'SAMEA3121481'] -GEO_IDS = ['GSE18729', 'GSM465244'] -ID_REGEX = r'^[A-Z]+' -PREFIX_LIST = sorted(list(set([re.search(ID_REGEX,x).group() for x in SRA_IDS + ENA_IDS + GEO_IDS]))) +SRA_IDS = ('PRJNA63463', 'SAMN00765663', 'SRA023522', 'SRP003255', 'SRR390278', 'SRS282569', 'SRX111814') +ENA_IDS = ('ERA2421642', 'ERP120836', 'ERR674736', 'ERS4399631', 'ERX629702', 'PRJEB7743', 'SAMEA3121481') +GEO_IDS = ('GSE18729', 'GSM465244') +ID_REGEX = re.compile(r'[A-Z]+') +PREFIX_LIST = sorted({ID_REGEX.match(x).group() for x in SRA_IDS + ENA_IDS + GEO_IDS}) ## List of meta fields fetched from the ENA API - can be overriden by --ena_metadata_fields ## Full list of accepted fields can be obtained here: https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run -ENA_METADATA_FIELDS = [ +ENA_METADATA_FIELDS = ( 'accession', 'run_accession', 'experiment_accession', 'sample_accession', 'secondary_sample_accession', 'study_accession', 'secondary_study_accession', 'parent_study', 'submission_accession', 'run_alias', 'experiment_alias', 'sample_alias', 'study_alias', 'library_layout', 'library_selection', 'library_source', 'library_strategy', 'library_name', @@ -27,7 +35,80 @@ 'sample_title', 'experiment_title', 'study_title', 'description', 'sample_description', 'fastq_md5', 'fastq_bytes', 'fastq_ftp', 'fastq_galaxy', 'fastq_aspera' -] +) + + +class Response: + """ + Define an HTTP response class. + + This class should not have to be instantiated directly. + + Attributes: + status (int): The numeric HTTP status code of the response. + reason (str): The response's reason phrase. + body (bytes): The response's decompressed body content as bytes. + + Methods: + text: The response's body as a decoded string. + + """ + + def __init__(self, *, response, **kwargs): + """ + Initialize an HTTP response object. + + Args: + response (http.client.HTTPResponse): A standard library response object + that is wrapped by this class. + **kwargs: Passed to parent classes. + + """ + super().__init__(**kwargs) + self._response = response + # Immediately read the body while the response context is still available. + self._raw = self._response.read() + self._content = None + + def _decompress(self): + """Decompress the response body if necessary.""" + method = self._response.getheader("Content-Encoding", "") + if not method: + self._content = self._raw + return + if method == "gzip": + self._content = gzip.decompress(self._raw) + elif method == "deflate": + self._content = zlib.decompress(self._raw) + else: + raise ValueError(f"Unsupported compression: {method}") + + @property + def status(self): + """Get the response's HTTP status code.""" + return self._response.status + + @property + def reason(self): + """Get the response's reason phrase.""" + return self._response.reason + + @property + def body(self): + """Get the response's decompressed body content as bytes.""" + if self._content is None: + self._decompress() + return self._content + + def text(self, encoding=None): + """Return the response's body as a decoded string.""" + if encoding is not None: + return self.body.decode(encoding) + + _, params = cgi.parse_header(self._response.getheader("Content-Type", "")) + encoding = params.get("charset", "utf-8") + return self.body.decode(encoding) + def parse_args(args=None): Description = 'Download and create a run information metadata file from SRA/ENA/GEO identifiers.' @@ -36,7 +117,7 @@ def parse_args(args=None): parser = argparse.ArgumentParser(description=Description, epilog=Epilog) parser.add_argument('FILE_IN', help="File containing database identifiers, one per line.") parser.add_argument('FILE_OUT', help="Output file in tab-delimited format.") - parser.add_argument('-ef', '--ena_metadata_fields', type=str, dest="ENA_METADATA_FIELDS", default='', help="Comma-separated list of ENA metadata fields to fetch. (default: {}).".format(','.join(ENA_METADATA_FIELDS))) + parser.add_argument('-ef', '--ena_metadata_fields', type=str, dest="ENA_METADATA_FIELDS", default='', help=f"Comma-separated list of ENA metadata fields to fetch. (default: {','.join(ENA_METADATA_FIELDS)}).") return parser.parse_args(args) def validate_csv_param(param, valid_vals, param_desc): @@ -47,7 +128,7 @@ def validate_csv_param(param, valid_vals, param_desc): if len(intersect) == len(user_vals): valid_list = intersect else: - print("ERROR: Please provide a valid value for {}!\nProvided values = {}\nAccepted values = {}".format(param_desc,param,','.join(valid_vals))) + logger.error(f"Please provide a valid value for {param_desc}!\nProvided values = {param}\nAccepted values = {','.join(valid_vals)}") sys.exit(1) return valid_list @@ -59,59 +140,87 @@ def make_dir(path): if exception.errno != errno.EEXIST: raise -def fetch_url(url, encoding='utf-8'): +def fetch_url(url): try: - r = requests.get(url) - except requests.exceptions.RequestException as e: - raise SystemExit(e) - if r.status_code != 200: - print("ERROR: Connection failed\nError code '{}'".format(r.status_code)) + with urlopen(url) as response: + result = Response(response=response).text().splitlines() + except HTTPError as e: + logger.error("The server couldn't fulfill the request.") + logger.error(f"Status: {e.code} {e.reason}") + sys.exit(1) + except URLError as e: + logger.error('We failed to reach a server.') + logger.error(f"Reason: {e.reason}") sys.exit(1) - return r.content.decode(encoding).splitlines() + return result def id_to_srx(db_id): - ids = [] - url = 'https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?save=efetch&db=sra&rettype=runinfo&term={}'.format(db_id) - for row in csv.DictReader(fetch_url(url), delimiter=','): - ids.append(row['Experiment']) - return ids + params = { + "save": "efetch", + "db": "sra", + "rettype": "runinfo", + "term": db_id + } + url = f'https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?{urlencode(params)}' + return [ + row['Experiment'] for row in csv.DictReader(fetch_url(url), delimiter=',') + ] def id_to_erx(db_id): - ids = [] fields = ['run_accession', 'experiment_accession'] - url = 'https://www.ebi.ac.uk/ena/portal/api/filereport?accession={}&result=read_run&fields={}'.format(db_id,','.join(fields)) - for row in csv.DictReader(fetch_url(url), delimiter='\t'): - ids.append(row['experiment_accession']) - return ids + params = { + "accession": db_id, + "result": "read_run", + "fields": ",".join(fields) + } + url = f'https://www.ebi.ac.uk/ena/portal/api/filereport?{urlencode(params)}' + return [ + row['experiment_accession'] for row in csv.DictReader(fetch_url(url), delimiter='\t') + ] def gse_to_srx(db_id): ids = [] - url = 'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={}&targ=gsm&view=data&form=text'.format(db_id) - gsm_ids = [x.split('=')[1].strip() for x in fetch_url(url) if x.find('GSM') != -1] + params = { + "acc": db_id, + "targ": "gsm", + "view": "data", + "form": "text" + } + url = f'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?{urlencode(params)}' + gsm_ids = [x.split('=')[1].strip() for x in fetch_url(url) if x.startswith('GSM')] for gsm_id in gsm_ids: ids += id_to_srx(gsm_id) return ids def get_ena_fields(): - fields = [] - url = 'https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run' - for row in csv.DictReader(fetch_url(url), delimiter='\t'): - fields.append(row['columnId']) - return fields + params = { + "dataPortal": "ena", + "format": "tsv", + "result": "read_run" + } + url = f'https://www.ebi.ac.uk/ena/portal/api/returnFields?{urlencode(params)}' + return [ + row['columnId'] for row in csv.DictReader(fetch_url(url), delimiter='\t') + ] def fetch_sra_runinfo(file_in, file_out, ena_metadata_fields=ENA_METADATA_FIELDS): total_out = 0 - seen_ids = []; run_ids = [] + seen_ids = set() + run_ids = set() header = [] make_dir(os.path.dirname(file_out)) + params = { + "result": "read_run", + "fields": ','.join(ena_metadata_fields) + } with open(file_in,"r") as fin, open(file_out,"w") as fout: for line in fin: db_id = line.strip() - match = re.search(ID_REGEX, db_id) + match = ID_REGEX.match(db_id) if match: prefix = match.group() if prefix in PREFIX_LIST: - if not db_id in seen_ids: + if db_id not in seen_ids: ids = [db_id] ## Resolve/expand these ids against GEO URL @@ -128,34 +237,37 @@ def fetch_sra_runinfo(file_in, file_out, ena_metadata_fields=ENA_METADATA_FIELDS ## Resolve/expand to get run identifier from ENA and write to file for id in ids: - url = 'https://www.ebi.ac.uk/ena/portal/api/filereport?accession={}&result=read_run&fields={}'.format(id,','.join(ena_metadata_fields)) - csv_dict = csv.DictReader(fetch_url(url), delimiter='\t') - for row in csv_dict: + params["accession"] = id + url = f'https://www.ebi.ac.uk/ena/portal/api/filereport?{urlencode(params)}' + for row in csv.DictReader(fetch_url(url), delimiter='\t'): run_id = row['run_accession'] - if not run_id in run_ids: + if run_id not in run_ids: if total_out == 0: header = row.keys() - fout.write('{}\n'.format('\t'.join(header))) + header_line = '\t'.join(header) + fout.write(f"{header_line}\n") else: if header != row.keys(): - print("ERROR: Metadata columns do not match for id {}!\nLine: '{}'".format(run_id,line.strip())) + logger.error(f"Metadata columns do not match for id {run_id}!\nLine: '{line.strip()}'") sys.exit(1) - fout.write('{}\n'.format('\t'.join([row[x] for x in header]))) + + ordered_row = '\t'.join([row[x] for x in header]) + fout.write(f'{ordered_row}\n') total_out += 1 - run_ids.append(run_id) - seen_ids.append(db_id) + run_ids.add(run_id) + seen_ids.add(db_id) if not ids: - print("ERROR: No matches found for database id {}!\nLine: '{}'".format(db_id,line.strip())) + logger.error(f"No matches found for database id {db_id}!\nLine: '{line.strip()}'") sys.exit(1) else: id_str = ', '.join([x + "*" for x in PREFIX_LIST]) - print("ERROR: Please provide a valid database id starting with {}!\nLine: '{}'".format(id_str,line.strip())) + logger.error(f"Please provide a valid database id starting with {id_str}!\nLine: '{line.strip()}'") sys.exit(1) else: id_str = ', '.join([x + "*" for x in PREFIX_LIST]) - print("ERROR: Please provide a valid database id starting with {}!\nLine: '{}'".format(id_str,line.strip())) + logger.error(f"Please provide a valid database id starting with {id_str}!\nLine: '{line.strip()}'") sys.exit(1) def main(args=None): @@ -167,4 +279,5 @@ def main(args=None): fetch_sra_runinfo(args.FILE_IN, args.FILE_OUT, ena_metadata_fields) if __name__ == '__main__': + logging.basicConfig(level='INFO', format='[%(levelname)s] %(message)s') sys.exit(main()) diff --git a/modules/local/get_software_versions.nf b/modules/local/get_software_versions.nf index 4d37bd6a..65b7f340 100644 --- a/modules/local/get_software_versions.nf +++ b/modules/local/get_software_versions.nf @@ -8,11 +8,11 @@ process GET_SOFTWARE_VERSIONS { mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:'pipeline_info', meta:[:], publish_by_meta:[]) } - conda (params.enable_conda ? "conda-forge::python=3.8.3" : null) + conda (params.enable_conda ? "conda-forge::python=3.9.5" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/python:3.8.3" + container "https://depot.galaxyproject.org/singularity/python:3.9--1" } else { - container "quay.io/biocontainers/python:3.8.3" + container "quay.io/biocontainers/python:3.9--1" } cache false diff --git a/modules/local/multiqc_mappings_config.nf b/modules/local/multiqc_mappings_config.nf index 63121b40..8360cd34 100644 --- a/modules/local/multiqc_mappings_config.nf +++ b/modules/local/multiqc_mappings_config.nf @@ -8,11 +8,11 @@ process MULTIQC_MAPPINGS_CONFIG { mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } - conda (params.enable_conda ? "conda-forge::python=3.8.3" : null) + conda (params.enable_conda ? "conda-forge::python=3.9.5" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/python:3.8.3" + container "https://depot.galaxyproject.org/singularity/python:3.9--1" } else { - container "quay.io/biocontainers/python:3.8.3" + container "quay.io/biocontainers/python:3.9--1" } input: diff --git a/modules/local/sra_ids_to_runinfo.nf b/modules/local/sra_ids_to_runinfo.nf index b277197e..3d3fc063 100644 --- a/modules/local/sra_ids_to_runinfo.nf +++ b/modules/local/sra_ids_to_runinfo.nf @@ -10,11 +10,11 @@ process SRA_IDS_TO_RUNINFO { mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } - conda (params.enable_conda ? "conda-forge::sed=4.7" : null) + conda (params.enable_conda ? "conda-forge::python=3.9.5" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img" + container "https://depot.galaxyproject.org/singularity/python:3.9--1" } else { - container "biocontainers/biocontainers:v1.2.0_cv1" + container "quay.io/biocontainers/python:3.9--1" } input: diff --git a/modules/local/sra_runinfo_to_ftp.nf b/modules/local/sra_runinfo_to_ftp.nf index b0421aea..f426f4ab 100644 --- a/modules/local/sra_runinfo_to_ftp.nf +++ b/modules/local/sra_runinfo_to_ftp.nf @@ -8,11 +8,11 @@ process SRA_RUNINFO_TO_FTP { mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } - conda (params.enable_conda ? "conda-forge::python=3.8.3" : null) + conda (params.enable_conda ? "conda-forge::python=3.9.5" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/python:3.8.3" + container "https://depot.galaxyproject.org/singularity/python:3.9--1" } else { - container "quay.io/biocontainers/python:3.8.3" + container "quay.io/biocontainers/python:3.9--1" } input: diff --git a/nextflow.config b/nextflow.config index 3b3ea780..52382140 100644 --- a/nextflow.config +++ b/nextflow.config @@ -147,7 +147,7 @@ manifest { description = 'Pipeline to fetch metadata and raw FastQ files from public databases' mainScript = 'main.nf' nextflowVersion = '!>=21.04.0' - version = '1.2' + version = '1.3' } // Function to ensure that resource requirements don't go beyond