Merge pull request #34 from nf-core/dev

Dev -> Master for 1.3 release
nf-core · Sep 15, 2021 · 2d593fb · 2d593fb
2 parents 8c00805 + e094f64
commit 2d593fb
Show file tree

Hide file tree

Showing 7 changed files with 190 additions and 65 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,18 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [[1.3](https://github.com/nf-core/fetchngs/releases/tag/1.3)] - 2021-09-15
+
+### Enhancements & fixes
+
+* Replaced Python `requests` with `urllib` to fetch ENA metadata
+
+### Software dependencies
+
+| Dependency  | Old version | New version |
+|-------------|-------------|-------------|
+| `python`    | 3.8.3       | 3.9.5       |
+
 ## [[1.2](https://github.com/nf-core/fetchngs/releases/tag/1.2)] - 2021-07-28
 
 ### Enhancements & fixes

diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py
@@ -1,23 +1,31 @@
 #!/usr/bin/env python
 
+import argparse
+import cgi
+import csv
+import errno
+import gzip
+import logging
 import os
 import re
 import sys
-import csv
-import errno
-import requests
-import argparse
+import zlib
+from urllib.error import URLError, HTTPError
+from urllib.parse import urlencode
+from urllib.request import urlopen
+
+logger = logging.getLogger()
 
 ## Example ids supported by this script
-SRA_IDS = ['PRJNA63463', 'SAMN00765663', 'SRA023522', 'SRP003255', 'SRR390278', 'SRS282569', 'SRX111814']
-ENA_IDS = ['ERA2421642', 'ERP120836', 'ERR674736', 'ERS4399631', 'ERX629702', 'PRJEB7743', 'SAMEA3121481']
-GEO_IDS = ['GSE18729', 'GSM465244']
-ID_REGEX = r'^[A-Z]+'
-PREFIX_LIST = sorted(list(set([re.search(ID_REGEX,x).group() for x in SRA_IDS + ENA_IDS + GEO_IDS])))
+SRA_IDS = ('PRJNA63463', 'SAMN00765663', 'SRA023522', 'SRP003255', 'SRR390278', 'SRS282569', 'SRX111814')
+ENA_IDS = ('ERA2421642', 'ERP120836', 'ERR674736', 'ERS4399631', 'ERX629702', 'PRJEB7743', 'SAMEA3121481')
+GEO_IDS = ('GSE18729', 'GSM465244')
+ID_REGEX = re.compile(r'[A-Z]+')
+PREFIX_LIST = sorted({ID_REGEX.match(x).group() for x in SRA_IDS + ENA_IDS + GEO_IDS})
 
 ## List of meta fields fetched from the ENA API - can be overriden by --ena_metadata_fields
 ## Full list of accepted fields can be obtained here: https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run
-ENA_METADATA_FIELDS = [
+ENA_METADATA_FIELDS = (
     'accession', 'run_accession', 'experiment_accession', 'sample_accession', 'secondary_sample_accession', 'study_accession', 'secondary_study_accession', 'parent_study', 'submission_accession',
     'run_alias', 'experiment_alias', 'sample_alias', 'study_alias',
     'library_layout', 'library_selection', 'library_source', 'library_strategy', 'library_name',
@@ -27,7 +35,80 @@
     'sample_title', 'experiment_title', 'study_title',
     'description', 'sample_description',
     'fastq_md5', 'fastq_bytes', 'fastq_ftp', 'fastq_galaxy', 'fastq_aspera'
-]
+)
+
+
+class Response:
+    """
+    Define an HTTP response class.
+
+    This class should not have to be instantiated directly.
+
+    Attributes:
+        status (int): The numeric HTTP status code of the response.
+        reason (str): The response's reason phrase.
+        body (bytes): The response's decompressed body content as bytes.
+
+    Methods:
+        text: The response's body as a decoded string.
+
+    """
+
+    def __init__(self, *, response, **kwargs):
+        """
+        Initialize an HTTP response object.
+
+        Args:
+            response (http.client.HTTPResponse): A standard library response object
+                that is wrapped by this class.
+            **kwargs: Passed to parent classes.
+
+        """
+        super().__init__(**kwargs)
+        self._response = response
+        # Immediately read the body while the response context is still available.
+        self._raw = self._response.read()
+        self._content = None
+
+    def _decompress(self):
+        """Decompress the response body if necessary."""
+        method = self._response.getheader("Content-Encoding", "")
+        if not method:
+            self._content = self._raw
+            return
+        if method == "gzip":
+            self._content = gzip.decompress(self._raw)
+        elif method == "deflate":
+            self._content = zlib.decompress(self._raw)
+        else:
+            raise ValueError(f"Unsupported compression: {method}")
+
+    @property
+    def status(self):
+        """Get the response's HTTP status code."""
+        return self._response.status
+
+    @property
+    def reason(self):
+        """Get the response's reason phrase."""
+        return self._response.reason
+
+    @property
+    def body(self):
+        """Get the response's decompressed body content as bytes."""
+        if self._content is None:
+            self._decompress()
+        return self._content
+
+    def text(self, encoding=None):
+        """Return the response's body as a decoded string."""
+        if encoding is not None:
+            return self.body.decode(encoding)
+
+        _, params = cgi.parse_header(self._response.getheader("Content-Type", ""))
+        encoding = params.get("charset", "utf-8")
+        return self.body.decode(encoding)
+
 
 def parse_args(args=None):
     Description = 'Download and create a run information metadata file from SRA/ENA/GEO identifiers.'
@@ -36,7 +117,7 @@ def parse_args(args=None):
     parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
     parser.add_argument('FILE_IN', help="File containing database identifiers, one per line.")
     parser.add_argument('FILE_OUT', help="Output file in tab-delimited format.")
-    parser.add_argument('-ef', '--ena_metadata_fields', type=str, dest="ENA_METADATA_FIELDS", default='', help="Comma-separated list of ENA metadata fields to fetch. (default: {}).".format(','.join(ENA_METADATA_FIELDS)))
+    parser.add_argument('-ef', '--ena_metadata_fields', type=str, dest="ENA_METADATA_FIELDS", default='', help=f"Comma-separated list of ENA metadata fields to fetch. (default: {','.join(ENA_METADATA_FIELDS)}).")
     return parser.parse_args(args)
 
 def validate_csv_param(param, valid_vals, param_desc):
@@ -47,7 +128,7 @@ def validate_csv_param(param, valid_vals, param_desc):
         if len(intersect) == len(user_vals):
             valid_list = intersect
         else:
-            print("ERROR: Please provide a valid value for {}!\nProvided values = {}\nAccepted values = {}".format(param_desc,param,','.join(valid_vals)))
+            logger.error(f"Please provide a valid value for {param_desc}!\nProvided values = {param}\nAccepted values = {','.join(valid_vals)}")
             sys.exit(1)
     return valid_list
 
@@ -59,59 +140,87 @@ def make_dir(path):
             if exception.errno != errno.EEXIST:
                 raise
 
-def fetch_url(url, encoding='utf-8'):
+def fetch_url(url):
     try:
-        r = requests.get(url)
-    except requests.exceptions.RequestException as e:
-        raise SystemExit(e)
-    if r.status_code != 200:
-        print("ERROR: Connection failed\nError code '{}'".format(r.status_code))
+        with urlopen(url) as response:
+            result = Response(response=response).text().splitlines()
+    except HTTPError as e:
+        logger.error("The server couldn't fulfill the request.")
+        logger.error(f"Status: {e.code} {e.reason}")
+        sys.exit(1)
+    except URLError as e:
+        logger.error('We failed to reach a server.')
+        logger.error(f"Reason: {e.reason}")
         sys.exit(1)
-    return r.content.decode(encoding).splitlines()
+    return result
 
 def id_to_srx(db_id):
-    ids = []
-    url = 'https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?save=efetch&db=sra&rettype=runinfo&term={}'.format(db_id)
-    for row in csv.DictReader(fetch_url(url), delimiter=','):
-        ids.append(row['Experiment'])
-    return ids
+    params = {
+        "save": "efetch",
+        "db": "sra",
+        "rettype": "runinfo",
+        "term": db_id
+    }
+    url = f'https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?{urlencode(params)}'
+    return [
+        row['Experiment'] for row in csv.DictReader(fetch_url(url), delimiter=',')
+    ]
 
 def id_to_erx(db_id):
-    ids = []
     fields = ['run_accession', 'experiment_accession']
-    url = 'https://www.ebi.ac.uk/ena/portal/api/filereport?accession={}&result=read_run&fields={}'.format(db_id,','.join(fields))
-    for row in csv.DictReader(fetch_url(url), delimiter='\t'):
-        ids.append(row['experiment_accession'])
-    return ids
+    params = {
+        "accession": db_id,
+        "result": "read_run",
+        "fields": ",".join(fields)
+    }
+    url = f'https://www.ebi.ac.uk/ena/portal/api/filereport?{urlencode(params)}'
+    return [
+        row['experiment_accession'] for row in csv.DictReader(fetch_url(url), delimiter='\t')
+    ]
 
 def gse_to_srx(db_id):
     ids = []
-    url = 'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={}&targ=gsm&view=data&form=text'.format(db_id)
-    gsm_ids = [x.split('=')[1].strip() for x in fetch_url(url) if x.find('GSM') != -1]
+    params = {
+        "acc": db_id,
+        "targ": "gsm",
+        "view": "data",
+        "form": "text"
+    }
+    url = f'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?{urlencode(params)}'
+    gsm_ids = [x.split('=')[1].strip() for x in fetch_url(url) if x.startswith('GSM')]
     for gsm_id in gsm_ids:
         ids += id_to_srx(gsm_id)
     return ids
 
 def get_ena_fields():
-    fields = []
-    url = 'https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run'
-    for row in csv.DictReader(fetch_url(url), delimiter='\t'):
-        fields.append(row['columnId'])
-    return fields
+    params = {
+        "dataPortal": "ena",
+        "format": "tsv",
+        "result": "read_run"
+    }
+    url = f'https://www.ebi.ac.uk/ena/portal/api/returnFields?{urlencode(params)}'
+    return [
+        row['columnId'] for row in csv.DictReader(fetch_url(url), delimiter='\t')
+    ]
 
 def fetch_sra_runinfo(file_in, file_out, ena_metadata_fields=ENA_METADATA_FIELDS):
     total_out = 0
-    seen_ids = []; run_ids = []
+    seen_ids = set()
+    run_ids = set()
     header = []
     make_dir(os.path.dirname(file_out))
+    params = {
+        "result": "read_run",
+        "fields": ','.join(ena_metadata_fields)
+    }
     with open(file_in,"r") as fin, open(file_out,"w") as fout:
         for line in fin:
             db_id = line.strip()
-            match = re.search(ID_REGEX, db_id)
+            match = ID_REGEX.match(db_id)
             if match:
                 prefix = match.group()
                 if prefix in PREFIX_LIST:
-                    if not db_id in seen_ids:
+                    if db_id not in seen_ids:
 
                         ids = [db_id]
                         ## Resolve/expand these ids against GEO URL
@@ -128,34 +237,37 @@ def fetch_sra_runinfo(file_in, file_out, ena_metadata_fields=ENA_METADATA_FIELDS
 
                         ## Resolve/expand to get run identifier from ENA and write to file
                         for id in ids:
-                            url = 'https://www.ebi.ac.uk/ena/portal/api/filereport?accession={}&result=read_run&fields={}'.format(id,','.join(ena_metadata_fields))
-                            csv_dict = csv.DictReader(fetch_url(url), delimiter='\t')
-                            for row in csv_dict:
+                            params["accession"] = id
+                            url = f'https://www.ebi.ac.uk/ena/portal/api/filereport?{urlencode(params)}'
+                            for row in csv.DictReader(fetch_url(url), delimiter='\t'):
                                 run_id = row['run_accession']
-                                if not run_id in run_ids:
+                                if run_id not in run_ids:
                                     if total_out == 0:
                                         header = row.keys()
-                                        fout.write('{}\n'.format('\t'.join(header)))
+                                        header_line = '\t'.join(header)
+                                        fout.write(f"{header_line}\n")
                                     else:
                                         if header != row.keys():
-                                            print("ERROR: Metadata columns do not match for id {}!\nLine: '{}'".format(run_id,line.strip()))
+                                            logger.error(f"Metadata columns do not match for id {run_id}!\nLine: '{line.strip()}'")
                                             sys.exit(1)
-                                    fout.write('{}\n'.format('\t'.join([row[x] for x in header])))
+
+                                    ordered_row = '\t'.join([row[x] for x in header])
+                                    fout.write(f'{ordered_row}\n')
                                     total_out += 1
-                                    run_ids.append(run_id)
-                        seen_ids.append(db_id)
+                                    run_ids.add(run_id)
+                        seen_ids.add(db_id)
 
                         if not ids:
-                            print("ERROR: No matches found for database id {}!\nLine: '{}'".format(db_id,line.strip()))
+                            logger.error(f"No matches found for database id {db_id}!\nLine: '{line.strip()}'")
                             sys.exit(1)
 
                 else:
                     id_str = ', '.join([x + "*" for x in PREFIX_LIST])
-                    print("ERROR: Please provide a valid database id starting with {}!\nLine: '{}'".format(id_str,line.strip()))
+                    logger.error(f"Please provide a valid database id starting with {id_str}!\nLine: '{line.strip()}'")
                     sys.exit(1)
             else:
                 id_str = ', '.join([x + "*" for x in PREFIX_LIST])
-                print("ERROR: Please provide a valid database id starting with {}!\nLine: '{}'".format(id_str,line.strip()))
+                logger.error(f"Please provide a valid database id starting with {id_str}!\nLine: '{line.strip()}'")
                 sys.exit(1)
 
 def main(args=None):
@@ -167,4 +279,5 @@ def main(args=None):
     fetch_sra_runinfo(args.FILE_IN, args.FILE_OUT, ena_metadata_fields)
 
 if __name__ == '__main__':
+    logging.basicConfig(level='INFO', format='[%(levelname)s] %(message)s')
     sys.exit(main())
diff --git a/modules/local/get_software_versions.nf b/modules/local/get_software_versions.nf
@@ -8,11 +8,11 @@ process GET_SOFTWARE_VERSIONS {
         mode: params.publish_dir_mode,
         saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:'pipeline_info', meta:[:], publish_by_meta:[]) }
 
-    conda (params.enable_conda ? "conda-forge::python=3.8.3" : null)
+    conda (params.enable_conda ? "conda-forge::python=3.9.5" : null)
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-        container "https://depot.galaxyproject.org/singularity/python:3.8.3"
+        container "https://depot.galaxyproject.org/singularity/python:3.9--1"
     } else {
-        container "quay.io/biocontainers/python:3.8.3"
+        container "quay.io/biocontainers/python:3.9--1"
     }
 
     cache false

diff --git a/modules/local/multiqc_mappings_config.nf b/modules/local/multiqc_mappings_config.nf
@@ -8,11 +8,11 @@ process MULTIQC_MAPPINGS_CONFIG {
         mode: params.publish_dir_mode,
         saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) }
 
-    conda (params.enable_conda ? "conda-forge::python=3.8.3" : null)
+    conda (params.enable_conda ? "conda-forge::python=3.9.5" : null)
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-        container "https://depot.galaxyproject.org/singularity/python:3.8.3"
+        container "https://depot.galaxyproject.org/singularity/python:3.9--1"
     } else {
-        container "quay.io/biocontainers/python:3.8.3"
+        container "quay.io/biocontainers/python:3.9--1"
     }
 
     input:

diff --git a/modules/local/sra_ids_to_runinfo.nf b/modules/local/sra_ids_to_runinfo.nf
@@ -10,11 +10,11 @@ process SRA_IDS_TO_RUNINFO {
         mode: params.publish_dir_mode,
         saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) }
 
-    conda (params.enable_conda ? "conda-forge::sed=4.7" : null)
+    conda (params.enable_conda ? "conda-forge::python=3.9.5" : null)
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-        container "https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img"
+        container "https://depot.galaxyproject.org/singularity/python:3.9--1"
     } else {
-        container "biocontainers/biocontainers:v1.2.0_cv1"
+        container "quay.io/biocontainers/python:3.9--1"
     }
 
     input:

diff --git a/modules/local/sra_runinfo_to_ftp.nf b/modules/local/sra_runinfo_to_ftp.nf
@@ -8,11 +8,11 @@ process SRA_RUNINFO_TO_FTP {
         mode: params.publish_dir_mode,
         saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) }
 
-    conda (params.enable_conda ? "conda-forge::python=3.8.3" : null)
+    conda (params.enable_conda ? "conda-forge::python=3.9.5" : null)
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-        container "https://depot.galaxyproject.org/singularity/python:3.8.3"
+        container "https://depot.galaxyproject.org/singularity/python:3.9--1"
     } else {
-        container "quay.io/biocontainers/python:3.8.3"
+        container "quay.io/biocontainers/python:3.9--1"
     }
 
     input: