Skip to content

Commit

Permalink
Merge pull request #34 from nf-core/dev
Browse files Browse the repository at this point in the history
Dev -> Master for 1.3 release
  • Loading branch information
drpatelh committed Sep 15, 2021
2 parents 8c00805 + e094f64 commit 2d593fb
Show file tree
Hide file tree
Showing 7 changed files with 190 additions and 65 deletions.
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,18 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [[1.3](https://github.com/nf-core/fetchngs/releases/tag/1.3)] - 2021-09-15

### Enhancements & fixes

* Replaced Python `requests` with `urllib` to fetch ENA metadata

### Software dependencies

| Dependency | Old version | New version |
|-------------|-------------|-------------|
| `python` | 3.8.3 | 3.9.5 |

## [[1.2](https://github.com/nf-core/fetchngs/releases/tag/1.2)] - 2021-07-28

### Enhancements & fixes
Expand Down
217 changes: 165 additions & 52 deletions bin/sra_ids_to_runinfo.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,31 @@
#!/usr/bin/env python

import argparse
import cgi
import csv
import errno
import gzip
import logging
import os
import re
import sys
import csv
import errno
import requests
import argparse
import zlib
from urllib.error import URLError, HTTPError
from urllib.parse import urlencode
from urllib.request import urlopen

logger = logging.getLogger()

## Example ids supported by this script
SRA_IDS = ['PRJNA63463', 'SAMN00765663', 'SRA023522', 'SRP003255', 'SRR390278', 'SRS282569', 'SRX111814']
ENA_IDS = ['ERA2421642', 'ERP120836', 'ERR674736', 'ERS4399631', 'ERX629702', 'PRJEB7743', 'SAMEA3121481']
GEO_IDS = ['GSE18729', 'GSM465244']
ID_REGEX = r'^[A-Z]+'
PREFIX_LIST = sorted(list(set([re.search(ID_REGEX,x).group() for x in SRA_IDS + ENA_IDS + GEO_IDS])))
SRA_IDS = ('PRJNA63463', 'SAMN00765663', 'SRA023522', 'SRP003255', 'SRR390278', 'SRS282569', 'SRX111814')
ENA_IDS = ('ERA2421642', 'ERP120836', 'ERR674736', 'ERS4399631', 'ERX629702', 'PRJEB7743', 'SAMEA3121481')
GEO_IDS = ('GSE18729', 'GSM465244')
ID_REGEX = re.compile(r'[A-Z]+')
PREFIX_LIST = sorted({ID_REGEX.match(x).group() for x in SRA_IDS + ENA_IDS + GEO_IDS})

## List of meta fields fetched from the ENA API - can be overriden by --ena_metadata_fields
## Full list of accepted fields can be obtained here: https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run
ENA_METADATA_FIELDS = [
ENA_METADATA_FIELDS = (
'accession', 'run_accession', 'experiment_accession', 'sample_accession', 'secondary_sample_accession', 'study_accession', 'secondary_study_accession', 'parent_study', 'submission_accession',
'run_alias', 'experiment_alias', 'sample_alias', 'study_alias',
'library_layout', 'library_selection', 'library_source', 'library_strategy', 'library_name',
Expand All @@ -27,7 +35,80 @@
'sample_title', 'experiment_title', 'study_title',
'description', 'sample_description',
'fastq_md5', 'fastq_bytes', 'fastq_ftp', 'fastq_galaxy', 'fastq_aspera'
]
)


class Response:
"""
Define an HTTP response class.
This class should not have to be instantiated directly.
Attributes:
status (int): The numeric HTTP status code of the response.
reason (str): The response's reason phrase.
body (bytes): The response's decompressed body content as bytes.
Methods:
text: The response's body as a decoded string.
"""

def __init__(self, *, response, **kwargs):
"""
Initialize an HTTP response object.
Args:
response (http.client.HTTPResponse): A standard library response object
that is wrapped by this class.
**kwargs: Passed to parent classes.
"""
super().__init__(**kwargs)
self._response = response
# Immediately read the body while the response context is still available.
self._raw = self._response.read()
self._content = None

def _decompress(self):
"""Decompress the response body if necessary."""
method = self._response.getheader("Content-Encoding", "")
if not method:
self._content = self._raw
return
if method == "gzip":
self._content = gzip.decompress(self._raw)
elif method == "deflate":
self._content = zlib.decompress(self._raw)
else:
raise ValueError(f"Unsupported compression: {method}")

@property
def status(self):
"""Get the response's HTTP status code."""
return self._response.status

@property
def reason(self):
"""Get the response's reason phrase."""
return self._response.reason

@property
def body(self):
"""Get the response's decompressed body content as bytes."""
if self._content is None:
self._decompress()
return self._content

def text(self, encoding=None):
"""Return the response's body as a decoded string."""
if encoding is not None:
return self.body.decode(encoding)

_, params = cgi.parse_header(self._response.getheader("Content-Type", ""))
encoding = params.get("charset", "utf-8")
return self.body.decode(encoding)


def parse_args(args=None):
Description = 'Download and create a run information metadata file from SRA/ENA/GEO identifiers.'
Expand All @@ -36,7 +117,7 @@ def parse_args(args=None):
parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
parser.add_argument('FILE_IN', help="File containing database identifiers, one per line.")
parser.add_argument('FILE_OUT', help="Output file in tab-delimited format.")
parser.add_argument('-ef', '--ena_metadata_fields', type=str, dest="ENA_METADATA_FIELDS", default='', help="Comma-separated list of ENA metadata fields to fetch. (default: {}).".format(','.join(ENA_METADATA_FIELDS)))
parser.add_argument('-ef', '--ena_metadata_fields', type=str, dest="ENA_METADATA_FIELDS", default='', help=f"Comma-separated list of ENA metadata fields to fetch. (default: {','.join(ENA_METADATA_FIELDS)}).")
return parser.parse_args(args)

def validate_csv_param(param, valid_vals, param_desc):
Expand All @@ -47,7 +128,7 @@ def validate_csv_param(param, valid_vals, param_desc):
if len(intersect) == len(user_vals):
valid_list = intersect
else:
print("ERROR: Please provide a valid value for {}!\nProvided values = {}\nAccepted values = {}".format(param_desc,param,','.join(valid_vals)))
logger.error(f"Please provide a valid value for {param_desc}!\nProvided values = {param}\nAccepted values = {','.join(valid_vals)}")
sys.exit(1)
return valid_list

Expand All @@ -59,59 +140,87 @@ def make_dir(path):
if exception.errno != errno.EEXIST:
raise

def fetch_url(url, encoding='utf-8'):
def fetch_url(url):
try:
r = requests.get(url)
except requests.exceptions.RequestException as e:
raise SystemExit(e)
if r.status_code != 200:
print("ERROR: Connection failed\nError code '{}'".format(r.status_code))
with urlopen(url) as response:
result = Response(response=response).text().splitlines()
except HTTPError as e:
logger.error("The server couldn't fulfill the request.")
logger.error(f"Status: {e.code} {e.reason}")
sys.exit(1)
except URLError as e:
logger.error('We failed to reach a server.')
logger.error(f"Reason: {e.reason}")
sys.exit(1)
return r.content.decode(encoding).splitlines()
return result

def id_to_srx(db_id):
ids = []
url = 'https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?save=efetch&db=sra&rettype=runinfo&term={}'.format(db_id)
for row in csv.DictReader(fetch_url(url), delimiter=','):
ids.append(row['Experiment'])
return ids
params = {
"save": "efetch",
"db": "sra",
"rettype": "runinfo",
"term": db_id
}
url = f'https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?{urlencode(params)}'
return [
row['Experiment'] for row in csv.DictReader(fetch_url(url), delimiter=',')
]

def id_to_erx(db_id):
ids = []
fields = ['run_accession', 'experiment_accession']
url = 'https://www.ebi.ac.uk/ena/portal/api/filereport?accession={}&result=read_run&fields={}'.format(db_id,','.join(fields))
for row in csv.DictReader(fetch_url(url), delimiter='\t'):
ids.append(row['experiment_accession'])
return ids
params = {
"accession": db_id,
"result": "read_run",
"fields": ",".join(fields)
}
url = f'https://www.ebi.ac.uk/ena/portal/api/filereport?{urlencode(params)}'
return [
row['experiment_accession'] for row in csv.DictReader(fetch_url(url), delimiter='\t')
]

def gse_to_srx(db_id):
ids = []
url = 'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={}&targ=gsm&view=data&form=text'.format(db_id)
gsm_ids = [x.split('=')[1].strip() for x in fetch_url(url) if x.find('GSM') != -1]
params = {
"acc": db_id,
"targ": "gsm",
"view": "data",
"form": "text"
}
url = f'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?{urlencode(params)}'
gsm_ids = [x.split('=')[1].strip() for x in fetch_url(url) if x.startswith('GSM')]
for gsm_id in gsm_ids:
ids += id_to_srx(gsm_id)
return ids

def get_ena_fields():
fields = []
url = 'https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run'
for row in csv.DictReader(fetch_url(url), delimiter='\t'):
fields.append(row['columnId'])
return fields
params = {
"dataPortal": "ena",
"format": "tsv",
"result": "read_run"
}
url = f'https://www.ebi.ac.uk/ena/portal/api/returnFields?{urlencode(params)}'
return [
row['columnId'] for row in csv.DictReader(fetch_url(url), delimiter='\t')
]

def fetch_sra_runinfo(file_in, file_out, ena_metadata_fields=ENA_METADATA_FIELDS):
total_out = 0
seen_ids = []; run_ids = []
seen_ids = set()
run_ids = set()
header = []
make_dir(os.path.dirname(file_out))
params = {
"result": "read_run",
"fields": ','.join(ena_metadata_fields)
}
with open(file_in,"r") as fin, open(file_out,"w") as fout:
for line in fin:
db_id = line.strip()
match = re.search(ID_REGEX, db_id)
match = ID_REGEX.match(db_id)
if match:
prefix = match.group()
if prefix in PREFIX_LIST:
if not db_id in seen_ids:
if db_id not in seen_ids:

ids = [db_id]
## Resolve/expand these ids against GEO URL
Expand All @@ -128,34 +237,37 @@ def fetch_sra_runinfo(file_in, file_out, ena_metadata_fields=ENA_METADATA_FIELDS

## Resolve/expand to get run identifier from ENA and write to file
for id in ids:
url = 'https://www.ebi.ac.uk/ena/portal/api/filereport?accession={}&result=read_run&fields={}'.format(id,','.join(ena_metadata_fields))
csv_dict = csv.DictReader(fetch_url(url), delimiter='\t')
for row in csv_dict:
params["accession"] = id
url = f'https://www.ebi.ac.uk/ena/portal/api/filereport?{urlencode(params)}'
for row in csv.DictReader(fetch_url(url), delimiter='\t'):
run_id = row['run_accession']
if not run_id in run_ids:
if run_id not in run_ids:
if total_out == 0:
header = row.keys()
fout.write('{}\n'.format('\t'.join(header)))
header_line = '\t'.join(header)
fout.write(f"{header_line}\n")
else:
if header != row.keys():
print("ERROR: Metadata columns do not match for id {}!\nLine: '{}'".format(run_id,line.strip()))
logger.error(f"Metadata columns do not match for id {run_id}!\nLine: '{line.strip()}'")
sys.exit(1)
fout.write('{}\n'.format('\t'.join([row[x] for x in header])))

ordered_row = '\t'.join([row[x] for x in header])
fout.write(f'{ordered_row}\n')
total_out += 1
run_ids.append(run_id)
seen_ids.append(db_id)
run_ids.add(run_id)
seen_ids.add(db_id)

if not ids:
print("ERROR: No matches found for database id {}!\nLine: '{}'".format(db_id,line.strip()))
logger.error(f"No matches found for database id {db_id}!\nLine: '{line.strip()}'")
sys.exit(1)

else:
id_str = ', '.join([x + "*" for x in PREFIX_LIST])
print("ERROR: Please provide a valid database id starting with {}!\nLine: '{}'".format(id_str,line.strip()))
logger.error(f"Please provide a valid database id starting with {id_str}!\nLine: '{line.strip()}'")
sys.exit(1)
else:
id_str = ', '.join([x + "*" for x in PREFIX_LIST])
print("ERROR: Please provide a valid database id starting with {}!\nLine: '{}'".format(id_str,line.strip()))
logger.error(f"Please provide a valid database id starting with {id_str}!\nLine: '{line.strip()}'")
sys.exit(1)

def main(args=None):
Expand All @@ -167,4 +279,5 @@ def main(args=None):
fetch_sra_runinfo(args.FILE_IN, args.FILE_OUT, ena_metadata_fields)

if __name__ == '__main__':
logging.basicConfig(level='INFO', format='[%(levelname)s] %(message)s')
sys.exit(main())
6 changes: 3 additions & 3 deletions modules/local/get_software_versions.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@ process GET_SOFTWARE_VERSIONS {
mode: params.publish_dir_mode,
saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:'pipeline_info', meta:[:], publish_by_meta:[]) }

conda (params.enable_conda ? "conda-forge::python=3.8.3" : null)
conda (params.enable_conda ? "conda-forge::python=3.9.5" : null)
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
container "https://depot.galaxyproject.org/singularity/python:3.8.3"
container "https://depot.galaxyproject.org/singularity/python:3.9--1"
} else {
container "quay.io/biocontainers/python:3.8.3"
container "quay.io/biocontainers/python:3.9--1"
}

cache false
Expand Down
6 changes: 3 additions & 3 deletions modules/local/multiqc_mappings_config.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@ process MULTIQC_MAPPINGS_CONFIG {
mode: params.publish_dir_mode,
saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) }

conda (params.enable_conda ? "conda-forge::python=3.8.3" : null)
conda (params.enable_conda ? "conda-forge::python=3.9.5" : null)
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
container "https://depot.galaxyproject.org/singularity/python:3.8.3"
container "https://depot.galaxyproject.org/singularity/python:3.9--1"
} else {
container "quay.io/biocontainers/python:3.8.3"
container "quay.io/biocontainers/python:3.9--1"
}

input:
Expand Down
6 changes: 3 additions & 3 deletions modules/local/sra_ids_to_runinfo.nf
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@ process SRA_IDS_TO_RUNINFO {
mode: params.publish_dir_mode,
saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) }

conda (params.enable_conda ? "conda-forge::sed=4.7" : null)
conda (params.enable_conda ? "conda-forge::python=3.9.5" : null)
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
container "https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img"
container "https://depot.galaxyproject.org/singularity/python:3.9--1"
} else {
container "biocontainers/biocontainers:v1.2.0_cv1"
container "quay.io/biocontainers/python:3.9--1"
}

input:
Expand Down
6 changes: 3 additions & 3 deletions modules/local/sra_runinfo_to_ftp.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@ process SRA_RUNINFO_TO_FTP {
mode: params.publish_dir_mode,
saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) }

conda (params.enable_conda ? "conda-forge::python=3.8.3" : null)
conda (params.enable_conda ? "conda-forge::python=3.9.5" : null)
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
container "https://depot.galaxyproject.org/singularity/python:3.8.3"
container "https://depot.galaxyproject.org/singularity/python:3.9--1"
} else {
container "quay.io/biocontainers/python:3.8.3"
container "quay.io/biocontainers/python:3.9--1"
}

input:
Expand Down
Loading

0 comments on commit 2d593fb

Please sign in to comment.