diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d149562c..26785ea1 100755 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -51,7 +51,7 @@ jobs: parameters: [ "--nf_core_pipeline rnaseq", - "--ena_metadata_fields run_accession,experiment_accession,library_layout,fastq_ftp,fastq_md5", + "--ena_metadata_fields run_accession,experiment_accession,library_layout,fastq_ftp,fastq_md5 --sample_mapping_fields run_accession,library_layout", --skip_fastq_download, ] steps: diff --git a/CHANGELOG.md b/CHANGELOG.md index 04618e30..3cc14bd0 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [[1.1](https://github.com/nf-core/fetchngs/releases/tag/1.1)] - 2021-06-22 + +### Enhancements & fixes + +* [[#12](https://github.com/nf-core/fetchngs/issues/12)] - Error when using singularity - /etc/resolv.conf doesn't exist in container +* Added `--sample_mapping_fields` parameter to create a separate `id_mappings.csv` and `multiqc_config.yml` with selected fields that can be used to rename samples in general and in [MultiQC](https://multiqc.info/docs/#bulk-sample-renaming) + ## [[1.0](https://github.com/nf-core/fetchngs/releases/tag/1.0)] - 2021-06-08 Initial release of nf-core/fetchngs, created with the [nf-core](https://nf-co.re/) template. diff --git a/README.md b/README.md index 0eb5cbfb..e78820f8 100755 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ The columns in the auto-created samplesheet can be tailored to be accepted out-o 3. Download the pipeline and test it on a minimal dataset with a single command: - ```console + ```bash nextflow run nf-core/fetchngs -profile test, ``` @@ -51,7 +51,7 @@ The columns in the auto-created samplesheet can be tailored to be accepted out-o 4. Start running your own analysis! - ```console + ```bash nextflow run nf-core/fetchngs --input ids.txt -profile ``` diff --git a/bin/multiqc_mappings_config.py b/bin/multiqc_mappings_config.py new file mode 100755 index 00000000..a7fc92ba --- /dev/null +++ b/bin/multiqc_mappings_config.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python + +import sys + +with open(sys.argv[1], "r") as fin, open(sys.argv[2], "w") as fout: + header = fin.readline().split(',') + config = "sample_names_rename_buttons:\n" + config += "\n".join([' - ' + x.strip('"') for x in header]) + config += "sample_names_rename:\n" + for line in fin: + config += f" - [{', '.join(line.strip().split(','))}]\n" + fout.write(config) + diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py index 71b81eff..a4e45493 100755 --- a/bin/scrape_software_versions.py +++ b/bin/scrape_software_versions.py @@ -30,7 +30,7 @@ print("
{}
{}
".format(k, v)) print(" ") -# Write out regexes as csv file: -with open("software_versions.csv", "w") as f: +# Write out as tsv file: +with open("software_versions.tsv", "w") as f: for k, v in sorted(results.items()): f.write("{}\t{}\n".format(k, v)) diff --git a/conf/modules.config b/conf/modules.config index 4059a911..3ae76b68 100755 --- a/conf/modules.config +++ b/conf/modules.config @@ -42,5 +42,8 @@ params { 'sra_merge_samplesheet' { publish_dir = 'samplesheet' } + 'multiqc_mappings_config' { + publish_dir = 'samplesheet' + } } } diff --git a/docs/output.md b/docs/output.md index f2cc20a5..c0916e6d 100755 --- a/docs/output.md +++ b/docs/output.md @@ -22,9 +22,11 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d * `*.md5`: Files containing `md5` sum for FastQ files downloaded from the ENA / SRA. * `samplesheet/` * `samplesheet.csv`: Auto-created samplesheet with collated metadata and paths to downloaded FastQ files. + * `id_mappings.csv`: File with selected fields that can be used to rename samples to more informative names; see [`--sample_mapping_fields`](https://nf-co.re/fetchngs/parameters#sample_mapping_fields) parameter to customise this behaviour. + * `multiqc_config.yml`: [MultiQC](https://multiqc.info/docs/#bulk-sample-renaming) config file that can be passed to most nf-core pipelines via the `--multiqc_config` parameter for bulk renaming of sample names from database ids; [`--sample_mapping_fields`](https://nf-co.re/fetchngs/parameters#sample_mapping_fields) parameter to customise this behaviour. * `metadata/` - * `*.runinfo_ftp.tsv`: Re-formatted metadata file downloaded from the ENA - * `*.runinfo.tsv`: Original metadata file downloaded from the ENA + * `*.runinfo_ftp.tsv`: Re-formatted metadata file downloaded from the ENA. + * `*.runinfo.tsv`: Original metadata file downloaded from the ENA. @@ -37,7 +39,7 @@ Please see the [usage documentation](https://nf-co.re/fetchngs/usage#introductio * `pipeline_info/` * Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. - * Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.csv`. + * Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.tsv`. diff --git a/docs/usage.md b/docs/usage.md index 2852035b..2e5675a9 100755 --- a/docs/usage.md +++ b/docs/usage.md @@ -46,7 +46,7 @@ This will launch the pipeline with the `docker` configuration profile. See below Note that the pipeline will create the following files in your working directory: -```bash +```console work # Directory containing the nextflow working files results # Finished results (configurable, see below) .nextflow_log # Log file from Nextflow diff --git a/main.nf b/main.nf index f28f690f..475c87a8 100755 --- a/main.nf +++ b/main.nf @@ -25,12 +25,12 @@ WorkflowMain.initialise(workflow, params, log) ======================================================================================== */ -workflow NFCORE_FETCHNGS { +include { FETCHNGS } from './workflows/fetchngs' - // - // WORKFLOW: Run main nf-core/fetchngs analysis pipeline - // - include { FETCHNGS } from './workflows/fetchngs' +// +// WORKFLOW: Run main nf-core/fetchngs analysis pipeline +// +workflow NFCORE_FETCHNGS { FETCHNGS () } diff --git a/modules/local/get_software_versions.nf b/modules/local/get_software_versions.nf index 5b6ad7ad..4d37bd6a 100755 --- a/modules/local/get_software_versions.nf +++ b/modules/local/get_software_versions.nf @@ -21,7 +21,7 @@ process GET_SOFTWARE_VERSIONS { path versions output: - path "software_versions.csv" , emit: csv + path "software_versions.tsv" , emit: tsv path 'software_versions_mqc.yaml', emit: yaml script: // This script is bundled with the pipeline, in nf-core/fetchngs/bin/ diff --git a/modules/local/multiqc_mappings_config.nf b/modules/local/multiqc_mappings_config.nf new file mode 100644 index 00000000..63121b40 --- /dev/null +++ b/modules/local/multiqc_mappings_config.nf @@ -0,0 +1,33 @@ +// Import generic module functions +include { saveFiles; getSoftwareName } from './functions' + +params.options = [:] + +process MULTIQC_MAPPINGS_CONFIG { + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } + + conda (params.enable_conda ? "conda-forge::python=3.8.3" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/python:3.8.3" + } else { + container "quay.io/biocontainers/python:3.8.3" + } + + input: + path csv + + output: + path "*yml" , emit: yml + path "*.version.txt", emit: version + + script: + """ + multiqc_mappings_config.py \\ + $csv \\ + multiqc_config.yml + + python --version | sed -e "s/Python //g" > python.version.txt + """ +} diff --git a/modules/local/sra_ids_to_runinfo.nf b/modules/local/sra_ids_to_runinfo.nf index d5b9705c..b277197e 100644 --- a/modules/local/sra_ids_to_runinfo.nf +++ b/modules/local/sra_ids_to_runinfo.nf @@ -10,11 +10,11 @@ process SRA_IDS_TO_RUNINFO { mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } - conda (params.enable_conda ? "conda-forge::requests=2.24.0" : null) + conda (params.enable_conda ? "conda-forge::sed=4.7" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/requests:2.24.0" + container "https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img" } else { - container "quay.io/biocontainers/requests:2.24.0" + container "biocontainers/biocontainers:v1.2.0_cv1" } input: diff --git a/modules/local/sra_merge_samplesheet.nf b/modules/local/sra_merge_samplesheet.nf index 423be634..e3c1314a 100644 --- a/modules/local/sra_merge_samplesheet.nf +++ b/modules/local/sra_merge_samplesheet.nf @@ -17,9 +17,11 @@ process SRA_MERGE_SAMPLESHEET { input: path ('samplesheets/*') + path ('mappings/*') output: - path "*csv", emit: csv + path "samplesheet.csv", emit: samplesheet + path "id_mappings.csv" , emit: mappings script: """ @@ -27,5 +29,10 @@ process SRA_MERGE_SAMPLESHEET { for fileid in `ls ./samplesheets/*`; do awk 'NR>1' \$fileid >> samplesheet.csv done + + head -n 1 `ls ./mappings/* | head -n 1` > id_mappings.csv + for fileid in `ls ./mappings/*`; do + awk 'NR>1' \$fileid >> id_mappings.csv + done """ } diff --git a/modules/local/sra_runinfo_to_ftp.nf b/modules/local/sra_runinfo_to_ftp.nf index 1418bf34..b0421aea 100644 --- a/modules/local/sra_runinfo_to_ftp.nf +++ b/modules/local/sra_runinfo_to_ftp.nf @@ -19,12 +19,15 @@ process SRA_RUNINFO_TO_FTP { path runinfo output: - path "*.tsv", emit: tsv + path "*.tsv" , emit: tsv + path "*.version.txt", emit: version script: """ sra_runinfo_to_ftp.py \\ ${runinfo.join(',')} \\ ${runinfo.toString().tokenize(".")[0]}.runinfo_ftp.tsv + + python --version | sed -e "s/Python //g" > python.version.txt """ } diff --git a/modules/local/sra_to_samplesheet.nf b/modules/local/sra_to_samplesheet.nf index a30e0f5e..c4fc9af7 100644 --- a/modules/local/sra_to_samplesheet.nf +++ b/modules/local/sra_to_samplesheet.nf @@ -15,11 +15,17 @@ process SRA_TO_SAMPLESHEET { input: tuple val(meta), path(fastq) val pipeline + val mapping_fields output: - tuple val(meta), path("*csv"), emit: csv + tuple val(meta), path("*samplesheet.csv"), emit: samplesheet + tuple val(meta), path("*mappings.csv") , emit: mappings exec: + // + // Create samplesheet containing metadata + // + // Remove custom keys needed to download the data def meta_map = meta.clone() meta_map.remove("id") @@ -45,10 +51,27 @@ process SRA_TO_SAMPLESHEET { pipeline_map << meta_map // Create a samplesheet - csv = pipeline_map.keySet().collect{ '"' + it + '"'}.join(",") + '\n' - csv += pipeline_map.values().collect{ '"' + it + '"'}.join(",") + samplesheet = pipeline_map.keySet().collect{ '"' + it + '"'}.join(",") + '\n' + samplesheet += pipeline_map.values().collect{ '"' + it + '"'}.join(",") + + // Write samplesheet to file + def samplesheet_file = task.workDir.resolve("${meta.id}.samplesheet.csv") + samplesheet_file.text = samplesheet + + // + // Create sample id mappings file + // + mappings_map = pipeline_map.clone() + def fields = mapping_fields ? ['sample'] + mapping_fields.split(',').collect{ it.trim().toLowerCase() } : [] + if ((mappings_map.keySet() + fields).unique().size() != mappings_map.keySet().size()) { + error("Invalid option for '--sample_mapping_fields': ${mapping_fields}.\nValid options: ${mappings_map.keySet().join(', ')}") + } + + // Create mappings + mappings = fields.collect{ '"' + it + '"'}.join(",") + '\n' + mappings += mappings_map.subMap(fields).values().collect{ '"' + it + '"'}.join(",") - // Write to file - def file = task.workDir.resolve("${meta.id}.samplesheet.csv") - file.text = csv + // Write mappings to file + def mappings_file = task.workDir.resolve("${meta.id}.mappings.csv") + mappings_file.text = mappings } diff --git a/nextflow.config b/nextflow.config index 94ddb47f..5ac246e5 100755 --- a/nextflow.config +++ b/nextflow.config @@ -13,6 +13,7 @@ params { input = null nf_core_pipeline = null ena_metadata_fields = null + sample_mapping_fields = 'run_accession,sample_accession,experiment_alias,run_alias,sample_alias,experiment_title,sample_title,sample_description,description' skip_fastq_download = false // Boilerplate options @@ -146,7 +147,7 @@ manifest { description = 'Pipeline to fetch metadata and raw FastQ files from public databases' mainScript = 'main.nf' nextflowVersion = '!>=21.04.0' - version = '1.0' + version = '1.1' } // Function to ensure that resource requirements don't go beyond diff --git a/nextflow_schema.json b/nextflow_schema.json index e5b24df1..8d1a1c44 100755 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -29,6 +29,12 @@ "description": "Comma-separated list of ENA metadata fields to fetch before downloading data.", "help_text": "The default list of fields used by the pipeline can be found at the top of the [`bin/sra_ids_to_runinfo.py`](https://github.com/nf-core/fetchngs/blob/master/bin/sra_ids_to_runinfo.py) script within the pipeline repo. This pipeline requires a minimal set of fields to download FastQ files i.e. `'run_accession,experiment_accession,library_layout,fastq_ftp,fastq_md5'`. Full list of accepted metadata fields can be obtained from the [ENA API](https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run])." }, + "sample_mapping_fields": { + "type": "string", + "fa_icon": "fas fa-globe-americas", + "description": "Comma-separated list of ENA metadata fields used to create a separate 'id_mappings.csv' and 'multiqc_config.yml' with selected fields that can be used to rename samples in general and in MultiQC.", + "default": "run_accession,sample_accession,experiment_alias,run_alias,sample_alias,experiment_title,sample_title,sample_description,description" + }, "nf_core_pipeline": { "type": "string", "fa_icon": "fab fa-apple", @@ -244,3 +250,4 @@ } ] } + diff --git a/workflows/fetchngs.nf b/workflows/fetchngs.nf index 4ba08fb3..1602494f 100755 --- a/workflows/fetchngs.nf +++ b/workflows/fetchngs.nf @@ -34,11 +34,13 @@ if (params.input) { // Don't overwrite global params.modules, create a copy instead and use that within the main script. def modules = params.modules.clone() -include { SRA_IDS_TO_RUNINFO } from '../modules/local/sra_ids_to_runinfo' addParams( options: modules['sra_ids_to_runinfo'] ) -include { SRA_RUNINFO_TO_FTP } from '../modules/local/sra_runinfo_to_ftp' addParams( options: modules['sra_runinfo_to_ftp'] ) -include { SRA_FASTQ_FTP } from '../modules/local/sra_fastq_ftp' addParams( options: modules['sra_fastq_ftp'] ) -include { SRA_TO_SAMPLESHEET } from '../modules/local/sra_to_samplesheet' addParams( options: modules['sra_to_samplesheet'], results_dir: modules['sra_fastq_ftp'].publish_dir ) -include { SRA_MERGE_SAMPLESHEET } from '../modules/local/sra_merge_samplesheet' addParams( options: modules['sra_merge_samplesheet'] ) +include { SRA_IDS_TO_RUNINFO } from '../modules/local/sra_ids_to_runinfo' addParams( options: modules['sra_ids_to_runinfo'] ) +include { SRA_RUNINFO_TO_FTP } from '../modules/local/sra_runinfo_to_ftp' addParams( options: modules['sra_runinfo_to_ftp'] ) +include { SRA_FASTQ_FTP } from '../modules/local/sra_fastq_ftp' addParams( options: modules['sra_fastq_ftp'] ) +include { SRA_TO_SAMPLESHEET } from '../modules/local/sra_to_samplesheet' addParams( options: modules['sra_to_samplesheet'], results_dir: modules['sra_fastq_ftp'].publish_dir ) +include { SRA_MERGE_SAMPLESHEET } from '../modules/local/sra_merge_samplesheet' addParams( options: modules['sra_merge_samplesheet'] ) +include { MULTIQC_MAPPINGS_CONFIG } from '../modules/local/multiqc_mappings_config' addParams( options: modules['multiqc_mappings_config'] ) +include { GET_SOFTWARE_VERSIONS } from '../modules/local/get_software_versions' addParams( options: [publish_files : ['tsv':'']] ) /* ======================================================================================== @@ -48,6 +50,8 @@ include { SRA_MERGE_SAMPLESHEET } from '../modules/local/sra_merge_samplesheet' workflow FETCHNGS { + ch_software_versions = Channel.empty() + // // MODULE: Get SRA run information for public database ids // @@ -74,6 +78,7 @@ workflow FETCHNGS { } .unique() .set { ch_sra_reads } + ch_software_versions = ch_software_versions.mix(SRA_RUNINFO_TO_FTP.out.version.first().ifEmpty(null)) if (!params.skip_fastq_download) { // @@ -88,16 +93,27 @@ workflow FETCHNGS { // SRA_TO_SAMPLESHEET ( SRA_FASTQ_FTP.out.fastq, - params.nf_core_pipeline ?: '' + params.nf_core_pipeline ?: '', + params.sample_mapping_fields ) // // MODULE: Create a merged samplesheet across all samples for the pipeline // SRA_MERGE_SAMPLESHEET ( - SRA_TO_SAMPLESHEET.out.csv.collect{it[1]} + SRA_TO_SAMPLESHEET.out.samplesheet.collect{it[1]}, + SRA_TO_SAMPLESHEET.out.mappings.collect{it[1]} ) + // + // MODULE: Create a MutiQC config file with sample name mappings + // + if (params.sample_mapping_fields) { + MULTIQC_MAPPINGS_CONFIG ( + SRA_MERGE_SAMPLESHEET.out.mappings + ) + } + // // If ids don't have a direct FTP download link write them to file for download outside of the pipeline // @@ -107,6 +123,21 @@ workflow FETCHNGS { .unique() .collectFile(name: no_ids_file, sort: true, newLine: true) } + + // + // MODULE: Pipeline reporting + // + ch_software_versions + .map { it -> if (it) [ it.baseName, it ] } + .groupTuple() + .map { it[1][0] } + .flatten() + .collect() + .set { ch_software_versions } + + GET_SOFTWARE_VERSIONS ( + ch_software_versions.map { it }.collect() + ) } /*