Merge pull request #15 from nf-core/dev

Dev -> Master for v1.1 release
nf-core · Jun 22, 2021 · 3bd6ea5 · 3bd6ea5
2 parents 4611da4 + 098b8b1
commit 3bd6ea5
Show file tree

Hide file tree

Showing 18 changed files with 164 additions and 34 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -51,7 +51,7 @@ jobs:
         parameters:
           [
             "--nf_core_pipeline rnaseq",
-            "--ena_metadata_fields run_accession,experiment_accession,library_layout,fastq_ftp,fastq_md5",
+            "--ena_metadata_fields run_accession,experiment_accession,library_layout,fastq_ftp,fastq_md5 --sample_mapping_fields run_accession,library_layout",
             --skip_fastq_download,
           ]
     steps:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,13 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [[1.1](https://github.com/nf-core/fetchngs/releases/tag/1.1)] - 2021-06-22
+
+### Enhancements & fixes
+
+* [[#12](https://github.com/nf-core/fetchngs/issues/12)] - Error when using singularity - /etc/resolv.conf doesn't exist in container
+* Added `--sample_mapping_fields` parameter to create a separate `id_mappings.csv` and `multiqc_config.yml` with selected fields that can be used to rename samples in general and in [MultiQC](https://multiqc.info/docs/#bulk-sample-renaming)
+
 ## [[1.0](https://github.com/nf-core/fetchngs/releases/tag/1.0)] - 2021-06-08
 
 Initial release of nf-core/fetchngs, created with the [nf-core](https://nf-co.re/) template.

diff --git a/README.md b/README.md
@@ -41,7 +41,7 @@ The columns in the auto-created samplesheet can be tailored to be accepted out-o
 
 3. Download the pipeline and test it on a minimal dataset with a single command:
 
-    ```console
+    ```bash
     nextflow run nf-core/fetchngs -profile test,<docker/singularity/podman/shifter/charliecloud/conda/institute>
     ```
 
@@ -51,7 +51,7 @@ The columns in the auto-created samplesheet can be tailored to be accepted out-o
 
 4. Start running your own analysis!
 
-    ```console
+    ```bash
     nextflow run nf-core/fetchngs --input ids.txt -profile <docker/singularity/podman/shifter/charliecloud/conda/institute>
     ```
 

diff --git a/bin/multiqc_mappings_config.py b/bin/multiqc_mappings_config.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python
+
+import sys
+
+with open(sys.argv[1], "r") as fin, open(sys.argv[2], "w") as fout:
+    header = fin.readline().split(',')
+    config = "sample_names_rename_buttons:\n"
+    config += "\n".join(['  - ' + x.strip('"') for x in header])
+    config += "sample_names_rename:\n"
+    for line in fin:
+        config += f"  - [{', '.join(line.strip().split(','))}]\n"
+    fout.write(config)
+
diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py
@@ -30,7 +30,7 @@
     print("        <dt>{}</dt><dd><samp>{}</samp></dd>".format(k, v))
 print("    </dl>")
 
-# Write out regexes as csv file:
-with open("software_versions.csv", "w") as f:
+# Write out as tsv file:
+with open("software_versions.tsv", "w") as f:
     for k, v in sorted(results.items()):
         f.write("{}\t{}\n".format(k, v))
diff --git a/conf/modules.config b/conf/modules.config
@@ -42,5 +42,8 @@ params {
         'sra_merge_samplesheet' {
             publish_dir     = 'samplesheet'
         }
+        'multiqc_mappings_config' {
+            publish_dir     = 'samplesheet'
+        }
     }
 }
diff --git a/docs/output.md b/docs/output.md
@@ -22,9 +22,11 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
     * `*.md5`: Files containing `md5` sum for FastQ files downloaded from the ENA / SRA.
 * `samplesheet/`
     * `samplesheet.csv`: Auto-created samplesheet with collated metadata and paths to downloaded FastQ files.
+    * `id_mappings.csv`: File with selected fields that can be used to rename samples to more informative names; see [`--sample_mapping_fields`](https://nf-co.re/fetchngs/parameters#sample_mapping_fields) parameter to customise this behaviour.
+    * `multiqc_config.yml`: [MultiQC](https://multiqc.info/docs/#bulk-sample-renaming) config file that can be passed to most nf-core pipelines via the `--multiqc_config` parameter for bulk renaming of sample names from database ids; [`--sample_mapping_fields`](https://nf-co.re/fetchngs/parameters#sample_mapping_fields) parameter to customise this behaviour.
 * `metadata/`
-    * `*.runinfo_ftp.tsv`: Re-formatted metadata file downloaded from the ENA
-    * `*.runinfo.tsv`: Original metadata file downloaded from the ENA
+    * `*.runinfo_ftp.tsv`: Re-formatted metadata file downloaded from the ENA.
+    * `*.runinfo.tsv`: Original metadata file downloaded from the ENA.
 
 </details>
 
@@ -37,7 +39,7 @@ Please see the [usage documentation](https://nf-co.re/fetchngs/usage#introductio
 
 * `pipeline_info/`
     * Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`.
-    * Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.csv`.
+    * Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.tsv`.
 
 </details>
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -46,7 +46,7 @@ This will launch the pipeline with the `docker` configuration profile. See below
 
 Note that the pipeline will create the following files in your working directory:
 
-```bash
+```console
 work            # Directory containing the nextflow working files
 results         # Finished results (configurable, see below)
 .nextflow_log   # Log file from Nextflow

diff --git a/main.nf b/main.nf
@@ -25,12 +25,12 @@ WorkflowMain.initialise(workflow, params, log)
 ========================================================================================
 */
 
-workflow NFCORE_FETCHNGS {
+include { FETCHNGS } from './workflows/fetchngs'
 
-    //
-    // WORKFLOW: Run main nf-core/fetchngs analysis pipeline
-    //
-    include { FETCHNGS } from './workflows/fetchngs'
+//
+// WORKFLOW: Run main nf-core/fetchngs analysis pipeline
+//
+workflow NFCORE_FETCHNGS {
     FETCHNGS ()
 }
 

diff --git a/modules/local/get_software_versions.nf b/modules/local/get_software_versions.nf
@@ -21,7 +21,7 @@ process GET_SOFTWARE_VERSIONS {
     path versions
 
     output:
-    path "software_versions.csv"     , emit: csv
+    path "software_versions.tsv"     , emit: tsv
     path 'software_versions_mqc.yaml', emit: yaml
 
     script: // This script is bundled with the pipeline, in nf-core/fetchngs/bin/

diff --git a/modules/local/multiqc_mappings_config.nf b/modules/local/multiqc_mappings_config.nf
@@ -0,0 +1,33 @@
+// Import generic module functions
+include { saveFiles; getSoftwareName } from './functions'
+
+params.options = [:]
+
+process MULTIQC_MAPPINGS_CONFIG {
+    publishDir "${params.outdir}",
+        mode: params.publish_dir_mode,
+        saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) }
+
+    conda (params.enable_conda ? "conda-forge::python=3.8.3" : null)
+    if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
+        container "https://depot.galaxyproject.org/singularity/python:3.8.3"
+    } else {
+        container "quay.io/biocontainers/python:3.8.3"
+    }
+
+    input:
+    path csv
+
+    output:
+    path "*yml"         , emit: yml
+    path "*.version.txt", emit: version
+
+    script:
+    """
+    multiqc_mappings_config.py \\
+        $csv \\
+        multiqc_config.yml
+
+    python --version | sed -e "s/Python //g" > python.version.txt
+    """
+}
diff --git a/modules/local/sra_ids_to_runinfo.nf b/modules/local/sra_ids_to_runinfo.nf
@@ -10,11 +10,11 @@ process SRA_IDS_TO_RUNINFO {
         mode: params.publish_dir_mode,
         saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) }
 
-    conda (params.enable_conda ? "conda-forge::requests=2.24.0" : null)
+    conda (params.enable_conda ? "conda-forge::sed=4.7" : null)
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-        container "https://depot.galaxyproject.org/singularity/requests:2.24.0"
+        container "https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img"
     } else {
-        container "quay.io/biocontainers/requests:2.24.0"
+        container "biocontainers/biocontainers:v1.2.0_cv1"
     }
 
     input:

diff --git a/modules/local/sra_merge_samplesheet.nf b/modules/local/sra_merge_samplesheet.nf
@@ -17,15 +17,22 @@ process SRA_MERGE_SAMPLESHEET {
 
     input:
     path ('samplesheets/*')
+    path ('mappings/*')
 
     output:
-    path "*csv", emit: csv
+    path "samplesheet.csv", emit: samplesheet
+    path "id_mappings.csv"   , emit: mappings
 
     script:
     """
     head -n 1 `ls ./samplesheets/* | head -n 1` > samplesheet.csv
     for fileid in `ls ./samplesheets/*`; do
         awk 'NR>1' \$fileid >> samplesheet.csv
     done
+
+    head -n 1 `ls ./mappings/* | head -n 1` > id_mappings.csv
+    for fileid in `ls ./mappings/*`; do
+        awk 'NR>1' \$fileid >> id_mappings.csv
+    done
     """
 }
diff --git a/modules/local/sra_runinfo_to_ftp.nf b/modules/local/sra_runinfo_to_ftp.nf
@@ -19,12 +19,15 @@ process SRA_RUNINFO_TO_FTP {
     path runinfo
 
     output:
-    path "*.tsv", emit: tsv
+    path "*.tsv"         , emit: tsv
+    path  "*.version.txt", emit: version
 
     script:
     """
     sra_runinfo_to_ftp.py \\
         ${runinfo.join(',')} \\
         ${runinfo.toString().tokenize(".")[0]}.runinfo_ftp.tsv
+
+    python --version | sed -e "s/Python //g" > python.version.txt
     """
 }
diff --git a/modules/local/sra_to_samplesheet.nf b/modules/local/sra_to_samplesheet.nf
@@ -15,11 +15,17 @@ process SRA_TO_SAMPLESHEET {
     input:
     tuple val(meta), path(fastq)
     val   pipeline
+    val   mapping_fields
 
     output:
-    tuple val(meta), path("*csv"), emit: csv
+    tuple val(meta), path("*samplesheet.csv"), emit: samplesheet
+    tuple val(meta), path("*mappings.csv")   , emit: mappings
 
     exec:
+    //
+    // Create samplesheet containing metadata
+    //
+
     //  Remove custom keys needed to download the data
     def meta_map = meta.clone()
     meta_map.remove("id")
@@ -45,10 +51,27 @@ process SRA_TO_SAMPLESHEET {
     pipeline_map << meta_map
 
     // Create a samplesheet
-    csv  = pipeline_map.keySet().collect{ '"' + it + '"'}.join(",") + '\n'
-    csv += pipeline_map.values().collect{ '"' + it + '"'}.join(",")
+    samplesheet  = pipeline_map.keySet().collect{ '"' + it + '"'}.join(",") + '\n'
+    samplesheet += pipeline_map.values().collect{ '"' + it + '"'}.join(",")
+
+    // Write samplesheet to file
+    def samplesheet_file = task.workDir.resolve("${meta.id}.samplesheet.csv")
+    samplesheet_file.text = samplesheet
+
+    //
+    // Create sample id mappings file
+    //
+    mappings_map = pipeline_map.clone()
+    def fields = mapping_fields ? ['sample'] + mapping_fields.split(',').collect{ it.trim().toLowerCase() } : []
+    if ((mappings_map.keySet() + fields).unique().size() != mappings_map.keySet().size()) {
+        error("Invalid option for '--sample_mapping_fields': ${mapping_fields}.\nValid options: ${mappings_map.keySet().join(', ')}")
+    }
+
+    // Create mappings
+    mappings  = fields.collect{ '"' + it + '"'}.join(",") + '\n'
+    mappings += mappings_map.subMap(fields).values().collect{ '"' + it + '"'}.join(",")
 
-    // Write to file
-    def file = task.workDir.resolve("${meta.id}.samplesheet.csv")
-    file.text = csv
+    // Write mappings to file
+    def mappings_file = task.workDir.resolve("${meta.id}.mappings.csv")
+    mappings_file.text = mappings
 }
diff --git a/nextflow.config b/nextflow.config
@@ -13,6 +13,7 @@ params {
     input                      = null
     nf_core_pipeline           = null
     ena_metadata_fields        = null
+    sample_mapping_fields      = 'run_accession,sample_accession,experiment_alias,run_alias,sample_alias,experiment_title,sample_title,sample_description,description'
     skip_fastq_download        = false
 
     // Boilerplate options
@@ -146,7 +147,7 @@ manifest {
     description     = 'Pipeline to fetch metadata and raw FastQ files from public databases'
     mainScript      = 'main.nf'
     nextflowVersion = '!>=21.04.0'
-    version         = '1.0'
+    version         = '1.1'
 }
 
 // Function to ensure that resource requirements don't go beyond

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -29,6 +29,12 @@
                     "description": "Comma-separated list of ENA metadata fields to fetch before downloading data.",
                     "help_text": "The default list of fields used by the pipeline can be found at the top of the [`bin/sra_ids_to_runinfo.py`](https://github.com/nf-core/fetchngs/blob/master/bin/sra_ids_to_runinfo.py) script within the pipeline repo. This pipeline requires a minimal set of fields to download FastQ files i.e. `'run_accession,experiment_accession,library_layout,fastq_ftp,fastq_md5'`. Full list of accepted metadata fields can be obtained from the [ENA API](https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run])."
                 },
+                "sample_mapping_fields": {
+                    "type": "string",
+                    "fa_icon": "fas fa-globe-americas",
+                    "description": "Comma-separated list of ENA metadata fields used to create a separate 'id_mappings.csv' and 'multiqc_config.yml' with selected fields that can be used to rename samples in general and in MultiQC.",
+                    "default": "run_accession,sample_accession,experiment_alias,run_alias,sample_alias,experiment_title,sample_title,sample_description,description"
+                },
                 "nf_core_pipeline": {
                     "type": "string",
                     "fa_icon": "fab fa-apple",
@@ -244,3 +250,4 @@
         }
     ]
 }
+
diff --git a/workflows/fetchngs.nf b/workflows/fetchngs.nf
@@ -34,11 +34,13 @@ if (params.input) {
 // Don't overwrite global params.modules, create a copy instead and use that within the main script.
 def modules = params.modules.clone()
 
-include { SRA_IDS_TO_RUNINFO    } from '../modules/local/sra_ids_to_runinfo'    addParams( options: modules['sra_ids_to_runinfo']    )
-include { SRA_RUNINFO_TO_FTP    } from '../modules/local/sra_runinfo_to_ftp'    addParams( options: modules['sra_runinfo_to_ftp']    )
-include { SRA_FASTQ_FTP         } from '../modules/local/sra_fastq_ftp'         addParams( options: modules['sra_fastq_ftp']         )
-include { SRA_TO_SAMPLESHEET    } from '../modules/local/sra_to_samplesheet'    addParams( options: modules['sra_to_samplesheet'], results_dir: modules['sra_fastq_ftp'].publish_dir )
-include { SRA_MERGE_SAMPLESHEET } from '../modules/local/sra_merge_samplesheet' addParams( options: modules['sra_merge_samplesheet'] )
+include { SRA_IDS_TO_RUNINFO      } from '../modules/local/sra_ids_to_runinfo'      addParams( options: modules['sra_ids_to_runinfo']      )
+include { SRA_RUNINFO_TO_FTP      } from '../modules/local/sra_runinfo_to_ftp'      addParams( options: modules['sra_runinfo_to_ftp']      )
+include { SRA_FASTQ_FTP           } from '../modules/local/sra_fastq_ftp'           addParams( options: modules['sra_fastq_ftp']           )
+include { SRA_TO_SAMPLESHEET      } from '../modules/local/sra_to_samplesheet'      addParams( options: modules['sra_to_samplesheet'], results_dir: modules['sra_fastq_ftp'].publish_dir )
+include { SRA_MERGE_SAMPLESHEET   } from '../modules/local/sra_merge_samplesheet'   addParams( options: modules['sra_merge_samplesheet']   )
+include { MULTIQC_MAPPINGS_CONFIG } from '../modules/local/multiqc_mappings_config' addParams( options: modules['multiqc_mappings_config'] )
+include { GET_SOFTWARE_VERSIONS   } from '../modules/local/get_software_versions'   addParams( options: [publish_files : ['tsv':'']]       )
 
 /*
 ========================================================================================
@@ -48,6 +50,8 @@ include { SRA_MERGE_SAMPLESHEET } from '../modules/local/sra_merge_samplesheet'
 
 workflow FETCHNGS {
 
+    ch_software_versions = Channel.empty()
+
     //
     // MODULE: Get SRA run information for public database ids
     //
@@ -74,6 +78,7 @@ workflow FETCHNGS {
         }
         .unique()
         .set { ch_sra_reads }
+    ch_software_versions = ch_software_versions.mix(SRA_RUNINFO_TO_FTP.out.version.first().ifEmpty(null))
 
     if (!params.skip_fastq_download) {
         //
@@ -88,16 +93,27 @@ workflow FETCHNGS {
         //
         SRA_TO_SAMPLESHEET (
             SRA_FASTQ_FTP.out.fastq,
-            params.nf_core_pipeline ?: ''
+            params.nf_core_pipeline ?: '',
+            params.sample_mapping_fields
         )
 
         //
         // MODULE: Create a merged samplesheet across all samples for the pipeline
         //
         SRA_MERGE_SAMPLESHEET (
-            SRA_TO_SAMPLESHEET.out.csv.collect{it[1]}
+            SRA_TO_SAMPLESHEET.out.samplesheet.collect{it[1]},
+            SRA_TO_SAMPLESHEET.out.mappings.collect{it[1]}
         )
 
+        //
+        // MODULE: Create a MutiQC config file with sample name mappings
+        //
+        if (params.sample_mapping_fields) {
+            MULTIQC_MAPPINGS_CONFIG (
+                SRA_MERGE_SAMPLESHEET.out.mappings
+            )
+        }
+
         //
         // If ids don't have a direct FTP download link write them to file for download outside of the pipeline
         //
@@ -107,6 +123,21 @@ workflow FETCHNGS {
             .unique()
             .collectFile(name: no_ids_file, sort: true, newLine: true)
     }
+
+    //
+    // MODULE: Pipeline reporting
+    //
+    ch_software_versions
+        .map { it -> if (it) [ it.baseName, it ] }
+        .groupTuple()
+        .map { it[1][0] }
+        .flatten()
+        .collect()
+        .set { ch_software_versions }
+
+    GET_SOFTWARE_VERSIONS (
+        ch_software_versions.map { it }.collect()
+    )
 }
 
 /*