Copy over code for fetchngs.

At first, this pipeline was called nf-core/fetchfastq. After initial release we found that this name was already taken by another bioinformatics tool. We took the decision to rename to nf-core/fetchngs. In this commit I am taking the released code with the new pipeline name and putting it into a single commit.
nf-core · Jun 7, 2021 · c28cf8a · c28cf8a
1 parent 98213d9
commit c28cf8a
Show file tree

Hide file tree

Showing 31 changed files with 748 additions and 461 deletions.
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
@@ -75,9 +75,7 @@ If you wish to contribute a new step, please use the following coding standards:
 8. Add any new software to the `scrape_software_versions.py` script in `bin/` and the version command to the `scrape_software_versions` process in `main.nf`.
 9. Do local tests that the new code works properly and as expected.
 10. Add a new test command in `.github/workflow/ci.yml`.
-11. If applicable add a [MultiQC](https://https://multiqc.info/) module.
-12. Update MultiQC config `assets/multiqc_config.yaml` so relevant suffixes, name clean up, General Statistics Table column order, and module figures are in the right order.
-13. Optional: Add any descriptions of MultiQC report sections and output files to `docs/output.md`.
+11. Add any descriptions of output files to `docs/output.md`.
 
 ### Default values
 

diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml
@@ -8,7 +8,6 @@ on:
     types: [published]
   workflow_dispatch:
 
-
 env:
   AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
   AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
@@ -17,7 +16,6 @@ env:
   AWS_JOB_QUEUE: ${{ secrets.AWS_JOB_QUEUE }}
   AWS_S3_BUCKET: ${{ secrets.AWS_S3_BUCKET }}
 
-
 jobs:
   run-awstest:
     name: Run AWS full tests
@@ -32,14 +30,10 @@ jobs:
       - name: Install awscli
         run: conda install -c conda-forge awscli
       - name: Start AWS batch job
-        # TODO nf-core: You can customise AWS full pipeline tests as required
-        # Add full size test data (but still relatively small datasets for few samples)
-        # on the `test_full.config` test runs with only one set of parameters
-        # Then specify `-profile test_full` instead of `-profile test` on the AWS batch command
         run: |
           aws batch submit-job \
             --region eu-west-1 \
             --job-name nf-core-fetchngs \
             --job-queue $AWS_JOB_QUEUE \
             --job-definition $AWS_JOB_DEFINITION \
-            --container-overrides '{"command": ["nf-core/fetchngs", "-r '"${GITHUB_SHA}"' -profile test --outdir s3://'"${AWS_S3_BUCKET}"'/fetchngs/results-'"${GITHUB_SHA}"' -w s3://'"${AWS_S3_BUCKET}"'/fetchngs/work-'"${GITHUB_SHA}"' -with-tower"], "environment": [{"name": "TOWER_ACCESS_TOKEN", "value": "'"$TOWER_ACCESS_TOKEN"'"}]}'
+            --container-overrides '{"command": ["nf-core/fetchngs", "-r '"${GITHUB_SHA}"' -profile test_full --outdir s3://'"${AWS_S3_BUCKET}"'/fetchngs/results-'"${GITHUB_SHA}"' -w s3://'"${AWS_S3_BUCKET}"'/fetchngs/work-'"${GITHUB_SHA}"' -with-tower"], "environment": [{"name": "TOWER_ACCESS_TOKEN", "value": "'"$TOWER_ACCESS_TOKEN"'"}]}'
diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml
@@ -6,7 +6,6 @@ name: nf-core AWS test
 on:
   workflow_dispatch:
 
-
 env:
   AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
   AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
@@ -15,7 +14,6 @@ env:
   AWS_JOB_QUEUE: ${{ secrets.AWS_JOB_QUEUE }}
   AWS_S3_BUCKET: ${{ secrets.AWS_S3_BUCKET }}
 
-
 jobs:
   run-awstest:
     name: Run AWS tests
@@ -30,9 +28,6 @@ jobs:
       - name: Install awscli
         run: conda install -c conda-forge awscli
       - name: Start AWS batch job
-        # TODO nf-core: You can customise CI pipeline run tests as required
-        # For example: adding multiple test runs with different parameters
-        # Remember that you can parallelise this by using strategy.matrix
         run: |
           aws batch submit-job \
           --region eu-west-1 \

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -23,7 +23,7 @@ jobs:
     strategy:
       matrix:
         # Nextflow versions: check pipeline minimum and current latest
-        nxf_ver: ['21.04.0', '']
+        nxf_ver: ["21.04.0", ""]
     steps:
       - name: Check out pipeline code
         uses: actions/checkout@v2
@@ -36,8 +36,34 @@ jobs:
           sudo mv nextflow /usr/local/bin/
 
       - name: Run pipeline with test data
-        # TODO nf-core: You can customise CI pipeline run tests as required
-        # For example: adding multiple test runs with different parameters
-        # Remember that you can parallelise this by using strategy.matrix
         run: |
           nextflow run ${GITHUB_WORKSPACE} -profile test,docker
+
+  parameters:
+    name: Test workflow parameters
+    if: ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/fetchngs') }}
+    runs-on: ubuntu-latest
+    env:
+      NXF_VER: ${{ matrix.nxf_ver }}
+      NXF_ANSI_LOG: false
+    strategy:
+      matrix:
+        parameters:
+          [
+            "--nf_core_pipeline rnaseq",
+            "--ena_metadata_fields run_accession,experiment_accession,library_layout,fastq_ftp,fastq_md5",
+            --skip_fastq_download,
+          ]
+    steps:
+      - name: Check out pipeline code
+        uses: actions/checkout@v2
+
+      - name: Install Nextflow
+        env:
+          CAPSULE_LOG: none
+        run: |
+          wget -qO- get.nextflow.io | bash
+          sudo mv nextflow /usr/local/bin/
+      - name: Run pipeline with various parameters
+        run: |
+          nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.parameters }}
diff --git a/.nf-core-lint.yaml b/.nf-core-lint.yaml
@@ -0,0 +1,13 @@
+files_unchanged:
+  - .github/CONTRIBUTING.md
+  - .github/ISSUE_TEMPLATE/bug_report.md
+  - .github/PULL_REQUEST_TEMPLATE.md
+  - .github/workflows/linting.yml
+  - assets/sendmail_template.txt
+  - docs/README.md
+  - lib/NfcoreSchema.groovy
+  - .gitignore
+files_exist:
+  - .github/markdownlint.yml
+  - bin/markdown_to_html.py
+actions_awsfulltest: False
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,14 +3,29 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## v1.0dev - [date]
+## [[1.0](https://github.com/nf-core/fetchngs/releases/tag/1.0)] - 2021-06-08
 
 Initial release of nf-core/fetchngs, created with the [nf-core](https://nf-co.re/) template.
 
-### `Added`
+## Pipeline summary
 
-### `Fixed`
+Via a single file of ids, provided one-per-line the pipeline performs the following steps:
 
-### `Dependencies`
+1. Resolve database ids back to appropriate experiment-level ids and to be compatible with the [ENA API](https://ena-docs.readthedocs.io/en/latest/retrieval/programmatic-access.html)
+2. Fetch extensive id metadata including direct download links to FastQ files via ENA API
+3. Download FastQ files in parallel via `curl` and perform `md5sum` check
+4. Collate id metadata and paths to FastQ files in a single samplesheet
 
-### `Deprecated`
+## Supported database ids
+
+Currently, the following types of example identifiers are supported:
+
+| `SRA`        | `ENA`        | `GEO`      |
+|--------------|--------------|------------|
+| SRR11605097  | ERR4007730   | GSM4432381 |
+| SRX8171613   | ERX4009132   | GSE147507  |
+| SRS6531847   | ERS4399630   |            |
+| SAMN14689442 | SAMEA6638373 |            |
+| SRP256957    | ERP120836    |            |
+| SRA1068758   | ERA2420837   |            |
+| PRJNA625551  | PRJEB37513   |            |
diff --git a/CITATIONS.md b/CITATIONS.md
@@ -10,10 +10,20 @@
 
 ## Pipeline tools
 
-* [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)
+* [Python](http://www.python.org)
 
-* [MultiQC](https://www.ncbi.nlm.nih.gov/pubmed/27312411/)
-    > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924.
+* [Requests](https://docs.python-requests.org/)
+
+## Pipeline resources
+
+* [ENA](https://pubmed.ncbi.nlm.nih.gov/33175160/)
+    > Harrison PW, Ahamed A, Aslam R, Alako BTF, Burgin J, Buso N, Courtot M, Fan J, Gupta D, Haseeb M, Holt S, Ibrahim T, Ivanov E, Jayathilaka S, Kadhirvelu VB, Kumar M, Lopez R, Kay S, Leinonen R, Liu X, O'Cathail C, Pakseresht A, Park Y, Pesant S, Rahman N, Rajan J, Sokolov A, Vijayaraja S, Waheed Z, Zyoud A, Burdett T, Cochrane G. The European Nucleotide Archive in 2020. Nucleic Acids Res. 2021 Jan 8;49(D1):D82-D85. doi: 10.1093/nar/gkaa1028. PubMed PMID: 33175160; PubMed Central PMCID: PMC7778925.
+
+* [SRA](https://pubmed.ncbi.nlm.nih.gov/21062823/)
+    > Leinonen R, Sugawara H, Shumway M, International Nucleotide Sequence Database Collaboration. The sequence read archive. Nucleic Acids Res. 2011 Jan;39 (Database issue):D19-21. doi: 10.1093/nar/gkq1019. Epub 2010 Nov 9. PubMed PMID: 21062823; PubMed Central PMCID: PMC3013647.
+
+* [GEO](https://pubmed.ncbi.nlm.nih.gov/23193258/)
+    > Barrett T, Wilhite SE, Ledoux P, Evangelista C, Kim IF, Tomashevsky M, Marshall KA, Phillippy KH, Sherman PM, Holko M, Yefanov A, Lee H, Zhang N, Robertson CL, Serova N, Davis S, Soboleva A. NCBI GEO: archive for functional genomics data sets--update. Nucleic Acids Res. 2013 Jan;41(Database issue):D991-5. doi: 10.1093/nar/gks1193. Epub 2012 Nov 27. PubMed PMID: 23193258; PubMed Central PMCID: PMC3531084.
 
 ## Software packaging/containerisation tools
 

diff --git a/README.md b/README.md
@@ -16,20 +16,22 @@
 
 ## Introduction
 
-<!-- TODO nf-core: Write a 1-2 sentence summary of what data the pipeline is for and what it does -->
-**nf-core/fetchngs** is a bioinformatics best-practice analysis pipeline for Pipeline to fetch metadata and raw FastQ files from public databases.
+**nf-core/fetchngs** is a bioinformatics pipeline to fetch metadata and raw FastQ files from public databases. At present, the pipeline supports SRA / ENA / GEO ids (see [usage docs](https://nf-co.re/fetchngs/usage#introduction)).
 
-The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community!
+The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies.
 
-<!-- TODO nf-core: Add full-sized test dataset and amend the paragraph below if applicable -->
 On release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources. The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/fetchngs/results).
 
 ## Pipeline summary
 
-<!-- TODO nf-core: Fill in short bullet-pointed list of the default steps in the pipeline -->
+Via a single file of ids, provided one-per-line (see [example input file](https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/samplesheet/public_database_ids.txt)) the pipeline performs the following steps:
 
-1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/))
-2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/))
+1. Resolve database ids back to appropriate experiment-level ids and to be compatible with the [ENA API](https://ena-docs.readthedocs.io/en/latest/retrieval/programmatic-access.html)
+2. Fetch extensive id metadata including direct download links to FastQ files via ENA API
+3. Download FastQ files in parallel via `curl` and perform `md5sum` check
+4. Collate id metadata and paths to FastQ files in a single samplesheet
+
+The columns in the auto-created samplesheet can be tailored to be accepted out-of-the-box by selected nf-core pipelines, these currently include [nf-core/rnaseq](https://nf-co.re/rnaseq/usage#samplesheet-input) and [nf-core/viralrecon](https://nf-co.re/viralrecon/usage#illumina-samplesheet-format). You can use the `--nf_core_pipeline` parameter to customise this behaviour e.g. `--nf_core_pipeline rnaseq`. More pipelines will be supported in due course as we adopt and standardise samplesheet input across nf-core.
 
 ## Quick Start
 
@@ -49,10 +51,8 @@ On release, automated continuous integration tests run the pipeline on a full-si
 
 4. Start running your own analysis!
 
-    <!-- TODO nf-core: Update the example "typical command" below used to run the pipeline -->
-
     ```console
-    nextflow run nf-core/fetchngs -profile <docker/singularity/podman/shifter/charliecloud/conda/institute> --input samplesheet.csv --genome GRCh37
+    nextflow run nf-core/fetchngs -profile <docker/singularity/podman/shifter/charliecloud/conda/institute> --input ids.txt
     ```
 
 ## Documentation
@@ -61,11 +61,7 @@ The nf-core/fetchngs pipeline comes with documentation about the pipeline [usage
 
 ## Credits
 
-nf-core/fetchngs was originally written by Harshil Patel.
-
-We thank the following people for their extensive assistance in the development of this pipeline:
-
-<!-- TODO nf-core: If applicable, make list of people who have also contributed -->
+nf-core/fetchngs was originally written by Harshil Patel ([@drpatelh](https://github.com/drpatelh)) from [The Bioinformatics & Biostatistics Group](https://www.crick.ac.uk/research/science-technology-platforms/bioinformatics-and-biostatistics/) at [The Francis Crick Institute, London](https://www.crick.ac.uk/) and Jose Espinosa-Carrasco ([@JoseEspinosa](https://github.com/JoseEspinosa)) from [The Comparative Bioinformatics Group](https://www.crg.eu/en/cedric_notredame) at [The Centre for Genomic Regulation, Spain](https://www.crg.eu/).
 
 ## Contributions and Support
 
@@ -75,10 +71,8 @@ For further information or help, don't hesitate to get in touch on the [Slack `#
 
 ## Citations
 
-<!-- TODO nf-core: Add citation for pipeline after first release. Uncomment lines below and update Zenodo doi and badge at the top of this file. -->
-<!-- If you use  nf-core/fetchngs for your analysis, please cite it using the following doi: [10.5281/zenodo.XXXXXX](https://doi.org/10.5281/zenodo.XXXXXX) -->
+If you use  nf-core/fetchngs for your analysis, please cite it using the following doi: [10.5281/zenodo.XXXXXX](https://doi.org/10.5281/zenodo.XXXXXX)
 
-<!-- TODO nf-core: Add bibliography of tools and data used in your pipeline -->
 An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.
 
 You can cite the `nf-core` publication as follows:

diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -5,35 +5,11 @@
     "description": "Schema for the file provided with params.input",
     "type": "array",
     "items": {
-        "type": "object",
-        "properties": {
-            "sample": {
-                "type": "string",
-                "pattern": "^\\S+$",
-                "errorMessage": "Sample name must be provided and cannot contain spaces"
-            },
-            "fastq_1": {
-                "type": "string",
-                "pattern": "^\\S+\\.f(ast)?q\\.gz$",
-                "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
-            },
-            "fastq_2": {
-                "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'",
-                "anyOf": [
-                    {
-                        "type": "string",
-                        "pattern": "^\\S+\\.f(ast)?q\\.gz$"
-                    },
-                    {
-                        "type": "string",
-                        "maxLength": 0
-                    }
-                ]
-            }
-        },
-        "required": [
-            "sample",
-            "fastq_1"
-        ]
+        "type": "array",
+        "items": {
+            "type": "string",
+            "pattern": "^[SEPG][RAS][RXSMPAJXE][EN]?[AB]?\\d{4,9}$",
+            "errorMessage": "Please provide a valid SRA, GEO or ENA identifier"
+        }
     }
 }
diff --git a/assets/sendmail_template.txt b/assets/sendmail_template.txt
@@ -25,29 +25,4 @@ Content-Disposition: inline; filename="nf-core-fetchngs_logo.png"
     flatten().
     join( '\n' ) %>
 
-<%
-if (mqcFile){
-def mqcFileObj = new File("$mqcFile")
-if (mqcFileObj.length() < mqcMaxSize){
-out << """
---nfcoremimeboundary
-Content-Type: text/html; name=\"multiqc_report\"
-Content-Transfer-Encoding: base64
-Content-ID: <mqcreport>
-Content-Disposition: attachment; filename=\"${mqcFileObj.getName()}\"
-
-${mqcFileObj.
-    bytes.
-    encodeBase64().
-    toString().
-    tokenize( '\n' )*.
-    toList()*.
-    collate( 76 )*.
-    collect { it.join() }.
-    flatten().
-    join( '\n' )}
-"""
-}}
-%>
-
 --nfcoremimeboundary--