diff --git a/.gitignore b/.gitignore index d425acf..a42ce01 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,4 @@ results/ testing/ testing* *.pyc -*.idea/ +null/ diff --git a/.nf-core.yml b/.nf-core.yml index 667b2a6..2a18a04 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1,6 +1,19 @@ bump_version: null -lint: null -nf_core_version: 3.0.1 +lint: + files_exist: + - conf/igenomes.config + - conf/igenomes_ignored.config + - assets/multiqc_config.yml + - conf/igenomes.config + - conf/igenomes_ignored.config + - assets/multiqc_config.yml + files_unchanged: + - .github/CONTRIBUTING.md + - assets/sendmail_template.txt + - .github/CONTRIBUTING.md + - assets/sendmail_template.txt + multiqc_config: false +nf_core_version: 3.0.2 org_path: null repository_type: pipeline template: @@ -15,6 +28,9 @@ template: name: drugresponseeval org: nf-core outdir: . - skip_features: null + skip_features: + - igenomes + - multiqc + - fastqc version: 1.0dev update: null diff --git a/README.md b/README.md index 39e2b75..f03b199 100644 --- a/README.md +++ b/README.md @@ -36,16 +36,6 @@ DrEval catalog, you can increase your work's exposure, reusability, and transfer # ![DrEval_pipeline](assets/DrEval_pipeline_simplified.png) - - - - - 1. The response data is loaded 2. All models are trained and evaluated in a cross-validation setting 3. For each CV split, the best hyperparameters are determined using a grid search per model @@ -66,8 +56,6 @@ For baseline models, no randomization or robustness tests are performed. Now, you can run the pipeline using: - - ```bash nextflow run nf-core/drugresponseeval \ -profile \ @@ -95,10 +83,13 @@ Berlin). We thank the following people for their extensive assistance in the development of this pipeline: - - ## Contributions and Support +Contributors to nf-core/drugresponseeval and the drevalpy PyPI package: +- [Judith Bernett](https://github.com/JudithBernett) (TUM) +- [Pascal Iversen](https://github.com/PascalIversen) (FU Berlin) +- [Mario Picciani](https://github.com/picciama) (TUM) + If you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md). For further information or help, don't hesitate to get in touch on the [Slack `#drugresponseeval` channel](https://nfcore.slack.com/channels/drugresponseeval) (you can join with [this invite](https://nf-co.re/join/slack)). diff --git a/assets/nf-core-drugresponseeval_logo_light.png b/assets/nf-core-drugresponseeval_logo_light.png index bc11b3f..d0922b1 100644 Binary files a/assets/nf-core-drugresponseeval_logo_light.png and b/assets/nf-core-drugresponseeval_logo_light.png differ diff --git a/conf/base.config b/conf/base.config index 4bc3f91..933ff63 100644 --- a/conf/base.config +++ b/conf/base.config @@ -10,7 +10,6 @@ process { - // TODO nf-core: Check the defaults for all processes cpus = { 1 * task.attempt } memory = { 6.GB * task.attempt } time = { 4.h * task.attempt } @@ -24,7 +23,6 @@ process { // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. // If possible, it would be nice to keep the same label naming convention when // adding in your local modules too. - // TODO nf-core: Customise requirements for specific processes. // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors withLabel:process_single { cpus = { 1 } diff --git a/conf/test.config b/conf/test.config index 1599590..2198b34 100644 --- a/conf/test.config +++ b/conf/test.config @@ -22,7 +22,7 @@ params { config_profile_name = 'Test profile' config_profile_description = 'Minimal test dataset to check pipeline function' - //TODO do this for the proper pipeline + // TODO nf-core: do this for the proper pipeline // Input data // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets // TODO nf-core: Give any required params for the test so that command line flags are not needed diff --git a/docs/images/nf-core-drugresponseeval_logo_dark.png b/docs/images/nf-core-drugresponseeval_logo_dark.png index 7f9be5f..7bb7436 100644 Binary files a/docs/images/nf-core-drugresponseeval_logo_dark.png and b/docs/images/nf-core-drugresponseeval_logo_dark.png differ diff --git a/docs/output.md b/docs/output.md index 42d511c..a961714 100644 --- a/docs/output.md +++ b/docs/output.md @@ -12,38 +12,25 @@ The directories listed below will be created in the results directory after the The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -- [FastQC](#fastqc) - Raw read QC -- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline -- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +1. [Load response](#load-response) The response data is loaded +2. All models are trained and evaluated in a cross-validation setting +3. For each CV split, the best hyperparameters are determined using a grid search per model +4. The model is trained on the full training set (train & validation) with the best + hyperparameters to predict the test set +5. If randomization tests are enabled, the model is trained on the full training set with the best + hyperparameters to predict the randomized test set +6. If robustness tests are enabled, the model is trained N times on the full training set with the + best hyperparameters +7. Plots are created summarizing the results +8. [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution -### FastQC +### Load response +The response data is loaded into the pipeline. This step is necessary to provide the pipeline with the response data that will be used to train and evaluate the models. -
-Output files - -- `fastqc/` - - `*_fastqc.html`: FastQC report containing quality metrics. - - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. - -
- -[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). - -### MultiQC - -
-Output files - -- `multiqc/` - - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. - - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. - - `multiqc_plots/`: directory containing static images from the report in various formats. - -
+### Train and evaluate models -[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. +All models are trained and evaluated in a cross-validation setting. The models are trained on the training set and evaluated on the validation set. The performance of the models is evaluated using various metrics such as accuracy, precision, recall, F1-score, and ROC-AUC. -Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . ### Pipeline information diff --git a/docs/usage.md b/docs/usage.md index 7d69c7b..88cf27d 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -8,65 +8,26 @@ -## Samplesheet input - -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. - -```bash ---input '[path to samplesheet file]' -``` - -### Multiple runs of the same sample - -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: - -```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz -``` - -### Full samplesheet - -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. - -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. - -```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, -``` - -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | - -An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. - ## Running the pipeline The typical command for running the pipeline is as follows: ```bash -nextflow run nf-core/drugresponseeval --input ./samplesheet.csv --outdir ./results --genome GRCh37 -profile docker +nextflow run nf-core/drugresponseeval \ + -profile \ + --models \ + --baselines \ + --dataset_name \ + --path_data \ ``` -This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. +This will launch the pipeline with the `docker/singularity/.../institute` configuration profile. See below for more information about profiles. Note that the pipeline will create the following files in your working directory: ```bash work # Directory containing the nextflow working files - # Finished results in specified location (defined with --outdir) + # Finished results in specified location (defined with --outdir), defaults to 'results' .nextflow_log # Log file from Nextflow # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` @@ -88,9 +49,11 @@ nextflow run nf-core/drugresponseeval -profile docker -params-file params.yaml with: ```yaml title="params.yaml" -input: './samplesheet.csv' +models: 'ElasticNet' +baselines: 'NaivePredictor,NaiveCellLineMeanPredictor,NaiveDrugMeanPredictor' outdir: './results/' -genome: 'GRCh37' +dataset_name: 'GDSC2' +path_data: '/path/to/data' <...> ``` diff --git a/modules.json b/modules.json index 62cedcb..4566ec2 100644 --- a/modules.json +++ b/modules.json @@ -3,21 +3,24 @@ "homePage": "https://github.com/nf-core/drugresponseeval", "repos": { "https://github.com/nf-core/modules.git": { + "modules": { + "nf-core": {} + }, "subworkflows": { "nf-core": { "utils_nextflow_pipeline": { "branch": "master", - "git_sha": "9d05360da397692321d377b6102d2fb22507c6ef", + "git_sha": "56372688d8979092cafbe0c5c3895b491166ca1c", "installed_by": ["subworkflows"] }, "utils_nfcore_pipeline": { "branch": "master", - "git_sha": "772684d9d66f37b650c8ba5146ac1ee3ecba2acb", + "git_sha": "1b6b9a3338d011367137808b49b923515080e3ba", "installed_by": ["subworkflows"] }, "utils_nfschema_plugin": { "branch": "master", - "git_sha": "bbd5a41f4535a8defafe6080e00ea74c45f4f96c", + "git_sha": "2fd2cd6d0e7b273747f32e465fdc6bcc3ae0814e", "installed_by": ["subworkflows"] } } diff --git a/nextflow_schema.json b/nextflow_schema.json index 7d57ead..7804e2b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -53,6 +53,18 @@ "fa_icon": "fas fa-envelope", "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" + }, + "input": { + "type": "string", + "format": "file-path", + "exists": true, + "schema": "assets/schema_input.json", + "mimetype": "text/csv", + "pattern": "^\\S+\\.csv$", + "description": "Unnecessary parameter for the pipeline, added to satisfy linting.", + "help_text": "Unnecessary parameter for the pipeline, added to satisfy linting.", + "fa_icon": "fas fa-file-csv", + "hidden": true } } }, diff --git a/subworkflows/nf-core/utils_nextflow_pipeline/main.nf b/subworkflows/nf-core/utils_nextflow_pipeline/main.nf index 2b0dc67..0fcbf7b 100644 --- a/subworkflows/nf-core/utils_nextflow_pipeline/main.nf +++ b/subworkflows/nf-core/utils_nextflow_pipeline/main.nf @@ -3,9 +3,9 @@ // /* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUBWORKFLOW DEFINITION -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ workflow UTILS_NEXTFLOW_PIPELINE { @@ -44,9 +44,9 @@ workflow UTILS_NEXTFLOW_PIPELINE { } /* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FUNCTIONS -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ // @@ -106,17 +106,19 @@ def checkCondaChannels() { def channels_missing = ((required_channels_in_order as Set) - (channels as Set)) as Boolean // Check that they are in the right order - def channel_priority_violation = false - - required_channels_in_order.eachWithIndex { channel, index -> - if (index < required_channels_in_order.size() - 1) { - channel_priority_violation |= !(channels.indexOf(channel) < channels.indexOf(required_channels_in_order[index + 1])) - } - } + def channel_priority_violation = required_channels_in_order != channels.findAll { ch -> ch in required_channels_in_order } if (channels_missing | channel_priority_violation) { - log.warn( - "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + " There is a problem with your Conda configuration!\n\n" + " You will need to set-up the conda-forge and bioconda channels correctly.\n" + " Please refer to https://bioconda.github.io/\n" + " The observed channel order is \n" + " ${channels}\n" + " but the following channel order is required:\n" + " ${required_channels_in_order}\n" + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - ) + log.warn """\ + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + There is a problem with your Conda configuration! + You will need to set-up the conda-forge and bioconda channels correctly. + Please refer to https://bioconda.github.io/ + The observed channel order is + ${channels} + but the following channel order is required: + ${required_channels_in_order} + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + """.stripIndent(true) } } diff --git a/subworkflows/nf-core/utils_nextflow_pipeline/tests/main.workflow.nf.test b/subworkflows/nf-core/utils_nextflow_pipeline/tests/main.workflow.nf.test index ca964ce..02dbf09 100644 --- a/subworkflows/nf-core/utils_nextflow_pipeline/tests/main.workflow.nf.test +++ b/subworkflows/nf-core/utils_nextflow_pipeline/tests/main.workflow.nf.test @@ -52,10 +52,12 @@ nextflow_workflow { } then { - assertAll( - { assert workflow.success }, - { assert workflow.stdout.contains("nextflow_workflow v9.9.9") } - ) + expect { + with(workflow) { + assert success + assert "nextflow_workflow v9.9.9" in stdout + } + } } } diff --git a/subworkflows/nf-core/utils_nfcore_pipeline/main.nf b/subworkflows/nf-core/utils_nfcore_pipeline/main.nf index b78273c..5cb7baf 100644 --- a/subworkflows/nf-core/utils_nfcore_pipeline/main.nf +++ b/subworkflows/nf-core/utils_nfcore_pipeline/main.nf @@ -3,9 +3,9 @@ // /* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUBWORKFLOW DEFINITION -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ workflow UTILS_NFCORE_PIPELINE { @@ -21,9 +21,9 @@ workflow UTILS_NFCORE_PIPELINE { } /* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FUNCTIONS -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ // @@ -62,7 +62,7 @@ def checkProfileProvided(nextflow_cli_args) { def workflowCitation() { def temp_doi_ref = "" def manifest_doi = workflow.manifest.doi.tokenize(",") - // Using a loop to handle multiple DOIs + // Handling multiple DOIs // Removing `https://doi.org/` to handle pipelines using DOIs vs DOI resolvers // Removing ` ` since the manifest.doi is a string and not a proper list manifest_doi.each { doi_ref -> diff --git a/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test b/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test index 842dc43..8fb3016 100644 --- a/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test +++ b/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test @@ -42,7 +42,7 @@ nextflow_workflow { params { test_data = '' - outdir = 1 + outdir = null } workflow { @@ -94,7 +94,7 @@ nextflow_workflow { params { test_data = '' - outdir = 1 + outdir = null } workflow {