From 6353679ff728f5019a54e5d0d683edbed8cfc713 Mon Sep 17 00:00:00 2001 From: haidyi Date: Mon, 30 Jun 2025 19:22:21 -0500 Subject: [PATCH 01/55] Add run_dbcan screening --- README.md | 5 +- conf/modules.config | 33 ++ docs/output.md | 52 ++- modules.json | 20 ++ .../rundbcan/cazymeannotation/environment.yml | 7 + .../nf-core/rundbcan/cazymeannotation/main.nf | 60 ++++ .../rundbcan/cazymeannotation/meta.yml | 88 +++++ .../cazymeannotation/tests/main.nf.test | 72 ++++ .../cazymeannotation/tests/main.nf.test.snap | 174 +++++++++ .../nf-core/rundbcan/database/environment.yml | 7 + modules/nf-core/rundbcan/database/main.nf | 38 ++ modules/nf-core/rundbcan/database/meta.yml | 36 ++ .../rundbcan/database/tests/main.nf.test | 56 +++ .../rundbcan/database/tests/main.nf.test.snap | 37 ++ .../nf-core/rundbcan/easycgc/environment.yml | 7 + modules/nf-core/rundbcan/easycgc/main.nf | 80 +++++ modules/nf-core/rundbcan/easycgc/meta.yml | 151 ++++++++ .../rundbcan/easycgc/tests/main.nf.test | 80 +++++ .../rundbcan/easycgc/tests/main.nf.test.snap | 334 ++++++++++++++++++ .../rundbcan/easysubstrate/environment.yml | 7 + .../nf-core/rundbcan/easysubstrate/main.nf | 92 +++++ .../nf-core/rundbcan/easysubstrate/meta.yml | 181 ++++++++++ .../rundbcan/easysubstrate/tests/main.nf.test | 90 +++++ .../easysubstrate/tests/main.nf.test.snap | 317 +++++++++++++++++ nextflow.config | 6 + nextflow_schema.json | 23 ++ subworkflows/local/annotation.nf | 41 ++- subworkflows/local/dbcan.nf | 68 ++++ workflows/funcscan.nf | 39 +- 29 files changed, 2177 insertions(+), 24 deletions(-) create mode 100644 modules/nf-core/rundbcan/cazymeannotation/environment.yml create mode 100644 modules/nf-core/rundbcan/cazymeannotation/main.nf create mode 100644 modules/nf-core/rundbcan/cazymeannotation/meta.yml create mode 100644 modules/nf-core/rundbcan/cazymeannotation/tests/main.nf.test create mode 100644 modules/nf-core/rundbcan/cazymeannotation/tests/main.nf.test.snap create mode 100644 modules/nf-core/rundbcan/database/environment.yml create mode 100644 modules/nf-core/rundbcan/database/main.nf create mode 100644 modules/nf-core/rundbcan/database/meta.yml create mode 100644 modules/nf-core/rundbcan/database/tests/main.nf.test create mode 100644 modules/nf-core/rundbcan/database/tests/main.nf.test.snap create mode 100644 modules/nf-core/rundbcan/easycgc/environment.yml create mode 100644 modules/nf-core/rundbcan/easycgc/main.nf create mode 100644 modules/nf-core/rundbcan/easycgc/meta.yml create mode 100644 modules/nf-core/rundbcan/easycgc/tests/main.nf.test create mode 100644 modules/nf-core/rundbcan/easycgc/tests/main.nf.test.snap create mode 100644 modules/nf-core/rundbcan/easysubstrate/environment.yml create mode 100644 modules/nf-core/rundbcan/easysubstrate/main.nf create mode 100644 modules/nf-core/rundbcan/easysubstrate/meta.yml create mode 100644 modules/nf-core/rundbcan/easysubstrate/tests/main.nf.test create mode 100644 modules/nf-core/rundbcan/easysubstrate/tests/main.nf.test.snap create mode 100644 subworkflows/local/dbcan.nf diff --git a/README.md b/README.md index 6d12d5a9..536c55c8 100644 --- a/README.md +++ b/README.md @@ -36,8 +36,9 @@ The nf-core/funcscan AWS full test dataset are contigs generated by the MGnify s 5. Screening contigs for antimicrobial peptide-like sequences with [`ampir`](https://cran.r-project.org/web/packages/ampir/index.html), [`Macrel`](https://github.com/BigDataBiology/macrel), [`HMMER`](http://hmmer.org/), [`AMPlify`](https://github.com/bcgsc/AMPlify) 6. Screening contigs for antibiotic resistant gene-like sequences with [`ABRicate`](https://github.com/tseemann/abricate), [`AMRFinderPlus`](https://github.com/ncbi/amr), [`fARGene`](https://github.com/fannyhb/fargene), [`RGI`](https://card.mcmaster.ca/analyze/rgi), [`DeepARG`](https://bench.cs.vt.edu/deeparg). [`argNorm`](https://github.com/BigDataBiology/argNorm) is used to map the outputs of `DeepARG`, `AMRFinderPlus`, and `ABRicate` to the [`Antibiotic Resistance Ontology`](https://www.ebi.ac.uk/ols4/ontologies/aro) for consistent ARG classification terms. 7. Screening contigs for biosynthetic gene cluster-like sequences with [`antiSMASH`](https://antismash.secondarymetabolites.org), [`DeepBGC`](https://github.com/Merck/deepbgc), [`GECCO`](https://gecco.embl.de/), [`HMMER`](http://hmmer.org/) -8. Creating aggregated reports for all samples across the workflows with [`AMPcombi`](https://github.com/Darcy220606/AMPcombi) for AMPs, [`hAMRonization`](https://github.com/pha4ge/hAMRonization) for ARGs, and [`comBGC`](https://raw.githubusercontent.com/nf-core/funcscan/master/bin/comBGC.py) for BGCs -9. Software version and methods text reporting with [`MultiQC`](http://multiqc.info/) +8. Screening contigs for carbohydrate-active enzyme (CAZyme), CAZyme gene clusters and substrate with [run_dbcan](https://github.com/bcb-unl/run_dbcan). +9. Creating aggregated reports for all samples across the workflows with [`AMPcombi`](https://github.com/Darcy220606/AMPcombi) for AMPs, [`hAMRonization`](https://github.com/pha4ge/hAMRonization) for ARGs, and [`comBGC`](https://raw.githubusercontent.com/nf-core/funcscan/master/bin/comBGC.py) for BGCs +10. Software version and methods text reporting with [`MultiQC`](http://multiqc.info/) ![funcscan metro workflow](docs/images/funcscan_metro_workflow.png) diff --git a/conf/modules.config b/conf/modules.config index 34528cd9..6c3aebeb 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -732,4 +732,37 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, ] } + + withName: RUNDBCAN_DATABASE { + publishDir = [ + path: { "${params.outdir}/databases/dbcan/" }, + mode: params.publish_dir_mode, + enabled: params.save_db, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + + withName: RUNDBCAN_CAZYMEANNOTATION { + publishDir = [ + path: { "${params.outdir}/dbcan/cazyme_annotation/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + + withName: RUNDBCAN_EASYCGC { + publishDir = [ + path: { "${params.outdir}/dbcan/cgc/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + + withName: RUNDBCAN_EASYSUBSTRATE { + publishDir = [ + path: { "${params.outdir}/dbcan/substrate/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } } diff --git a/docs/output.md b/docs/output.md index 289d9086..20a145be 100644 --- a/docs/output.md +++ b/docs/output.md @@ -7,10 +7,11 @@ The output of nf-core/funcscan provides reports for each of the functional group - **antibiotic resistance genes** (tools: [ABRicate](https://github.com/tseemann/abricate), [AMRFinderPlus](https://www.ncbi.nlm.nih.gov/pathogens/antimicrobial-resistance/AMRFinder), [DeepARG](https://bitbucket.org/gusphdproj/deeparg-ss/src/master), [fARGene](https://github.com/fannyhb/fargene), [RGI](https://card.mcmaster.ca/analyze/rgi) – summarised by [hAMRonization](https://github.com/pha4ge/hAMRonization). Results from ABRicate, AMRFinderPlus, and DeepARG are normalised to [ARO](https://obofoundry.org/ontology/aro.html) by [argNorm](https://github.com/BigDataBiology/argNorm).) - **antimicrobial peptides** (tools: [Macrel](https://github.com/BigDataBiology/macrel), [AMPlify](https://github.com/bcgsc/AMPlify), [ampir](https://ampir.marine-omics.net), [hmmsearch](http://hmmer.org) – summarised by [AMPcombi](https://github.com/Darcy220606/AMPcombi)) - **biosynthetic gene clusters** (tools: [antiSMASH](https://docs.antismash.secondarymetabolites.org), [DeepBGC](https://github.com/Merck/deepbgc), [GECCO](https://gecco.embl.de), [hmmsearch](http://hmmer.org) – summarised by [comBGC](#combgc)) +- carbohydrate-active enzyme (CAZyme) annotation, CAZyme gene clusters and substrate (tools: [run_dbcan](https://github.com/bcb-unl/run_dbcan)) As a general workflow, we recommend to first look at the summary reports ([ARGs](#hamronization), [AMPs](#ampcombi), [BGCs](#combgc)), to get a general overview of what hits have been found across all the tools of each functional group. After which, you can explore the specific output directories of each tool to get more detailed information about each result. The tool-specific output directories also includes the output from the functional annotation steps of either [prokka](https://github.com/tseemann/prokka), [pyrodigal](https://github.com/althonos/pyrodigal), [prodigal](https://github.com/hyattpd/Prodigal), or [Bakta](https://github.com/oschwengers/bakta) if the `--save_annotations` flag was set. Additionally, taxonomic classifications from [MMseqs2](https://github.com/soedinglab/MMseqs2) are saved if the `--taxa_classification_mmseqs_db_savetmp` and `--taxa_classification_mmseqs_taxonomy_savetmp` flags are set. -Similarly, all downloaded databases are saved (i.e. from [MMseqs2](https://github.com/soedinglab/MMseqs2), [antiSMASH](https://docs.antismash.secondarymetabolites.org), [AMRFinderPlus](https://www.ncbi.nlm.nih.gov/pathogens/antimicrobial-resistance/AMRFinder), [Bakta](https://github.com/oschwengers/bakta), [DeepARG](https://bitbucket.org/gusphdproj/deeparg-ss/src/master), [RGI](https://github.com/arpcard/rgi), and/or [AMPcombi](https://github.com/Darcy220606/AMPcombi)) into the output directory `/databases/` if the `--save_db` flag was set. +Similarly, all downloaded databases are saved (i.e. from [MMseqs2](https://github.com/soedinglab/MMseqs2), [antiSMASH](https://docs.antismash.secondarymetabolites.org), [AMRFinderPlus](https://www.ncbi.nlm.nih.gov/pathogens/antimicrobial-resistance/AMRFinder), [Bakta](https://github.com/oschwengers/bakta), [DeepARG](https://bitbucket.org/gusphdproj/deeparg-ss/src/master), [RGI](https://github.com/arpcard/rgi), and/or [AMPcombi](https://github.com/Darcy220606/AMPcombi), [run_dbcan](https://github.com/bcb-unl/run_dbcan)) into the output directory `/databases/` if the `--save_db` flag was set. Furthermore, for reproducibility, versions of all software used in the run is presented in a [MultiQC](http://multiqc.info) report. @@ -41,6 +42,10 @@ results/ | ├── deepbgc/ | ├── gecco/ | └── hmmsearch/ +├── dbcan/ +| ├── cazyme_annotation/ +| ├── cgc/ +| └── substrate/ ├── databases/ ├── multiqc/ ├── pipeline_info/ @@ -102,6 +107,10 @@ Biosynthetic Gene Clusters (BGCs): - [GECCO](#gecco) – biosynthetic gene cluster detection, using Conditional Random Fields (CRFs). - [hmmsearch](#hmmsearch) – biosynthetic gene cluster detection, based on hidden Markov models. +Carbohydrate-active enzyme (CAZYMEs) + +- [run_dbcan](https://github.com/bcb-unl/run_dbcan) - carbohydrate-active enzyme (CAZyme), CAZyme gene clusters and substrate. + Output Summaries: - [AMPcombi](#ampcombi) – summary report of antimicrobial peptide gene output from various detection tools. @@ -466,6 +475,47 @@ Note that filtered FASTA is only used for BGC workflow for run-time optimisation [GECCO](https://gecco.embl.de) is a fast and scalable method for identifying putative novel Biosynthetic Gene Clusters (BGCs) in genomic and metagenomic data using Conditional Random Fields (CRFs). +### CAZyme annotation tools + +#### run_dbcan + +
+Output files + +- `dbcan/` + - `cazyme_annotation` + - `*_overview.tsv/`: TSV file containing the results of dbCAN CAZyme annotation. + - `*_dbCAN_hmm_results.tsv`: TSV file containing the detailed dbCAN HMM results for CAZyme annotation. + - `*_dbCANsub_hmm_results.tsv`: TSV file containing the detailed dbCAN subfamily results for CAZyme annotation. + - `*_diamond.out`: TSV file containing the detailed dbCAN diamond results for CAZyme annotation. + - `cgc` + - `*_overview.tsv/`: TSV file containing the results of dbCAN CAZyme annotation. + - `*_dbCAN_hmm_results.tsv`: TSV file containing the detailed dbCAN HMM results for CAZyme annotation. + - `*_dbCANsub_hmm_results.tsv`: TSV file containing the detailed dbCAN subfamily results for CAZyme annotation. + - `*_diamond.out`: TSV file containing the detailed dbCAN diamond results for CAZyme annotation. + - `*_cgc.gff`: GFF file containing the CAZyme gene clusters (CGC) identified by dbCAN. This file is generated from the dbCAN annotation and contains the locations of CAZyme gene clusters in the genome. + - `*_cgc_standard_out.tsv`: Standard output file from dbCAN for CAZyme gene clusters (CGC) in a tabular format. This file summarizes the CAZyme gene clusters identified in the genome. + - `*_diamond.out.tc`: TSV file containing the diamond output for transporter annotation. + - `*_TF_hmm_results.tsv`: TSV file containing the results of Transcription factor. + - `*_STP_hmm_results.tsv`: TSV file containing the results of signaling transduction proteins (STP) annotation. + - `substrate` + - `*_overview.tsv/`: TSV file containing the results of dbCAN CAZyme annotation. + - `*_dbCAN_hmm_results.tsv`: TSV file containing the detailed dbCAN HMM results for CAZyme annotation. + - `*_dbCANsub_hmm_results.tsv`: TSV file containing the detailed dbCAN subfamily results for CAZyme annotation. + - `*_diamond.out`: TSV file containing the detailed dbCAN diamond results for CAZyme annotation. + - `*_cgc.gff`: GFF file containing the CAZyme gene clusters (CGC) identified by dbCAN. This file is generated from the dbCAN annotation and contains the locations of CAZyme gene clusters in the genome. + - `*_cgc_standard_out.tsv`: Standard output file from dbCAN for CAZyme gene clusters (CGC) in a tabular format. This file summarizes the CAZyme gene clusters identified in the genome. + - `*_diamond.out.tc`: TSV file containing the diamond output for transporter annotation. + - `*_TF_hmm_results.tsv`: TSV file containing the results of Transcription factor. + - `*_STP_hmm_results.tsv`: TSV file containing the results of signaling transduction proteins (STP) annotation. + - `*_total_cgc_info.tsv`: TSV file summarizing the total additional genes in the genome. + - `*_substrate_prediction.tsv`: TSV file containing the substrate predictions based on the CGC annotations from dbCAN. + - `*_synteny_pdf/`: Directory containing the synteny plots in PDF format for the CAZyme gene clusters (CGC) identified by dbCAN. This directory will contain one or more PDF files showing the syntenic regions of the CGC in the genome. + +
+ +[run_dbcan](https://github.com/bcb-unl/run_dbcan) is an automated tool for carbohydrate-active enzyme (CAZyme), CAZyme gene cluster and substrate annotation. + ### Summary tools [AMPcombi](#ampcombi), [hAMRonization](#hamronization), [comBGC](#combgc), [MultiQC](#multiqc), [pipeline information](#pipeline-information), [argNorm](#argnorm). diff --git a/modules.json b/modules.json index b4ef3688..0c559852 100644 --- a/modules.json +++ b/modules.json @@ -200,6 +200,26 @@ "git_sha": "81880787133db07d9b4c1febd152c090eb8325dc", "installed_by": ["modules"] }, + "rundbcan/cazymeannotation": { + "branch": "master", + "git_sha": "d06da24d16537815f9699c4a4edd4a6ec5bc517f", + "installed_by": ["modules"] + }, + "rundbcan/database": { + "branch": "master", + "git_sha": "d06da24d16537815f9699c4a4edd4a6ec5bc517f", + "installed_by": ["modules"] + }, + "rundbcan/easycgc": { + "branch": "master", + "git_sha": "d06da24d16537815f9699c4a4edd4a6ec5bc517f", + "installed_by": ["modules"] + }, + "rundbcan/easysubstrate": { + "branch": "master", + "git_sha": "d06da24d16537815f9699c4a4edd4a6ec5bc517f", + "installed_by": ["modules"] + }, "seqkit/seq": { "branch": "master", "git_sha": "81880787133db07d9b4c1febd152c090eb8325dc", diff --git a/modules/nf-core/rundbcan/cazymeannotation/environment.yml b/modules/nf-core/rundbcan/cazymeannotation/environment.yml new file mode 100644 index 00000000..6d9a56ae --- /dev/null +++ b/modules/nf-core/rundbcan/cazymeannotation/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - bioconda + - conda-forge +dependencies: + - bioconda::dbcan=5.1.2 diff --git a/modules/nf-core/rundbcan/cazymeannotation/main.nf b/modules/nf-core/rundbcan/cazymeannotation/main.nf new file mode 100644 index 00000000..208bd4bd --- /dev/null +++ b/modules/nf-core/rundbcan/cazymeannotation/main.nf @@ -0,0 +1,60 @@ +process RUNDBCAN_CAZYMEANNOTATION { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/dbcan:5.1.2--pyhdfd78af_0' : + 'biocontainers/dbcan:5.1.2--pyhdfd78af_0' }" + + input: + tuple val(meta), path(input_raw_data) + path dbcan_db + + output: + tuple val(meta), path("${prefix}_overview.tsv") , emit: cazyme_annotation + tuple val(meta), path("${prefix}_dbCAN_hmm_results.tsv") , emit: dbcanhmm_results + tuple val(meta), path("${prefix}_dbCANsub_hmm_results.tsv"), emit: dbcansub_results + tuple val(meta), path("${prefix}_diamond.out") , emit: dbcandiamond_results + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + run_dbcan CAZyme_annotation \\ + --mode protein \\ + --db_dir ${dbcan_db} \\ + --input_raw_data ${input_raw_data} \\ + --output_dir . \\ + ${args} + + mv overview.tsv ${prefix}_overview.tsv + mv dbCAN_hmm_results.tsv ${prefix}_dbCAN_hmm_results.tsv + mv dbCANsub_hmm_results.tsv ${prefix}_dbCANsub_hmm_results.tsv + mv diamond.out ${prefix}_diamond.out + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dbcan: \$(echo \$(run_dbcan version) | cut -f2 -d':' | cut -f2 -d' ') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_overview.tsv + touch ${prefix}_dbCAN_hmm_results.tsv + touch ${prefix}_dbCANsub_hmm_results.tsv + touch ${prefix}_diamond.out + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dbcan: \$(echo \$(run_dbcan version) | cut -f2 -d':' | cut -f2 -d' ') + END_VERSIONS + """ +} diff --git a/modules/nf-core/rundbcan/cazymeannotation/meta.yml b/modules/nf-core/rundbcan/cazymeannotation/meta.yml new file mode 100644 index 00000000..a40c515c --- /dev/null +++ b/modules/nf-core/rundbcan/cazymeannotation/meta.yml @@ -0,0 +1,88 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "rundbcan_cazymeannotation" +description: CAZyme annotation module for the dbcan pipeline. This module is used + to annotate carbohydrate-active enzymes (CAZymes) from genomic data using the dbCAN + annotation tool. +keywords: + - dbCAN + - download + - CAZyme + - CAZyme gene Cluster + - genomes +tools: + - "dbcan": + description: "Standalone version of dbCAN annotation tool for automated CAZyme + annotation." + homepage: "https://bcb.unl.edu/dbCAN2/" + documentation: "https://run-dbcan.readthedocs.io/en/latest/" + tool_dev_url: "https://github.com/bcb-unl/run_dbcan" + doi: "10.1093/nar/gkad328" + licence: ["GPL v3-or-later"] + identifier: biotools:dbcan + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - input_raw_data: + type: file + description: FASTA file for protein sequences. + pattern: "*.{fasta,fa,faa}" + ontologies: + - edam: "http://edamontology.org/data_2044" # Sequence + - edam: "http://edamontology.org/format_1929" # FASTA + - - dbcan_db: + type: directory + description: Path to the dbCAN database directory. +output: + - cazyme_annotation: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_overview.tsv: + type: file + description: | + TSV file containing the results of dbCAN CAZyme annotation. + - dbcanhmm_results: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_dbCAN_hmm_results.tsv: + type: file + description: | + TSV file containing the detailed dbCAN HMM results for CAZyme annotation. + - dbcansub_results: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - ${prefix}_dbCANsub_hmm_results.tsv: + type: file + description: | + TSV file containing the detailed dbCAN subfamily results for CAZyme annotation. + - dbcandiamond_results: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - ${prefix}_diamond.out: + type: file + description: | + TSV file containing the detailed dbCAN diamond results for CAZyme annotation. + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@Xinpeng021001" +maintainers: + - "@Xinpeng021001" diff --git a/modules/nf-core/rundbcan/cazymeannotation/tests/main.nf.test b/modules/nf-core/rundbcan/cazymeannotation/tests/main.nf.test new file mode 100644 index 00000000..d3ff7578 --- /dev/null +++ b/modules/nf-core/rundbcan/cazymeannotation/tests/main.nf.test @@ -0,0 +1,72 @@ +nextflow_process { + + name "Test Process RUNDBCAN_CAZYMEANNOTATION" + script "../main.nf" + process "RUNDBCAN_CAZYMEANNOTATION" + + tag "modules" + tag "modules_nfcore" + tag "rundbcan" + tag "rundbcan/database" + tag "rundbcan/cazymeannotation" + + test("dbcancazyme - simplified") { + + setup { + run("RUNDBCAN_DATABASE"){ + script "../../database/main.nf" + process { + """ + """ + } + } + } + + when { + process { + """ + input[0] = [ + [id:'test'], + file(params.modules_testdata_base_path + 'genomics/prokaryotes/candidatus_portiera_aleyrodidarum/genome/proteome.fasta', checkIfExists: true) + ] + input[1] = RUNDBCAN_DATABASE.out.dbcan_db + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot( + process.out, + path(process.out.versions[0]).yaml + ).match() + } + ) + } + } + + test("dbcancazyme - cazyme annotation - stub") { + options "-stub" + + when { + process { + """ + input[0] = [[id: 'stub'],file('stub') ] + input[1] = file('stub_db') + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot( + process.out, + path(process.out.versions[0]).yaml + ).match() + } + ) + } + } +} diff --git a/modules/nf-core/rundbcan/cazymeannotation/tests/main.nf.test.snap b/modules/nf-core/rundbcan/cazymeannotation/tests/main.nf.test.snap new file mode 100644 index 00000000..cf210a60 --- /dev/null +++ b/modules/nf-core/rundbcan/cazymeannotation/tests/main.nf.test.snap @@ -0,0 +1,174 @@ +{ + "dbcancazyme - cazyme annotation - stub": { + "content": [ + { + "0": [ + [ + { + "id": "stub" + }, + "stub_overview.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "stub" + }, + "stub_dbCAN_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "stub" + }, + "stub_dbCANsub_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + [ + { + "id": "stub" + }, + "stub_diamond.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "4": [ + "versions.yml:md5,40f0cf24dce2629d444781eaee026c7f" + ], + "cazyme_annotation": [ + [ + { + "id": "stub" + }, + "stub_overview.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "dbcandiamond_results": [ + [ + { + "id": "stub" + }, + "stub_diamond.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "dbcanhmm_results": [ + [ + { + "id": "stub" + }, + "stub_dbCAN_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "dbcansub_results": [ + [ + { + "id": "stub" + }, + "stub_dbCANsub_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,40f0cf24dce2629d444781eaee026c7f" + ] + }, + { + "RUNDBCAN_CAZYMEANNOTATION": { + "dbcan": "5.1.2" + } + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-12T17:00:00.485809769" + }, + "dbcancazyme - simplified": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_overview.tsv:md5,73bd9acee752d61e096370d4fedfee54" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test_dbCAN_hmm_results.tsv:md5,7ce7b6536845ffa8907b3f3fb2b77a1b" + ] + ], + "2": [ + [ + { + "id": "test" + }, + "test_dbCANsub_hmm_results.tsv:md5,7ce7b6536845ffa8907b3f3fb2b77a1b" + ] + ], + "3": [ + [ + { + "id": "test" + }, + "test_diamond.out:md5,c935cda6778ca2b6aaaa4362b6f24d84" + ] + ], + "4": [ + "versions.yml:md5,40f0cf24dce2629d444781eaee026c7f" + ], + "cazyme_annotation": [ + [ + { + "id": "test" + }, + "test_overview.tsv:md5,73bd9acee752d61e096370d4fedfee54" + ] + ], + "dbcandiamond_results": [ + [ + { + "id": "test" + }, + "test_diamond.out:md5,c935cda6778ca2b6aaaa4362b6f24d84" + ] + ], + "dbcanhmm_results": [ + [ + { + "id": "test" + }, + "test_dbCAN_hmm_results.tsv:md5,7ce7b6536845ffa8907b3f3fb2b77a1b" + ] + ], + "dbcansub_results": [ + [ + { + "id": "test" + }, + "test_dbCANsub_hmm_results.tsv:md5,7ce7b6536845ffa8907b3f3fb2b77a1b" + ] + ], + "versions": [ + "versions.yml:md5,40f0cf24dce2629d444781eaee026c7f" + ] + }, + { + "RUNDBCAN_CAZYMEANNOTATION": { + "dbcan": "5.1.2" + } + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-12T16:59:56.40131421" + } +} \ No newline at end of file diff --git a/modules/nf-core/rundbcan/database/environment.yml b/modules/nf-core/rundbcan/database/environment.yml new file mode 100644 index 00000000..6d9a56ae --- /dev/null +++ b/modules/nf-core/rundbcan/database/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - bioconda + - conda-forge +dependencies: + - bioconda::dbcan=5.1.2 diff --git a/modules/nf-core/rundbcan/database/main.nf b/modules/nf-core/rundbcan/database/main.nf new file mode 100644 index 00000000..617ab882 --- /dev/null +++ b/modules/nf-core/rundbcan/database/main.nf @@ -0,0 +1,38 @@ +process RUNDBCAN_DATABASE { + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/dbcan:5.1.2--pyhdfd78af_0' : + 'biocontainers/dbcan:5.1.2--pyhdfd78af_0' }" + + output: + path "dbcan_db" , emit: dbcan_db + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + run_dbcan database \\ + --db_dir dbcan_db + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dbcan: \$(echo \$(run_dbcan version) | cut -f2 -d':' | cut -f2 -d' ') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + """ + mkdir -p dbcan_db + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dbcan: \$(echo \$(run_dbcan version) | cut -f2 -d':' | cut -f2 -d' ') + END_VERSIONS + """ +} diff --git a/modules/nf-core/rundbcan/database/meta.yml b/modules/nf-core/rundbcan/database/meta.yml new file mode 100644 index 00000000..0ae300b0 --- /dev/null +++ b/modules/nf-core/rundbcan/database/meta.yml @@ -0,0 +1,36 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "rundbcan_database" +description: command from run_dbcan to prepare the database for dbCAN annotation. +keywords: + - dbCAN + - download + - CAZyme + - CAZyme gene Cluster + - genomes +tools: + - "run_dbcan": + description: "Standalone version of dbCAN annotation tool for automated CAZyme annotation." + homepage: "https://bcb.unl.edu/dbCAN2/" + documentation: "https://run-dbcan.readthedocs.io/en/latest/" + tool_dev_url: "https://github.com/bcb-unl/run_dbcan" + doi: "10.1093/nar/gkad328" + licence: ["GPL v3-or-later"] + identifier: biotools:dbcan + +output: + - dbcan_db: + - dbcan_db: + type: directory + description: Download directory for dbCAN databases + pattern: "dbcan_db" + - versions: + - "versions.yml": + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@Xinpeng021001" +maintainers: + - "@Xinpeng021001" diff --git a/modules/nf-core/rundbcan/database/tests/main.nf.test b/modules/nf-core/rundbcan/database/tests/main.nf.test new file mode 100644 index 00000000..d81a3319 --- /dev/null +++ b/modules/nf-core/rundbcan/database/tests/main.nf.test @@ -0,0 +1,56 @@ +nextflow_process { + + name "Test Process RUNDBCAN_DATABASE" + script "../main.nf" + process "RUNDBCAN_DATABASE" + + tag "modules" + tag "modules_nfcore" + tag "rundbcan" + tag "rundbcan/database" + + test("rundbcan - database - basic") { + + when { + process { + """ + """ + } + } + + then { + assert process.success + assert path(process.out.dbcan_db.get(0)).exists() + assert path(process.out.versions[0]).exists() + assertAll( + { assert snapshot( + process.out.versions, + path(process.out.versions[0]).yaml + ).match() } + ) + } + } + + test("rundbcan - database - stub") { + + options "-stub" + + when { + process { + """ + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot( + file(process.out.dbcan_db.get(0)).name, + process.out.versions, + path(process.out.versions[0]).yaml + ).match() } + ) + } + } +} diff --git a/modules/nf-core/rundbcan/database/tests/main.nf.test.snap b/modules/nf-core/rundbcan/database/tests/main.nf.test.snap new file mode 100644 index 00000000..e54b59a8 --- /dev/null +++ b/modules/nf-core/rundbcan/database/tests/main.nf.test.snap @@ -0,0 +1,37 @@ +{ + "rundbcan - database - basic": { + "content": [ + [ + "versions.yml:md5,b064fe90159963e182ec980f0f4677c5" + ], + { + "RUNDBCAN_DATABASE": { + "dbcan": "5.1.2" + } + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-12T17:01:26.958626278" + }, + "rundbcan - database - stub": { + "content": [ + "dbcan_db", + [ + "versions.yml:md5,b064fe90159963e182ec980f0f4677c5" + ], + { + "RUNDBCAN_DATABASE": { + "dbcan": "5.1.2" + } + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-12T17:01:31.197377024" + } +} \ No newline at end of file diff --git a/modules/nf-core/rundbcan/easycgc/environment.yml b/modules/nf-core/rundbcan/easycgc/environment.yml new file mode 100644 index 00000000..6d9a56ae --- /dev/null +++ b/modules/nf-core/rundbcan/easycgc/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - bioconda + - conda-forge +dependencies: + - bioconda::dbcan=5.1.2 diff --git a/modules/nf-core/rundbcan/easycgc/main.nf b/modules/nf-core/rundbcan/easycgc/main.nf new file mode 100644 index 00000000..0095e81d --- /dev/null +++ b/modules/nf-core/rundbcan/easycgc/main.nf @@ -0,0 +1,80 @@ +process RUNDBCAN_EASYCGC { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/dbcan:5.1.2--pyhdfd78af_0' : + 'biocontainers/dbcan:5.1.2--pyhdfd78af_0' }" + + input: + tuple val(meta), path(input_raw_data) + tuple val(meta2), path(input_gff), val(gff_type) + path dbcan_db + + output: + tuple val(meta), path("${prefix}_overview.tsv") , emit: cazyme_annotation + tuple val(meta), path("${prefix}_dbCAN_hmm_results.tsv") , emit: dbcanhmm_results + tuple val(meta), path("${prefix}_dbCANsub_hmm_results.tsv"), emit: dbcansub_results + tuple val(meta), path("${prefix}_diamond.out") , emit: dbcandiamond_results + tuple val(meta), path("${prefix}_cgc.gff") , emit: cgc_gff + tuple val(meta), path("${prefix}_cgc_standard_out.tsv") , emit: cgc_standard_out + tuple val(meta), path("${prefix}_diamond.out.tc") , emit: diamond_out_tc + tuple val(meta), path("${prefix}_TF_hmm_results.tsv") , emit: tf_hmm_results + tuple val(meta), path("${prefix}_STP_hmm_results.tsv") , emit: stp_hmm_results + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + run_dbcan easy_CGC \\ + --mode protein \\ + --db_dir ${dbcan_db} \\ + --input_raw_data ${input_raw_data} \\ + --output_dir . \\ + --input_gff ${input_gff} \\ + --gff_type ${gff_type} \\ + ${args} + + mv overview.tsv ${prefix}_overview.tsv + mv dbCAN_hmm_results.tsv ${prefix}_dbCAN_hmm_results.tsv + mv dbCANsub_hmm_results.tsv ${prefix}_dbCANsub_hmm_results.tsv + mv diamond.out ${prefix}_diamond.out + mv cgc.gff ${prefix}_cgc.gff + mv cgc_standard_out.tsv ${prefix}_cgc_standard_out.tsv + mv diamond.out.tc ${prefix}_diamond.out.tc + mv TF_hmm_results.tsv ${prefix}_TF_hmm_results.tsv + mv STP_hmm_results.tsv ${prefix}_STP_hmm_results.tsv + mv total_cgc_info.tsv ${prefix}_total_cgc_info.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dbcan: \$(echo \$(run_dbcan version) | cut -f2 -d':' | cut -f2 -d' ') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_overview.tsv + touch ${prefix}_dbCAN_hmm_results.tsv + touch ${prefix}_dbCANsub_hmm_results.tsv + touch ${prefix}_diamond.out + touch ${prefix}_cgc.gff + touch ${prefix}_cgc_standard_out.tsv + touch ${prefix}_diamond.out.tc + touch ${prefix}_TF_hmm_results.tsv + touch ${prefix}_STP_hmm_results.tsv + touch ${prefix}_total_cgc_info.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dbcan: \$(echo \$(run_dbcan version) | cut -f2 -d':' | cut -f2 -d' ') + END_VERSIONS + """ +} diff --git a/modules/nf-core/rundbcan/easycgc/meta.yml b/modules/nf-core/rundbcan/easycgc/meta.yml new file mode 100644 index 00000000..8e6e1f99 --- /dev/null +++ b/modules/nf-core/rundbcan/easycgc/meta.yml @@ -0,0 +1,151 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "rundbcan_easycgc" +description: CGC annotation module for the dbcan pipeline. This module is used to + annotate carbohydrate-active enzymes (CAZymes) from genomic data using the dbCAN + annotation tool. +keywords: + - dbCAN + - download + - CAZyme + - CAZyme gene Cluster + - genomes +tools: + - "dbcan": + description: "Standalone version of dbCAN annotation tool for automated CAZyme + annotation." + homepage: "https://bcb.unl.edu/dbCAN2/" + documentation: "https://run-dbcan.readthedocs.io/en/latest/" + tool_dev_url: "https://github.com/bcb-unl/run_dbcan" + doi: "10.1093/nar/gkad328" + licence: ["GPL v3-or-later"] + identifier: biotools:dbcan + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - input_raw_data: + type: file + description: FASTA file for protein sequences. + pattern: "*.{fasta,fa,faa}" + ontologies: + - edam: "http://edamontology.org/data_2044" # Sequence + - edam: "http://edamontology.org/format_1929" # FASTA + - - meta2: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - input_gff: + type: file + description: GFF file for protein sequences. + - gff_type: + type: string + description: | + Type of GFF file. Options are `NCBI_prok`, `JGI`, `NCBI_euk`, and `prodigal`. This is used to parse the GFF file correctly. + - - dbcan_db: + type: directory + description: Path to the dbCAN database directory. + +output: + - cazyme_annotation: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_overview.tsv: + type: file + description: | + TSV file containing the results of dbCAN CAZyme annotation. + - dbcanhmm_results: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_dbCAN_hmm_results.tsv: + type: file + description: | + TSV file containing the detailed dbCAN HMM results for CAZyme annotation. + - dbcansub_results: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_dbCANsub_hmm_results.tsv: + type: file + description: | + TSV file containing the detailed dbCAN subfamily results for CAZyme annotation. + - dbcandiamond_results: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_diamond.out: + type: file + description: | + TSV file containing the detailed dbCAN diamond results for CAZyme annotation. + - cgc_gff: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_cgc.gff: + type: file + description: | + GFF file containing the CAZyme gene clusters (CGC) identified by dbCAN. This file is generated from the dbCAN annotation and contains the locations of CAZyme gene clusters in the genome. + - cgc_standard_out: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_cgc_standard_out.tsv: + type: file + description: | + Standard output file from dbCAN for CAZyme gene clusters (CGC) in a tabular format. This file summarizes the CAZyme gene clusters identified in the genome. + - diamond_out_tc: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_diamond.out.tc: + type: file + description: | + TSV file containing the diamond output for transporter annotation. + - tf_hmm_results: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_TF_hmm_results.tsv: + type: file + description: | + TSV file containing the results of Transcription factor. + - stp_hmm_results: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_STP_hmm_results.tsv: + type: file + description: | + TSV file containing the results of signaling transduction proteins (STP) annotation. + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@Xinpeng021001" +maintainers: + - "@Xinpeng021001" diff --git a/modules/nf-core/rundbcan/easycgc/tests/main.nf.test b/modules/nf-core/rundbcan/easycgc/tests/main.nf.test new file mode 100644 index 00000000..fe316389 --- /dev/null +++ b/modules/nf-core/rundbcan/easycgc/tests/main.nf.test @@ -0,0 +1,80 @@ +nextflow_process { + + name "Test Process RUNDBCAN_EASYCGC" + script "../main.nf" + process "RUNDBCAN_EASYCGC" + + tag "modules" + tag "modules_nfcore" + tag "rundbcan" + tag "rundbcan/database" + tag "rundbcan/easycgc" + + + test("easycgc - simplified") { + + setup { + run("RUNDBCAN_DATABASE"){ + script "../../database/main.nf" + process { + """ + """ + } + } + } + + when { + process { + """ + input[0] = [ + [id:'test'], + file(params.modules_testdata_base_path + 'genomics/prokaryotes/candidatus_portiera_aleyrodidarum/genome/proteome.fasta', checkIfExists: true) + ] + input[1] = [ + [id:'test'], + file(params.modules_testdata_base_path + 'genomics/prokaryotes/candidatus_portiera_aleyrodidarum/genome/gff/test1.gff', checkIfExists: true) + ,"prodigal" + ] + input[2] = RUNDBCAN_DATABASE.out.dbcan_db + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot( + process.out, + path(process.out.versions[0]).yaml + ).match() + } + ) + } + } + + test("easycgc - stub") { + options "-stub" + + when { + process { + """ + input[0] = [[id: 'stub'], file('stub')] + input[1] = [[id: 'stub'], file('stub.gff'), "prodigal"] + input[2] = file('stub_db') + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot( + process.out, + path(process.out.versions[0]).yaml + ).match() + } + ) + } + } +} + diff --git a/modules/nf-core/rundbcan/easycgc/tests/main.nf.test.snap b/modules/nf-core/rundbcan/easycgc/tests/main.nf.test.snap new file mode 100644 index 00000000..664f7670 --- /dev/null +++ b/modules/nf-core/rundbcan/easycgc/tests/main.nf.test.snap @@ -0,0 +1,334 @@ +{ + "easycgc - simplified": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_overview.tsv:md5,73bd9acee752d61e096370d4fedfee54" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test_dbCAN_hmm_results.tsv:md5,7ce7b6536845ffa8907b3f3fb2b77a1b" + ] + ], + "2": [ + [ + { + "id": "test" + }, + "test_dbCANsub_hmm_results.tsv:md5,7ce7b6536845ffa8907b3f3fb2b77a1b" + ] + ], + "3": [ + [ + { + "id": "test" + }, + "test_diamond.out:md5,c935cda6778ca2b6aaaa4362b6f24d84" + ] + ], + "4": [ + [ + { + "id": "test" + }, + "test_cgc.gff:md5,cb1bd08c0276b4a0a37540032863a0ac" + ] + ], + "5": [ + [ + { + "id": "test" + }, + "test_cgc_standard_out.tsv:md5,6be9ab29b289ff46cc6e4b6fe48dc3d7" + ] + ], + "6": [ + [ + { + "id": "test" + }, + "test_diamond.out.tc:md5,4b9747475aaf438eede8556832060f83" + ] + ], + "7": [ + [ + { + "id": "test" + }, + "test_TF_hmm_results.tsv:md5,f63f2b2b3c4439304fa1c20c19cf1e99" + ] + ], + "8": [ + [ + { + "id": "test" + }, + "test_STP_hmm_results.tsv:md5,e03c881b99bb7c7637ed600324816b5c" + ] + ], + "9": [ + "versions.yml:md5,98440d2e11ce6a66cf63395467603bb9" + ], + "cazyme_annotation": [ + [ + { + "id": "test" + }, + "test_overview.tsv:md5,73bd9acee752d61e096370d4fedfee54" + ] + ], + "cgc_gff": [ + [ + { + "id": "test" + }, + "test_cgc.gff:md5,cb1bd08c0276b4a0a37540032863a0ac" + ] + ], + "cgc_standard_out": [ + [ + { + "id": "test" + }, + "test_cgc_standard_out.tsv:md5,6be9ab29b289ff46cc6e4b6fe48dc3d7" + ] + ], + "dbcandiamond_results": [ + [ + { + "id": "test" + }, + "test_diamond.out:md5,c935cda6778ca2b6aaaa4362b6f24d84" + ] + ], + "dbcanhmm_results": [ + [ + { + "id": "test" + }, + "test_dbCAN_hmm_results.tsv:md5,7ce7b6536845ffa8907b3f3fb2b77a1b" + ] + ], + "dbcansub_results": [ + [ + { + "id": "test" + }, + "test_dbCANsub_hmm_results.tsv:md5,7ce7b6536845ffa8907b3f3fb2b77a1b" + ] + ], + "diamond_out_tc": [ + [ + { + "id": "test" + }, + "test_diamond.out.tc:md5,4b9747475aaf438eede8556832060f83" + ] + ], + "stp_hmm_results": [ + [ + { + "id": "test" + }, + "test_STP_hmm_results.tsv:md5,e03c881b99bb7c7637ed600324816b5c" + ] + ], + "tf_hmm_results": [ + [ + { + "id": "test" + }, + "test_TF_hmm_results.tsv:md5,f63f2b2b3c4439304fa1c20c19cf1e99" + ] + ], + "versions": [ + "versions.yml:md5,98440d2e11ce6a66cf63395467603bb9" + ] + }, + { + "RUNDBCAN_EASYCGC": { + "dbcan": "5.1.2" + } + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-12T17:03:26.553650524" + }, + "easycgc - stub": { + "content": [ + { + "0": [ + [ + { + "id": "stub" + }, + "stub_overview.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "stub" + }, + "stub_dbCAN_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "stub" + }, + "stub_dbCANsub_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + [ + { + "id": "stub" + }, + "stub_diamond.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "4": [ + [ + { + "id": "stub" + }, + "stub_cgc.gff:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "5": [ + [ + { + "id": "stub" + }, + "stub_cgc_standard_out.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "6": [ + [ + { + "id": "stub" + }, + "stub_diamond.out.tc:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "7": [ + [ + { + "id": "stub" + }, + "stub_TF_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "8": [ + [ + { + "id": "stub" + }, + "stub_STP_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "9": [ + "versions.yml:md5,98440d2e11ce6a66cf63395467603bb9" + ], + "cazyme_annotation": [ + [ + { + "id": "stub" + }, + "stub_overview.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "cgc_gff": [ + [ + { + "id": "stub" + }, + "stub_cgc.gff:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "cgc_standard_out": [ + [ + { + "id": "stub" + }, + "stub_cgc_standard_out.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "dbcandiamond_results": [ + [ + { + "id": "stub" + }, + "stub_diamond.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "dbcanhmm_results": [ + [ + { + "id": "stub" + }, + "stub_dbCAN_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "dbcansub_results": [ + [ + { + "id": "stub" + }, + "stub_dbCANsub_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "diamond_out_tc": [ + [ + { + "id": "stub" + }, + "stub_diamond.out.tc:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "stp_hmm_results": [ + [ + { + "id": "stub" + }, + "stub_STP_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "tf_hmm_results": [ + [ + { + "id": "stub" + }, + "stub_TF_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,98440d2e11ce6a66cf63395467603bb9" + ] + }, + { + "RUNDBCAN_EASYCGC": { + "dbcan": "5.1.2" + } + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-12T17:03:30.257931353" + } +} \ No newline at end of file diff --git a/modules/nf-core/rundbcan/easysubstrate/environment.yml b/modules/nf-core/rundbcan/easysubstrate/environment.yml new file mode 100644 index 00000000..6d9a56ae --- /dev/null +++ b/modules/nf-core/rundbcan/easysubstrate/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - bioconda + - conda-forge +dependencies: + - bioconda::dbcan=5.1.2 diff --git a/modules/nf-core/rundbcan/easysubstrate/main.nf b/modules/nf-core/rundbcan/easysubstrate/main.nf new file mode 100644 index 00000000..442debfc --- /dev/null +++ b/modules/nf-core/rundbcan/easysubstrate/main.nf @@ -0,0 +1,92 @@ +process RUNDBCAN_EASYSUBSTRATE { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/dbcan:5.1.2--pyhdfd78af_0' : + 'biocontainers/dbcan:5.1.2--pyhdfd78af_0' }" + + input: + tuple val(meta), path(input_raw_data) + tuple val(meta2), path(input_gff), val(gff_type) + path dbcan_db + + output: + tuple val(meta), path("${prefix}_overview.tsv") , emit: cazyme_annotation + tuple val(meta), path("${prefix}_dbCAN_hmm_results.tsv") , emit: dbcanhmm_results + tuple val(meta), path("${prefix}_dbCANsub_hmm_results.tsv"), emit: dbcansub_results + tuple val(meta), path("${prefix}_diamond.out") , emit: dbcandiamond_results + tuple val(meta), path("${prefix}_cgc.gff") , emit: cgc_gff + tuple val(meta), path("${prefix}_cgc_standard_out.tsv") , emit: cgc_standard_out + tuple val(meta), path("${prefix}_diamond.out.tc") , emit: diamond_out_tc + tuple val(meta), path("${prefix}_TF_hmm_results.tsv") , emit: tf_hmm_results + tuple val(meta), path("${prefix}_STP_hmm_results.tsv") , emit: stp_hmm_results + tuple val(meta), path("${prefix}_total_cgc_info.tsv") , emit: total_cgc_info + tuple val(meta), path("${prefix}_substrate_prediction.tsv"), emit: substrate_prediction + tuple val(meta), path("${prefix}_synteny_pdf/") , emit: synteny_pdf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + + run_dbcan easy_substrate \\ + --mode protein \\ + --db_dir ${dbcan_db} \\ + --input_raw_data ${input_raw_data} \\ + --output_dir . \\ + --input_gff ${input_gff} \\ + --gff_type ${gff_type} \\ + ${args} + + mv overview.tsv ${prefix}_overview.tsv + mv dbCAN_hmm_results.tsv ${prefix}_dbCAN_hmm_results.tsv + mv dbCANsub_hmm_results.tsv ${prefix}_dbCANsub_hmm_results.tsv + mv diamond.out ${prefix}_diamond.out + mv cgc.gff ${prefix}_cgc.gff + mv cgc_standard_out.tsv ${prefix}_cgc_standard_out.tsv + mv diamond.out.tc ${prefix}_diamond.out.tc + mv TF_hmm_results.tsv ${prefix}_TF_hmm_results.tsv + mv STP_hmm_results.tsv ${prefix}_STP_hmm_results.tsv + mv total_cgc_info.tsv ${prefix}_total_cgc_info.tsv + mv CGC.faa ${prefix}_CGC.faa + mv PUL_blast.out ${prefix}_PUL_blast.out + mv substrate_prediction.tsv ${prefix}_substrate_prediction.tsv + mv synteny_pdf/ ${prefix}_synteny_pdf/ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dbcan: \$(echo \$(run_dbcan version) | cut -f2 -d':' | cut -f2 -d' ') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_overview.tsv + touch ${prefix}_dbCAN_hmm_results.tsv + touch ${prefix}_dbCANsub_hmm_results.tsv + touch ${prefix}_diamond.out + touch ${prefix}_cgc.gff + touch ${prefix}_cgc_standard_out.tsv + touch ${prefix}_diamond.out.tc + touch ${prefix}_TF_hmm_results.tsv + touch ${prefix}_STP_hmm_results.tsv + touch ${prefix}_total_cgc_info.tsv + touch ${prefix}_CGC.faa + touch ${prefix}_PUL_blast.out + touch ${prefix}_substrate_prediction.tsv + mkdir -p ${prefix}_synteny_pdf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dbcan: \$(echo \$(run_dbcan version) | cut -f2 -d':' | cut -f2 -d' ') + END_VERSIONS + """ +} diff --git a/modules/nf-core/rundbcan/easysubstrate/meta.yml b/modules/nf-core/rundbcan/easysubstrate/meta.yml new file mode 100644 index 00000000..32672c50 --- /dev/null +++ b/modules/nf-core/rundbcan/easysubstrate/meta.yml @@ -0,0 +1,181 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "rundbcan_easysubstrate" +description: Substrate annotation module for the dbcan pipeline. This module is used + to annotate carbohydrate-active enzymes (CAZymes) from genomic data using the dbCAN + annotation tool. +keywords: + - dbCAN + - download + - CAZyme + - CAZyme gene Cluster + - genomes +tools: + - "dbcan": + description: "Standalone version of dbCAN annotation tool for automated CAZyme + annotation." + homepage: "https://bcb.unl.edu/dbCAN2/" + documentation: "https://run-dbcan.readthedocs.io/en/latest/" + tool_dev_url: "https://github.com/bcb-unl/run_dbcan" + doi: "10.1093/nar/gkad328" + licence: ["GPL v3-or-later"] + identifier: biotools:dbcan + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - input_raw_data: + type: file + description: FASTA file for protein sequences. + pattern: "*.{fasta,fa,faa}" + ontologies: + - edam: "http://edamontology.org/data_2044" # Sequence + - edam: "http://edamontology.org/format_1929" # FASTA + - - meta2: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - input_gff: + type: file + description: GFF file for protein sequences. + - gff_type: + type: string + description: | + Type of GFF file. Options are `NCBI_prok`, `JGI`, `NCBI_euk`, and `prodigal`. This is used to parse the GFF file correctly. + - - dbcan_db: + type: directory + description: Path to the dbCAN database directory. + +output: + - cazyme_annotation: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_overview.tsv: + type: file + description: | + TSV file containing the results of dbCAN CAZyme annotation. + - dbcanhmm_results: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_dbCAN_hmm_results.tsv: + type: file + description: | + TSV file containing the detailed dbCAN HMM results for CAZyme annotation. + - dbcansub_results: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_dbCANsub_hmm_results.tsv: + type: file + description: | + TSV file containing the detailed dbCAN subfamily results for CAZyme annotation. + - dbcandiamond_results: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_diamond.out: + type: file + description: | + TSV file containing the detailed dbCAN diamond results for CAZyme annotation. + - cgc_gff: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_cgc.gff: + type: file + description: | + GFF file containing the CAZyme gene clusters (CGC) identified by dbCAN. This file is generated from the dbCAN annotation and contains the locations of CAZyme gene clusters in the genome. + - cgc_standard_out: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_cgc_standard_out.tsv: + type: file + description: | + Standard output file from dbCAN for CAZyme gene clusters (CGC) in a tabular format. This file summarizes the CAZyme gene clusters identified in the genome. + - diamond_out_tc: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_diamond.out.tc: + type: file + description: | + TSV file containing the diamond output for transporter annotation. + - tf_hmm_results: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_TF_hmm_results.tsv: + type: file + description: | + TSV file containing the results of Transcription factor. + - stp_hmm_results: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_STP_hmm_results.tsv: + type: file + description: | + TSV file containing the results of signaling transduction proteins (STP) annotation. + - total_cgc_info: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_total_cgc_info.tsv: + type: file + description: | + TSV file summarizing the total additional genes in the genome. + - substrate_prediction: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_substrate_prediction.tsv: + type: file + description: | + TSV file containing the substrate predictions based on the CGC annotations from dbCAN. + - synteny_pdf: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1']` + - ${prefix}_synteny_pdf/: + type: directory + description: | + Directory containing the synteny plots in PDF format for the CAZyme gene clusters (CGC) identified by dbCAN. This directory will contain one or more PDF files showing the syntenic regions of the CGC in the genome. + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@Xinpeng021001" +maintainers: + - "@Xinpeng021001" diff --git a/modules/nf-core/rundbcan/easysubstrate/tests/main.nf.test b/modules/nf-core/rundbcan/easysubstrate/tests/main.nf.test new file mode 100644 index 00000000..b6c016a1 --- /dev/null +++ b/modules/nf-core/rundbcan/easysubstrate/tests/main.nf.test @@ -0,0 +1,90 @@ +// nf-core modules test dbcan +nextflow_process { + + name "Test Process RUNDBCAN_EASYSUBSTRATE" + script "../main.nf" + process "RUNDBCAN_EASYSUBSTRATE" + + tag "modules" + tag "modules_nfcore" + tag "rundbcan" + tag "rundbcan/database" + tag "rundbcan/easysubstrate" + + test("easysubstrate - simplified") { + + setup { + run("RUNDBCAN_DATABASE"){ + script "../../database/main.nf" + process { + """ + """ + } + } + } + + when { + process { + """ + input[0] = [ + [id:'test'], + file(params.modules_testdata_base_path + 'genomics/prokaryotes/candidatus_portiera_aleyrodidarum/genome/proteome.fasta', checkIfExists: true) + ] + input[1] = [ + [id:'test'], + file(params.modules_testdata_base_path + 'genomics/prokaryotes/candidatus_portiera_aleyrodidarum/genome/gff/test1.gff', checkIfExists: true), + "prodigal" + ] + input[2] = RUNDBCAN_DATABASE.out.dbcan_db + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot( + process.out.cazyme_annotation, + process.out.dbcanhmm_results, + process.out.dbcansub_results, + process.out.dbcandiamond_results, + process.out.cgc_gff, + process.out.cgc_standard_out, + process.out.diamond_out_tc, + process.out.tf_hmm_results, + process.out.stp_hmm_results, + process.out.total_cgc_info, + process.out.versions, + path(process.out.versions[0]).yaml + ).match() + } + ) + } + } + + test("easysubstrate - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [[id: 'stub'], file('stub')] + input[1] = [[id: 'stub'], file('stub.gff'), "prodigal"] + input[2] = file('stub_db') + """ + } + } + + then { + assert process.success + assertAll( + { assert snapshot( + process.out, + path(process.out.versions[0]).yaml + ).match() + } + ) + } + } +} diff --git a/modules/nf-core/rundbcan/easysubstrate/tests/main.nf.test.snap b/modules/nf-core/rundbcan/easysubstrate/tests/main.nf.test.snap new file mode 100644 index 00000000..936e9472 --- /dev/null +++ b/modules/nf-core/rundbcan/easysubstrate/tests/main.nf.test.snap @@ -0,0 +1,317 @@ +{ + "easysubstrate - stub": { + "content": [ + { + "0": [ + [ + { + "id": "stub" + }, + "stub_overview.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "stub" + }, + "stub_dbCAN_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "10": [ + [ + { + "id": "stub" + }, + "stub_substrate_prediction.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "11": [ + [ + { + "id": "stub" + }, + [ + + ] + ] + ], + "12": [ + "versions.yml:md5,d5dd0946a8485d35c9593ca672a9387c" + ], + "2": [ + [ + { + "id": "stub" + }, + "stub_dbCANsub_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + [ + { + "id": "stub" + }, + "stub_diamond.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "4": [ + [ + { + "id": "stub" + }, + "stub_cgc.gff:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "5": [ + [ + { + "id": "stub" + }, + "stub_cgc_standard_out.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "6": [ + [ + { + "id": "stub" + }, + "stub_diamond.out.tc:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "7": [ + [ + { + "id": "stub" + }, + "stub_TF_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "8": [ + [ + { + "id": "stub" + }, + "stub_STP_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "9": [ + [ + { + "id": "stub" + }, + "stub_total_cgc_info.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "cazyme_annotation": [ + [ + { + "id": "stub" + }, + "stub_overview.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "cgc_gff": [ + [ + { + "id": "stub" + }, + "stub_cgc.gff:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "cgc_standard_out": [ + [ + { + "id": "stub" + }, + "stub_cgc_standard_out.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "dbcandiamond_results": [ + [ + { + "id": "stub" + }, + "stub_diamond.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "dbcanhmm_results": [ + [ + { + "id": "stub" + }, + "stub_dbCAN_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "dbcansub_results": [ + [ + { + "id": "stub" + }, + "stub_dbCANsub_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "diamond_out_tc": [ + [ + { + "id": "stub" + }, + "stub_diamond.out.tc:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "stp_hmm_results": [ + [ + { + "id": "stub" + }, + "stub_STP_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "substrate_prediction": [ + [ + { + "id": "stub" + }, + "stub_substrate_prediction.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "synteny_pdf": [ + [ + { + "id": "stub" + }, + [ + + ] + ] + ], + "tf_hmm_results": [ + [ + { + "id": "stub" + }, + "stub_TF_hmm_results.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "total_cgc_info": [ + [ + { + "id": "stub" + }, + "stub_total_cgc_info.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,d5dd0946a8485d35c9593ca672a9387c" + ] + }, + { + "RUNDBCAN_EASYSUBSTRATE": { + "dbcan": "5.1.2" + } + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-12T17:05:25.261676793" + }, + "easysubstrate - simplified": { + "content": [ + [ + [ + { + "id": "test" + }, + "test_overview.tsv:md5,73bd9acee752d61e096370d4fedfee54" + ] + ], + [ + [ + { + "id": "test" + }, + "test_dbCAN_hmm_results.tsv:md5,7ce7b6536845ffa8907b3f3fb2b77a1b" + ] + ], + [ + [ + { + "id": "test" + }, + "test_dbCANsub_hmm_results.tsv:md5,7ce7b6536845ffa8907b3f3fb2b77a1b" + ] + ], + [ + [ + { + "id": "test" + }, + "test_diamond.out:md5,c935cda6778ca2b6aaaa4362b6f24d84" + ] + ], + [ + [ + { + "id": "test" + }, + "test_cgc.gff:md5,cb1bd08c0276b4a0a37540032863a0ac" + ] + ], + [ + [ + { + "id": "test" + }, + "test_cgc_standard_out.tsv:md5,6be9ab29b289ff46cc6e4b6fe48dc3d7" + ] + ], + [ + [ + { + "id": "test" + }, + "test_diamond.out.tc:md5,4b9747475aaf438eede8556832060f83" + ] + ], + [ + [ + { + "id": "test" + }, + "test_TF_hmm_results.tsv:md5,f63f2b2b3c4439304fa1c20c19cf1e99" + ] + ], + [ + [ + { + "id": "test" + }, + "test_STP_hmm_results.tsv:md5,e03c881b99bb7c7637ed600324816b5c" + ] + ], + [ + [ + { + "id": "test" + }, + "test_total_cgc_info.tsv:md5,0b1411698abea697723acd7be2ff03a7" + ] + ], + [ + "versions.yml:md5,d5dd0946a8485d35c9593ca672a9387c" + ], + { + "RUNDBCAN_EASYSUBSTRATE": { + "dbcan": "5.1.2" + } + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-12T17:05:21.376354173" + } +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 2ea2baf6..ae7c3040 100644 --- a/nextflow.config +++ b/nextflow.config @@ -249,6 +249,12 @@ params { bgc_hmmsearch_savetargets = false bgc_hmmsearch_savedomains = false + // RUNDBCAN options + run_dbcan_screening = false + + dbcan_skip_cgc = false + dbcan_skip_substrate = false + // Config options config_profile_name = null config_profile_description = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 8558748d..a5ee1fb4 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -64,6 +64,11 @@ "type": "boolean", "description": "Activate biosynthetic gene cluster screening tools.", "fa_icon": "fas fa-check-circle" + }, + "run_dbcan_screening": { + "type": "boolean", + "description": "Activate CAZyme and CAZyme gene cluster screening tools.", + "fa_icon": "fas fa-check-circle" } }, "fa_icon": "fa fa-list" @@ -1466,6 +1471,24 @@ }, "fa_icon": "fas fa-angle-double-right" }, + "dbcan": { + "title": "DBCAN", + "type": "object", + "description": "Carbohydrate-active Enzyme Annotation based on pre-defined HMM models. More info: https://run-dbcan.readthedocs.io/en/latest", + "default": "", + "properties": { + "dbcan_skip_cgc": { + "type": "boolean", + "description": "Skip CGC during the DBCAN screening.", + "fa_icon": "fas fa-ban" + }, + "dbcan_skip_substrate": { + "type": "boolean", + "description": "Skip Substrate during the DBCAN screening.", + "fa_icon": "fas fa-ban" + } + } + }, "institutional_config_options": { "title": "Institutional config options", "type": "object", diff --git a/subworkflows/local/annotation.nf b/subworkflows/local/annotation.nf index a59fe561..2d990958 100644 --- a/subworkflows/local/annotation.nf +++ b/subworkflows/local/annotation.nf @@ -3,16 +3,20 @@ */ include { PROKKA } from '../../modules/nf-core/prokka/main' -include { PRODIGAL } from '../../modules/nf-core/prodigal/main' -include { PYRODIGAL } from '../../modules/nf-core/pyrodigal/main' +include { PRODIGAL as PRODIGAL_GBK } from '../../modules/nf-core/prodigal/main' +include { PRODIGAL as PRODIGAL_GFF } from '../../modules/nf-core/prodigal/main' +include { PYRODIGAL as PYRODIGAL_GBK } from '../../modules/nf-core/pyrodigal/main' +include { PYRODIGAL as PYRODIGAL_GFF } from '../../modules/nf-core/pyrodigal/main' include { BAKTA_BAKTADBDOWNLOAD } from '../../modules/nf-core/bakta/baktadbdownload/main' include { BAKTA_BAKTA } from '../../modules/nf-core/bakta/bakta/main' include { GUNZIP as GUNZIP_PRODIGAL_FNA } from '../../modules/nf-core/gunzip/main' include { GUNZIP as GUNZIP_PRODIGAL_FAA } from '../../modules/nf-core/gunzip/main' include { GUNZIP as GUNZIP_PRODIGAL_GBK } from '../../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_PRODIGAL_GFF } from '../../modules/nf-core/gunzip/main' include { GUNZIP as GUNZIP_PYRODIGAL_FNA } from '../../modules/nf-core/gunzip/main' include { GUNZIP as GUNZIP_PYRODIGAL_FAA } from '../../modules/nf-core/gunzip/main' include { GUNZIP as GUNZIP_PYRODIGAL_GBK } from '../../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_PYRODIGAL_GFF } from '../../modules/nf-core/gunzip/main' workflow ANNOTATION { take: @@ -32,31 +36,41 @@ workflow ANNOTATION { log.warn("[nf-core/funcscan] Switching annotation tool to: Pyrodigal. This is because Prodigal annotations (in GBK format) are incompatible with AMPcombi. If you specifically wish to run Prodigal instead, please skip AMP workflow or provide a pre-annotated GBK file in the samplesheet.") } - PYRODIGAL(fasta, "gbk") - GUNZIP_PYRODIGAL_FAA(PYRODIGAL.out.faa) - GUNZIP_PYRODIGAL_FNA(PYRODIGAL.out.fna) - GUNZIP_PYRODIGAL_GBK(PYRODIGAL.out.annotations) - ch_versions = ch_versions.mix(PYRODIGAL.out.versions) + PYRODIGAL_GBK(fasta, "gbk") + PYRODIGAL_GFF(fasta, "gff") + GUNZIP_PYRODIGAL_FAA(PYRODIGAL_GBK.out.faa) + GUNZIP_PYRODIGAL_FNA(PYRODIGAL_GBK.out.fna) + GUNZIP_PYRODIGAL_GBK(PYRODIGAL_GBK.out.annotations) + GUNZIP_PYRODIGAL_GFF(PYRODIGAL_GFF.out.annotations) + ch_versions = ch_versions.mix(PYRODIGAL_GBK.out.versions) + ch_versions = ch_versions.mix(PYRODIGAL_GFF.out.versions) ch_versions = ch_versions.mix(GUNZIP_PYRODIGAL_FAA.out.versions) ch_versions = ch_versions.mix(GUNZIP_PYRODIGAL_FNA.out.versions) ch_versions = ch_versions.mix(GUNZIP_PYRODIGAL_GBK.out.versions) + ch_versions = ch_versions.mix(GUNZIP_PYRODIGAL_GFF.out.versions) ch_annotation_faa = GUNZIP_PYRODIGAL_FAA.out.gunzip ch_annotation_fna = GUNZIP_PYRODIGAL_FNA.out.gunzip ch_annotation_gbk = GUNZIP_PYRODIGAL_GBK.out.gunzip + ch_annotation_gff = GUNZIP_PYRODIGAL_GFF.out.gunzip } else if (params.annotation_tool == "prodigal") { - PRODIGAL(fasta, "gbk") - GUNZIP_PRODIGAL_FAA(PRODIGAL.out.amino_acid_fasta) - GUNZIP_PRODIGAL_FNA(PRODIGAL.out.nucleotide_fasta) - GUNZIP_PRODIGAL_GBK(PRODIGAL.out.gene_annotations) - ch_versions = ch_versions.mix(PRODIGAL.out.versions) + PRODIGAL_GBK(fasta, "gbk") + PRODIGAL_GFF(fasta, "gff") + GUNZIP_PRODIGAL_FAA(PRODIGAL_GBK.out.amino_acid_fasta) + GUNZIP_PRODIGAL_FNA(PRODIGAL_GBK.out.nucleotide_fasta) + GUNZIP_PRODIGAL_GBK(PRODIGAL_GBK.out.gene_annotations) + GUNZIP_PRODIGAL_GFF(PRODIGAL_GFF.out.gene_annotations) + ch_versions = ch_versions.mix(PRODIGAL_GBK.out.versions) + ch_versions = ch_versions.mix(PRODIGAL_GFF.out.versions) ch_versions = ch_versions.mix(GUNZIP_PRODIGAL_FAA.out.versions) ch_versions = ch_versions.mix(GUNZIP_PRODIGAL_FNA.out.versions) ch_versions = ch_versions.mix(GUNZIP_PRODIGAL_GBK.out.versions) + ch_versions = ch_versions.mix(GUNZIP_PRODIGAL_GFF.out.versions) ch_annotation_faa = GUNZIP_PRODIGAL_FAA.out.gunzip ch_annotation_fna = GUNZIP_PRODIGAL_FNA.out.gunzip ch_annotation_gbk = GUNZIP_PRODIGAL_GBK.out.gunzip + ch_annotation_gff = GUNZIP_PRODIGAL_GFF.out.gunzip } else if (params.annotation_tool == "prokka") { @@ -66,6 +80,7 @@ workflow ANNOTATION { ch_annotation_faa = PROKKA.out.faa ch_annotation_fna = PROKKA.out.fna ch_annotation_gbk = PROKKA.out.gbk + ch_annotation_gff = PROKKA.out.gff } else if (params.annotation_tool == "bakta") { @@ -87,6 +102,7 @@ workflow ANNOTATION { ch_annotation_faa = BAKTA_BAKTA.out.faa ch_annotation_fna = BAKTA_BAKTA.out.fna ch_annotation_gbk = BAKTA_BAKTA.out.gbff + ch_annotation_gff = BAKTA_BAKTA.out.gff } emit: @@ -95,4 +111,5 @@ workflow ANNOTATION { faa = ch_annotation_faa // [ [meta], path(faa) ] fna = ch_annotation_fna // [ [meta], path(fna) ] gbk = ch_annotation_gbk // [ [meta], path(gbk) ] + gff = ch_annotation_gff // [ [meta], path(gff) ] } diff --git a/subworkflows/local/dbcan.nf b/subworkflows/local/dbcan.nf new file mode 100644 index 00000000..e2332dfd --- /dev/null +++ b/subworkflows/local/dbcan.nf @@ -0,0 +1,68 @@ +/* + Run rundbcan screening tools +*/ + +include { RUNDBCAN_DATABASE } from '../../modules/nf-core/rundbcan/database/main' +include { RUNDBCAN_CAZYMEANNOTATION } from '../../modules/nf-core/rundbcan/cazymeannotation/main' +include { RUNDBCAN_EASYCGC } from '../../modules/nf-core/rundbcan/easycgc/main' +include { RUNDBCAN_EASYSUBSTRATE } from '../../modules/nf-core/rundbcan/easysubstrate/main' + + +workflow DBCAN { + + take: + faas // tuple val(meta), path(PROKKA/PRODIGAL.out.faa) + gffs // tuple val(meta), path(ANNOTATION_ANNOTATION_TOOL.out.gff) + + main: + + ch_versions = Channel.empty() + + // When adding new tool that requires FAA, make sure to update conditions + // in funcscan.nf around annotation and AMP subworkflow execution + // to ensure annotation is executed! + ch_faas_for_rundbcan = faas + ch_gffs_for_rundbcan = gffs + + // RUN DBCAN + RUNDBCAN_DATABASE () + ch_versions = ch_versions.mix(RUNDBCAN_DATABASE.out.versions) + + // RUN CAZyme Annotation + RUNDBCAN_CAZYMEANNOTATION ( + ch_faas_for_rundbcan, + RUNDBCAN_DATABASE.out.dbcan_db + ) + ch_versions = ch_versions.mix(RUNDBCAN_CAZYMEANNOTATION.out.versions) + + // Prepare input for DBCAN CGC and SUBSTRATE + ch_input_for_dbcan = ch_faas_for_rundbcan + .join(ch_gffs_for_rundbcan) + .multiMap { meta, faa, gff -> + faa: [meta, faa] + gff: [meta, gff, 'prodigal'] + } + + // Run DBCAN CGC Annotation when annotation_tool is `prodigal` or `pyrodigal` + if ( !params.dbcan_skip_cgc && (params.annotation_tool == "prodigal" || params.annotation_tool == "pyrodigal") ) { + RUNDBCAN_EASYCGC ( + ch_input_for_dbcan.faa, + ch_input_for_dbcan.gff, + RUNDBCAN_DATABASE.out.dbcan_db + ) + ch_versions = ch_versions.mix(RUNDBCAN_EASYCGC.out.versions) + } + + // Run DBCAN Substrate Annotation when annotation tool is `prodigal` or `pyrodigal` + if ( !params.dbcan_skip_substrate && (params.annotation_tool == "prodigal" || params.annotation_tool == "pyrodigal") ) { + RUNDBCAN_EASYSUBSTRATE ( + ch_input_for_dbcan.faa, + ch_input_for_dbcan.gff, + RUNDBCAN_DATABASE.out.dbcan_db + ) + ch_versions = ch_versions.mix(RUNDBCAN_EASYSUBSTRATE.out.versions) + } + + emit: + versions = ch_versions +} diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index ba8f997a..56264d5b 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -24,6 +24,7 @@ include { PROTEIN_ANNOTATION } from '../subworkflows/local/protein_annota include { AMP } from '../subworkflows/local/amp' include { ARG } from '../subworkflows/local/arg' include { BGC } from '../subworkflows/local/bgc' +include { DBCAN } from '../subworkflows/local/dbcan' include { TAXA_CLASS } from '../subworkflows/local/taxa_class' /* @@ -86,24 +87,26 @@ workflow FUNCSCAN { .map { meta, files -> def fasta_found = files.find { it.toString().tokenize('.').last().matches('fasta|fas|fna|fa') } def faa_found = files.find { it.toString().endsWith('.faa') } + def gff_found = files.find { it.toString().endsWith('.gff') } def gbk_found = files.find { it.toString().tokenize('.').last().matches('gbk|gbff') } def fasta = fasta_found != null ? fasta_found : [] def faa = faa_found != null ? faa_found : [] + def gff = gff_found != null ? gff_found : [] def gbk = gbk_found != null ? gbk_found : [] - [meta, fasta, faa, gbk] + [meta, fasta, faa, gff, gbk] } - .branch { meta, fasta, faa, gbk -> - preannotated: gbk != [] + .branch { meta, fasta, faa, gff, gbk -> + preannotated: gff != [] || gbk != [] fastas: true } // Duplicate and filter the duplicated file for long contigs only for BGC // This is to speed up BGC run and prevent 'no hits found' fails if (params.run_bgc_screening) { - SEQKIT_SEQ_LENGTH(ch_intermediate_input.fastas.map { meta, fasta, faa, gbk -> [meta, fasta] }) + SEQKIT_SEQ_LENGTH(ch_intermediate_input.fastas.map { meta, fasta, faa, gff, gbk -> [meta, fasta] }) ch_input_for_annotation = ch_intermediate_input.fastas - .map { meta, fasta, protein, gbk -> [meta, fasta] } + .map { meta, fasta, protein, gff, gbk -> [meta, fasta] } .mix(SEQKIT_SEQ_LENGTH.out.fastx.map { meta, fasta -> [meta + [category: 'long'], fasta] }) .filter { meta, fasta -> if (fasta != [] && fasta.isEmpty()) { @@ -122,12 +125,13 @@ workflow FUNCSCAN { */ // Some tools require annotated FASTAs - if ((params.run_arg_screening && !params.arg_skip_deeparg) || params.run_amp_screening || params.run_bgc_screening) { + if ((params.run_arg_screening && !params.arg_skip_deeparg) || params.run_amp_screening || params.run_bgc_screening || params.run_dbcan_screening) { ANNOTATION(ch_input_for_annotation) ch_versions = ch_versions.mix(ANNOTATION.out.versions) ch_new_annotation = ch_input_for_annotation .join(ANNOTATION.out.faa) + .join(ANNOTATION.out.gff) .join(ANNOTATION.out.gbk) } else { @@ -138,20 +142,22 @@ workflow FUNCSCAN { ch_prepped_input = ch_new_annotation .filter { meta, fasta, faa, gbk -> meta.category != 'long' } .mix(ch_intermediate_input.preannotated) - .multiMap { meta, fasta, faa, gbk -> + .multiMap { meta, fasta, faa, gff, gbk -> fastas: [meta, fasta] faas: [meta, faa] + gffs: [meta, gff] gbks: [meta, gbk] } if (params.run_bgc_screening) { ch_prepped_input_long = ch_new_annotation - .filter { meta, fasta, faa, gbk -> meta.category == 'long' } + .filter { meta, fasta, faa, gff, gbk -> meta.category == 'long' } .mix(ch_intermediate_input.preannotated) - .multiMap { meta, fasta, faa, gbk -> + .multiMap { meta, fasta, faa, gff, gbk -> fastas: [meta, fasta] faas: [meta, faa] + gffs: [meta, gff] gbks: [meta, gbk] } } @@ -357,6 +363,21 @@ workflow FUNCSCAN { ch_versions = ch_versions.mix(BGC.out.versions) } + /* + DBCANs + */ + if ( params.run_dbcan_screening ) { + DBCAN ( + ch_prepped_input.faas.filter { meta, file -> + if (file != [] && file.isEmpty()) { + log.warn("[nf-core/funcscan] Annotation of following sample produced an empty FAA file. DBCAN screening tools requiring this file will not be executed: ${meta.id}") + } + !file.isEmpty() + }, + ch_prepped_input.gffs + ) + } + // // Collate and save software versions // From 15f2ef50c6528d3117514cd152606ecc9bbe9c54 Mon Sep 17 00:00:00 2001 From: HaidYi Date: Mon, 30 Jun 2025 22:51:42 -0500 Subject: [PATCH 02/55] fix missing gffs --- workflows/funcscan.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index 56264d5b..d8293315 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -117,7 +117,7 @@ workflow FUNCSCAN { ch_versions = ch_versions.mix(SEQKIT_SEQ_LENGTH.out.versions) } else { - ch_input_for_annotation = ch_intermediate_input.fastas.map { meta, fasta, protein, gbk -> [meta, fasta] } + ch_input_for_annotation = ch_intermediate_input.fastas.map { meta, fasta, protein, gff, gbk -> [meta, fasta] } } /* @@ -140,7 +140,7 @@ workflow FUNCSCAN { // Mix back the preannotated samples with the newly annotated ones ch_prepped_input = ch_new_annotation - .filter { meta, fasta, faa, gbk -> meta.category != 'long' } + .filter { meta, fasta, faa, gff, gbk -> meta.category != 'long' } .mix(ch_intermediate_input.preannotated) .multiMap { meta, fasta, faa, gff, gbk -> fastas: [meta, fasta] From d5df4a1eaf95254195c422f955211e6d648d4c27 Mon Sep 17 00:00:00 2001 From: HaidYi Date: Mon, 30 Jun 2025 22:52:05 -0500 Subject: [PATCH 03/55] split dbcan results by meta.id --- conf/modules.config | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 6c3aebeb..6a0d7e34 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -744,7 +744,7 @@ process { withName: RUNDBCAN_CAZYMEANNOTATION { publishDir = [ - path: { "${params.outdir}/dbcan/cazyme_annotation/" }, + path: { "${params.outdir}/dbcan/cazyme_annotation/${meta.id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, ] @@ -752,7 +752,7 @@ process { withName: RUNDBCAN_EASYCGC { publishDir = [ - path: { "${params.outdir}/dbcan/cgc/" }, + path: { "${params.outdir}/dbcan/cgc/${meta.id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, ] @@ -760,7 +760,7 @@ process { withName: RUNDBCAN_EASYSUBSTRATE { publishDir = [ - path: { "${params.outdir}/dbcan/substrate/" }, + path: { "${params.outdir}/dbcan/substrate/${meta.id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, ] From f049e2f46c935851155fe9fcb2eafe05ecc59d4f Mon Sep 17 00:00:00 2001 From: HaidYi Date: Mon, 30 Jun 2025 23:28:42 -0500 Subject: [PATCH 04/55] rm constraints of annotation tool --- subworkflows/local/dbcan.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/subworkflows/local/dbcan.nf b/subworkflows/local/dbcan.nf index e2332dfd..373207ec 100644 --- a/subworkflows/local/dbcan.nf +++ b/subworkflows/local/dbcan.nf @@ -43,8 +43,8 @@ workflow DBCAN { gff: [meta, gff, 'prodigal'] } - // Run DBCAN CGC Annotation when annotation_tool is `prodigal` or `pyrodigal` - if ( !params.dbcan_skip_cgc && (params.annotation_tool == "prodigal" || params.annotation_tool == "pyrodigal") ) { + // Run DBCAN CGC Annotation + if ( !params.dbcan_skip_cgc ) { RUNDBCAN_EASYCGC ( ch_input_for_dbcan.faa, ch_input_for_dbcan.gff, @@ -53,8 +53,8 @@ workflow DBCAN { ch_versions = ch_versions.mix(RUNDBCAN_EASYCGC.out.versions) } - // Run DBCAN Substrate Annotation when annotation tool is `prodigal` or `pyrodigal` - if ( !params.dbcan_skip_substrate && (params.annotation_tool == "prodigal" || params.annotation_tool == "pyrodigal") ) { + // Run DBCAN Substrate Annotation + if ( !params.dbcan_skip_substrate ) { RUNDBCAN_EASYSUBSTRATE ( ch_input_for_dbcan.faa, ch_input_for_dbcan.gff, From 8289bdb88c20a7bb77798e8a5094b12b3e29593c Mon Sep 17 00:00:00 2001 From: HaidYi Date: Mon, 30 Jun 2025 23:29:21 -0500 Subject: [PATCH 05/55] add test config for rundbcan --- conf/test_dbcan_pyrodigal.config | 34 ++++++++++++++++++++++++++ conf/test_preannotated_dbcan.config | 37 +++++++++++++++++++++++++++++ nextflow.config | 6 +++++ 3 files changed, 77 insertions(+) create mode 100644 conf/test_dbcan_pyrodigal.config create mode 100644 conf/test_preannotated_dbcan.config diff --git a/conf/test_dbcan_pyrodigal.config b/conf/test_dbcan_pyrodigal.config new file mode 100644 index 00000000..746cd6aa --- /dev/null +++ b/conf/test_dbcan_pyrodigal.config @@ -0,0 +1,34 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/funcscan -profile test_dbcan_pyrodigal, --outdir + +---------------------------------------------------------------------------------------- +*/ + +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'DBCAN Pyrodigal test profile' + config_profile_description = 'Minimal test dataset to check DBCAN workflow function' + + // Input data + input = params.pipelines_testdata_base_path + 'funcscan/samplesheet_reduced.csv' + + annotation_tool = 'pyrodigal' + + run_arg_screening = false + run_amp_screening = false + run_bgc_screening = false + run_dbcan_screening = true +} diff --git a/conf/test_preannotated_dbcan.config b/conf/test_preannotated_dbcan.config new file mode 100644 index 00000000..2a86f547 --- /dev/null +++ b/conf/test_preannotated_dbcan.config @@ -0,0 +1,37 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/funcscan -profile test_preannotated_dbcan, --outdir + +---------------------------------------------------------------------------------------- +*/ + +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'DBCAN test profile - preannotated input' + config_profile_description = 'Minimal test dataset to check DBCAN workflow function' + + // Input data + input = params.pipelines_testdata_base_path + 'funcscan/samplesheet_preannotated.csv' + + annotation_tool = 'pyrodigal' + + run_arg_screening = false + run_amp_screening = false + run_bgc_screening = false + run_dbcan_screening = true + + dbcan_skip_cgc = true // skip cgc as .gbk is used + dbcan_skip_substrate = true // skip substrate as .gbk is used +} diff --git a/nextflow.config b/nextflow.config index ae7c3040..211e919c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -416,6 +416,12 @@ profiles { test_preannotated_bgc { includeConfig 'conf/test_preannotated_bgc.config' } + test_dbcan_pyrodigal { + includeConfig 'conf/test_dbcan_pyrodigal.config' + } + test_preannotated_dbcan { + includeConfig 'conf/test_preannotated_dbcan.config' + } } // Load nf-core custom profiles from different Institutions From d8af5e9b4738ea3937cceb179ff5f6a179e171cc Mon Sep 17 00:00:00 2001 From: HaidYi Date: Mon, 30 Jun 2025 23:29:33 -0500 Subject: [PATCH 06/55] add test profile for rundbcan in ci --- .github/workflows/ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index be05a2ea..9e36b299 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -65,6 +65,8 @@ jobs: - "test_taxonomy_prokka" - "test_preannotated" - "test_preannotated_bgc" + - "test_dbcan_pyrodigal" + - "test_preannotated_dbcan" isMaster: - ${{ github.base_ref == 'master' }} # Exclude conda and singularity on dev From 0a5e50598e38e457a94b926259849e26be50515a Mon Sep 17 00:00:00 2001 From: HaidYi Date: Tue, 1 Jul 2025 19:14:37 -0500 Subject: [PATCH 07/55] add dbcan in the refs --- nextflow_schema.json | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nextflow_schema.json b/nextflow_schema.json index a5ee1fb4..adb1e5bb 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1726,6 +1726,9 @@ { "$ref": "#/$defs/bgc_hmmsearch" }, + { + "$ref": "#/$defs/dbcan" + }, { "$ref": "#/$defs/institutional_config_options" }, From 01a573a61300c0a27b6fa94b948d99faf73a057c Mon Sep 17 00:00:00 2001 From: jasmezz Date: Thu, 10 Jul 2025 16:01:57 +0200 Subject: [PATCH 08/55] Suggestions from code review --- README.md | 2 +- conf/modules.config | 2 +- conf/test_preannotated_dbcan.config | 4 +- docs/output.md | 70 ++++++++++++++--------------- nextflow_schema.json | 8 ++-- subworkflows/local/bgc.nf | 2 +- subworkflows/local/dbcan.nf | 12 ++--- 7 files changed, 50 insertions(+), 50 deletions(-) diff --git a/README.md b/README.md index 536c55c8..92dd7b1a 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ The nf-core/funcscan AWS full test dataset are contigs generated by the MGnify s 5. Screening contigs for antimicrobial peptide-like sequences with [`ampir`](https://cran.r-project.org/web/packages/ampir/index.html), [`Macrel`](https://github.com/BigDataBiology/macrel), [`HMMER`](http://hmmer.org/), [`AMPlify`](https://github.com/bcgsc/AMPlify) 6. Screening contigs for antibiotic resistant gene-like sequences with [`ABRicate`](https://github.com/tseemann/abricate), [`AMRFinderPlus`](https://github.com/ncbi/amr), [`fARGene`](https://github.com/fannyhb/fargene), [`RGI`](https://card.mcmaster.ca/analyze/rgi), [`DeepARG`](https://bench.cs.vt.edu/deeparg). [`argNorm`](https://github.com/BigDataBiology/argNorm) is used to map the outputs of `DeepARG`, `AMRFinderPlus`, and `ABRicate` to the [`Antibiotic Resistance Ontology`](https://www.ebi.ac.uk/ols4/ontologies/aro) for consistent ARG classification terms. 7. Screening contigs for biosynthetic gene cluster-like sequences with [`antiSMASH`](https://antismash.secondarymetabolites.org), [`DeepBGC`](https://github.com/Merck/deepbgc), [`GECCO`](https://gecco.embl.de/), [`HMMER`](http://hmmer.org/) -8. Screening contigs for carbohydrate-active enzyme (CAZyme), CAZyme gene clusters and substrate with [run_dbcan](https://github.com/bcb-unl/run_dbcan). +8. Screening contigs for carbohydrate-active enzymes (CAZymes), CAZyme gene clusters and substrates with [run_dbcan](https://github.com/bcb-unl/run_dbcan). 9. Creating aggregated reports for all samples across the workflows with [`AMPcombi`](https://github.com/Darcy220606/AMPcombi) for AMPs, [`hAMRonization`](https://github.com/pha4ge/hAMRonization) for ARGs, and [`comBGC`](https://raw.githubusercontent.com/nf-core/funcscan/master/bin/comBGC.py) for BGCs 10. Software version and methods text reporting with [`MultiQC`](http://multiqc.info/) diff --git a/conf/modules.config b/conf/modules.config index 6a0d7e34..3f12d19a 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -744,7 +744,7 @@ process { withName: RUNDBCAN_CAZYMEANNOTATION { publishDir = [ - path: { "${params.outdir}/dbcan/cazyme_annotation/${meta.id}" }, + path: { "${params.outdir}/dbcan/cazyme/${meta.id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, ] diff --git a/conf/test_preannotated_dbcan.config b/conf/test_preannotated_dbcan.config index 2a86f547..ce947816 100644 --- a/conf/test_preannotated_dbcan.config +++ b/conf/test_preannotated_dbcan.config @@ -32,6 +32,6 @@ params { run_bgc_screening = false run_dbcan_screening = true - dbcan_skip_cgc = true // skip cgc as .gbk is used - dbcan_skip_substrate = true // skip substrate as .gbk is used + dbcan_skip_cgc = true // Skip cgc annotation as .gbk (not .gff) is provided in samplesheet + dbcan_skip_substrate = true // Skip substrate annotation as .gbk (not .gff) is provided in samplesheet } diff --git a/docs/output.md b/docs/output.md index 20a145be..f56e8da2 100644 --- a/docs/output.md +++ b/docs/output.md @@ -7,11 +7,11 @@ The output of nf-core/funcscan provides reports for each of the functional group - **antibiotic resistance genes** (tools: [ABRicate](https://github.com/tseemann/abricate), [AMRFinderPlus](https://www.ncbi.nlm.nih.gov/pathogens/antimicrobial-resistance/AMRFinder), [DeepARG](https://bitbucket.org/gusphdproj/deeparg-ss/src/master), [fARGene](https://github.com/fannyhb/fargene), [RGI](https://card.mcmaster.ca/analyze/rgi) – summarised by [hAMRonization](https://github.com/pha4ge/hAMRonization). Results from ABRicate, AMRFinderPlus, and DeepARG are normalised to [ARO](https://obofoundry.org/ontology/aro.html) by [argNorm](https://github.com/BigDataBiology/argNorm).) - **antimicrobial peptides** (tools: [Macrel](https://github.com/BigDataBiology/macrel), [AMPlify](https://github.com/bcgsc/AMPlify), [ampir](https://ampir.marine-omics.net), [hmmsearch](http://hmmer.org) – summarised by [AMPcombi](https://github.com/Darcy220606/AMPcombi)) - **biosynthetic gene clusters** (tools: [antiSMASH](https://docs.antismash.secondarymetabolites.org), [DeepBGC](https://github.com/Merck/deepbgc), [GECCO](https://gecco.embl.de), [hmmsearch](http://hmmer.org) – summarised by [comBGC](#combgc)) -- carbohydrate-active enzyme (CAZyme) annotation, CAZyme gene clusters and substrate (tools: [run_dbcan](https://github.com/bcb-unl/run_dbcan)) +- **carbohydrate-active enzymes (CAZymes)**, CAZyme gene clusters and substrates (tools: [run_dbcan](https://github.com/bcb-unl/run_dbcan)) As a general workflow, we recommend to first look at the summary reports ([ARGs](#hamronization), [AMPs](#ampcombi), [BGCs](#combgc)), to get a general overview of what hits have been found across all the tools of each functional group. After which, you can explore the specific output directories of each tool to get more detailed information about each result. The tool-specific output directories also includes the output from the functional annotation steps of either [prokka](https://github.com/tseemann/prokka), [pyrodigal](https://github.com/althonos/pyrodigal), [prodigal](https://github.com/hyattpd/Prodigal), or [Bakta](https://github.com/oschwengers/bakta) if the `--save_annotations` flag was set. Additionally, taxonomic classifications from [MMseqs2](https://github.com/soedinglab/MMseqs2) are saved if the `--taxa_classification_mmseqs_db_savetmp` and `--taxa_classification_mmseqs_taxonomy_savetmp` flags are set. -Similarly, all downloaded databases are saved (i.e. from [MMseqs2](https://github.com/soedinglab/MMseqs2), [antiSMASH](https://docs.antismash.secondarymetabolites.org), [AMRFinderPlus](https://www.ncbi.nlm.nih.gov/pathogens/antimicrobial-resistance/AMRFinder), [Bakta](https://github.com/oschwengers/bakta), [DeepARG](https://bitbucket.org/gusphdproj/deeparg-ss/src/master), [RGI](https://github.com/arpcard/rgi), and/or [AMPcombi](https://github.com/Darcy220606/AMPcombi), [run_dbcan](https://github.com/bcb-unl/run_dbcan)) into the output directory `/databases/` if the `--save_db` flag was set. +Similarly, all downloaded databases are saved (i.e. from [MMseqs2](https://github.com/soedinglab/MMseqs2), [antiSMASH](https://docs.antismash.secondarymetabolites.org), [AMRFinderPlus](https://www.ncbi.nlm.nih.gov/pathogens/antimicrobial-resistance/AMRFinder), [Bakta](https://github.com/oschwengers/bakta), [DeepARG](https://bitbucket.org/gusphdproj/deeparg-ss/src/master), [RGI](https://github.com/arpcard/rgi), [AMPcombi](https://github.com/Darcy220606/AMPcombi), and/or [run_dbcan](https://github.com/bcb-unl/run_dbcan)) into the output directory `/databases/` if the `--save_db` flag was set. Furthermore, for reproducibility, versions of all software used in the run is presented in a [MultiQC](http://multiqc.info) report. @@ -43,7 +43,7 @@ results/ | ├── gecco/ | └── hmmsearch/ ├── dbcan/ -| ├── cazyme_annotation/ +| ├── cazyme/ | ├── cgc/ | └── substrate/ ├── databases/ @@ -68,11 +68,11 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes p Input contig QC with: -- [SeqKit](https://bioinf.shenwei.me/seqkit/) (default) - for separating into long- and short- categories +- [SeqKit](https://bioinf.shenwei.me/seqkit/) (default) – for separating into long- and short- categories Taxonomy classification of nucleotide sequences with: -- [MMseqs2](https://github.com/soedinglab/MMseqs2) (default) - for contig taxonomic classification using 2bLCA. +- [MMseqs2](https://github.com/soedinglab/MMseqs2) (default) – for contig taxonomic classification using 2bLCA. ORF prediction and annotation with any of: @@ -103,22 +103,22 @@ Antimicrobial Peptides (AMPs): Biosynthetic Gene Clusters (BGCs): - [antiSMASH](#antismash) – biosynthetic gene cluster detection. -- [deepBGC](#deepbgc) - biosynthetic gene cluster detection, using a deep learning model. +- [deepBGC](#deepbgc) – biosynthetic gene cluster detection, using a deep learning model. - [GECCO](#gecco) – biosynthetic gene cluster detection, using Conditional Random Fields (CRFs). - [hmmsearch](#hmmsearch) – biosynthetic gene cluster detection, based on hidden Markov models. -Carbohydrate-active enzyme (CAZYMEs) +Carbohydrate-active enzymes (CAZYMEs) -- [run_dbcan](https://github.com/bcb-unl/run_dbcan) - carbohydrate-active enzyme (CAZyme), CAZyme gene clusters and substrate. +- [run_dbcan](https://github.com/bcb-unl/run_dbcan) – carbohydrate-active enzyme (CAZyme), CAZyme gene clusters and substrate detection. Output Summaries: -- [AMPcombi](#ampcombi) – summary report of antimicrobial peptide gene output from various detection tools. -- [hAMRonization](#hamronization) – summary of antimicrobial resistance gene output from various detection tools. -- [argNorm](#argNorm) - Normalize ARG annotations from [ABRicate](#abricate), [AMRFinderPlus](#amrfinderplus), and [DeepARG](#deeparg) to the ARO -- [comBGC](#combgc) – summary of biosynthetic gene cluster output from various detection tools. -- [MultiQC](#multiqc) – report of all software and versions used in the pipeline. -- [Pipeline information](#pipeline-information) – report metrics generated during the workflow execution. +- [AMPcombi](#ampcombi) – summary report of antimicrobial peptide gene output from various detection tools +- [hAMRonization](#hamronization) – summary of antimicrobial resistance gene output from various detection tools +- [argNorm](#argNorm) – Normalize ARG annotations from [ABRicate](#abricate), [AMRFinderPlus](#amrfinderplus), and [DeepARG](#deeparg) to the ARO +- [comBGC](#combgc) – summary of biosynthetic gene cluster output from various detection tools +- [MultiQC](#multiqc) – report of all software and versions used in the pipeline +- [Pipeline information](#pipeline-information) – report metrics generated during the workflow execution ## Tool details @@ -483,34 +483,34 @@ Note that filtered FASTA is only used for BGC workflow for run-time optimisation Output files - `dbcan/` - - `cazyme_annotation` - - `*_overview.tsv/`: TSV file containing the results of dbCAN CAZyme annotation. - - `*_dbCAN_hmm_results.tsv`: TSV file containing the detailed dbCAN HMM results for CAZyme annotation. - - `*_dbCANsub_hmm_results.tsv`: TSV file containing the detailed dbCAN subfamily results for CAZyme annotation. - - `*_diamond.out`: TSV file containing the detailed dbCAN diamond results for CAZyme annotation. + - `cazyme` + - `*_overview.tsv`: TSV file containing the results of dbCAN CAZyme annotation + - `*_dbCAN_hmm_results.tsv`: TSV file containing the detailed dbCAN HMM results for CAZyme annotation + - `*_dbCANsub_hmm_results.tsv`: TSV file containing the detailed dbCAN subfamily results for CAZyme annotation + - `*_diamond.out`: TSV file containing the detailed dbCAN diamond results for CAZyme annotation - `cgc` - - `*_overview.tsv/`: TSV file containing the results of dbCAN CAZyme annotation. - - `*_dbCAN_hmm_results.tsv`: TSV file containing the detailed dbCAN HMM results for CAZyme annotation. - - `*_dbCANsub_hmm_results.tsv`: TSV file containing the detailed dbCAN subfamily results for CAZyme annotation. - - `*_diamond.out`: TSV file containing the detailed dbCAN diamond results for CAZyme annotation. + - `*_overview.tsv/`: TSV file containing the results of dbCAN CAZyme annotation + - `*_dbCAN_hmm_results.tsv`: TSV file containing the detailed dbCAN HMM results for CAZyme annotation + - `*_dbCANsub_hmm_results.tsv`: TSV file containing the detailed dbCAN subfamily results for CAZyme annotation + - `*_diamond.out`: TSV file containing the detailed dbCAN diamond results for CAZyme annotation - `*_cgc.gff`: GFF file containing the CAZyme gene clusters (CGC) identified by dbCAN. This file is generated from the dbCAN annotation and contains the locations of CAZyme gene clusters in the genome. - `*_cgc_standard_out.tsv`: Standard output file from dbCAN for CAZyme gene clusters (CGC) in a tabular format. This file summarizes the CAZyme gene clusters identified in the genome. - - `*_diamond.out.tc`: TSV file containing the diamond output for transporter annotation. - - `*_TF_hmm_results.tsv`: TSV file containing the results of Transcription factor. + - `*_diamond.out.tc`: TSV file containing the diamond output for transporter annotation + - `*_TF_hmm_results.tsv`: TSV file containing the results of transcription factor screening - `*_STP_hmm_results.tsv`: TSV file containing the results of signaling transduction proteins (STP) annotation. - `substrate` - - `*_overview.tsv/`: TSV file containing the results of dbCAN CAZyme annotation. - - `*_dbCAN_hmm_results.tsv`: TSV file containing the detailed dbCAN HMM results for CAZyme annotation. - - `*_dbCANsub_hmm_results.tsv`: TSV file containing the detailed dbCAN subfamily results for CAZyme annotation. - - `*_diamond.out`: TSV file containing the detailed dbCAN diamond results for CAZyme annotation. + - `*_overview.tsv`: TSV file containing the results of dbCAN CAZyme annotation + - `*_dbCAN_hmm_results.tsv`: TSV file containing the detailed dbCAN HMM results for CAZyme annotation + - `*_dbCANsub_hmm_results.tsv`: TSV file containing the detailed dbCAN subfamily results for CAZyme annotation + - `*_diamond.out`: TSV file containing the detailed dbCAN diamond results for CAZyme annotation - `*_cgc.gff`: GFF file containing the CAZyme gene clusters (CGC) identified by dbCAN. This file is generated from the dbCAN annotation and contains the locations of CAZyme gene clusters in the genome. - `*_cgc_standard_out.tsv`: Standard output file from dbCAN for CAZyme gene clusters (CGC) in a tabular format. This file summarizes the CAZyme gene clusters identified in the genome. - - `*_diamond.out.tc`: TSV file containing the diamond output for transporter annotation. - - `*_TF_hmm_results.tsv`: TSV file containing the results of Transcription factor. - - `*_STP_hmm_results.tsv`: TSV file containing the results of signaling transduction proteins (STP) annotation. - - `*_total_cgc_info.tsv`: TSV file summarizing the total additional genes in the genome. - - `*_substrate_prediction.tsv`: TSV file containing the substrate predictions based on the CGC annotations from dbCAN. - - `*_synteny_pdf/`: Directory containing the synteny plots in PDF format for the CAZyme gene clusters (CGC) identified by dbCAN. This directory will contain one or more PDF files showing the syntenic regions of the CGC in the genome. + - `*_diamond.out.tc`: TSV file containing the diamond output for transporter annotation + - `*_TF_hmm_results.tsv`: TSV file containing the results of transcription factor screening + - `*_STP_hmm_results.tsv`: TSV file containing the results of signaling transduction proteins (STP) annotation + - `*_total_cgc_info.tsv`: TSV file summarizing the total additional genes in the genome + - `*_substrate_prediction.tsv`: TSV file containing the substrate predictions based on the CGC annotations from dbCAN + - `*_synteny_pdf/`: Directory containing one or more PDF files showing the syntenic regions of the CGCs in DNA sequence as identified by dbCAN diff --git a/nextflow_schema.json b/nextflow_schema.json index adb1e5bb..7606ba56 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1472,19 +1472,19 @@ "fa_icon": "fas fa-angle-double-right" }, "dbcan": { - "title": "DBCAN", + "title": "dbCAN", "type": "object", - "description": "Carbohydrate-active Enzyme Annotation based on pre-defined HMM models. More info: https://run-dbcan.readthedocs.io/en/latest", + "description": "Carbohydrate-active enzyme annotation based on pre-defined HMM models.\n\nFor more information check the dbCAN [documentation](https://run-dbcan.readthedocs.io/en/latest)", "default": "", "properties": { "dbcan_skip_cgc": { "type": "boolean", - "description": "Skip CGC during the DBCAN screening.", + "description": "Skip CGC during the dbCAN screening.", "fa_icon": "fas fa-ban" }, "dbcan_skip_substrate": { "type": "boolean", - "description": "Skip Substrate during the DBCAN screening.", + "description": "Skip substrate during the dbCAN screening.", "fa_icon": "fas fa-ban" } } diff --git a/subworkflows/local/bgc.nf b/subworkflows/local/bgc.nf index 25b21daa..e2ef0f8b 100644 --- a/subworkflows/local/bgc.nf +++ b/subworkflows/local/bgc.nf @@ -27,7 +27,7 @@ workflow BGC { ch_bgcresults_for_combgc = Channel.empty() // When adding new tool that requires FAA, make sure to update conditions - // in funcscan.nf around annotation and AMP subworkflow execution + // in funcscan.nf around annotation and BGC subworkflow execution // to ensure annotation is executed! ch_faa_for_bgc_hmmsearch = faas diff --git a/subworkflows/local/dbcan.nf b/subworkflows/local/dbcan.nf index 373207ec..6ff8b334 100644 --- a/subworkflows/local/dbcan.nf +++ b/subworkflows/local/dbcan.nf @@ -19,23 +19,23 @@ workflow DBCAN { ch_versions = Channel.empty() // When adding new tool that requires FAA, make sure to update conditions - // in funcscan.nf around annotation and AMP subworkflow execution + // in funcscan.nf around annotation and dbCAN subworkflow execution // to ensure annotation is executed! ch_faas_for_rundbcan = faas ch_gffs_for_rundbcan = gffs - // RUN DBCAN + // Download dbCAN database RUNDBCAN_DATABASE () ch_versions = ch_versions.mix(RUNDBCAN_DATABASE.out.versions) - // RUN CAZyme Annotation + // CAZyme annotation RUNDBCAN_CAZYMEANNOTATION ( ch_faas_for_rundbcan, RUNDBCAN_DATABASE.out.dbcan_db ) ch_versions = ch_versions.mix(RUNDBCAN_CAZYMEANNOTATION.out.versions) - // Prepare input for DBCAN CGC and SUBSTRATE + // Prepare input for dbCAN CGC and substrate annotation ch_input_for_dbcan = ch_faas_for_rundbcan .join(ch_gffs_for_rundbcan) .multiMap { meta, faa, gff -> @@ -43,7 +43,7 @@ workflow DBCAN { gff: [meta, gff, 'prodigal'] } - // Run DBCAN CGC Annotation + // CGC annotation if ( !params.dbcan_skip_cgc ) { RUNDBCAN_EASYCGC ( ch_input_for_dbcan.faa, @@ -53,7 +53,7 @@ workflow DBCAN { ch_versions = ch_versions.mix(RUNDBCAN_EASYCGC.out.versions) } - // Run DBCAN Substrate Annotation + // substrate annotation if ( !params.dbcan_skip_substrate ) { RUNDBCAN_EASYSUBSTRATE ( ch_input_for_dbcan.faa, From 5c5ec6646e74fdf6994e8edf865d08beb6e2fb1f Mon Sep 17 00:00:00 2001 From: haidyi Date: Mon, 14 Jul 2025 23:14:02 -0500 Subject: [PATCH 09/55] rm duplicate outputs --- conf/modules.config | 8 +++++--- docs/output.md | 13 ------------- 2 files changed, 5 insertions(+), 16 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 3f12d19a..d3b7e3c1 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -744,7 +744,7 @@ process { withName: RUNDBCAN_CAZYMEANNOTATION { publishDir = [ - path: { "${params.outdir}/dbcan/cazyme/${meta.id}" }, + path: { "${params.outdir}/cazyme/cazyme_annotation/${meta.id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, ] @@ -752,16 +752,18 @@ process { withName: RUNDBCAN_EASYCGC { publishDir = [ - path: { "${params.outdir}/dbcan/cgc/${meta.id}" }, + path: { "${params.outdir}/cazyme/cgc/${meta.id}" }, mode: params.publish_dir_mode, + pattern: "*_{cgc.gff,cgc_standard_out.tsv,diamond.out.tc,TF_hmm_results.tsv,STP_hmm_results.tsv}", saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, ] } withName: RUNDBCAN_EASYSUBSTRATE { publishDir = [ - path: { "${params.outdir}/dbcan/substrate/${meta.id}" }, + path: { "${params.outdir}/cazyme/substrate/${meta.id}" }, mode: params.publish_dir_mode, + pattern: "*_{total_cgc_info.tsv,substrate_prediction.tsv,synteny_pdf}", saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, ] } diff --git a/docs/output.md b/docs/output.md index f56e8da2..9e6ceaaa 100644 --- a/docs/output.md +++ b/docs/output.md @@ -489,25 +489,12 @@ Note that filtered FASTA is only used for BGC workflow for run-time optimisation - `*_dbCANsub_hmm_results.tsv`: TSV file containing the detailed dbCAN subfamily results for CAZyme annotation - `*_diamond.out`: TSV file containing the detailed dbCAN diamond results for CAZyme annotation - `cgc` - - `*_overview.tsv/`: TSV file containing the results of dbCAN CAZyme annotation - - `*_dbCAN_hmm_results.tsv`: TSV file containing the detailed dbCAN HMM results for CAZyme annotation - - `*_dbCANsub_hmm_results.tsv`: TSV file containing the detailed dbCAN subfamily results for CAZyme annotation - - `*_diamond.out`: TSV file containing the detailed dbCAN diamond results for CAZyme annotation - `*_cgc.gff`: GFF file containing the CAZyme gene clusters (CGC) identified by dbCAN. This file is generated from the dbCAN annotation and contains the locations of CAZyme gene clusters in the genome. - `*_cgc_standard_out.tsv`: Standard output file from dbCAN for CAZyme gene clusters (CGC) in a tabular format. This file summarizes the CAZyme gene clusters identified in the genome. - `*_diamond.out.tc`: TSV file containing the diamond output for transporter annotation - `*_TF_hmm_results.tsv`: TSV file containing the results of transcription factor screening - `*_STP_hmm_results.tsv`: TSV file containing the results of signaling transduction proteins (STP) annotation. - `substrate` - - `*_overview.tsv`: TSV file containing the results of dbCAN CAZyme annotation - - `*_dbCAN_hmm_results.tsv`: TSV file containing the detailed dbCAN HMM results for CAZyme annotation - - `*_dbCANsub_hmm_results.tsv`: TSV file containing the detailed dbCAN subfamily results for CAZyme annotation - - `*_diamond.out`: TSV file containing the detailed dbCAN diamond results for CAZyme annotation - - `*_cgc.gff`: GFF file containing the CAZyme gene clusters (CGC) identified by dbCAN. This file is generated from the dbCAN annotation and contains the locations of CAZyme gene clusters in the genome. - - `*_cgc_standard_out.tsv`: Standard output file from dbCAN for CAZyme gene clusters (CGC) in a tabular format. This file summarizes the CAZyme gene clusters identified in the genome. - - `*_diamond.out.tc`: TSV file containing the diamond output for transporter annotation - - `*_TF_hmm_results.tsv`: TSV file containing the results of transcription factor screening - - `*_STP_hmm_results.tsv`: TSV file containing the results of signaling transduction proteins (STP) annotation - `*_total_cgc_info.tsv`: TSV file summarizing the total additional genes in the genome - `*_substrate_prediction.tsv`: TSV file containing the substrate predictions based on the CGC annotations from dbCAN - `*_synteny_pdf/`: Directory containing one or more PDF files showing the syntenic regions of the CGCs in DNA sequence as identified by dbCAN From 9fd005cb042563cb5edbadbb3c0ac9a29ad19a96 Mon Sep 17 00:00:00 2001 From: haidyi Date: Mon, 14 Jul 2025 23:14:33 -0500 Subject: [PATCH 10/55] add manual dbCAN database download --- docs/usage.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/docs/usage.md b/docs/usage.md index 74da7840..b8075b3f 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -25,6 +25,7 @@ To run any of the three screening workflows (AMP, ARG, and/or BGC), taxonomic cl - `--run_bgc_screening` - `--run_taxa_classification` (for optional additional taxonomic annotations) - `--run_protein_annotation` (for optional additional protein family and domain annotation) +- `--run_cazyme_annotation` (for optional additional carbohydrate-active enzyme annotation) When switched on, all tools of the given workflow will be run by default. If you don't need specific tools, you can explicitly skip them. The exception is HMMsearch, which needs to be explicitly switched on and provided with HMM screening files (AMP and BGC workflows, see [parameter documentation](/funcscan/parameters)). For the taxonomic classification, MMseqs2 is currently the only tool implemented in the pipeline. Likewise, InterProScan is the only tool for protein sequence annotation. @@ -565,6 +566,25 @@ interproscan_db/ └── tmhmm ``` +### Run_dbCAN + +The [run_dbcan](https://github.com/bcb-unl/run_dbcan) tool requires a pre-built database to perform carbohydrate-active enzyme (CAZyme) annotation. +To download the database automatically, install the [`dbcan`](https://bioconda.github.io/recipes/dbcan/README.html) package: + +``` +conda create -n dbcan -c bioconda dbcan +conda activate dbcan +``` + +Then, download the database: + +``` +run_dbcan database --db_dir +``` + +Replace `` with your preferred directory path for storing the database files. Once the database download is complete (), the file are ready for use with the `run_dbcan` tool without additional configurations or modifications. + + ## Updating the pipeline When you run the below command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: From ea4b85261ff2aeb9ab5afbdc0a69ff37be597bfb Mon Sep 17 00:00:00 2001 From: haidyi Date: Mon, 14 Jul 2025 23:15:49 -0500 Subject: [PATCH 11/55] rename DBCAN to CAZYME --- conf/test_dbcan_pyrodigal.config | 4 ++-- conf/test_preannotated_dbcan.config | 4 ++-- subworkflows/local/{dbcan.nf => cazyme.nf} | 2 +- workflows/funcscan.nf | 8 ++++---- 4 files changed, 9 insertions(+), 9 deletions(-) rename subworkflows/local/{dbcan.nf => cazyme.nf} (99%) diff --git a/conf/test_dbcan_pyrodigal.config b/conf/test_dbcan_pyrodigal.config index 746cd6aa..19b77702 100644 --- a/conf/test_dbcan_pyrodigal.config +++ b/conf/test_dbcan_pyrodigal.config @@ -19,8 +19,8 @@ process { } params { - config_profile_name = 'DBCAN Pyrodigal test profile' - config_profile_description = 'Minimal test dataset to check DBCAN workflow function' + config_profile_name = 'CAZyme Pyrodigal test profile' + config_profile_description = 'Minimal test dataset to check CAZyme workflow function' // Input data input = params.pipelines_testdata_base_path + 'funcscan/samplesheet_reduced.csv' diff --git a/conf/test_preannotated_dbcan.config b/conf/test_preannotated_dbcan.config index ce947816..02d42c5f 100644 --- a/conf/test_preannotated_dbcan.config +++ b/conf/test_preannotated_dbcan.config @@ -19,8 +19,8 @@ process { } params { - config_profile_name = 'DBCAN test profile - preannotated input' - config_profile_description = 'Minimal test dataset to check DBCAN workflow function' + config_profile_name = 'CAZyme test profile - preannotated input' + config_profile_description = 'Minimal test dataset to check CAZyme workflow function' // Input data input = params.pipelines_testdata_base_path + 'funcscan/samplesheet_preannotated.csv' diff --git a/subworkflows/local/dbcan.nf b/subworkflows/local/cazyme.nf similarity index 99% rename from subworkflows/local/dbcan.nf rename to subworkflows/local/cazyme.nf index 6ff8b334..7aea51db 100644 --- a/subworkflows/local/dbcan.nf +++ b/subworkflows/local/cazyme.nf @@ -8,7 +8,7 @@ include { RUNDBCAN_EASYCGC } from '../../modules/nf-core/rundbcan/easyc include { RUNDBCAN_EASYSUBSTRATE } from '../../modules/nf-core/rundbcan/easysubstrate/main' -workflow DBCAN { +workflow CAZYME { take: faas // tuple val(meta), path(PROKKA/PRODIGAL.out.faa) diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index d8293315..6f5e8379 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -24,7 +24,7 @@ include { PROTEIN_ANNOTATION } from '../subworkflows/local/protein_annota include { AMP } from '../subworkflows/local/amp' include { ARG } from '../subworkflows/local/arg' include { BGC } from '../subworkflows/local/bgc' -include { DBCAN } from '../subworkflows/local/dbcan' +include { CAZYME } from '../subworkflows/local/cazyme' include { TAXA_CLASS } from '../subworkflows/local/taxa_class' /* @@ -364,13 +364,13 @@ workflow FUNCSCAN { } /* - DBCANs + CAZYMEs */ if ( params.run_dbcan_screening ) { - DBCAN ( + CAZYME ( ch_prepped_input.faas.filter { meta, file -> if (file != [] && file.isEmpty()) { - log.warn("[nf-core/funcscan] Annotation of following sample produced an empty FAA file. DBCAN screening tools requiring this file will not be executed: ${meta.id}") + log.warn("[nf-core/funcscan] Annotation of following sample produced an empty FAA file. CAZyme screening tools requiring this file will not be executed: ${meta.id}") } !file.isEmpty() }, From 62623a571ce39c36f7acc0c40d51fb257c38ffbc Mon Sep 17 00:00:00 2001 From: haidyi Date: Mon, 14 Jul 2025 23:17:35 -0500 Subject: [PATCH 12/55] add gff column in samplesheet --- assets/samplesheet.csv | 2 +- assets/schema_input.json | 8 ++++++++ workflows/funcscan.nf | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 791912cd..c568ad65 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,4 +1,4 @@ -sample,fasta,protein,gbk +sample,fasta,protein,gbk,gff sample_1,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_1.fasta.gz,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_prokka_1.faa,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_prokka_1.gbk sample_2,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_2.fasta.gz,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_prokka_2.faa.gz,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_prokka_2.gbk.gz sample_3,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs.fasta diff --git a/assets/schema_input.json b/assets/schema_input.json index be402b5f..bb323a5d 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -33,12 +33,20 @@ "exists": true, "pattern": "^\\S+\\.(gbk|gbff)(\\.gz)?$", "errorMessage": "Input file for feature annotations has incorrect file format. File must end in `.gbk`, `.gbk.gz` or `.gbff`, or `.gbff.gz`" + }, + "gff": { + "type": "string", + "format": "file-path", + "exists": true, + "pattern": "^\\S+\\.(gff|gff3)(\\.gz)?$", + "errorMessage": "Input file for feature annotations has incorrect file format. File must end in `.gff`, `.gff.gz` or `.gff3`, or `.gff3.gz`" } }, "required": ["sample", "fasta"], "dependentRequired": { "protein": ["gbk"], "gbk": ["protein"] + "gff": ["protein"] } }, "uniqueItems": true diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index 6f5e8379..5adf658a 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -69,7 +69,7 @@ workflow FUNCSCAN { // Some tools require uncompressed input ch_input_prep = ch_samplesheet - .map { meta, fasta, faa, gbk -> [meta + [category: 'all'], [fasta, faa, gbk]] } + .map { meta, fasta, faa, gbk, gff -> [meta + [category: 'all'], [fasta, faa, gbk, gff]] } .transpose() .branch { compressed: it[1].toString().endsWith('.gz') From 0cad8f95c553b3cdd3a59c34a0db107bd6df14f4 Mon Sep 17 00:00:00 2001 From: haidyi Date: Mon, 14 Jul 2025 23:18:04 -0500 Subject: [PATCH 13/55] change run_dbcan_screening to run_cazyme_screening --- conf/test_dbcan_pyrodigal.config | 2 +- conf/test_preannotated_dbcan.config | 2 +- nextflow.config | 2 +- nextflow_schema.json | 2 +- workflows/funcscan.nf | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/conf/test_dbcan_pyrodigal.config b/conf/test_dbcan_pyrodigal.config index 19b77702..9d431f78 100644 --- a/conf/test_dbcan_pyrodigal.config +++ b/conf/test_dbcan_pyrodigal.config @@ -30,5 +30,5 @@ params { run_arg_screening = false run_amp_screening = false run_bgc_screening = false - run_dbcan_screening = true + run_cazyme_screening = true } diff --git a/conf/test_preannotated_dbcan.config b/conf/test_preannotated_dbcan.config index 02d42c5f..fde7e1fd 100644 --- a/conf/test_preannotated_dbcan.config +++ b/conf/test_preannotated_dbcan.config @@ -30,7 +30,7 @@ params { run_arg_screening = false run_amp_screening = false run_bgc_screening = false - run_dbcan_screening = true + run_cazyme_screening = true dbcan_skip_cgc = true // Skip cgc annotation as .gbk (not .gff) is provided in samplesheet dbcan_skip_substrate = true // Skip substrate annotation as .gbk (not .gff) is provided in samplesheet diff --git a/nextflow.config b/nextflow.config index 211e919c..5c829fab 100644 --- a/nextflow.config +++ b/nextflow.config @@ -250,7 +250,7 @@ params { bgc_hmmsearch_savedomains = false // RUNDBCAN options - run_dbcan_screening = false + run_cazyme_screening = false dbcan_skip_cgc = false dbcan_skip_substrate = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 7606ba56..7b284d8c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -65,7 +65,7 @@ "description": "Activate biosynthetic gene cluster screening tools.", "fa_icon": "fas fa-check-circle" }, - "run_dbcan_screening": { + "run_cazyme_screening": { "type": "boolean", "description": "Activate CAZyme and CAZyme gene cluster screening tools.", "fa_icon": "fas fa-check-circle" diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index 5adf658a..90ff8139 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -125,7 +125,7 @@ workflow FUNCSCAN { */ // Some tools require annotated FASTAs - if ((params.run_arg_screening && !params.arg_skip_deeparg) || params.run_amp_screening || params.run_bgc_screening || params.run_dbcan_screening) { + if ((params.run_arg_screening && !params.arg_skip_deeparg) || params.run_amp_screening || params.run_bgc_screening || params.run_cazyme_screening) { ANNOTATION(ch_input_for_annotation) ch_versions = ch_versions.mix(ANNOTATION.out.versions) @@ -366,7 +366,7 @@ workflow FUNCSCAN { /* CAZYMEs */ - if ( params.run_dbcan_screening ) { + if ( params.run_cazyme_screening ) { CAZYME ( ch_prepped_input.faas.filter { meta, file -> if (file != [] && file.isEmpty()) { From b76e3a205e1c3a6631b52401e3f4eea2bd529d56 Mon Sep 17 00:00:00 2001 From: HD Yi Date: Wed, 16 Jul 2025 19:24:41 -0500 Subject: [PATCH 14/55] add missing identifier Co-authored-by: James A. Fellows Yates --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index b8075b3f..9947a35b 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -571,7 +571,7 @@ interproscan_db/ The [run_dbcan](https://github.com/bcb-unl/run_dbcan) tool requires a pre-built database to perform carbohydrate-active enzyme (CAZyme) annotation. To download the database automatically, install the [`dbcan`](https://bioconda.github.io/recipes/dbcan/README.html) package: -``` +```bash conda create -n dbcan -c bioconda dbcan conda activate dbcan ``` From 0f5863a694736c925d04ac363f470e34cfc75f07 Mon Sep 17 00:00:00 2001 From: HD Yi Date: Wed, 16 Jul 2025 19:25:02 -0500 Subject: [PATCH 15/55] add missing identifier Co-authored-by: James A. Fellows Yates --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index 9947a35b..8f0cc11a 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -578,7 +578,7 @@ conda activate dbcan Then, download the database: -``` +```bash run_dbcan database --db_dir ``` From f2d79d5a2b0dcaf90f0048cea3bf46f623496089 Mon Sep 17 00:00:00 2001 From: HD Yi Date: Wed, 16 Jul 2025 22:55:19 -0500 Subject: [PATCH 16/55] add missing conda Co-authored-by: James A. Fellows Yates --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index 8f0cc11a..e19ca298 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -569,7 +569,7 @@ interproscan_db/ ### Run_dbCAN The [run_dbcan](https://github.com/bcb-unl/run_dbcan) tool requires a pre-built database to perform carbohydrate-active enzyme (CAZyme) annotation. -To download the database automatically, install the [`dbcan`](https://bioconda.github.io/recipes/dbcan/README.html) package: +To download the database automatically, install the [`dbcan`](https://bioconda.github.io/recipes/dbcan/README.html) package, e.g. with conda: ```bash conda create -n dbcan -c bioconda dbcan From 625ced41663f5b6fda1dc73b94af67358061faab Mon Sep 17 00:00:00 2001 From: HD Yi Date: Wed, 16 Jul 2025 22:55:32 -0500 Subject: [PATCH 17/55] fix typo Co-authored-by: James A. Fellows Yates --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index e19ca298..8efda09f 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -582,7 +582,7 @@ Then, download the database: run_dbcan database --db_dir ``` -Replace `` with your preferred directory path for storing the database files. Once the database download is complete (), the file are ready for use with the `run_dbcan` tool without additional configurations or modifications. +Replace `` with your preferred directory path for storing the database files. Once the database download is complete, the file are ready for use with the `run_dbcan` tool without additional configurations or modifications. ## Updating the pipeline From 58273f1284aac8b0d344acc7c6eef9cec9c23e7d Mon Sep 17 00:00:00 2001 From: haidyi Date: Wed, 16 Jul 2025 23:01:44 -0500 Subject: [PATCH 18/55] re-organize the outdir structure of cazyme screening --- conf/modules.config | 6 +++--- docs/output.md | 33 +++++++++++++++++---------------- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index d3b7e3c1..006cd95f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -744,7 +744,7 @@ process { withName: RUNDBCAN_CAZYMEANNOTATION { publishDir = [ - path: { "${params.outdir}/cazyme/cazyme_annotation/${meta.id}" }, + path: { "${params.outdir}/cazyme/dbcan/cazyme_annotation/${meta.id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, ] @@ -752,7 +752,7 @@ process { withName: RUNDBCAN_EASYCGC { publishDir = [ - path: { "${params.outdir}/cazyme/cgc/${meta.id}" }, + path: { "${params.outdir}/cazyme/dbcan/cgc/${meta.id}" }, mode: params.publish_dir_mode, pattern: "*_{cgc.gff,cgc_standard_out.tsv,diamond.out.tc,TF_hmm_results.tsv,STP_hmm_results.tsv}", saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, @@ -761,7 +761,7 @@ process { withName: RUNDBCAN_EASYSUBSTRATE { publishDir = [ - path: { "${params.outdir}/cazyme/substrate/${meta.id}" }, + path: { "${params.outdir}/cazyme/dbcan/substrate/${meta.id}" }, mode: params.publish_dir_mode, pattern: "*_{total_cgc_info.tsv,substrate_prediction.tsv,synteny_pdf}", saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, diff --git a/docs/output.md b/docs/output.md index 9e6ceaaa..5b99d4e5 100644 --- a/docs/output.md +++ b/docs/output.md @@ -482,22 +482,23 @@ Note that filtered FASTA is only used for BGC workflow for run-time optimisation
Output files -- `dbcan/` - - `cazyme` - - `*_overview.tsv`: TSV file containing the results of dbCAN CAZyme annotation - - `*_dbCAN_hmm_results.tsv`: TSV file containing the detailed dbCAN HMM results for CAZyme annotation - - `*_dbCANsub_hmm_results.tsv`: TSV file containing the detailed dbCAN subfamily results for CAZyme annotation - - `*_diamond.out`: TSV file containing the detailed dbCAN diamond results for CAZyme annotation - - `cgc` - - `*_cgc.gff`: GFF file containing the CAZyme gene clusters (CGC) identified by dbCAN. This file is generated from the dbCAN annotation and contains the locations of CAZyme gene clusters in the genome. - - `*_cgc_standard_out.tsv`: Standard output file from dbCAN for CAZyme gene clusters (CGC) in a tabular format. This file summarizes the CAZyme gene clusters identified in the genome. - - `*_diamond.out.tc`: TSV file containing the diamond output for transporter annotation - - `*_TF_hmm_results.tsv`: TSV file containing the results of transcription factor screening - - `*_STP_hmm_results.tsv`: TSV file containing the results of signaling transduction proteins (STP) annotation. - - `substrate` - - `*_total_cgc_info.tsv`: TSV file summarizing the total additional genes in the genome - - `*_substrate_prediction.tsv`: TSV file containing the substrate predictions based on the CGC annotations from dbCAN - - `*_synteny_pdf/`: Directory containing one or more PDF files showing the syntenic regions of the CGCs in DNA sequence as identified by dbCAN +- `cazyme/` + - `dbcan` + - `cazyme_annotation` + - `_overview.tsv`: TSV file containing the results of dbCAN CAZyme annotation + - `_dbCAN_hmm_results.tsv`: TSV file containing the detailed dbCAN HMM results for CAZyme annotation + - `_dbCANsub_hmm_results.tsv`: TSV file containing the detailed dbCAN subfamily results for CAZyme annotation + - `_diamond.out`: TSV file containing the detailed dbCAN diamond results for CAZyme annotation + - `cgc` + - `_cgc.gff`: GFF file containing the CAZyme gene clusters (CGC) identified by dbCAN. This file is generated from the dbCAN annotation and contains the locations of CAZyme gene clusters in the genome. + - `_cgc_standard_out.tsv`: Standard output file from dbCAN for CAZyme gene clusters (CGC) in a tabular format. This file summarizes the CAZyme gene clusters identified in the genome. + - `_diamond.out.tc`: TSV file containing the diamond output for transporter annotation + - `_TF_hmm_results.tsv`: TSV file containing the results of transcription factor screening + - `_STP_hmm_results.tsv`: TSV file containing the results of signaling transduction proteins (STP) annotation. + - `substrate` + - `_total_cgc_info.tsv`: TSV file summarizing the total additional genes in the genome + - `_substrate_prediction.tsv`: TSV file containing the substrate predictions based on the CGC annotations from dbCAN + - `_synteny_pdf/`: Directory containing one or more PDF files showing the syntenic regions of the CGCs in DNA sequence as identified by dbCAN
From a638f3251a0ed965fe78fcb4ec52662470d308fe Mon Sep 17 00:00:00 2001 From: haidyi Date: Sat, 26 Jul 2025 11:37:32 -0500 Subject: [PATCH 19/55] add citation --- CITATIONS.md | 4 ++++ subworkflows/local/utils_nfcore_funcscan_pipeline/main.nf | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/CITATIONS.md b/CITATIONS.md index 37e595d2..8f1a4a07 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -102,6 +102,10 @@ > Alcock, B. P., Huynh, W., Chalil, R., Smith, K. W., Raphenya, A. R., Wlodarski, M. A., Edalatmand, A., Petkau, A., Syed, S. A., Tsang, K. K., Baker, S. J. C., Dave, M., McCarthy, M. C., Mukiri, K. M., Nasir, J. A., Golbon, B., Imtiaz, H., Jiang, X., Kaur, K., Kwong, M., Liang, Z. C., Niu, K. C., Shan, P., Yang, J. Y. J., Gray, K. L., Hoad, G. R., Jia, B., Bhando, T., Carfrae, L. A., Farha, M. A., French, S., Gordzevich, R., Rachwalski, K., Tu, M. M., Bordeleau, E., Dooley, D., Griffiths, E., Zubyk, H. L., Brown, E. D., Maguire, F., Beiko, R. G., Hsiao, W. W. L., Brinkman F. S. L., Van Domselaar, G., McArthur, A. G. (2023). CARD 2023: expanded curation, support for machine learning, and resistome prediction at the Comprehensive Antibiotic Resistance Database. Nucleic acids research, 51(D1):D690-D699. [DOI: 10.1093/nar/gkac920](https://doi.org/10.1093/nar/gkac920) +- [dbCAN](https://doi.org/10.1093/nar/gkad328) + + > Jinfang Zheng, Qiwei Ge, Yuchen Yan, Xinpeng Zhang, Le Huang, Yanbin Yin, dbCAN3: automated carbohydrate-active enzyme and substrate annotation, Nucleic Acids Research, Volume 51, Issue W1, 5 July 2023, Pages W115–W121. [DOI:10.1093/nar/gkad328](https://doi.org/10.1093/nar/gkad328) + - [SeqKit](https://bioinf.shenwei.me/seqkit/) > Shen, W., Sipos, B., & Zhao, L. (2024). SeqKit2: A Swiss army knife for sequence and alignment processing. iMeta, e191. [https://doi.org/10.1002/imt2.191](https://doi.org/10.1002/imt2.191) diff --git a/subworkflows/local/utils_nfcore_funcscan_pipeline/main.nf b/subworkflows/local/utils_nfcore_funcscan_pipeline/main.nf index 27dce6cb..a095a8b1 100644 --- a/subworkflows/local/utils_nfcore_funcscan_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_funcscan_pipeline/main.nf @@ -266,6 +266,10 @@ def toolBibliographyText() { '
  • Frangenberg, J. Fellows Yates, J. A., Ibrahim, A., Perelo, L., & Beber, M. E. (2023). nf-core/funcscan: 1.0.0 - German Rollmops - 2023-02-15. https://doi.org/10.5281/zenodo.7643100
  • ', ].join(' ').replaceAll(', +.', ".").trim() + def cazyme_text = [ + !params.cazyme_skip_dbcan ? '
  • Jinfang Zheng, Qiwei Ge, Yuchen Yan, Xinpeng Zhang, Le Huang, Yanbin Yin, dbCAN3: automated carbohydrate-active enzyme and substrate annotation, Nucleic Acids Research, Volume 51, Issue W1, 5 July 2023, Pages W115–W121. DOI: 10.1093/nar/gkad328
  • ' : "" + ].join(' ').replaceAll(', +.', ".").trim() + def postprocessing_text = '
  • Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. https://doi.org/10.1093/bioinformatics/btw354
  • ' // Special as reused in multiple subworkflows, and we don't want to cause duplicates @@ -277,6 +281,7 @@ def toolBibliographyText() { params.run_amp_screening ? amp_text : "", params.run_arg_screening ? arg_text : "", params.run_bgc_screening ? bgc_text : "", + params.run_cazyme_screening ? cazyme_text : "", hmmsearch_text, postprocessing_text, ].join(' ').trim() From a5d692b79f3308c40ca8b5718a80131ebca0b7b6 Mon Sep 17 00:00:00 2001 From: haidyi Date: Sat, 26 Jul 2025 11:38:14 -0500 Subject: [PATCH 20/55] add cazyme_skip_dbcan param --- nextflow.config | 1 + nextflow_schema.json | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/nextflow.config b/nextflow.config index 5c829fab..e60339cc 100644 --- a/nextflow.config +++ b/nextflow.config @@ -252,6 +252,7 @@ params { // RUNDBCAN options run_cazyme_screening = false + cazyme_skip_dbcan = false dbcan_skip_cgc = false dbcan_skip_substrate = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 7b284d8c..45551b10 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1471,12 +1471,17 @@ }, "fa_icon": "fas fa-angle-double-right" }, - "dbcan": { + "cayme_dbcan": { "title": "dbCAN", "type": "object", "description": "Carbohydrate-active enzyme annotation based on pre-defined HMM models.\n\nFor more information check the dbCAN [documentation](https://run-dbcan.readthedocs.io/en/latest)", "default": "", "properties": { + "cazyme_skip_dbcan": { + "type": "boolean", + "description": "Skip dbCAN during the CAZyme screening.", + "fa_icon": "fas fa-ban" + }, "dbcan_skip_cgc": { "type": "boolean", "description": "Skip CGC during the dbCAN screening.", @@ -1727,7 +1732,7 @@ "$ref": "#/$defs/bgc_hmmsearch" }, { - "$ref": "#/$defs/dbcan" + "$ref": "#/$defs/cazyme_dbcan" }, { "$ref": "#/$defs/institutional_config_options" From da9d4a41cc10914beb5019a8bcfc4107484a6a34 Mon Sep 17 00:00:00 2001 From: haidyi Date: Sat, 26 Jul 2025 12:41:04 -0500 Subject: [PATCH 21/55] fix missing ',' --- assets/schema_input.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/schema_input.json b/assets/schema_input.json index bb323a5d..7e459fd1 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -45,7 +45,7 @@ "required": ["sample", "fasta"], "dependentRequired": { "protein": ["gbk"], - "gbk": ["protein"] + "gbk": ["protein"], "gff": ["protein"] } }, From 9f3af6c5ba60396e538c755c115543fd471dc58f Mon Sep 17 00:00:00 2001 From: haidyi Date: Sun, 24 Aug 2025 23:30:33 -0500 Subject: [PATCH 22/55] add gff type parameter for dbcan --- nextflow.config | 2 ++ nextflow_schema.json | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/nextflow.config b/nextflow.config index e60339cc..2c5dec4c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -253,8 +253,10 @@ params { run_cazyme_screening = false cazyme_skip_dbcan = false + dbcan_skip_cgc = false dbcan_skip_substrate = false + dbcan_gff_type = 'prodigal' // Config options config_profile_name = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 45551b10..e736f3dc 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1491,6 +1491,14 @@ "type": "boolean", "description": "Skip substrate during the dbCAN screening.", "fa_icon": "fas fa-ban" + }, + "dbcan_gff_type": { + "type": "string", + "default": "prodigal", + "description": "GFF type used to annotate CAZymes in the input GFF file.", + "help_text": "This flag specifies the type of GFF file that is used to annotate CAZymes in the input GFF file. It is used to ensure that the correct gene annotations are used for the CAZyme screening. For more information check the dbCAN [documentation](https://run-dbcan.readthedocs.io/en/latest/user_guide/CGC_information_generation.html#input-gff-file-types).", + "fa_icon": "fas fa-database", + "enum": ["NCBI_prok", "prodigal", "NCBI_euk", "JGI"] } } }, From 15645fb10ad3e09db6b2f251fbd86eacf936300d Mon Sep 17 00:00:00 2001 From: haidyi Date: Sun, 24 Aug 2025 23:31:00 -0500 Subject: [PATCH 23/55] mv hard-coded gff type to params --- subworkflows/local/cazyme.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/cazyme.nf b/subworkflows/local/cazyme.nf index 7aea51db..b7a6b252 100644 --- a/subworkflows/local/cazyme.nf +++ b/subworkflows/local/cazyme.nf @@ -40,7 +40,7 @@ workflow CAZYME { .join(ch_gffs_for_rundbcan) .multiMap { meta, faa, gff -> faa: [meta, faa] - gff: [meta, gff, 'prodigal'] + gff: [meta, gff, params.dbcan_gff_type] // One samplesheet can only have one gff type, mixed mode is not supported now. } // CGC annotation From 8e4893684fd711a4612cd9af82c1c42819ade3d0 Mon Sep 17 00:00:00 2001 From: haidyi Date: Wed, 27 Aug 2025 16:04:58 -0500 Subject: [PATCH 24/55] fix typo --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index c5b6c704..6186b1fe 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1468,7 +1468,7 @@ }, "fa_icon": "fas fa-angle-double-right" }, - "cayme_dbcan": { + "cazyme_dbcan": { "title": "dbCAN", "type": "object", "description": "Carbohydrate-active enzyme annotation based on pre-defined HMM models.\n\nFor more information check the dbCAN [documentation](https://run-dbcan.readthedocs.io/en/latest)", From 101a1593811b597a2cd7e2b26a6597a73258ff3f Mon Sep 17 00:00:00 2001 From: HaidYi Date: Wed, 27 Aug 2025 14:30:46 -0700 Subject: [PATCH 25/55] fix format --- docs/usage.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index 8efda09f..a3c2cb1d 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -584,7 +584,6 @@ run_dbcan database --db_dir Replace `` with your preferred directory path for storing the database files. Once the database download is complete, the file are ready for use with the `run_dbcan` tool without additional configurations or modifications. - ## Updating the pipeline When you run the below command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: From 9c14e24ab922feb60d4b50db7b4019cfd908f8b9 Mon Sep 17 00:00:00 2001 From: HaidYi Date: Wed, 27 Aug 2025 14:32:36 -0700 Subject: [PATCH 26/55] fix lint issue --- ro-crate-metadata.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json index 6c9ca831..46ea2b17 100644 --- a/ro-crate-metadata.json +++ b/ro-crate-metadata.json @@ -23,7 +23,7 @@ "@type": "Dataset", "creativeWorkStatus": "InProgress", "datePublished": "2025-03-05T13:07:35+00:00", - "description": "

    \n \n \n \"nf-core/funcscan\"\n \n

    \n\n[![GitHub Actions CI Status](https://github.com/nf-core/funcscan/actions/workflows/ci.yml/badge.svg)](https://github.com/nf-core/funcscan/actions/workflows/ci.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/funcscan/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/funcscan/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/funcscan/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7643099-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7643099)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A524.04.2-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.3.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.3.1)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/funcscan)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23funcscan-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/funcscan)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/funcscan** is a bioinformatics best-practice analysis pipeline for the screening of nucleotide sequences such as assembled contigs for functional genes. It currently features mining for antimicrobial peptides, antibiotic resistance genes and biosynthetic gene clusters.\n\nThe pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community!\n\nOn release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources. The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/funcscan/results).\n\nThe nf-core/funcscan AWS full test dataset are contigs generated by the MGnify service from the ENA. We used contigs generated from assemblies of chicken cecum shotgun metagenomes (study accession: MGYS00005631).\n\n## Pipeline summary\n\n1. Quality control of input sequences with [`SeqKit`](https://bioinf.shenwei.me/seqkit/)\n2. Taxonomic classification of contigs of **prokaryotic origin** with [`MMseqs2`](https://github.com/soedinglab/MMseqs2)\n3. Annotation of assembled prokaryotic contigs with [`Prodigal`](https://github.com/hyattpd/Prodigal), [`Pyrodigal`](https://github.com/althonos/pyrodigal), [`Prokka`](https://github.com/tseemann/prokka), or [`Bakta`](https://github.com/oschwengers/bakta)\n4. Annotation of coding sequences from 3. to obtain general protein families and domains with [`InterProScan`](https://github.com/ebi-pf-team/interproscan)\n5. Screening contigs for antimicrobial peptide-like sequences with [`ampir`](https://cran.r-project.org/web/packages/ampir/index.html), [`Macrel`](https://github.com/BigDataBiology/macrel), [`HMMER`](http://hmmer.org/), [`AMPlify`](https://github.com/bcgsc/AMPlify)\n6. Screening contigs for antibiotic resistant gene-like sequences with [`ABRicate`](https://github.com/tseemann/abricate), [`AMRFinderPlus`](https://github.com/ncbi/amr), [`fARGene`](https://github.com/fannyhb/fargene), [`RGI`](https://card.mcmaster.ca/analyze/rgi), [`DeepARG`](https://bench.cs.vt.edu/deeparg). [`argNorm`](https://github.com/BigDataBiology/argNorm) is used to map the outputs of `DeepARG`, `AMRFinderPlus`, and `ABRicate` to the [`Antibiotic Resistance Ontology`](https://www.ebi.ac.uk/ols4/ontologies/aro) for consistent ARG classification terms.\n7. Screening contigs for biosynthetic gene cluster-like sequences with [`antiSMASH`](https://antismash.secondarymetabolites.org), [`DeepBGC`](https://github.com/Merck/deepbgc), [`GECCO`](https://gecco.embl.de/), [`HMMER`](http://hmmer.org/)\n8. Creating aggregated reports for all samples across the workflows with [`AMPcombi`](https://github.com/Darcy220606/AMPcombi) for AMPs, [`hAMRonization`](https://github.com/pha4ge/hAMRonization) for ARGs, and [`comBGC`](https://raw.githubusercontent.com/nf-core/funcscan/master/bin/comBGC.py) for BGCs\n9. Software version and methods text reporting with [`MultiQC`](http://multiqc.info/)\n\n![funcscan metro workflow](docs/images/funcscan_metro_workflow.png)\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nFirst, prepare a samplesheet with your input data that looks as follows:\n\n`samplesheet.csv`:\n\n```csv\nsample,fasta\nCONTROL_REP1,AEG588A1_001.fasta\nCONTROL_REP2,AEG588A1_002.fasta\nCONTROL_REP3,AEG588A1_003.fasta\n```\n\nEach row represents a (multi-)fasta file of assembled contig sequences.\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run nf-core/funcscan \\\n -profile \\\n --input samplesheet.csv \\\n --outdir \\\n --run_amp_screening \\\n --run_arg_screening \\\n --run_bgc_screening\n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/funcscan/usage) and the [parameter documentation](https://nf-co.re/funcscan/parameters).\n\n## Pipeline output\n\nTo see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/funcscan/results) tab on the nf-core website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://nf-co.re/funcscan/output).\n\n## Credits\n\nnf-core/funcscan was originally written by Jasmin Frangenberg, Anan Ibrahim, Louisa Perelo, Moritz E. Beber, James A. Fellows Yates.\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\nAdam Talbot, Alexandru Mizeranschi, Hugo Tavares, J\u00falia Mir Pedrol, Martin Klapper, Mehrdad Jaberi, Robert Syme, Rosa Herbst, Vedanth Ramji, @Microbion.\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#funcscan` channel](https://nfcore.slack.com/channels/funcscan) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\nIf you use nf-core/funcscan for your analysis, please cite it using the following doi: [10.5281/zenodo.7643099](https://doi.org/10.5281/zenodo.7643099)\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", + "description": "

    \n \n \n \"nf-core/funcscan\"\n \n

    \n\n[![GitHub Actions CI Status](https://github.com/nf-core/funcscan/actions/workflows/ci.yml/badge.svg)](https://github.com/nf-core/funcscan/actions/workflows/ci.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/funcscan/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/funcscan/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/funcscan/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7643099-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7643099)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A524.04.2-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.3.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.3.1)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/funcscan)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23funcscan-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/funcscan)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/funcscan** is a bioinformatics best-practice analysis pipeline for the screening of nucleotide sequences such as assembled contigs for functional genes. It currently features mining for antimicrobial peptides, antibiotic resistance genes and biosynthetic gene clusters.\n\nThe pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community!\n\nOn release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources. The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/funcscan/results).\n\nThe nf-core/funcscan AWS full test dataset are contigs generated by the MGnify service from the ENA. We used contigs generated from assemblies of chicken cecum shotgun metagenomes (study accession: MGYS00005631).\n\n## Pipeline summary\n\n1. Quality control of input sequences with [`SeqKit`](https://bioinf.shenwei.me/seqkit/)\n2. Taxonomic classification of contigs of **prokaryotic origin** with [`MMseqs2`](https://github.com/soedinglab/MMseqs2)\n3. Annotation of assembled prokaryotic contigs with [`Prodigal`](https://github.com/hyattpd/Prodigal), [`Pyrodigal`](https://github.com/althonos/pyrodigal), [`Prokka`](https://github.com/tseemann/prokka), or [`Bakta`](https://github.com/oschwengers/bakta)\n4. Annotation of coding sequences from 3. to obtain general protein families and domains with [`InterProScan`](https://github.com/ebi-pf-team/interproscan)\n5. Screening contigs for antimicrobial peptide-like sequences with [`ampir`](https://cran.r-project.org/web/packages/ampir/index.html), [`Macrel`](https://github.com/BigDataBiology/macrel), [`HMMER`](http://hmmer.org/), [`AMPlify`](https://github.com/bcgsc/AMPlify)\n6. Screening contigs for antibiotic resistant gene-like sequences with [`ABRicate`](https://github.com/tseemann/abricate), [`AMRFinderPlus`](https://github.com/ncbi/amr), [`fARGene`](https://github.com/fannyhb/fargene), [`RGI`](https://card.mcmaster.ca/analyze/rgi), [`DeepARG`](https://bench.cs.vt.edu/deeparg). [`argNorm`](https://github.com/BigDataBiology/argNorm) is used to map the outputs of `DeepARG`, `AMRFinderPlus`, and `ABRicate` to the [`Antibiotic Resistance Ontology`](https://www.ebi.ac.uk/ols4/ontologies/aro) for consistent ARG classification terms.\n7. Screening contigs for biosynthetic gene cluster-like sequences with [`antiSMASH`](https://antismash.secondarymetabolites.org), [`DeepBGC`](https://github.com/Merck/deepbgc), [`GECCO`](https://gecco.embl.de/), [`HMMER`](http://hmmer.org/)\n8. Screening contigs for carbohydrate-active enzymes (CAZymes), CAZyme gene clusters and substrates with [run_dbcan](https://github.com/bcb-unl/run_dbcan).\n9. Creating aggregated reports for all samples across the workflows with [`AMPcombi`](https://github.com/Darcy220606/AMPcombi) for AMPs, [`hAMRonization`](https://github.com/pha4ge/hAMRonization) for ARGs, and [`comBGC`](https://raw.githubusercontent.com/nf-core/funcscan/master/bin/comBGC.py) for BGCs\n10. Software version and methods text reporting with [`MultiQC`](http://multiqc.info/)\n\n![funcscan metro workflow](docs/images/funcscan_metro_workflow.png)\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nFirst, prepare a samplesheet with your input data that looks as follows:\n\n`samplesheet.csv`:\n\n```csv\nsample,fasta\nCONTROL_REP1,AEG588A1_001.fasta\nCONTROL_REP2,AEG588A1_002.fasta\nCONTROL_REP3,AEG588A1_003.fasta\n```\n\nEach row represents a (multi-)fasta file of assembled contig sequences.\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run nf-core/funcscan \\\n -profile \\\n --input samplesheet.csv \\\n --outdir \\\n --run_amp_screening \\\n --run_arg_screening \\\n --run_bgc_screening\n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/funcscan/usage) and the [parameter documentation](https://nf-co.re/funcscan/parameters).\n\n## Pipeline output\n\nTo see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/funcscan/results) tab on the nf-core website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://nf-co.re/funcscan/output).\n\n## Credits\n\nnf-core/funcscan was originally written by Jasmin Frangenberg, Anan Ibrahim, Louisa Perelo, Moritz E. Beber, James A. Fellows Yates.\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\nAdam Talbot, Alexandru Mizeranschi, Hugo Tavares, J\u00falia Mir Pedrol, Martin Klapper, Mehrdad Jaberi, Robert Syme, Rosa Herbst, Vedanth Ramji, @Microbion.\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#funcscan` channel](https://nfcore.slack.com/channels/funcscan) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\nIf you use nf-core/funcscan for your analysis, please cite it using the following doi: [10.5281/zenodo.7643099](https://doi.org/10.5281/zenodo.7643099)\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", "hasPart": [ { "@id": "main.nf" From 63c8b04d4ddac6c26acd9f1ca5efe2b08b7dd91d Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Thu, 28 Aug 2025 12:55:33 +0000 Subject: [PATCH 27/55] Fix snapshot --- tests/default.nf.test.snap | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/default.nf.test.snap b/tests/default.nf.test.snap index a2f71420..d20ac50a 100644 --- a/tests/default.nf.test.snap +++ b/tests/default.nf.test.snap @@ -35,7 +35,7 @@ }, "-profile test": { "content": [ - 57, + 61, { "ABRICATE_RUN": { "abricate": "1.0.1" @@ -98,6 +98,9 @@ "GUNZIP_PYRODIGAL_GBK": { "gunzip": 1.13 }, + "GUNZIP_PYRODIGAL_GFF": { + "gunzip": 1.13 + }, "HAMRONIZATION_ABRICATE": { "hamronization": "1.1.4" }, @@ -116,7 +119,10 @@ "MACREL_CONTIGS": { "macrel": "1.4.0" }, - "PYRODIGAL": { + "PYRODIGAL_GBK": { + "pyrodigal": "3.6.3" + }, + "PYRODIGAL_GFF": { "pyrodigal": "3.6.3" }, "RGI_CARDANNOTATION": { @@ -137,9 +143,9 @@ ], "meta": { "nf-test": "0.9.2", - "nextflow": "25.04.3" + "nextflow": "25.04.6" }, - "timestamp": "2025-06-18T09:03:18.913706717" + "timestamp": "2025-08-28T12:48:37.438422987" }, "rgi": { "content": [ From 03b1030fb24eaf710b6323d942f0fee78ed7c867 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Thu, 28 Aug 2025 12:59:46 +0000 Subject: [PATCH 28/55] Fix RO crate --- ro-crate-metadata.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json index 461f61a2..8dae7155 100644 --- a/ro-crate-metadata.json +++ b/ro-crate-metadata.json @@ -23,7 +23,7 @@ "@type": "Dataset", "creativeWorkStatus": "InProgress", "datePublished": "2025-07-08T10:27:35+00:00", - "description": "

    \n \n \n \"nf-core/funcscan\"\n \n

    \n\n[![GitHub Actions CI Status](https://github.com/nf-core/funcscan/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/funcscan/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/funcscan/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/funcscan/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/funcscan/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7643099-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7643099)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A524.10.5-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.3.2-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.3.2)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/funcscan)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23funcscan-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/funcscan)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/funcscan** is a bioinformatics best-practice analysis pipeline for the screening of nucleotide sequences such as assembled contigs for functional genes. It currently features mining for antimicrobial peptides, antibiotic resistance genes and biosynthetic gene clusters.\n\nThe pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community!\n\nOn release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources. The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/funcscan/results).\n\nThe nf-core/funcscan AWS full test dataset are contigs generated by the MGnify service from the ENA. We used contigs generated from assemblies of chicken cecum shotgun metagenomes (study accession: MGYS00005631).\n\n## Pipeline summary\n\n1. Quality control of input sequences with [`SeqKit`](https://bioinf.shenwei.me/seqkit/)\n2. Taxonomic classification of contigs of **prokaryotic origin** with [`MMseqs2`](https://github.com/soedinglab/MMseqs2)\n3. Annotation of assembled prokaryotic contigs with [`Prodigal`](https://github.com/hyattpd/Prodigal), [`Pyrodigal`](https://github.com/althonos/pyrodigal), [`Prokka`](https://github.com/tseemann/prokka), or [`Bakta`](https://github.com/oschwengers/bakta)\n4. Annotation of coding sequences from 3. to obtain general protein families and domains with [`InterProScan`](https://github.com/ebi-pf-team/interproscan)\n5. Screening contigs for antimicrobial peptide-like sequences with [`ampir`](https://cran.r-project.org/web/packages/ampir/index.html), [`Macrel`](https://github.com/BigDataBiology/macrel), [`HMMER`](http://hmmer.org/), [`AMPlify`](https://github.com/bcgsc/AMPlify)\n6. Screening contigs for antibiotic resistant gene-like sequences with [`ABRicate`](https://github.com/tseemann/abricate), [`AMRFinderPlus`](https://github.com/ncbi/amr), [`fARGene`](https://github.com/fannyhb/fargene), [`RGI`](https://card.mcmaster.ca/analyze/rgi), [`DeepARG`](https://bench.cs.vt.edu/deeparg). [`argNorm`](https://github.com/BigDataBiology/argNorm) is used to map the outputs of `DeepARG`, `AMRFinderPlus`, and `ABRicate` to the [`Antibiotic Resistance Ontology`](https://www.ebi.ac.uk/ols4/ontologies/aro) for consistent ARG classification terms.\n7. Screening contigs for biosynthetic gene cluster-like sequences with [`antiSMASH`](https://antismash.secondarymetabolites.org), [`DeepBGC`](https://github.com/Merck/deepbgc), [`GECCO`](https://gecco.embl.de/), [`HMMER`](http://hmmer.org/)\n8. Creating aggregated reports for all samples across the workflows with [`AMPcombi`](https://github.com/Darcy220606/AMPcombi) for AMPs, [`hAMRonization`](https://github.com/pha4ge/hAMRonization) for ARGs, and [`comBGC`](https://raw.githubusercontent.com/nf-core/funcscan/master/bin/comBGC.py) for BGCs\n9. Software version and methods text reporting with [`MultiQC`](http://multiqc.info/)\n\n![funcscan metro workflow](docs/images/funcscan_metro_workflow.png)\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nFirst, prepare a samplesheet with your input data that looks as follows:\n\n`samplesheet.csv`:\n\n```csv\nsample,fasta\nCONTROL_REP1,AEG588A1_001.fasta\nCONTROL_REP2,AEG588A1_002.fasta\nCONTROL_REP3,AEG588A1_003.fasta\n```\n\nEach row represents a (multi-)fasta file of assembled contig sequences.\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run nf-core/funcscan \\\n -profile \\\n --input samplesheet.csv \\\n --outdir \\\n --run_amp_screening \\\n --run_arg_screening \\\n --run_bgc_screening\n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/funcscan/usage) and the [parameter documentation](https://nf-co.re/funcscan/parameters).\n\n## Pipeline output\n\nTo see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/funcscan/results) tab on the nf-core website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://nf-co.re/funcscan/output).\n\n## Credits\n\nnf-core/funcscan was originally written by Jasmin Frangenberg, Anan Ibrahim, Louisa Perelo, Moritz E. Beber, James A. Fellows Yates.\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\nAdam Talbot, Alexandru Mizeranschi, Hugo Tavares, J\u00falia Mir Pedrol, Martin Klapper, Mehrdad Jaberi, Robert Syme, Rosa Herbst, Vedanth Ramji, @Microbion.\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#funcscan` channel](https://nfcore.slack.com/channels/funcscan) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\nIf you use nf-core/funcscan for your analysis, please cite it using the following doi: [10.5281/zenodo.7643099](https://doi.org/10.5281/zenodo.7643099)\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", + "description": "

    \n \n \n \"nf-core/funcscan\"\n \n

    \n\n[![GitHub Actions CI Status](https://github.com/nf-core/funcscan/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/funcscan/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/funcscan/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/funcscan/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/funcscan/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7643099-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7643099)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A524.10.5-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.3.2-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.3.2)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/funcscan)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23funcscan-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/funcscan)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/funcscan** is a bioinformatics best-practice analysis pipeline for the screening of nucleotide sequences such as assembled contigs for functional genes. It currently features mining for antimicrobial peptides, antibiotic resistance genes and biosynthetic gene clusters.\n\nThe pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community!\n\nOn release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources. The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/funcscan/results).\n\nThe nf-core/funcscan AWS full test dataset are contigs generated by the MGnify service from the ENA. We used contigs generated from assemblies of chicken cecum shotgun metagenomes (study accession: MGYS00005631).\n\n## Pipeline summary\n\n1. Quality control of input sequences with [`SeqKit`](https://bioinf.shenwei.me/seqkit/)\n2. Taxonomic classification of contigs of **prokaryotic origin** with [`MMseqs2`](https://github.com/soedinglab/MMseqs2)\n3. Annotation of assembled prokaryotic contigs with [`Prodigal`](https://github.com/hyattpd/Prodigal), [`Pyrodigal`](https://github.com/althonos/pyrodigal), [`Prokka`](https://github.com/tseemann/prokka), or [`Bakta`](https://github.com/oschwengers/bakta)\n4. Annotation of coding sequences from 3. to obtain general protein families and domains with [`InterProScan`](https://github.com/ebi-pf-team/interproscan)\n5. Screening contigs for antimicrobial peptide-like sequences with [`ampir`](https://cran.r-project.org/web/packages/ampir/index.html), [`Macrel`](https://github.com/BigDataBiology/macrel), [`HMMER`](http://hmmer.org/), [`AMPlify`](https://github.com/bcgsc/AMPlify)\n6. Screening contigs for antibiotic resistant gene-like sequences with [`ABRicate`](https://github.com/tseemann/abricate), [`AMRFinderPlus`](https://github.com/ncbi/amr), [`fARGene`](https://github.com/fannyhb/fargene), [`RGI`](https://card.mcmaster.ca/analyze/rgi), [`DeepARG`](https://bench.cs.vt.edu/deeparg). [`argNorm`](https://github.com/BigDataBiology/argNorm) is used to map the outputs of `DeepARG`, `AMRFinderPlus`, and `ABRicate` to the [`Antibiotic Resistance Ontology`](https://www.ebi.ac.uk/ols4/ontologies/aro) for consistent ARG classification terms.\n7. Screening contigs for biosynthetic gene cluster-like sequences with [`antiSMASH`](https://antismash.secondarymetabolites.org), [`DeepBGC`](https://github.com/Merck/deepbgc), [`GECCO`](https://gecco.embl.de/), [`HMMER`](http://hmmer.org/)\n8. Screening contigs for carbohydrate-active enzymes (CAZymes), CAZyme gene clusters and substrates with [run_dbcan](https://github.com/bcb-unl/run_dbcan).\n9. Creating aggregated reports for all samples across the workflows with [`AMPcombi`](https://github.com/Darcy220606/AMPcombi) for AMPs, [`hAMRonization`](https://github.com/pha4ge/hAMRonization) for ARGs, and [`comBGC`](https://raw.githubusercontent.com/nf-core/funcscan/master/bin/comBGC.py) for BGCs\n10. Software version and methods text reporting with [`MultiQC`](http://multiqc.info/)\n\n![funcscan metro workflow](docs/images/funcscan_metro_workflow.png)\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nFirst, prepare a samplesheet with your input data that looks as follows:\n\n`samplesheet.csv`:\n\n```csv\nsample,fasta\nCONTROL_REP1,AEG588A1_001.fasta\nCONTROL_REP2,AEG588A1_002.fasta\nCONTROL_REP3,AEG588A1_003.fasta\n```\n\nEach row represents a (multi-)fasta file of assembled contig sequences.\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run nf-core/funcscan \\\n -profile \\\n --input samplesheet.csv \\\n --outdir \\\n --run_amp_screening \\\n --run_arg_screening \\\n --run_bgc_screening\n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/funcscan/usage) and the [parameter documentation](https://nf-co.re/funcscan/parameters).\n\n## Pipeline output\n\nTo see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/funcscan/results) tab on the nf-core website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://nf-co.re/funcscan/output).\n\n## Credits\n\nnf-core/funcscan was originally written by Jasmin Frangenberg, Anan Ibrahim, Louisa Perelo, Moritz E. Beber, James A. Fellows Yates.\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\nAdam Talbot, Alexandru Mizeranschi, Hugo Tavares, J\u00falia Mir Pedrol, Martin Klapper, Mehrdad Jaberi, Robert Syme, Rosa Herbst, Vedanth Ramji, @Microbion.\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#funcscan` channel](https://nfcore.slack.com/channels/funcscan) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\nIf you use nf-core/funcscan for your analysis, please cite it using the following doi: [10.5281/zenodo.7643099](https://doi.org/10.5281/zenodo.7643099)\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", "hasPart": [ { "@id": "main.nf" From cce04b2ccf41d72a5b6d19c4118a94576cadb7e4 Mon Sep 17 00:00:00 2001 From: HD Yi Date: Thu, 18 Sep 2025 09:57:55 -0700 Subject: [PATCH 29/55] only list top view Co-authored-by: James A. Fellows Yates --- docs/output.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/docs/output.md b/docs/output.md index 5b99d4e5..1e83f9ad 100644 --- a/docs/output.md +++ b/docs/output.md @@ -42,10 +42,8 @@ results/ | ├── deepbgc/ | ├── gecco/ | └── hmmsearch/ -├── dbcan/ -| ├── cazyme/ -| ├── cgc/ -| └── substrate/ +├── cazyme/ +| └── dbcan/ ├── databases/ ├── multiqc/ ├── pipeline_info/ From ddd51c1204f233d3bb65559f33b8c4ce862fdfc5 Mon Sep 17 00:00:00 2001 From: HD Yi Date: Thu, 18 Sep 2025 09:58:43 -0700 Subject: [PATCH 30/55] Update docs/output.md Co-authored-by: James A. Fellows Yates --- docs/output.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index 1e83f9ad..f97aafa7 100644 --- a/docs/output.md +++ b/docs/output.md @@ -481,7 +481,7 @@ Note that filtered FASTA is only used for BGC workflow for run-time optimisation Output files - `cazyme/` - - `dbcan` + - `dbcan/` - `cazyme_annotation` - `_overview.tsv`: TSV file containing the results of dbCAN CAZyme annotation - `_dbCAN_hmm_results.tsv`: TSV file containing the detailed dbCAN HMM results for CAZyme annotation From b31feb6ac5316e4241802731bdea223e2d1ca704 Mon Sep 17 00:00:00 2001 From: HD Yi Date: Thu, 18 Sep 2025 09:58:58 -0700 Subject: [PATCH 31/55] Update docs/output.md Co-authored-by: James A. Fellows Yates --- docs/output.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index f97aafa7..d3c8c60f 100644 --- a/docs/output.md +++ b/docs/output.md @@ -482,7 +482,7 @@ Note that filtered FASTA is only used for BGC workflow for run-time optimisation - `cazyme/` - `dbcan/` - - `cazyme_annotation` + - `cazyme_annotation/` - `_overview.tsv`: TSV file containing the results of dbCAN CAZyme annotation - `_dbCAN_hmm_results.tsv`: TSV file containing the detailed dbCAN HMM results for CAZyme annotation - `_dbCANsub_hmm_results.tsv`: TSV file containing the detailed dbCAN subfamily results for CAZyme annotation From 36c22d34db6e45ccee81673cb1428d0bd39d117a Mon Sep 17 00:00:00 2001 From: HD Yi Date: Thu, 18 Sep 2025 09:59:12 -0700 Subject: [PATCH 32/55] Update docs/output.md Co-authored-by: James A. Fellows Yates --- docs/output.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index d3c8c60f..78f45094 100644 --- a/docs/output.md +++ b/docs/output.md @@ -487,7 +487,7 @@ Note that filtered FASTA is only used for BGC workflow for run-time optimisation - `_dbCAN_hmm_results.tsv`: TSV file containing the detailed dbCAN HMM results for CAZyme annotation - `_dbCANsub_hmm_results.tsv`: TSV file containing the detailed dbCAN subfamily results for CAZyme annotation - `_diamond.out`: TSV file containing the detailed dbCAN diamond results for CAZyme annotation - - `cgc` + - `cgc/` - `_cgc.gff`: GFF file containing the CAZyme gene clusters (CGC) identified by dbCAN. This file is generated from the dbCAN annotation and contains the locations of CAZyme gene clusters in the genome. - `_cgc_standard_out.tsv`: Standard output file from dbCAN for CAZyme gene clusters (CGC) in a tabular format. This file summarizes the CAZyme gene clusters identified in the genome. - `_diamond.out.tc`: TSV file containing the diamond output for transporter annotation From 59385f93aec1514e8d5083a50138b1d5a55162b0 Mon Sep 17 00:00:00 2001 From: HD Yi Date: Thu, 18 Sep 2025 09:59:23 -0700 Subject: [PATCH 33/55] Update docs/output.md Co-authored-by: James A. Fellows Yates --- docs/output.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index 78f45094..8c2f4641 100644 --- a/docs/output.md +++ b/docs/output.md @@ -493,7 +493,7 @@ Note that filtered FASTA is only used for BGC workflow for run-time optimisation - `_diamond.out.tc`: TSV file containing the diamond output for transporter annotation - `_TF_hmm_results.tsv`: TSV file containing the results of transcription factor screening - `_STP_hmm_results.tsv`: TSV file containing the results of signaling transduction proteins (STP) annotation. - - `substrate` + - `substrate/` - `_total_cgc_info.tsv`: TSV file summarizing the total additional genes in the genome - `_substrate_prediction.tsv`: TSV file containing the substrate predictions based on the CGC annotations from dbCAN - `_synteny_pdf/`: Directory containing one or more PDF files showing the syntenic regions of the CGCs in DNA sequence as identified by dbCAN From 2a6544ea9a8959a00665695664ece21514965234 Mon Sep 17 00:00:00 2001 From: HD Yi Date: Thu, 18 Sep 2025 10:06:55 -0700 Subject: [PATCH 34/55] Update docs/output.md Co-authored-by: James A. Fellows Yates --- docs/output.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index 8c2f4641..da6a0beb 100644 --- a/docs/output.md +++ b/docs/output.md @@ -492,7 +492,7 @@ Note that filtered FASTA is only used for BGC workflow for run-time optimisation - `_cgc_standard_out.tsv`: Standard output file from dbCAN for CAZyme gene clusters (CGC) in a tabular format. This file summarizes the CAZyme gene clusters identified in the genome. - `_diamond.out.tc`: TSV file containing the diamond output for transporter annotation - `_TF_hmm_results.tsv`: TSV file containing the results of transcription factor screening - - `_STP_hmm_results.tsv`: TSV file containing the results of signaling transduction proteins (STP) annotation. + - `_STP_hmm_results.tsv`: TSV file containing the results of signaling transduction proteins (STP) annotation - `substrate/` - `_total_cgc_info.tsv`: TSV file summarizing the total additional genes in the genome - `_substrate_prediction.tsv`: TSV file containing the substrate predictions based on the CGC annotations from dbCAN From 2dbe952f3cc84b311aff649d91a08294eba70d84 Mon Sep 17 00:00:00 2001 From: HD Yi Date: Thu, 18 Sep 2025 10:07:13 -0700 Subject: [PATCH 35/55] Update docs/output.md Co-authored-by: James A. Fellows Yates --- docs/output.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index da6a0beb..c1c54a07 100644 --- a/docs/output.md +++ b/docs/output.md @@ -488,7 +488,7 @@ Note that filtered FASTA is only used for BGC workflow for run-time optimisation - `_dbCANsub_hmm_results.tsv`: TSV file containing the detailed dbCAN subfamily results for CAZyme annotation - `_diamond.out`: TSV file containing the detailed dbCAN diamond results for CAZyme annotation - `cgc/` - - `_cgc.gff`: GFF file containing the CAZyme gene clusters (CGC) identified by dbCAN. This file is generated from the dbCAN annotation and contains the locations of CAZyme gene clusters in the genome. + - `_cgc.gff`: GFF file containing the CAZyme gene clusters (CGC) identified by dbCAN. This file is generated from the dbCAN annotation and contains the locations of CAZyme gene clusters in the genome - `_cgc_standard_out.tsv`: Standard output file from dbCAN for CAZyme gene clusters (CGC) in a tabular format. This file summarizes the CAZyme gene clusters identified in the genome. - `_diamond.out.tc`: TSV file containing the diamond output for transporter annotation - `_TF_hmm_results.tsv`: TSV file containing the results of transcription factor screening From 01fb374e94b115e961763ede8d108de8c060bffb Mon Sep 17 00:00:00 2001 From: HaidYi Date: Mon, 22 Sep 2025 23:36:09 -0500 Subject: [PATCH 36/55] add a column: gff_type in samplesheet --- assets/schema_input.json | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/assets/schema_input.json b/assets/schema_input.json index 7b8a0303..733e8f37 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -40,6 +40,12 @@ "exists": true, "pattern": "^\\S+\\.(gff|gff3)(\\.gz)?$", "errorMessage": "Input file for feature annotations has incorrect file format. File must end in `.gff`, `.gff.gz` or `.gff3`, or `.gff3.gz`" + }, + "gff_type": { + "type": "string", + "enum": ["NCBI_prok", "prodigal", "NCBI_euk", "JGI"], + "errorMessage": "GFF type must be one of: NCBI_prok, prodigal, NCBI_euk, or JGI", + "meta": ["gff_type"] } }, "required": ["sample", "fasta"], From 13b82ab0eab2b28ac9122d5e1fc632bf80ccd961 Mon Sep 17 00:00:00 2001 From: HaidYi Date: Mon, 22 Sep 2025 23:36:26 -0500 Subject: [PATCH 37/55] rm dbcan_gff_type parameter --- nextflow.config | 9 ++++----- nextflow_schema.json | 8 -------- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/nextflow.config b/nextflow.config index 72f2e173..df80b9ba 100644 --- a/nextflow.config +++ b/nextflow.config @@ -250,13 +250,12 @@ params { bgc_hmmsearch_savedomains = false // RUNDBCAN options - run_cazyme_screening = false + run_cazyme_screening = false - cazyme_skip_dbcan = false + cazyme_skip_dbcan = false - dbcan_skip_cgc = false - dbcan_skip_substrate = false - dbcan_gff_type = 'prodigal' + dbcan_skip_cgc = false + dbcan_skip_substrate = false // Config options config_profile_name = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 0293d461..f25b9756 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1482,14 +1482,6 @@ "type": "boolean", "description": "Skip substrate during the dbCAN screening.", "fa_icon": "fas fa-ban" - }, - "dbcan_gff_type": { - "type": "string", - "default": "prodigal", - "description": "GFF type used to annotate CAZymes in the input GFF file.", - "help_text": "This flag specifies the type of GFF file that is used to annotate CAZymes in the input GFF file. It is used to ensure that the correct gene annotations are used for the CAZyme screening. For more information check the dbCAN [documentation](https://run-dbcan.readthedocs.io/en/latest/user_guide/CGC_information_generation.html#input-gff-file-types).", - "fa_icon": "fas fa-database", - "enum": ["NCBI_prok", "prodigal", "NCBI_euk", "JGI"] } } }, From 3af937f0241a634abf6a5ea57e9a4d280188b641 Mon Sep 17 00:00:00 2001 From: HaidYi Date: Mon, 22 Sep 2025 23:37:43 -0500 Subject: [PATCH 38/55] add option for using local dbcan db --- subworkflows/local/cazyme.nf | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/subworkflows/local/cazyme.nf b/subworkflows/local/cazyme.nf index b7a6b252..25e81123 100644 --- a/subworkflows/local/cazyme.nf +++ b/subworkflows/local/cazyme.nf @@ -24,16 +24,23 @@ workflow CAZYME { ch_faas_for_rundbcan = faas ch_gffs_for_rundbcan = gffs - // Download dbCAN database - RUNDBCAN_DATABASE () - ch_versions = ch_versions.mix(RUNDBCAN_DATABASE.out.versions) + // Prepare channel for database + if (!params.cazyme_skip_dbcan && params.cazyme_dbcan_db) { + ch_dbcan_db = Channel + .from(params.cazyme_dbcan_db, checkIfExists: true) + .first() + } + else if (!params.cazyme_skip_dbcan && !params.cazyme_dbcan_db) { + // Download dbCAN database + RUNDBCAN_DATABASE () + ch_versions = ch_versions.mix(RUNDBCAN_DATABASE.out.versions) + ch_dbcan_db = RUNDBCAN_DATABASE.out.dbcan_db + } - // CAZyme annotation - RUNDBCAN_CAZYMEANNOTATION ( - ch_faas_for_rundbcan, - RUNDBCAN_DATABASE.out.dbcan_db - ) - ch_versions = ch_versions.mix(RUNDBCAN_CAZYMEANNOTATION.out.versions) + if (!params.cazyme_skip_dbcan) { + // CAZyme annotation + RUNDBCAN_CAZYMEANNOTATION (ch_faas_for_rundbcan, ch_dbcan_db) + ch_versions = ch_versions.mix(RUNDBCAN_CAZYMEANNOTATION.out.versions) // Prepare input for dbCAN CGC and substrate annotation ch_input_for_dbcan = ch_faas_for_rundbcan From c28f049d68d2c0f00097343e1208f41186d45679 Mon Sep 17 00:00:00 2001 From: HaidYi Date: Mon, 22 Sep 2025 23:38:13 -0500 Subject: [PATCH 39/55] filter samples for dbcan cgc/substrate if no gff_type provided in samplesheet --- subworkflows/local/cazyme.nf | 59 +++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 24 deletions(-) diff --git a/subworkflows/local/cazyme.nf b/subworkflows/local/cazyme.nf index 25e81123..1f194d92 100644 --- a/subworkflows/local/cazyme.nf +++ b/subworkflows/local/cazyme.nf @@ -42,32 +42,43 @@ workflow CAZYME { RUNDBCAN_CAZYMEANNOTATION (ch_faas_for_rundbcan, ch_dbcan_db) ch_versions = ch_versions.mix(RUNDBCAN_CAZYMEANNOTATION.out.versions) - // Prepare input for dbCAN CGC and substrate annotation - ch_input_for_dbcan = ch_faas_for_rundbcan - .join(ch_gffs_for_rundbcan) - .multiMap { meta, faa, gff -> - faa: [meta, faa] - gff: [meta, gff, params.dbcan_gff_type] // One samplesheet can only have one gff type, mixed mode is not supported now. - } + // Prepare input for dbCAN CGC and substrate annotation + if ( !params.dbcan_skip_cgc || !params.dbcan_skip_substrate ) { + ch_input_for_dbcan = ch_faas_for_rundbcan + .join(ch_gffs_for_rundbcan) + .filter { meta, faa, gff -> + if (meta.gff_type == null) { + log.warn "Skipping sample ${meta.id ?: 'unknown'} for dbcan cgc/substrate annotation due to null gff_type" + return false + } + return true + } + .multiMap { meta, faa, gff -> + faa: [meta, faa] + gff: [meta, gff, meta.gff_type] + } - // CGC annotation - if ( !params.dbcan_skip_cgc ) { - RUNDBCAN_EASYCGC ( - ch_input_for_dbcan.faa, - ch_input_for_dbcan.gff, - RUNDBCAN_DATABASE.out.dbcan_db - ) - ch_versions = ch_versions.mix(RUNDBCAN_EASYCGC.out.versions) - } + // CGC annotation + if ( !params.dbcan_skip_cgc ) { + RUNDBCAN_EASYCGC ( + ch_input_for_dbcan.faa, + ch_input_for_dbcan.gff, + ch_dbcan_db + ) + ch_versions = ch_versions.mix(RUNDBCAN_EASYCGC.out.versions) + } - // substrate annotation - if ( !params.dbcan_skip_substrate ) { - RUNDBCAN_EASYSUBSTRATE ( - ch_input_for_dbcan.faa, - ch_input_for_dbcan.gff, - RUNDBCAN_DATABASE.out.dbcan_db - ) - ch_versions = ch_versions.mix(RUNDBCAN_EASYSUBSTRATE.out.versions) + + // substrate annotation + if ( !params.dbcan_skip_substrate ) { + RUNDBCAN_EASYSUBSTRATE ( + ch_input_for_dbcan.faa, + ch_input_for_dbcan.gff, + ch_dbcan_db + ) + ch_versions = ch_versions.mix(RUNDBCAN_EASYSUBSTRATE.out.versions) + } + } } emit: From 796b96d42ef920860a2d551c37b6a32e5b3fe0ba Mon Sep 17 00:00:00 2001 From: HaidYi Date: Wed, 24 Sep 2025 22:21:22 -0500 Subject: [PATCH 40/55] add cazyme to toolCitationText --- subworkflows/local/utils_nfcore_funcscan_pipeline/main.nf | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/subworkflows/local/utils_nfcore_funcscan_pipeline/main.nf b/subworkflows/local/utils_nfcore_funcscan_pipeline/main.nf index 6faf6be7..514692b0 100644 --- a/subworkflows/local/utils_nfcore_funcscan_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_funcscan_pipeline/main.nf @@ -197,6 +197,11 @@ def toolCitationText() { ". The output from the biosynthetic gene cluster screening tools were standardised and summarised with comBGC (Frangenberg et al. 2023).", ].join(' ').replaceAll(', +.', ".").trim() + def cazyme_text = [ + "The following carbohydrate-active enzymes (CAZymes) screening tools were used:", + !params.cazyme_skip_dbcan ? "dbCAN3 (Zheng, Jinfang, et al. 2023)," : "", + ].join(' ').replaceAll(', +.', ".").trim() + def postprocessing_text = "Run statistics were reported using MultiQC (Ewels et al. 2016)." def citation_text = [ @@ -205,6 +210,7 @@ def toolCitationText() { params.run_amp_screening ? amp_text : "", params.run_arg_screening ? arg_text : "", params.run_bgc_screening ? bgc_text : "", + params.run_cazyme_screening ? cazyme_text : "", postprocessing_text, ].join(' ').trim() From 6de200597a41d064a43c7f10aa8fedf34b7c4db7 Mon Sep 17 00:00:00 2001 From: HD Yi Date: Wed, 24 Sep 2025 20:24:18 -0700 Subject: [PATCH 41/55] Update docs/output.md Co-authored-by: James A. Fellows Yates --- docs/output.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index c1c54a07..505a60cb 100644 --- a/docs/output.md +++ b/docs/output.md @@ -489,7 +489,7 @@ Note that filtered FASTA is only used for BGC workflow for run-time optimisation - `_diamond.out`: TSV file containing the detailed dbCAN diamond results for CAZyme annotation - `cgc/` - `_cgc.gff`: GFF file containing the CAZyme gene clusters (CGC) identified by dbCAN. This file is generated from the dbCAN annotation and contains the locations of CAZyme gene clusters in the genome - - `_cgc_standard_out.tsv`: Standard output file from dbCAN for CAZyme gene clusters (CGC) in a tabular format. This file summarizes the CAZyme gene clusters identified in the genome. + - `_cgc_standard_out.tsv`: Standard output file from dbCAN for CAZyme gene clusters (CGC) in a tabular format. This file summarizes the CAZyme gene clusters identified in the genome - `_diamond.out.tc`: TSV file containing the diamond output for transporter annotation - `_TF_hmm_results.tsv`: TSV file containing the results of transcription factor screening - `_STP_hmm_results.tsv`: TSV file containing the results of signaling transduction proteins (STP) annotation From 5f9b432c6fa8d96eafec15bd7bc6d2544f916fca Mon Sep 17 00:00:00 2001 From: HaidYi Date: Fri, 26 Sep 2025 23:45:54 -0500 Subject: [PATCH 42/55] update the profile name --- ...t_dbcan_pyrodigal.config => test_cazyme_pyrodigal.config} | 0 nextflow.config | 5 +++-- 2 files changed, 3 insertions(+), 2 deletions(-) rename conf/{test_dbcan_pyrodigal.config => test_cazyme_pyrodigal.config} (100%) diff --git a/conf/test_dbcan_pyrodigal.config b/conf/test_cazyme_pyrodigal.config similarity index 100% rename from conf/test_dbcan_pyrodigal.config rename to conf/test_cazyme_pyrodigal.config diff --git a/nextflow.config b/nextflow.config index df80b9ba..f468e510 100644 --- a/nextflow.config +++ b/nextflow.config @@ -253,6 +253,7 @@ params { run_cazyme_screening = false cazyme_skip_dbcan = false + cazyme_dbcan_db = null dbcan_skip_cgc = false dbcan_skip_substrate = false @@ -423,8 +424,8 @@ profiles { test_preannotated_bgc { includeConfig 'conf/test_preannotated_bgc.config' } - test_dbcan_pyrodigal { - includeConfig 'conf/test_dbcan_pyrodigal.config' + test_cazyme_pyrodigal { + includeConfig 'conf/test_cazyme_pyrodigal.config' } test_preannotated_dbcan { includeConfig 'conf/test_preannotated_dbcan.config' From e114734d4fa0c828a297166ca61fdb5c81feeb5f Mon Sep 17 00:00:00 2001 From: HaidYi Date: Sun, 28 Sep 2025 13:01:08 -0500 Subject: [PATCH 43/55] add cazyme_screening to default test --- conf/test.config | 2 ++ tests/default.nf.test | 12 +++++++++++- tests/default.nf.test.snap | 16 ++++++++++++++-- 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/conf/test.config b/conf/test.config index 61ad1c4d..21b32cb0 100644 --- a/conf/test.config +++ b/conf/test.config @@ -33,4 +33,6 @@ params { run_amp_screening = true amp_run_hmmsearch = true amp_hmmsearch_models = params.pipelines_testdata_base_path + 'funcscan/hmms/mybacteriocin.hmm' + + run_cazyme_screening = true } diff --git a/tests/default.nf.test b/tests/default.nf.test index aad94a4d..c460c913 100644 --- a/tests/default.nf.test +++ b/tests/default.nf.test @@ -116,7 +116,17 @@ nextflow_pipeline { { assert path("$outputDir/arg/fargene/sample_2/sample_2-class_b_1_2.log").text.contains("fARGene is done.") }, // hAMRonization - { assert snapshot(path("$outputDir/reports/hamronization_summarize/hamronization_combined_report.tsv")).match("hamronization_summarize") } + { assert snapshot(path("$outputDir/reports/hamronization_summarize/hamronization_combined_report.tsv")).match("hamronization_summarize") }, + + // dbCAN + { assert path("$outputDir/cazyme/dbcan/cazyme_annotation/sample_2/sample_2_overview.tsv").text.contains("dbCAN_hmm") }, + { assert path("$outputDir/cazyme/dbcan/cgc/sample_2/sample_2_cgc_standard_out.tsv").text.contains("CGC#") }, + { assert path("$outputDir/cazyme/dbcan/substrate/sample_2/sample_2_substrate_prediction.tsv").text.contains("#cgcid") }, + { assert snapshot( + path("$outputDir/cazyme/dbcan/cazyme_annotation/sample_2/sample_2_overview.tsv"), + path("$outputDir/cazyme/dbcan/cgc/sample_2/sample_2_cgc_standard_out.tsv"), + path("$outputDir/cazyme/dbcan/substrate/sample_2/sample_2_substrate_prediction.tsv") + ).match('dbcan') } ) } } diff --git a/tests/default.nf.test.snap b/tests/default.nf.test.snap index d20ac50a..7d5be6ac 100644 --- a/tests/default.nf.test.snap +++ b/tests/default.nf.test.snap @@ -35,7 +35,7 @@ }, "-profile test": { "content": [ - 61, + 68, { "ABRICATE_RUN": { "abricate": "1.0.1" @@ -145,7 +145,7 @@ "nf-test": "0.9.2", "nextflow": "25.04.6" }, - "timestamp": "2025-08-28T12:48:37.438422987" + "timestamp": "2025-09-28T00:25:27.627242047" }, "rgi": { "content": [ @@ -206,6 +206,18 @@ }, "timestamp": "2025-06-12T13:50:58.955107983" }, + "dbcan": { + "content": [ + "sample_2_overview.tsv:md5,f1f42b20b6438a6d9cde75415276ded6", + "sample_2_cgc_standard_out.tsv:md5,6be9ab29b289ff46cc6e4b6fe48dc3d7", + "sample_2_substrate_prediction.tsv:md5,fe2a5ea9e19c4f1108798103547ff98d" + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.6" + }, + "timestamp": "2025-09-28T00:25:27.655500872" + }, "macrel": { "content": [ "sample_1.macrel.smorfs.faa.gz:md5,1b5e2434860e635e95324d1804a3be7b", From 6ee4dd78e37e5297594fa849e0d82dd7969a7966 Mon Sep 17 00:00:00 2001 From: HaidYi Date: Sun, 28 Sep 2025 13:01:51 -0500 Subject: [PATCH 44/55] add test_cazyme_pyrodigal test --- tests/test_cazyme_pyrodigal.nf.test | 51 ++++++++++++++++++++++++ tests/test_cazyme_pyrodigal.nf.test.snap | 14 +++++++ 2 files changed, 65 insertions(+) create mode 100644 tests/test_cazyme_pyrodigal.nf.test create mode 100644 tests/test_cazyme_pyrodigal.nf.test.snap diff --git a/tests/test_cazyme_pyrodigal.nf.test b/tests/test_cazyme_pyrodigal.nf.test new file mode 100644 index 00000000..3b5f74f7 --- /dev/null +++ b/tests/test_cazyme_pyrodigal.nf.test @@ -0,0 +1,51 @@ +nextflow_pipeline { + + name "Test pipeline: NFCORE_FUNCSCAN" + script "main.nf" + tag "pipeline" + tag "nfcore_funcscan" + tag "test_cazyme_pyrodigal" + profile "test_cazyme_pyrodigal" + + test("-profile test_cazyme_pyrodigal") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + assertAll( + { assert workflow.success }, + { assert new File("$outputDir/pipeline_info/nf_core_funcscan_software_mqc_versions.yml").exists() }, + { assert new File("$outputDir/multiqc/multiqc_report.html").exists() }, + + // dbCAN annotation + { assert path("$outputDir/cazyme/dbcan/cazyme_annotation/sample_2/sample_2_dbCAN_hmm_results.tsv").text.contains("dbCAN") }, + { assert path("$outputDir/cazyme/dbcan/cazyme_annotation/sample_2/sample_2_dbCANsub_hmm_results.tsv").text.contains("dbCAN-sub") }, + { assert path("$outputDir/cazyme/dbcan/cazyme_annotation/sample_2/sample_2_diamond.out").exists() }, + { assert path("$outputDir/cazyme/dbcan/cazyme_annotation/sample_2/sample_2_overview.tsv").text.contains("dbCAN_hmm") }, + + // dbCAN cgc + { assert path("$outputDir/cazyme/dbcan/cgc/sample_2/sample_2_cgc.gff").exists() }, + { assert path("$outputDir/cazyme/dbcan/cgc/sample_2/sample_2_cgc_standard_out.tsv").text.contains("CGC#") }, + { assert path("$outputDir/cazyme/dbcan/cgc/sample_2/sample_2_diamond.out.tc").exists() }, + { assert path("$outputDir/cazyme/dbcan/cgc/sample_2/sample_2_STP_hmm_results.tsv").text.contains("HMM") }, + { assert path("$outputDir/cazyme/dbcan/cgc/sample_2/sample_2_TF_hmm_results.tsv").text.contains("HMM") }, + + // dbCAN substrate + { assert path("$outputDir/cazyme/dbcan/substrate/sample_2/sample_2_substrate_prediction.tsv").text.contains("#cgcid") }, + { assert path("$outputDir/cazyme/dbcan/substrate/sample_2/sample_2_synteny_pdf").exists() }, + { assert path("$outputDir/cazyme/dbcan/substrate/sample_2/sample_2_total_cgc_info.tsv").text.contains("Annotate") }, + + // snap shot + { assert snapshot( + path("$outputDir/cazyme/dbcan/cazyme_annotation/sample_2/sample_2_overview.tsv"), + path("$outputDir/cazyme/dbcan/cgc/sample_2/sample_2_cgc_standard_out.tsv"), + path("$outputDir/cazyme/dbcan/substrate/sample_2/sample_2_substrate_prediction.tsv") + ).match('dbcan') } + ) + } + } +} diff --git a/tests/test_cazyme_pyrodigal.nf.test.snap b/tests/test_cazyme_pyrodigal.nf.test.snap new file mode 100644 index 00000000..8a1d5e2c --- /dev/null +++ b/tests/test_cazyme_pyrodigal.nf.test.snap @@ -0,0 +1,14 @@ +{ + "dbcan": { + "content": [ + "sample_2_overview.tsv:md5,f1f42b20b6438a6d9cde75415276ded6", + "sample_2_cgc_standard_out.tsv:md5,6be9ab29b289ff46cc6e4b6fe48dc3d7", + "sample_2_substrate_prediction.tsv:md5,fe2a5ea9e19c4f1108798103547ff98d" + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.6" + }, + "timestamp": "2025-09-28T00:25:17.456937683" + } +} \ No newline at end of file From 18ba885f738528f8ed0ab1ee6fc9d93f038d597b Mon Sep 17 00:00:00 2001 From: HaidYi Date: Sun, 28 Sep 2025 13:02:42 -0500 Subject: [PATCH 45/55] add cazyme_dbcan_db to params --- nextflow_schema.json | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/nextflow_schema.json b/nextflow_schema.json index f25b9756..9100d23c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1473,6 +1473,12 @@ "description": "Skip dbCAN during the CAZyme screening.", "fa_icon": "fas fa-ban" }, + "cazyme_dbcan_db": { + "type": "string", + "fa_icon": "fas fa-database", + "description": "Path to local dbCAN database folder.", + "help_text": "For more information of preparing dbCAN database, refer to the [documentation](https://run-dbcan.readthedocs.io/en/latest/user_guide/prepare_the_database.html)." + }, "dbcan_skip_cgc": { "type": "boolean", "description": "Skip CGC during the dbCAN screening.", From 161d37d3620527a0cdbc8a1c9a6e54b55b739400 Mon Sep 17 00:00:00 2001 From: HaidYi Date: Sun, 28 Sep 2025 13:02:48 -0500 Subject: [PATCH 46/55] fix bug --- subworkflows/local/cazyme.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/subworkflows/local/cazyme.nf b/subworkflows/local/cazyme.nf index 1f194d92..b957ea29 100644 --- a/subworkflows/local/cazyme.nf +++ b/subworkflows/local/cazyme.nf @@ -27,7 +27,7 @@ workflow CAZYME { // Prepare channel for database if (!params.cazyme_skip_dbcan && params.cazyme_dbcan_db) { ch_dbcan_db = Channel - .from(params.cazyme_dbcan_db, checkIfExists: true) + .fromPath(params.cazyme_dbcan_db, checkIfExists: true) .first() } else if (!params.cazyme_skip_dbcan && !params.cazyme_dbcan_db) { @@ -47,8 +47,8 @@ workflow CAZYME { ch_input_for_dbcan = ch_faas_for_rundbcan .join(ch_gffs_for_rundbcan) .filter { meta, faa, gff -> - if (meta.gff_type == null) { - log.warn "Skipping sample ${meta.id ?: 'unknown'} for dbcan cgc/substrate annotation due to null gff_type" + if (!gff || !meta.gff_type) { + log.warn "Skipping sample: ${meta.id ?: 'unknown'} for dbcan cgc and substrate annotation due to empty gff or gff_type" return false } return true From d505ea6366f1f5f5f44e7c46c78c3dede5c2a335 Mon Sep 17 00:00:00 2001 From: HaidYi Date: Sun, 28 Sep 2025 13:03:55 -0500 Subject: [PATCH 47/55] add gff_type in meta for cazyme screening --- workflows/funcscan.nf | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index 90ff8139..cb42b377 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -139,8 +139,18 @@ workflow FUNCSCAN { } // Mix back the preannotated samples with the newly annotated ones - ch_prepped_input = ch_new_annotation + ch_new_annotation_short = ch_new_annotation .filter { meta, fasta, faa, gff, gbk -> meta.category != 'long' } + + // Add gff_type to meta for cazyme screening + if ((params.run_cazyme_screening && !params.cazyme_skip_dbcan && (!params.dbcan_skip_cgc || !params.dbcan_skip_substrate)) && params.annotation_tool in ['pyrodigal', 'prodigal', 'prokka', 'bakta']) { + ch_new_annotation_short.map { meta, fasta, faa, gff, gbk -> + def new_meta = meta + [gff_type: 'prodigal'] // Only Use 'prodigal' as dbcan does not distinguish 'pyrodigal' and 'prodigal' + [new_meta, fasta, faa, gff, gbk] + }.set { ch_new_annotation_short } + } + + ch_prepped_input = ch_new_annotation_short .mix(ch_intermediate_input.preannotated) .multiMap { meta, fasta, faa, gff, gbk -> fastas: [meta, fasta] From f5ed73e3bbc24e6e162ab1b2a39f9fe8e2813eba Mon Sep 17 00:00:00 2001 From: nf-core-bot Date: Wed, 8 Oct 2025 09:14:19 +0000 Subject: [PATCH 48/55] [automated] Fix code linting --- workflows/funcscan.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index cb42b377..da2d7310 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -145,7 +145,7 @@ workflow FUNCSCAN { // Add gff_type to meta for cazyme screening if ((params.run_cazyme_screening && !params.cazyme_skip_dbcan && (!params.dbcan_skip_cgc || !params.dbcan_skip_substrate)) && params.annotation_tool in ['pyrodigal', 'prodigal', 'prokka', 'bakta']) { ch_new_annotation_short.map { meta, fasta, faa, gff, gbk -> - def new_meta = meta + [gff_type: 'prodigal'] // Only Use 'prodigal' as dbcan does not distinguish 'pyrodigal' and 'prodigal' + def new_meta = meta + [gff_type: 'prodigal'] // Only Use 'prodigal' as dbcan does not distinguish 'pyrodigal' and 'prodigal' [new_meta, fasta, faa, gff, gbk] }.set { ch_new_annotation_short } } From 0ef0adeab62bad628b1f1b099f70d153bb055d63 Mon Sep 17 00:00:00 2001 From: HaidYi Date: Sun, 23 Nov 2025 19:46:39 -0800 Subject: [PATCH 49/55] rm `set` and use = --- workflows/funcscan.nf | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index da2d7310..d7ca5228 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -144,13 +144,15 @@ workflow FUNCSCAN { // Add gff_type to meta for cazyme screening if ((params.run_cazyme_screening && !params.cazyme_skip_dbcan && (!params.dbcan_skip_cgc || !params.dbcan_skip_substrate)) && params.annotation_tool in ['pyrodigal', 'prodigal', 'prokka', 'bakta']) { - ch_new_annotation_short.map { meta, fasta, faa, gff, gbk -> + ch_new_annotation_for_mixing = ch_new_annotation_short.map { meta, fasta, faa, gff, gbk -> def new_meta = meta + [gff_type: 'prodigal'] // Only Use 'prodigal' as dbcan does not distinguish 'pyrodigal' and 'prodigal' [new_meta, fasta, faa, gff, gbk] - }.set { ch_new_annotation_short } + } + } else { + ch_new_annotation_for_mixing = ch_new_annotation_short } - ch_prepped_input = ch_new_annotation_short + ch_prepped_input = ch_new_annotation_for_mixing .mix(ch_intermediate_input.preannotated) .multiMap { meta, fasta, faa, gff, gbk -> fastas: [meta, fasta] From fdb238a9a557f3f432a3f50c3e81120d79c71cdb Mon Sep 17 00:00:00 2001 From: HD Yi Date: Sun, 23 Nov 2025 19:48:13 -0800 Subject: [PATCH 50/55] Update subworkflows/local/cazyme.nf Co-authored-by: James A. Fellows Yates --- subworkflows/local/cazyme.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/subworkflows/local/cazyme.nf b/subworkflows/local/cazyme.nf index b957ea29..7c77e3ea 100644 --- a/subworkflows/local/cazyme.nf +++ b/subworkflows/local/cazyme.nf @@ -15,7 +15,6 @@ workflow CAZYME { gffs // tuple val(meta), path(ANNOTATION_ANNOTATION_TOOL.out.gff) main: - ch_versions = Channel.empty() // When adding new tool that requires FAA, make sure to update conditions From 6af2309b4549cd6b2cb7d170759b795433db7b97 Mon Sep 17 00:00:00 2001 From: HD Yi Date: Sun, 23 Nov 2025 19:52:50 -0800 Subject: [PATCH 51/55] Update workflows/funcscan.nf Co-authored-by: James A. Fellows Yates --- workflows/funcscan.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index d7ca5228..05bbc5ab 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -388,6 +388,7 @@ workflow FUNCSCAN { }, ch_prepped_input.gffs ) + ch_versions = ch_versions.mix(CAZYME.out.versions) } // From e4a478459824a2bc6193445a39b879f0ecb64354 Mon Sep 17 00:00:00 2001 From: HD Yi Date: Sun, 23 Nov 2025 20:00:46 -0800 Subject: [PATCH 52/55] Update nextflow.config Co-authored-by: James A. Fellows Yates --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index de530c14..8fb6e4f5 100644 --- a/nextflow.config +++ b/nextflow.config @@ -425,7 +425,7 @@ profiles { test_cazyme_pyrodigal { includeConfig 'conf/test_cazyme_pyrodigal.config' } - test_preannotated_dbcan { + test_preannotated_cazyme { includeConfig 'conf/test_preannotated_dbcan.config' } } From 41e9466b3ebc6113045242a50d13f342a709d4c3 Mon Sep 17 00:00:00 2001 From: HD Yi Date: Sun, 23 Nov 2025 20:00:54 -0800 Subject: [PATCH 53/55] Update nextflow.config Co-authored-by: James A. Fellows Yates --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 8fb6e4f5..15fc082c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -426,7 +426,7 @@ profiles { includeConfig 'conf/test_cazyme_pyrodigal.config' } test_preannotated_cazyme { - includeConfig 'conf/test_preannotated_dbcan.config' + includeConfig 'conf/test_preannotated_cazyme.config' } } // Load nf-core custom profiles from different institutions From 6cb3a41ee38f7979c32782f7de78f6eb42674cf2 Mon Sep 17 00:00:00 2001 From: HaidYi Date: Sun, 23 Nov 2025 20:20:49 -0800 Subject: [PATCH 54/55] update the name --- ...annotated_dbcan.config => test_preannotated_cazyme.config} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename conf/{test_preannotated_dbcan.config => test_preannotated_cazyme.config} (83%) diff --git a/conf/test_preannotated_dbcan.config b/conf/test_preannotated_cazyme.config similarity index 83% rename from conf/test_preannotated_dbcan.config rename to conf/test_preannotated_cazyme.config index fde7e1fd..3f21cd76 100644 --- a/conf/test_preannotated_dbcan.config +++ b/conf/test_preannotated_cazyme.config @@ -32,6 +32,6 @@ params { run_bgc_screening = false run_cazyme_screening = true - dbcan_skip_cgc = true // Skip cgc annotation as .gbk (not .gff) is provided in samplesheet - dbcan_skip_substrate = true // Skip substrate annotation as .gbk (not .gff) is provided in samplesheet + dbcan_skip_cgc = false // Skip cgc annotation as .gbk (not .gff) is provided in samplesheet + dbcan_skip_substrate = false // Skip substrate annotation as .gbk (not .gff) is provided in samplesheet } From 99c0c19085cb79d3933a6d996cdc42f4686075e0 Mon Sep 17 00:00:00 2001 From: HaidYi Date: Sun, 23 Nov 2025 20:30:59 -0800 Subject: [PATCH 55/55] add the icon --- nextflow_schema.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index c2c3f76e..a7b6d217 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1489,7 +1489,8 @@ "description": "Skip substrate during the dbCAN screening.", "fa_icon": "fas fa-ban" } - } + }, + "fa_icon": "fas fa-angle-double-right" }, "institutional_config_options": { "title": "Institutional config options",