diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 279a228c..01a8c576 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -17,7 +17,7 @@ Learn more about contributing: [CONTRIBUTING.md](https://github.com/ebi-metageno - [ ] If you've fixed a bug or added code that should be tested, add tests! - [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/ebi-metagenomics/mettannotator/tree/master/.github/CONTRIBUTING.md) - [ ] Make sure your code lints (`nf-core lint`). -- [ ] Ensure the test suite passes (`nf-test test main.nf.test -profile test,docker`). +- [ ] Ensure the test suite passes (`nextflow run . -profile test,docker --outdir `). - [ ] Check for unexpected warnings in debug mode (`nextflow run . -profile debug,test,docker --outdir `). - [ ] Usage Documentation in `docs/usage.md` is updated. - [ ] Output Documentation in `docs/output.md` is updated. diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 073e1876..1fcafe88 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -14,13 +14,12 @@ jobs: pre-commit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 + - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 - - name: Set up Python 3.11 - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5 + - name: Set up Python 3.12 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 with: - python-version: 3.11 - cache: "pip" + python-version: "3.12" - name: Install pre-commit run: pip install pre-commit @@ -32,14 +31,14 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out pipeline code - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 + uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 - name: Install Nextflow - uses: nf-core/setup-nextflow@v1 + uses: nf-core/setup-nextflow@v2 - - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5 + - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 with: - python-version: "3.11" + python-version: "3.12" architecture: "x64" - name: Install dependencies @@ -60,7 +59,7 @@ jobs: - name: Upload linting log file artifact if: ${{ always() }} - uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4 + uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4 with: name: linting-logs path: | diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index b706875f..40acc23f 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Download lint results - uses: dawidd6/action-download-artifact@f6b0bace624032e30a85a8fd9c1a7f8f611f5737 # v3 + uses: dawidd6/action-download-artifact@09f2f74827fd3a8607589e5ad7f9398816f540fe # v3 with: workflow: linting.yml workflow_conclusion: completed diff --git a/CITATIONS.md b/CITATIONS.md index 1d48ce31..79879ee6 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -22,6 +22,10 @@ > Seemann T. Prokka: rapid prokaryotic genome annotation. Bioinformatics. 2014 Jul 15;30(14):2068-9. doi: 10.1093/bioinformatics/btu153. Epub 2014 Mar 18. PMID: 24642063. +- [Bakta](https://pubmed.ncbi.nlm.nih.gov/34739369/) + + > Schwengers O, Jelonek L, Dieckmann MA, Beyvers S, Blom J, Goesmann A. Bakta: rapid and standardized annotation of bacterial genomes via alignment-free sequence identification. Microb Genom. 2021 Nov;7(11):000685. doi: 10.1099/mgen.0.000685. PMID: 34739369; PMCID: PMC8743544. + - [InterProScan](https://pubmed.ncbi.nlm.nih.gov/24451626/) > Jones P, Binns D, Chang HY, Fraser M, Li W, McAnulla C, McWilliam H, Maslen J, Mitchell A, Nuka G, Pesseat S, Quinn AF, Sangrador-Vegas A, Scheremetjew M, Yong SY, Lopez R, Hunter S. InterProScan 5: genome-scale protein function classification. Bioinformatics. 2014 May 1;30(9):1236-40. doi: 10.1093/bioinformatics/btu031. Epub 2014 Jan 21. PMID: 24451626; PMCID: PMC3998142. diff --git a/README.md b/README.md index 7cad46e0..cfe7a652 100644 --- a/README.md +++ b/README.md @@ -34,28 +34,31 @@ The workflow uses the following tools and databases: -| Tool/Database | Version | Purpose | -| ------------------------------------------------------------------------------------------------ | ----------------- | ---------------------------------------------------------------------------------------------------------------------- | -| [Prokka](https://github.com/tseemann/prokka) | 1.14.6 | CDS calling and functional annotation | -| [InterProScan](https://www.ebi.ac.uk/interpro/about/interproscan/) | 5.62-94.0 | Protein annotation (InterPro, Pfam) | -| [eggNOG-mapper](https://github.com/eggnogdb/eggnog-mapper) | 2.1.11 | Protein annotation (eggNOG, KEGG, COG, GO-terms) | -| [eggNOG DB](http://eggnog6.embl.de/download/) | 5.0.2 | Database for eggNOG-mapper | -| [UniFIRE](https://gitlab.ebi.ac.uk/uniprot-public/unifire) | 2023.4 | Protein annotation | -| [AMRFinderPlus](https://github.com/ncbi/amr) | 3.11.4 | Antimicrobial resistance gene annotation; virulence factors, biocide, heat, acid, and metal resistance gene annotation | -| [AMRFinderPlus DB](https://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/) | 3.11 2023-02-23.1 | Database for AMRFinderPlus | -| [DefenseFinder](https://github.com/mdmparis/defense-finder) | 1.2.0 | Annotation of anti-phage systems | -| [DefenseFinder models](https://github.com/mdmparis/defense-finder-models) | 1.2.3 | Database for DefenseFinder | -| [GECCO](https://github.com/zellerlab/GECCO) | 0.9.8 | Biosynthetic gene cluster annotation | -| [antiSMASH](https://antismash.secondarymetabolites.org/#!/download) | 7.1.0 | Biosynthetic gene cluster annotation | -| [SanntiS](https://github.com/Finn-Lab/SanntiS) | 0.9.3.4 | Biosynthetic gene cluster annotation | -| [run_dbCAN](https://github.com/linnabrown/run_dbcan) | 4.1.2 | PUL prediction | -| [dbCAN DB](https://bcb.unl.edu/dbCAN2/download/Databases/) | V12 | Database for run_dbCAN | -| [CRISPRCasFinder](https://github.com/dcouvin/CRISPRCasFinder) | 4.3.2 | Annotation of CRISPR arrays | -| [cmscan](http://eddylab.org/infernal/) | 1.1.5 | ncRNA predictions | -| [Rfam](https://rfam.org/) | 14.9 | Identification of SSU/LSU rRNA and other ncRNAs | -| [tRNAscan-SE](https://github.com/UCSC-LoweLab/tRNAscan-SE) | 2.0.9 | tRNA predictions | -| [VIRify](https://github.com/EBI-Metagenomics/emg-viral-pipeline) | 2.0.0 | Viral sequence annotation (runs separately) | -| [Mobilome annotation pipeline](https://github.com/EBI-Metagenomics/mobilome-annotation-pipeline) | 2.0 | Mobilome annotation (runs separately) | +| Tool/Database | Version | Purpose | +| ------------------------------------------------------------------------------------------------ | --------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------- | +| [Prokka](https://github.com/tseemann/prokka) | 1.14.6 | CDS calling and functional annotation (default) | +| [Bakta](https://github.com/oschwengers/bakta) | 1.9.3 | CDS calling and functional annotation (if --bakta flag is used) | +| [Bakta db](https://zenodo.org/record/10522951/) | 2024-01-19 with AMRFinderPlus DB 2024-01-31.1 | Bakta DB (when Bakta is used as the gene caller) | +| [InterProScan](https://www.ebi.ac.uk/interpro/about/interproscan/) | 5.62-94.0 | Protein annotation (InterPro, Pfam) | +| [eggNOG-mapper](https://github.com/eggnogdb/eggnog-mapper) | 2.1.11 | Protein annotation (eggNOG, KEGG, COG, GO-terms) | +| [eggNOG DB](http://eggnog6.embl.de/download/) | 5.0.2 | Database for eggNOG-mapper | +| [UniFIRE](https://gitlab.ebi.ac.uk/uniprot-public/unifire) | 2023.4 | Protein annotation | +| [AMRFinderPlus](https://github.com/ncbi/amr) | 3.12.8 | Antimicrobial resistance gene annotation; virulence factors, biocide, heat, acid, and metal resistance gene annotation | +| [AMRFinderPlus DB](https://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/) | 3.12 2024-01-31.1 | Database for AMRFinderPlus | +| [DefenseFinder](https://github.com/mdmparis/defense-finder) | 1.2.0 | Annotation of anti-phage systems | +| [DefenseFinder models](https://github.com/mdmparis/defense-finder-models) | 1.2.3 | Database for DefenseFinder | +| [GECCO](https://github.com/zellerlab/GECCO) | 0.9.8 | Biosynthetic gene cluster annotation | +| [antiSMASH](https://antismash.secondarymetabolites.org/#!/download) | 7.1.0 | Biosynthetic gene cluster annotation | +| [SanntiS](https://github.com/Finn-Lab/SanntiS) | 0.9.3.4 | Biosynthetic gene cluster annotation | +| [run_dbCAN](https://github.com/linnabrown/run_dbcan) | 4.1.2 | PUL prediction | +| [dbCAN DB](https://bcb.unl.edu/dbCAN2/download/Databases/) | V12 | Database for run_dbCAN | +| [CRISPRCasFinder](https://github.com/dcouvin/CRISPRCasFinder) | 4.3.2 | Annotation of CRISPR arrays | +| [cmscan](http://eddylab.org/infernal/) | 1.1.5 | ncRNA predictions | +| [Rfam](https://rfam.org/) | 14.9 | Identification of SSU/LSU rRNA and other ncRNAs | +| [tRNAscan-SE](https://github.com/UCSC-LoweLab/tRNAscan-SE) | 2.0.9 | tRNA predictions | +| [pyCirclize](https://github.com/moshi4/pyCirclize) | 1.4.0 | Visualise the merged GFF file | +| [VIRify](https://github.com/EBI-Metagenomics/emg-viral-pipeline) | 2.0.0 | Viral sequence annotation (runs separately) | +| [Mobilome annotation pipeline](https://github.com/EBI-Metagenomics/mobilome-annotation-pipeline) | 2.0 | Mobilome annotation (runs separately) | @@ -79,13 +82,14 @@ The pipeline needs reference databases in order to work, they take roughly 110G. | ------------------- | ---- | | amrfinder | 217M | | antismash | 9.4G | +| bakta | 71G | | dbcan | 7.5G | | defense_finder | 242M | | eggnog | 48G | | interproscan | 45G | | interpro_entry_list | 2.6M | | rfam_models | 637M | -| total | 110G | +| total | 180G | `mettannotator` has an automated mechanism to download the databases using the `--dbs ` flag. When this flag is provided, the pipeline inspects the folder to verify if the required databases are already present. If any of the databases are missing, the pipeline will automatically download them. @@ -112,14 +116,66 @@ EC_ASM584v2,/path/to/GCF_000005845.2.fna,562 Here, `prefix` is the prefix and the locus tag that will be assigned to output files and proteins during the annotation process; +maximum length is 24 characters; `assembly` is the path to where the assembly file in FASTA format is located; -`taxid` is the NCBI taxid (if the species-level taxid is not known, a taxid for a higher taxonomic level can be used). +`taxid` is the NCBI TaxId (if the species-level TaxId is not known, a TaxId for a higher taxonomic level can be used). If the taxonomy is known, look up the TaxID [here](https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi). -### Command +#### Finding TaxIds -Running the tool with the `--help` option will pull the repository and display the help message: +If TaxIds for input genomes are not known, a tool such as [CAT/BAT](https://github.com/MGXlab/CAT_pack) can be used. +Follow the [instructions](https://github.com/MGXlab/CAT_pack?tab=readme-ov-file#installation) for getting the tool and downloading the NCBI nr database for it. + +If using CAT/BAT, here is the suggested process for making the `mettannotator` input file: + +```bash +# Run BAT on each input genome, saving all results to the same folder +CAT bins -b ${genome_name}.fna -d ${path_to_CAT_database} -t ${path_to_CAT_tax_folder} -o BAT_results/${genome_name} + +# Optional: to check what taxa were assigned, you can add names to them +CAT add_names -i BAT_results/${genome_name}.bin2classification.txt -o BAT_results/${genome_name}.name.txt -t ${path_to_CAT_tax_folder} +``` + +To generate an input file for `mettannotator`, use [generate_input_file.py](preprocessing/generate_input_file.py): + +``` +python3 preprocessing/generate_input_file.py -h +usage: generate_input_file.py [-h] -i INFILE -d INPUT_DIR -b BAT_DIR -o OUTFILE [--no-prefix] + +The script takes a list of genomes and the taxonomy results generated by BAT and makes a +mettannotator input csv file. The user has the option to either use the genome file name +(minus the extension) as the prefix for mettannotator or leave the prefix off and fill it +out themselves after the script generates an input file with just the FASTA location and +the taxid. It is expected that for all genomes, BAT results are stored in the same folder +and are named as {fasta_base_name}.bin2classification.txt. The script will use the lowest- +level taxid without an asterisk as the taxid for the genome. + +optional arguments: + -h, --help show this help message and exit + -i INFILE A file containing a list of genome files to include (file name only, with file + extension, unzipped, one file per line). + -d INPUT_DIR Full path to the directory where the input FASTA files are located. + -b BAT_DIR Folder with BAT results. Results for all genomes should be in the same folder + and should be named {fasta_base_name}.bin2classification.txt + -o OUTFILE Path to the file where the output will be saved to. + --no-prefix Skip prefix generation and leave the first column of the output file empty for + the user to fill out. Defaule: False +``` + +For example: + +```bash +python3 generate_input_file.py -i list_of_genome_fasta_files.txt -d /path/to/the/fasta/files/folder/ -b BAT_results/ -o mettannotator_input.csv +``` + +It is always best to check the outputs to ensure the results are as expected. Correct any wrongly detected taxa before starting `mettannotator`. + +Note, that by default the script uses FASTA file names as prefixes and truncates them to 24 characters if they exceed the limit. + +### Running mettannotator + +Running `mettannotator` with the `--help` option will pull the repository and display the help message: ```angular2html nextflow run ebi-metagenomics/mettannotator/main.nf --help @@ -173,6 +229,9 @@ Reference databases Generic options --multiqc_methods_description [string] Custom MultiQC yaml file containing HTML including a methods description. +Other parameters + --bakta [boolean] Use Bakta instead of Prokka for CDS annotation. Prokka will still be used for archaeal genomes. + !! Hiding 17 params, use --validationShowHiddenParams to show them !! ------------------------------------------------------ If you use ebi-metagenomics/mettannotator for your analysis please cite: @@ -201,6 +260,13 @@ nextflow run ebi-metagenomics/mettannotator \ > provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; > see [docs](https://nf-co.re/usage/configuration#custom-configuration-files). +### Gene caller choice + +By default, `mettannotator` uses Prokka to identify protein-coding genes. Users can choose to use Bakta instead by +running `mettannotator` with the `--bakta` flag. `mettannotator` runs Bakta without ncRNA and CRISPR +annotation as these are produced by separate tools in the pipeline. Archaeal genomes will continue to be annotated using +Prokka as Bakta is only intended for annotation of bacterial genomes. + ## Test @@ -277,6 +343,10 @@ The two main output files for each genome are located in `//func Both files include the genome sequence in the FASTA format at the bottom of the file. +Additionally, for genomes with no more than 50 annotated contigs, a Circos plot of the `_annotations.gff` file is generated and included in the same folder. An example of such plot is shown below: + + + #### Data sources Below is an explanation of how each field in column 3 and 9 of the final GFF file is populated. In most cases, information is taken as is from the reporting tool's output. @@ -288,7 +358,7 @@ Below is an explanation of how each field in column 3 and 9 of the final GFF fil | LeftFLANK, RightFLANK | all\* | CRISPRCasFinder | CRISPR array flanking sequence | | CRISPRdr | all\* | CRISPRCasFinder | Direct repeat region of a CRISPR array | | CRISPRspacer | all\* | CRISPRCasFinder | CRISPR spacer | -| CDS | `ID`, `eC_number`, `Name`, `Dbxref`, `gene`, `inference`, `locus_tag` | Prokka | Protein annotation | +| CDS | `ID`, `eC_number`, `Name`, `Dbxref`, `gene`, `inference`, `locus_tag` | Prokka/Bakta | Protein annotation | | CDS | `product` | mettannotator | Product assigned as described in [ Determining the product ](#product) | | CDS | `product_source` | mettannotator | Tool that reported the product chosen by mettannotator | | CDS | `eggNOG` | eggNOG-mapper | Seed ortholog from eggNOG | @@ -341,9 +411,64 @@ Note: if the pipeline completed without errors but some of the tool-specific out ## Mobilome annotation -The mobilome annotation workflow is not currently integrated into `mettannotator`. However, the outputs produced by `mettannotator` can be used to run [VIRify](https://github.com/EBI-Metagenomics/emg-viral-pipeline) and the [mobilome annotation pipeline](https://github.com/EBI-Metagenomics/mobilome-annotation-pipeline). +The mobilome annotation workflow is not currently integrated into `mettannotator`. However, the outputs produced by `mettannotator` can be used to run [VIRify](https://github.com/EBI-Metagenomics/emg-viral-pipeline) and the [mobilome annotation pipeline](https://github.com/EBI-Metagenomics/mobilome-annotation-pipeline) and the outputs of these tools can be integrated back into the GFF file produced by `mettannotator`. + +After installing both tools, follow these steps to add the mobilome annotation: + +1. Run the [viral annotation pipeline](https://github.com/EBI-Metagenomics/emg-viral-pipeline): -These post-processing steps will be better integrated in the next release. +```bash +nextflow run \ + emg-viral-pipeline/virify.nf \ + -profile \ + --fasta \ + --output +``` + +2. Run the [mobilome annotation pipeline](https://github.com/EBI-Metagenomics/mobilome-annotation-pipeline): + +```bash +nextflow run mobilome-annotation-pipeline/main.nf \ + --assembly \ + --user_genes true \ + --prot_gff /functional_annotation/merged_gff/_annotations.gff \ + --virify true # only if the next two VIRify files exist, otherwise skip this line \ + --vir_gff Virify_output_folder/08-final/gff/_virify.gff # only if file exists, otherwise skip this line \ + --vir_checkv Virify_output_folder/07-checkv/\*quality_summary.tsv # only if the GFF file above exists, otherwise skip this line \ + --outdir \ + --skip_crispr true \ + --skip_amr true \ + -profile " +``` + +3. Integrate the output into the `mettannotator` GFF + +```bash +# Add mobilome to the merged GFF produced by mettannotator +python3 postprocessing/add_mobilome_to_gff.py \ + -m /gff_output_files/mobilome_nogenes.gff \ + -i //functional_annotation/merged_gff/_annotations.gff \ + -o _annotations_with_mobilome.gff + +# Add mobilome to the GFF with descriptions produced by mettannotator +python3 postprocessing/add_mobilome_to_gff.py \ + -m /gff_output_files/mobilome_nogenes.gff \ + -i //functional_annotation/merged_gff/_annotations_with_descriptions.gff \ + -o _annotations_with_descriptions_with_mobilome.gff +``` + +4. Optional: regenerate the Circos plot with the mobilome track added + +```bash +pip install pycirclize +pip install matplotlib + +python3 bin/circos_plot.py \ + -i _annotations_with_mobilome.gff \ + -o plot.png \ + -p \ + --mobilome +``` diff --git a/bin/add_hypothetical_protein_descriptions.py b/bin/add_hypothetical_protein_descriptions.py index a4291fc6..a63d7e77 100755 --- a/bin/add_hypothetical_protein_descriptions.py +++ b/bin/add_hypothetical_protein_descriptions.py @@ -30,7 +30,7 @@ def main(ipr_types_file, ipr_file, hierarchy_file, eggnog_file, infile, outfile) ipr_info, ipr_memberdb_only, ipr_leveled_info = load_ipr( ipr_file, ipr_types, levels ) - + gene_caller = "Prokka" fasta_flag = False with open(infile, "r") as file_in, open(outfile, "w") as file_out: for line in file_in: @@ -54,6 +54,7 @@ def main(ipr_types_file, ipr_file, hierarchy_file, eggnog_file, infile, outfile) eggnog_info, ipr_info, ipr_memberdb_only, + gene_caller, ) if not function_source == "UniFIRE": @@ -64,6 +65,7 @@ def main(ipr_types_file, ipr_file, hierarchy_file, eggnog_file, infile, outfile) found_function, function_source, attributes_dict, + gene_caller ) ) found_function = escape_reserved_characters(found_function) @@ -73,7 +75,7 @@ def main(ipr_types_file, ipr_file, hierarchy_file, eggnog_file, infile, outfile) ) else: attributes_dict = insert_product_source( - attributes_dict, "Prokka" + attributes_dict, gene_caller ) col9_updated = update_col9(attributes_dict) file_out.write( @@ -96,11 +98,13 @@ def main(ipr_types_file, ipr_file, hierarchy_file, eggnog_file, infile, outfile) file_out.write(line) else: file_out.write(line) + if "Bakta" in line: + gene_caller = "Bakta" else: file_out.write(line) -def keep_or_move_to_note(found_function, function_source, col9_dict): +def keep_or_move_to_note(found_function, function_source, col9_dict, gene_caller): """ Function aims to identify if a description is likely to be a sentence/paragraph rather than a succinct function description. If it's the former, move it to note and revert function to @@ -172,12 +176,12 @@ def keep_or_move_to_note(found_function, function_source, col9_dict): if move_to_note: col9_dict = move_function_to_note(found_function, col9_dict) found_function = "hypothetical protein" - function_source = "Prokka" + function_source = gene_caller else: # Product is too long, move to note col9_dict = move_function_to_note(found_function, col9_dict) found_function = "hypothetical protein" - function_source = "Prokka" + function_source = gene_caller return found_function, function_source, col9_dict @@ -248,7 +252,7 @@ def insert_product_source(my_dict, source): ) -def get_function(acc, attributes_dict, eggnog_annot, ipr_info, ipr_memberdb_only): +def get_function(acc, attributes_dict, eggnog_annot, ipr_info, ipr_memberdb_only, gene_caller): """ Identify function by carrying it over from a db match. The following priority is used: Priority 1: UniFIRE protein recommended full name @@ -306,7 +310,7 @@ def get_function(acc, attributes_dict, eggnog_annot, ipr_info, ipr_memberdb_only return func_description, source if acc in eggnog_annot: return eggnog_annot[acc], "eggNOG" - return "hypothetical protein", "Prokka" + return "hypothetical protein", gene_caller def get_description_and_source(my_dict, ipr_type): diff --git a/bin/annotate_gff.py b/bin/annotate_gff.py index 3fc367b5..4acf8476 100755 --- a/bin/annotate_gff.py +++ b/bin/annotate_gff.py @@ -456,9 +456,12 @@ def load_annotations( line = line.replace("db_xref", "Dbxref") cols = line.split("\t") if len(cols) == 9: - contig, feature, start, annot = cols[0], cols[2], cols[3], cols[8] + contig, caller, feature, start, annot = cols[0], cols[1], cols[2], cols[3], cols[8] if feature != "CDS": - continue + if caller == "Bakta" and feature == "region": + main_gff.setdefault(contig, dict()).setdefault(int(start), list()).append(line) + else: + continue protein = annot.split(";")[0].split("=")[-1] added_annot[protein] = {} try: diff --git a/bin/circos_plot.py b/bin/circos_plot.py new file mode 100755 index 00000000..981a3a20 --- /dev/null +++ b/bin/circos_plot.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +import argparse +import logging +import sys + +from pycirclize import Circos +from pycirclize.parser import Gff + +from matplotlib.patches import Patch + +logging.basicConfig(level=logging.INFO) + + +def main(infile, outfile, prefix, contig_num_limit, contig_trim, mobilome, dpi): + modified_infile = remove_escaped_characters(infile) + gff = Gff(modified_infile) + seqid2size = gff.get_seqid2size() + if len(seqid2size) > contig_num_limit: + logging.info( + "Skipping plot generation for file {} due to a large number of contigs: {}. " + "Plots are only generated for genomes with up to {} annotated contigs.".format( + infile, len(seqid2size), contig_num_limit + ) + ) + sys.exit() + + seqid2features = gff.get_seqid2features(feature_type=None) + + circos = Circos(seqid2size, space=1, start=1, end=358) + + circos.text("{}\n".format(prefix), size=15, r=30) + + # Skip printing contig names if the names are too long + print_contigs = True + if contig_trim == 500: # the user didn't choose truncation, check lengths + for sector in circos.sectors: + if len(sector.name[:contig_trim]) > 24: + print_contigs = False + if not print_contigs: + logging.info("Not printing contig labels because they are too long. Rerun the script with the " + "--contig-trim flag to truncate the labels if you would like them printed.") + for sector in circos.sectors: + if print_contigs: + # Plot contig labels + sector.text( + sector.name[:contig_trim], orientation="vertical", r=110, size=6, color="dimgrey" + ) + # Plot scale + position_track = sector.add_track((99, 100)) + position_track.axis(fc="none") + major_ticks_interval = 500000 + minor_ticks_interval = 50000 + if sector.size > minor_ticks_interval: + if sector.size >= minor_ticks_interval * 10: + position_track.xticks_by_interval( + major_ticks_interval, label_formatter=lambda v: f"{v / 10 ** 6:.1f} Mb" + ) + else: + position_track.xticks_by_interval( + major_ticks_interval, show_label=False + ) + position_track.xticks_by_interval( + minor_ticks_interval, tick_length=1, show_label=False + ) + + # Initiate feature tracks + f_cds_track = sector.add_track((93, 98), r_pad_ratio=0.1) + f_cds_track.axis(fc="none", ec="none") + r_cds_track = sector.add_track((88, 93), r_pad_ratio=0.1) + r_cds_track.axis(fc="none", ec="none") + rna_track = sector.add_track((83, 87), r_pad_ratio=0.1) + rna_track.axis(fc="none", ec="none") + bgc_track_antismash = sector.add_track((78, 80), r_pad_ratio=0.1) + bgc_track_antismash.axis(fc="none", ec="tomato", ls="dashdot", lw=0.15) + bgc_track_gecco = sector.add_track((76, 78), r_pad_ratio=0.1) + bgc_track_gecco.axis(fc="none", ec="lightsalmon", ls="dashdot", lw=0.15) + bgc_track_sanntis = sector.add_track((74, 76), r_pad_ratio=0.1) + bgc_track_sanntis.axis(fc="none", ec="firebrick", ls="dashdot", lw=0.15) + dbcan_track = sector.add_track((68, 70), r_pad_ratio=0.1) + dbcan_track.axis(fc="none", ec="forestgreen", ls="dashdot", lw=0.15) + amr_track = sector.add_track((62, 64), r_pad_ratio=0.1) + amr_track.axis(fc="none", ec="dodgerblue", ls="dashdot", lw=0.15) + antiphage_track = sector.add_track((56, 58), r_pad_ratio=0.1) + antiphage_track.axis(fc="none", ec="orchid", ls="dashdot", lw=0.15) + if mobilome: + mobilome_track = sector.add_track((50, 52), r_pad_ratio=0.1) + mobilome_track.axis(fc="none", ec="lightseagreen", ls="dashdot", lw=0.15) + + for feature in seqid2features[sector.name]: + if feature.type == "CDS": + if feature.strand == 1: + f_cds_track.genomic_features([feature], fc="hotpink") + else: + r_cds_track.genomic_features([feature], fc="steelblue") + if "antismash_bgc_function" in feature.qualifiers: + bgc_track_antismash.genomic_features([feature], fc="tomato") + if "gecco_bgc_type" in feature.qualifiers: + bgc_track_gecco.genomic_features([feature], fc="lightsalmon") + if "nearest_MiBIG" in feature.qualifiers: + bgc_track_sanntis.genomic_features([feature], fc="firebrick") + if "dbcan_prot_type" in feature.qualifiers: + dbcan_track.genomic_features([feature], fc="forestgreen") + if "amrfinderplus_scope" in feature.qualifiers: + amr_track.genomic_features([feature], fc="dodgerblue") + if "defense_finder_type" in feature.qualifiers: + antiphage_track.genomic_features([feature], fc="orchid") + + elif feature.type in ["tRNA", "ncRNA"]: + rna_track.genomic_features([feature], fc="darkmagenta") + elif mobilome and feature.type in [ + "mobility_island", + "cellular_recombinase", + "insertion_sequence", + "conjugative_element", + "conjugative_integron", + "integron", + "plasmid", + "nested_mobile_element", + "terminal_inverted_repeat_element", + "direct_repeat_element", + "viral_sequence", + ]: + mobilome_track.genomic_features([feature], fc="lightseagreen") + elif mobilome and feature.type.lower() in ["phage", "prophage"]: + mobilome_track.genomic_features([feature], fc="blue") + + fig = circos.plotfig() + # Add legend + handles = [ + Patch(color="hotpink", label="Forward CDS"), + Patch(color="steelblue", label="Reverse CDS"), + Patch(color="darkmagenta", label="RNA"), + Patch(color="tomato", label="BGCs (antiSMASH)"), + Patch(color="lightsalmon", label="BGCs (GECCO)"), + Patch(color="firebrick", label="BGCs (SanntiS)"), + Patch(color="forestgreen", label="Predicted PULs"), + Patch(color="dodgerblue", label="AMR genes"), + Patch(color="orchid", label="Anti-phage defense genes"), + ] + if mobilome: + handles = handles + [ + Patch(color="blue", label="Mobilome (phage)"), + Patch(color="lightseagreen", label="Mobilome (other)"), + ] + + main_legend = circos.ax.legend( + handles=handles, bbox_to_anchor=(0.5, 0.475), loc="center", fontsize=8 + ) + circos.ax.add_artist(main_legend) + fig.savefig(outfile, dpi=dpi) + + +def remove_escaped_characters(infile): + outfile = infile + "_modified" + with open(infile, "r") as file_in: + content = file_in.read() + modified_content = content.replace("\\=", "") + + # Write the modified content into a file + with open(outfile, "w") as file_out: + file_out.write(modified_content) + return outfile + + +def parse_args(): + parser = argparse.ArgumentParser(description="Script for Circos plot generation.") + parser.add_argument( + "-i", "--infile", required=True, help="Path to the GFF file to plot" + ) + parser.add_argument( + "-o", "--outfile", required=True, help="Path to the output file" + ) + parser.add_argument( + "-p", "--prefix", required=True, help="Prefix to use for the genome" + ) + parser.add_argument( + "-l", + "--limit", + required=False, + type=int, + default=50, + help="Only generate a plot if the genome has no more than this number of contigs. Limit introduced because " + "highly fragmented genomes do not produce readable plots. Default: 50.", + ) + parser.add_argument( + "--contig-trim", + required=False, + default=500, + type=int, + help="If the contig length is over 24 characters long, contig names will not be printed on the plot. Specify " + "the length to trim the contig names down to if you would like the shorter names printed.", + ) + parser.add_argument( + "--mobilome", + required=False, + action="store_true", + default=False, + help="Plot the mobilome track. Default: False", + ) + parser.add_argument( + "--dpi", + required=False, + type=int, + default=600, + help="Specify the dpi for the plot. Default: 600", + ) + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + main(args.infile, args.outfile, args.prefix, args.limit, args.contig_trim, args.mobilome, args.dpi) diff --git a/bin/identify_kingdom.py b/bin/identify_kingdom.py index 1df6c2ec..78403789 100755 --- a/bin/identify_kingdom.py +++ b/bin/identify_kingdom.py @@ -17,6 +17,7 @@ import argparse import logging +import os import sys import requests @@ -25,7 +26,7 @@ logging.basicConfig(level=logging.INFO) -def main(taxid, outfile): +def main(taxid, include_kingdom, outfile): kingdom = "Bacteria" if not taxid.isdigit(): sys.exit("Invalid format of taxid {}".format(taxid)) @@ -56,6 +57,10 @@ def main(taxid, outfile): ) logging.info("Reporting kingdom {} for taxid {}".format(kingdom, taxid)) if outfile: + if include_kingdom: + outfile_path, outfile_name = os.path.split(outfile) + outfile_new_name = "{}_{}".format(kingdom, outfile_name) + outfile = os.path.join(outfile_path, outfile_new_name) with open(outfile, "w") as file_out: file_out.write(kingdom) else: @@ -83,13 +88,19 @@ def parse_args(): required=True, help="Taxid to identify the kingdom for.", ) + parser.add_argument( + "--include-kingdom", + action='store_true', + help="Add the kingdom name at the beginning of the file name.", + ) parser.add_argument( "-o", dest="outfile", required=False, help=( - "Path to the output file where the result will be saved. If none specified, script will print the output " - "as STDOUT." + "Path to the output file (with file name) where the result will be saved. If the --include-kingdom option " + "was specified, the kingdom will be added to the beginning of the chosen output file name. " + "If the output file is not specified, the script will print the output as STDOUT." ), ) return parser.parse_args() @@ -99,5 +110,6 @@ def parse_args(): args = parse_args() main( args.taxid, + args.include_kingdom, args.outfile, ) diff --git a/bin/prepare_unifire_input.py b/bin/prepare_unifire_input.py index 06744251..56b56c5d 100755 --- a/bin/prepare_unifire_input.py +++ b/bin/prepare_unifire_input.py @@ -53,6 +53,7 @@ def check_dir(directory_path): def reformat_line(line, taxid): line = line.lstrip(">").strip() id, description = line.split(maxsplit=1) + description = description.replace("\"", "").replace("'", "").replace("‘", "").replace("’", "") formatted_line = ">tr|{id}|{description} OX={taxid}\n".format( id=id, description=description, taxid=taxid ) diff --git a/bin/process_unifire_output.py b/bin/process_unifire_output.py index 13899082..b4df861f 100755 --- a/bin/process_unifire_output.py +++ b/bin/process_unifire_output.py @@ -208,17 +208,17 @@ def load_unirule_arba(file): if not line.startswith("Evidence"): parts = line.strip().split("\t") if len(parts) == 6: - pass # skip feature lines with start and end coordinates + pass # don't do anything to feature lines with start and end coordinates else: evidence, protein_id, annot_type, value = line.strip().split("\t") - if ( - not annot_type.startswith("comment") - and not annot_type.startswith("protein.domain") - and not annot_type.startswith("protein.component") - ): - results_dict.setdefault(protein_id, dict()).setdefault( - annot_type, list() - ).append(value) + if ( + not annot_type.startswith("comment") + and not annot_type.startswith("protein.domain") + and not annot_type.startswith("protein.component") + ): + results_dict.setdefault(protein_id, dict()).setdefault( + annot_type, list() + ).append(value) for record in results_dict: if "keyword" in results_dict[record]: if len(results_dict[record]["keyword"]) > 1: diff --git a/conf/base.config b/conf/base.config index 5e4440cc..2f1672ec 100644 --- a/conf/base.config +++ b/conf/base.config @@ -79,8 +79,8 @@ process { memory = { check_max( 12.GB * task.attempt, 'memory' ) } } withName: UNIFIRE { - cpus = { check_max( 8 * task.attempt, 'cpus' ) } - memory = { check_max( 20.GB * task.attempt, 'memory' ) } + cpus = { check_max( 8 , 'cpus' ) } + memory = { check_max( 30.GB * task.attempt, 'memory' ) } } withName: DBCAN { cpus = { check_max( 8 * task.attempt, 'cpus' ) } diff --git a/conf/ebi_codon.config b/conf/ebi_codon.config index 292b09b7..7224d68d 100644 --- a/conf/ebi_codon.config +++ b/conf/ebi_codon.config @@ -7,11 +7,13 @@ params { rfam_ncrna_models = "/hps/nobackup/rdf/metagenomics/service-team/production/ref-dbs/genomes-pipeline/rfam_14.9/ncrna_cms" - amrfinder_plus_db = "/hps/nobackup/rdf/metagenomics/service-team/ref-dbs/amrfinderplus/3.11/2023-02-23.1" + amrfinder_plus_db = "/hps/nobackup/rdf/metagenomics/service-team/ref-dbs/amrfinderplus/3.12/2024-01-31.1" defense_finder_db = "/hps/nobackup/rdf/metagenomics/service-team/ref-dbs/defense-finder-models/1.2.3" antismash_db = "/hps/nobackup/rdf/metagenomics/service-team/ref-dbs/antismash/7.1.x" dbcan_db = "/hps/nobackup/rdf/metagenomics/service-team/ref-dbs/dbcan/4.1.3-V12" + + bakta_db = "/hps/nobackup/rdf/metagenomics/service-team/ref-dbs/bakta/2024-01-19" } diff --git a/conf/modules.config b/conf/modules.config index 71de031e..6b2e16a4 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -194,6 +194,19 @@ process { ] } + withName: BAKTA_BAKTA { + publishDir = [ + path: { "${params.outdir}/${meta.prefix}/functional_annotation/bakta" }, + mode: params.publish_dir_mode, + saveAs: { filename -> + if ( filename.equals('versions.yml') ) { + return null; + } + return new File(filename).name; + } + ] + } + withName: DBCAN { publishDir = [ path: { "${params.outdir}/${meta.prefix}/functional_annotation/dbcan" }, @@ -220,6 +233,13 @@ process { ] } + withName: CIRCOS_PLOT { + publishDir = [ + path: { "${params.outdir}/${meta.prefix}/functional_annotation/merged_gff" }, + mode: params.publish_dir_mode, + ] + } + /* ~~~~~~~~~~~~~~~~~~~~~~ Antiphage defense diff --git a/media/circos-plot-example.png b/media/circos-plot-example.png new file mode 100644 index 00000000..648684d6 Binary files /dev/null and b/media/circos-plot-example.png differ diff --git a/media/mettannotator-product.png b/media/mettannotator-product.png index 1fd5a798..23d40d6f 100644 Binary files a/media/mettannotator-product.png and b/media/mettannotator-product.png differ diff --git a/media/mettannotator-schema.png b/media/mettannotator-schema.png index e3c90401..4f3f9169 100644 Binary files a/media/mettannotator-schema.png and b/media/mettannotator-schema.png differ diff --git a/modules.json b/modules.json index d8648f19..ec7c7e7f 100644 --- a/modules.json +++ b/modules.json @@ -5,6 +5,12 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "bakta/bakta": { + "branch": "master", + "git_sha": "9d0f89b445e1f5b2fb30476f4be9a8b519c07846", + "installed_by": ["modules"], + "patch": "modules/nf-core/bakta/bakta/bakta-bakta.diff" + }, "custom/dumpsoftwareversions": { "branch": "master", "git_sha": "05c280924b6c768d484c7c443dad5e605c4ff4b4", diff --git a/modules/local/amrfinder_plus.nf b/modules/local/amrfinder_plus.nf index a8fab2ae..7d8d1832 100644 --- a/modules/local/amrfinder_plus.nf +++ b/modules/local/amrfinder_plus.nf @@ -2,7 +2,7 @@ process AMRFINDER_PLUS { tag "${meta.prefix}" - container 'quay.io/biocontainers/ncbi-amrfinderplus:3.11.4--h6e70893_0' + container 'quay.io/biocontainers/ncbi-amrfinderplus:3.12.8--h283d18e_0' input: tuple val(meta), path(fna), path(faa), path(gff) @@ -52,7 +52,7 @@ process AMRFINDER_PLUS_TO_GFF { process_amrfinderplus_results.py \\ -i ${amrfinder_tsv} \\ -o ${meta.prefix}_amrfinderplus.gff \\ - -v 3.11.4 + -v 3.12.8 cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/amrfinder_plus_getdb.nf b/modules/local/amrfinder_plus_getdb.nf index d961b503..42bc9712 100644 --- a/modules/local/amrfinder_plus_getdb.nf +++ b/modules/local/amrfinder_plus_getdb.nf @@ -1,24 +1,24 @@ process AMRFINDER_PLUS_GETDB { - tag "AMR Finder DB 2023-02-23.1" + tag "AMR Finder DB 2024-01-31.1" - container 'quay.io/biocontainers/ncbi-amrfinderplus:3.11.4--h6e70893_0' + container 'quay.io/biocontainers/ncbi-amrfinderplus:3.12.8--h283d18e_0' publishDir "$params.dbs", mode: 'copy' output: - tuple path("amrfinder", type: "dir"), val("2023-02-23.1"), emit: amrfinder_plus_db + tuple path("amrfinder", type: "dir"), val("2024-01-31.1"), emit: amrfinder_plus_db script: """ wget -r -nH --cut-dirs=5 \\ - ftp://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/database/3.11/2023-02-23.1/ + ftp://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/database/3.12/2024-01-31.1/ - amrfinder_index 2023-02-23.1 + amrfinder_index 2024-01-31.1 - mv 2023-02-23.1 amrfinder + mv 2024-01-31.1 amrfinder - echo "2023-02-23.1" > amrfinder/VERSION.txt + echo "2024-01-31.1" > amrfinder/VERSION.txt """ } diff --git a/modules/local/annotate_gff.nf b/modules/local/annotate_gff.nf index 62141f12..80f1d9cb 100644 --- a/modules/local/annotate_gff.nf +++ b/modules/local/annotate_gff.nf @@ -30,7 +30,6 @@ process ANNOTATE_GFF { tuple val(meta), path("*_annotations_with_descriptions.gff"), emit: annotated_desc_gff path "versions.yml", emit: versions - // For the version, I'm using the latest stable the genomes-annotation pipeline script: def sanntis_flag = ""; def crisprcas_flag = ""; diff --git a/modules/local/bakta_getdb.nf b/modules/local/bakta_getdb.nf new file mode 100644 index 00000000..bc219d4b --- /dev/null +++ b/modules/local/bakta_getdb.nf @@ -0,0 +1,30 @@ + +process BAKTA_GETDB { + + tag "BAKTA DB 2024-01-19" + + container 'quay.io/biocontainers/ncbi-amrfinderplus:3.12.8--h283d18e_0' + + publishDir "$params.dbs", mode: 'copy' + + output: + tuple path("bakta", type: "dir"), val("2024-01-19"), emit: bakta_db + + script: + """ + wget https://zenodo.org/record/10522951/files/db.tar.gz + tar -xzf db.tar.gz + rm db.tar.gz + mv db bakta + wget -r -nH --cut-dirs=5 \\ + ftp://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/database/3.12/2024-01-31.1/ + rm -r bakta/amrfinderplus-db/* + mv 2024-01-31.1 bakta/amrfinderplus-db/ + + amrfinder_index bakta/amrfinderplus-db/2024-01-31.1 + + mv bakta/amrfinderplus-db/2024-01-31.1/ bakta/amrfinderplus-db/latest/ + + echo "2024-01-19" > bakta/VERSION.txt + """ +} diff --git a/modules/local/circos_plot.nf b/modules/local/circos_plot.nf new file mode 100644 index 00000000..36e85585 --- /dev/null +++ b/modules/local/circos_plot.nf @@ -0,0 +1,35 @@ +process CIRCOS_PLOT { + tag "${meta.prefix}" + + label 'process_nano' + + container 'quay.io/microbiome-informatics/pycirclize:1.4.0' + + input: + tuple val(meta), path(gff) + + output: + tuple val(meta), path("*.png"), emit: circos_plot, optional: true + path "versions.yml", emit: versions + + script: + """ + circos_plot.py -p ${meta.prefix} -i ${gff} -o ${meta.prefix}_plot.png + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pyCirclize: \$(python -c "import pycirclize; print(pycirclize.__version__)") + END_VERSIONS + """ + + stub: + """ + touch ${meta.prefix}_plot.png + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pyCirclize: \$(python -c "import pycirclize; print(pycirclize.__version__)") + END_VERSIONS + + """ +} diff --git a/modules/local/detect_trna.nf b/modules/local/detect_trna.nf index 4f9fc630..a4410604 100644 --- a/modules/local/detect_trna.nf +++ b/modules/local/detect_trna.nf @@ -8,7 +8,7 @@ process DETECT_TRNA { container 'quay.io/microbiome-informatics/genomes-pipeline.detect_rrna:v3.2' input: - tuple val(meta), path(fasta) + tuple val(meta), path(fasta), path(detected_kingdom) output: tuple val(meta), path('*_trna.out'), emit: trna_out @@ -28,9 +28,11 @@ process DETECT_TRNA { # bash trap to clean the tmp directory trap 'rm -r -- "\${PROCESSTMP}"' EXIT + kingdom_val="\$(head -c1 ${detected_kingdom})" + echo "[ Detecting tRNAs ]" - tRNAscan-SE -B -Q \ + tRNAscan-SE -\${kingdom_val} -Q \ -m ${meta.prefix}_stats.out \ -o ${meta.prefix}_trna.out \ --gff ${meta.prefix}_trna.gff \ diff --git a/modules/local/lookup_kingdom.nf b/modules/local/lookup_kingdom.nf index fa8e53c1..b6a2ed7f 100644 --- a/modules/local/lookup_kingdom.nf +++ b/modules/local/lookup_kingdom.nf @@ -10,17 +10,17 @@ process LOOKUP_KINGDOM { tuple val(meta), path(fasta) output: - tuple val(meta), path("${meta.prefix}_kingdom.txt"), emit: detected_kingdom + tuple val(meta), path("*.txt"), emit: detected_kingdom // For the version, I'm using the latest stable the genomes-annotation pipeline script: """ - identify_kingdom.py -t ${meta.taxid} -o ${meta.prefix}_kingdom.txt + identify_kingdom.py -t ${meta.taxid} --include-kingdom -o ${meta.prefix}_kingdom.txt """ stub: """ - touch ${meta.prefix}_kingdom.txt + touch Bacteria.txt """ } diff --git a/modules/local/prokka.nf b/modules/local/prokka.nf index 298c811f..8f9b1f09 100644 --- a/modules/local/prokka.nf +++ b/modules/local/prokka.nf @@ -5,7 +5,7 @@ process PROKKA { container "quay.io/biocontainers/prokka:1.14.6--pl526_0" input: - tuple val(meta), path(fasta), path(detected_kingdom) + tuple val(meta), path(fasta), val(detected_kingdom) output: tuple val(meta), file("${meta.prefix}_prokka/${meta.prefix}.gff"), emit: gff @@ -20,11 +20,9 @@ process PROKKA { """ cat ${fasta} | tr '-' ' ' > ${meta.prefix}_cleaned.fasta - kingdom_val="\$(cat ${detected_kingdom})" - prokka ${meta.prefix}_cleaned.fasta \ --cpus ${task.cpus} \ - --kingdom \${kingdom_val} \ + --kingdom ${detected_kingdom} \ --outdir ${meta.prefix}_prokka \ --prefix ${meta.prefix} \ --force \ diff --git a/modules/local/sanntis.nf b/modules/local/sanntis.nf index cd63fdf9..5aa83c1a 100644 --- a/modules/local/sanntis.nf +++ b/modules/local/sanntis.nf @@ -15,14 +15,17 @@ process SANNTIS { path "versions.yml" , emit: versions script: + def genbank_extension = prokka_gbk.extension if (interproscan_tsv.extension == "gz") { """ gunzip -c ${interproscan_tsv} > interproscan.tsv + // Workaround implemented for SanntiS due to discrepancies in Bakta's format, resulting in empty output files. + grep -v "/protein_id=" ${prokka_gbk} > ${meta.prefix}_prepped.${genbank_extension} sanntis \ --ip-file interproscan.tsv \ --outfile ${meta.prefix}_sanntis.gff \ --cpu ${task.cpus} \ - ${prokka_gbk} + ${meta.prefix}_prepped.${genbank_extension} cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -31,10 +34,11 @@ process SANNTIS { """ } else { """ + grep -v "/protein_id=" ${prokka_gbk} > ${meta.prefix}_prepped.${genbank_extension} sanntis \ --ip-file ${interproscan_tsv} \ --outfile ${meta.prefix}_sanntis.gff \ - ${prokka_gbk} + ${meta.prefix}_prepped.${genbank_extension} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/unifire.nf b/modules/local/unifire.nf index 1e718eac..41da5673 100644 --- a/modules/local/unifire.nf +++ b/modules/local/unifire.nf @@ -2,6 +2,8 @@ process UNIFIRE { tag "${meta.prefix}" + label 'error_retry' + container "dockerhub.ebi.ac.uk/uniprot-public/unifire:2023.4" containerOptions "--bind unifire:/volume" diff --git a/modules/nf-core/bakta/bakta/bakta-bakta.diff b/modules/nf-core/bakta/bakta/bakta-bakta.diff new file mode 100644 index 00000000..0d2045fa --- /dev/null +++ b/modules/nf-core/bakta/bakta/bakta-bakta.diff @@ -0,0 +1,78 @@ +Changes in module 'nf-core/bakta/bakta' +--- /dev/null ++++ modules/nf-core/bakta/bakta/modules.json +@@ -0,0 +1,5 @@ ++{ ++ "name": "", ++ "homePage": "", ++ "repos": {} ++} + +--- modules/nf-core/bakta/bakta/main.nf ++++ modules/nf-core/bakta/bakta/main.nf +@@ -1,5 +1,5 @@ + process BAKTA_BAKTA { +- tag "$meta.id" ++ tag "$meta.prefix" + label 'process_medium' + + conda "${moduleDir}/environment.yml" +@@ -8,22 +8,22 @@ + 'biocontainers/bakta:1.9.3--pyhdfd78af_0' }" + + input: +- tuple val(meta), path(fasta) +- path db +- path proteins +- path prodigal_tf ++ tuple val(meta), path(fasta), path(detected_kingdom) ++ tuple path(db), val(db_version) + + output: + tuple val(meta), path("${prefix}.embl") , emit: embl + tuple val(meta), path("${prefix}.faa") , emit: faa + tuple val(meta), path("${prefix}.ffn") , emit: ffn + tuple val(meta), path("${prefix}.fna") , emit: fna +- tuple val(meta), path("${prefix}.gbff") , emit: gbff ++ tuple val(meta), path("${prefix}.gbff") , emit: gbk + tuple val(meta), path("${prefix}.gff3") , emit: gff + tuple val(meta), path("${prefix}.hypotheticals.tsv"), emit: hypotheticals_tsv + tuple val(meta), path("${prefix}.hypotheticals.faa"), emit: hypotheticals_faa + tuple val(meta), path("${prefix}.tsv") , emit: tsv + tuple val(meta), path("${prefix}.txt") , emit: txt ++ tuple val(meta), path("${prefix}.svg") , emit: svg ++ tuple val(meta), path("${prefix}.png") , emit: png + path "versions.yml" , emit: versions + + when: +@@ -31,18 +31,22 @@ + + script: + def args = task.ext.args ?: '' +- prefix = task.ext.prefix ?: "${meta.id}" +- def proteins_opt = proteins ? "--proteins ${proteins[0]}" : "" +- def prodigal_tf = prodigal_tf ? "--prodigal-tf ${prodigal_tf[0]}" : "" ++ prefix = task.ext.prefix ?: "${meta.prefix}" + """ + bakta \\ + $fasta \\ + $args \\ + --threads $task.cpus \\ + --prefix $prefix \\ +- $proteins_opt \\ +- $prodigal_tf \\ +- --db $db ++ --locus-tag $prefix \\ ++ --db $db \\ ++ --keep-contig-headers \\ ++ --skip-trna \\ ++ --skip-tmrna \\ ++ --skip-rrna \\ ++ --skip-ncrna \\ ++ --skip-ncrna-region \\ ++ --skip-crispr + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + +************************************************************ diff --git a/modules/nf-core/bakta/bakta/environment.yml b/modules/nf-core/bakta/bakta/environment.yml new file mode 100644 index 00000000..efb92265 --- /dev/null +++ b/modules/nf-core/bakta/bakta/environment.yml @@ -0,0 +1,7 @@ +name: bakta_bakta +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bakta=1.9.3 diff --git a/modules/nf-core/bakta/bakta/main.nf b/modules/nf-core/bakta/bakta/main.nf new file mode 100644 index 00000000..546b9cea --- /dev/null +++ b/modules/nf-core/bakta/bakta/main.nf @@ -0,0 +1,76 @@ +process BAKTA_BAKTA { + tag "$meta.prefix" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bakta:1.9.3--pyhdfd78af_0' : + 'biocontainers/bakta:1.9.3--pyhdfd78af_0' }" + + input: + tuple val(meta), path(fasta), val(detected_kingdom) + tuple path(db), val(db_version) + + output: + tuple val(meta), path("${prefix}.embl") , emit: embl + tuple val(meta), path("${prefix}.faa") , emit: faa + tuple val(meta), path("${prefix}.ffn") , emit: ffn + tuple val(meta), path("${prefix}.fna") , emit: fna + tuple val(meta), path("${prefix}.gbff") , emit: gbk + tuple val(meta), path("${prefix}.gff3") , emit: gff + tuple val(meta), path("${prefix}.hypotheticals.tsv"), emit: hypotheticals_tsv + tuple val(meta), path("${prefix}.hypotheticals.faa"), emit: hypotheticals_faa + tuple val(meta), path("${prefix}.tsv") , emit: tsv + tuple val(meta), path("${prefix}.txt") , emit: txt + tuple val(meta), path("${prefix}.svg") , emit: svg + tuple val(meta), path("${prefix}.png") , emit: png + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.prefix}" + """ + bakta \\ + $fasta \\ + $args \\ + --threads $task.cpus \\ + --prefix $prefix \\ + --locus-tag $prefix \\ + --db $db \\ + --keep-contig-headers \\ + --skip-trna \\ + --skip-tmrna \\ + --skip-rrna \\ + --skip-ncrna \\ + --skip-ncrna-region \\ + --skip-crispr + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bakta: \$(echo \$(bakta --version) 2>&1 | cut -f '2' -d ' ') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.embl + touch ${prefix}.faa + touch ${prefix}.ffn + touch ${prefix}.fna + touch ${prefix}.gbff + touch ${prefix}.gff3 + touch ${prefix}.hypotheticals.tsv + touch ${prefix}.hypotheticals.faa + touch ${prefix}.tsv + touch ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bakta: \$(echo \$(bakta --version) 2>&1 | cut -f '2' -d ' ') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bakta/bakta/meta.yml b/modules/nf-core/bakta/bakta/meta.yml new file mode 100644 index 00000000..c0e53e2a --- /dev/null +++ b/modules/nf-core/bakta/bakta/meta.yml @@ -0,0 +1,92 @@ +name: bakta_bakta +description: Annotation of bacterial genomes (isolates, MAGs) and plasmids +keywords: + - annotation + - fasta + - bacteria +tools: + - bakta: + description: Rapid & standardized annotation of bacterial genomes, MAGs & plasmids. + homepage: https://github.com/oschwengers/bakta + documentation: https://github.com/oschwengers/bakta + tool_dev_url: https://github.com/oschwengers/bakta + doi: "10.1099/mgen.0.000685" + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: | + FASTA file to be annotated. Has to contain at least a non-empty string dummy value. + - db: + type: file + description: | + Path to the Bakta database. Must have amrfinderplus database directory already installed within it (in a directory called 'amrfinderplus-db/'). + - proteins: + type: file + description: FASTA/GenBank file of trusted proteins to first annotate from (optional) + - prodigal_tf: + type: file + description: Training file to use for Prodigal (optional) +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - txt: + type: file + description: genome statistics and annotation summary + pattern: "*.txt" + - tsv: + type: file + description: annotations as simple human readble tab separated values + pattern: "*.tsv" + - gff: + type: file + description: annotations & sequences in GFF3 format + pattern: "*.gff3" + - gbff: + type: file + description: annotations & sequences in (multi) GenBank format + pattern: "*.gbff" + - embl: + type: file + description: annotations & sequences in (multi) EMBL format + pattern: "*.embl" + - fna: + type: file + description: replicon/contig DNA sequences as FASTA + pattern: "*.fna" + - faa: + type: file + description: CDS/sORF amino acid sequences as FASTA + pattern: "*.faa" + - ffn: + type: file + description: feature nucleotide sequences as FASTA + pattern: "*.ffn" + - hypotheticals_tsv: + type: file + description: additional information on hypothetical protein CDS as simple human readble tab separated values + pattern: "*.hypotheticals.tsv" + - hypotheticals_faa: + type: file + description: hypothetical protein CDS amino acid sequences as FASTA + pattern: "*.hypotheticals.faa" +authors: + - "@rpetit3" + - "@oschwengers" + - "@jfy133" +maintainers: + - "@rpetit3" + - "@oschwengers" + - "@jfy133" diff --git a/modules/nf-core/bakta/bakta/tests/main.nf.test b/modules/nf-core/bakta/bakta/tests/main.nf.test new file mode 100644 index 00000000..bdceb16e --- /dev/null +++ b/modules/nf-core/bakta/bakta/tests/main.nf.test @@ -0,0 +1,83 @@ +nextflow_process { + + name "Test Process BAKTA_BAKTA" + script "../main.nf" + config "./nextflow.config" + process "BAKTA_BAKTA" + + tag "modules" + tag "modules_nfcore" + tag "bakta" + tag "bakta/bakta" + tag "bakta/baktadbdownload" + + test("Bakta - bacteroides_fragilis - genome.fasta") { + + setup { + run("BAKTA_BAKTADBDOWNLOAD") { + script "../../baktadbdownload/main.nf" + process { + """ + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['bacteroides_fragilis']['illumina']['test1_contigs_fa_gz'], checkIfExists: true) + ] + input[1] = BAKTA_BAKTADBDOWNLOAD.out.db + input[2] = [] + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.embl.get(0).get(1)).text.contains("/translation=\"MKNTLKIAILLIAIISMGHWMPVKQVCDLNSLSLQNVEALANGET") }, + { assert path(process.out.faa.get(0).get(1)).text.contains("MKNTLKIAILLIAIISMGHWMPVKQVCDLNSLSLQNVEALANGETPNYTFCIGAGSVDCPIQHDKVKYVSQGFSLDY") }, + { assert path(process.out.ffn.get(0).get(1)).text.contains("ATGAAAAACACTTTAAAAATAGCTATTCTTCTTATTGCTATTATTTCTATGGGGCATTGGATGCCTGTAAAACAAGT") }, + { assert path(process.out.fna.get(0).get(1)).text.contains("TCTTTTTACTCATAATCTACTTTTATGATGTTAATTATTTTTTCCGTGTCTCTCTTTCGG") }, + { assert path(process.out.gbff.get(0).get(1)).text.contains("/translation=\"MKNTLKIAILLIAIISMGHWMPVKQVCDLNSLSLQNVEALANGET") }, + { assert path(process.out.gff.get(0).get(1)).text.contains("##sequence-region contig_1 1 2926") }, + { assert path(process.out.hypotheticals_tsv.get(0).get(1)).text.contains("#Annotated with Bakta") }, + { assert path(process.out.hypotheticals_faa.get(0).get(1)).text.contains("MKNLILVLGCFFFLISCQQTEKEKLEELVKNWNGKEVLL") }, + { assert path(process.out.tsv.get(0).get(1)).text.contains("SO:0001217, UniRef:UniRef50_A0A0I9S7A3") }, + { assert path(process.out.txt.get(0).get(1)).text.contains("Length: 1739120") }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + + test("Bakta - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [[id: 'stub'],file('stub')] + input[1] = [] + input[2] = [] + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/bakta/bakta/tests/main.nf.test.snap b/modules/nf-core/bakta/bakta/tests/main.nf.test.snap new file mode 100644 index 00000000..40e30c36 --- /dev/null +++ b/modules/nf-core/bakta/bakta/tests/main.nf.test.snap @@ -0,0 +1,191 @@ +{ + "versions": { + "content": [ + [ + "versions.yml:md5,f8b70ceb2a328c25a190699384e6152d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-14T09:11:06.657602394" + }, + "Bakta - stub": { + "content": [ + { + "0": [ + [ + { + "id": "stub" + }, + "stub.embl:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "stub" + }, + "stub.faa:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "10": [ + "versions.yml:md5,f8b70ceb2a328c25a190699384e6152d" + ], + "2": [ + [ + { + "id": "stub" + }, + "stub.ffn:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + [ + { + "id": "stub" + }, + "stub.fna:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "4": [ + [ + { + "id": "stub" + }, + "stub.gbff:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "5": [ + [ + { + "id": "stub" + }, + "stub.gff3:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "6": [ + [ + { + "id": "stub" + }, + "stub.hypotheticals.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "7": [ + [ + { + "id": "stub" + }, + "stub.hypotheticals.faa:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "8": [ + [ + { + "id": "stub" + }, + "stub.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "9": [ + [ + { + "id": "stub" + }, + "stub.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "embl": [ + [ + { + "id": "stub" + }, + "stub.embl:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "faa": [ + [ + { + "id": "stub" + }, + "stub.faa:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "ffn": [ + [ + { + "id": "stub" + }, + "stub.ffn:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "fna": [ + [ + { + "id": "stub" + }, + "stub.fna:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "gbff": [ + [ + { + "id": "stub" + }, + "stub.gbff:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "gff": [ + [ + { + "id": "stub" + }, + "stub.gff3:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "hypotheticals_faa": [ + [ + { + "id": "stub" + }, + "stub.hypotheticals.faa:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "hypotheticals_tsv": [ + [ + { + "id": "stub" + }, + "stub.hypotheticals.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "tsv": [ + [ + { + "id": "stub" + }, + "stub.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "txt": [ + [ + { + "id": "stub" + }, + "stub.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,f8b70ceb2a328c25a190699384e6152d" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-14T09:11:15.532858932" + } +} \ No newline at end of file diff --git a/modules/nf-core/bakta/bakta/tests/nextflow.config b/modules/nf-core/bakta/bakta/tests/nextflow.config new file mode 100644 index 00000000..9af0dde1 --- /dev/null +++ b/modules/nf-core/bakta/bakta/tests/nextflow.config @@ -0,0 +1,11 @@ +process { + + withName: 'BAKTA_BAKTADBDOWNLOAD' { + ext.args = "--type light" + } + + withName: 'BAKTA_BAKTA' { + memory = 7.GB + } + +} diff --git a/modules/nf-core/bakta/bakta/tests/tags.yml b/modules/nf-core/bakta/bakta/tests/tags.yml new file mode 100644 index 00000000..ecb08c45 --- /dev/null +++ b/modules/nf-core/bakta/bakta/tests/tags.yml @@ -0,0 +1,2 @@ +bakta/bakta: + - "modules/nf-core/bakta/bakta/**" diff --git a/nextflow.config b/nextflow.config index 022f325d..d8ac7435 100644 --- a/nextflow.config +++ b/nextflow.config @@ -26,7 +26,7 @@ params { eggnog_db_version = "5.0.2" amrfinder_plus_db = null - amrfinder_plus_db_version = "2023-02-23.1" + amrfinder_plus_db_version = "2024-01-31.1" defense_finder_db = null defense_finder_db_version = "1.2.3" @@ -40,6 +40,12 @@ params { rfam_ncrna_models = null rfam_ncrna_models_rfam_version = "14.9" + bakta_db = null + bakta_db_version = "2024-01-19" + + // Tool options + bakta = false + // MultiQC options multiqc_config = null multiqc_title = null @@ -261,12 +267,12 @@ dag { manifest { name = 'ebi-metagenomics/mettannotator' - author = """@mberacochea""" + author = """@mberacochea,@tgurbich""" homePage = 'https://github.com/ebi-metagenomics/mettannotator' description = """ME TT assembly annotation pipeline""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '1.0dev' + version = '1.1dev' doi = '' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 2c190ead..5f90c718 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -108,7 +108,7 @@ }, "amrfinder_plus_db_version": { "type": "string", - "default": "2023-02-23.1", + "default": "2024-01-31.1", "description": "The AMRFinderPlus reference database version." }, "defense_finder_db": { @@ -143,6 +143,17 @@ "type": "string", "default": "4.1.3_V12", "description": "The dbCAN reference database version." + }, + "bakta_db": { + "type": "string", + "format": "directory-path", + "description": "Bakta reference database, please go to the documentation for the setup https://zenodo.org/records/10522951 and https://github.com/oschwengers/bakta?tab=readme-ov-file#database", + "help_text": "Set this variable to the path of the folder that contains the Bakta database directory with pre-indexed AMRFinderPlus db inside." + }, + "bakta_db_version": { + "type": "string", + "default": "2024-01-19", + "description": "The Bakta reference database version." } } }, @@ -305,5 +316,11 @@ { "$ref": "#/definitions/generic_options" } - ] + ], + "properties": { + "bakta": { + "type": "boolean", + "description": "Use Bakta instead of Prokka for CDS annotation. Prokka will still be used for archaeal genomes." + } + } } diff --git a/postprocessing/add_mobilome_to_gff.py b/postprocessing/add_mobilome_to_gff.py index 24d60d06..aaa6ecec 100644 --- a/postprocessing/add_mobilome_to_gff.py +++ b/postprocessing/add_mobilome_to_gff.py @@ -157,7 +157,10 @@ def load_mobilome(infile): with open(infile, "r") as file_in: for line in file_in: if line.startswith("#"): - continue + if line.startswith("##FASTA"): + break + else: + continue contig, tool, feature, start, end, blank1, blank2, blank3, col9 = ( line.strip().split("\t") ) diff --git a/preprocessing/generate_input_file.py b/preprocessing/generate_input_file.py new file mode 100644 index 00000000..2b6215a8 --- /dev/null +++ b/preprocessing/generate_input_file.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright 2024 EMBL - European Bioinformatics Institute +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import logging +import os +import sys + +logging.basicConfig(level=logging.INFO) + + +def main(infile, input_dir, bat_dir, outfile, no_prefix): + all_prefixes = list() + if not os.path.exists(infile): + sys.exit("Input file {} does not exist!".format(infile)) + if not os.path.exists(bat_dir): + sys.exit("BAT directory {} does not exist!".format(bat_dir)) + if not os.path.exists(input_dir): + sys.exit("Input directory {} does not exist!".format(input_dir)) + + with open(infile, "r") as f_in, open(outfile, "w") as f_out: + f_out.write("prefix,assembly,taxid\n") + for line in f_in: + line = line.strip() + if line.endswith(("gz", "zip")): + sys.exit( + "Fasta file {} is archived. Please provide only unarchived files.".format( + line + ) + ) + taxid = parse_taxid(line, bat_dir) + if not no_prefix: + prefix = os.path.splitext(line)[0] + if len(prefix) > 24: + prefix = prefix[:24] + if prefix in all_prefixes: + sys.exit( + "Prefix {} is being used multiple times. Cannot generate output. Either rename the FASTA " + "files or run the script with the --no-prefix option to fill them out manually." + ) + all_prefixes.append(prefix) + full_path = os.path.join(input_dir, line) + if no_prefix: + f_out.write(",{},{}\n".format(full_path, taxid)) + else: + f_out.write("{},{},{}\n".format(prefix, full_path, taxid)) + logging.info("Saved results to {}".format(outfile)) + + +def parse_taxid(fasta_file, bat_dir): + bat_filename = os.path.splitext(fasta_file)[0] + ".bin2classification.txt" + bat_filepath = os.path.join(bat_dir, bat_filename) + if not os.path.exists(bat_filepath): + sys.exit("Bat file {} does not exist!".format(bat_filepath)) + with open(bat_filepath, "r") as f_in: + # skip header + f_in.readline() + taxid_string = f_in.readline().strip().split("\t")[3] + taxid = find_taxid_without_asterisk(taxid_string) + if not taxid: + taxid = "FILL IN MANUALY" + logging.warning( + "No taxid found for taxid {}. If taxid is known, fill it out manually in the generated " + "output file.".format(taxid) + ) + return taxid + + +def find_taxid_without_asterisk(input_string): + elements = input_string.split(";") + elements.reverse() + for element in elements: + if not element.endswith("*"): + return element + return None + + +def parse_args(): + parser = argparse.ArgumentParser( + description=( + "The script takes a list of genomes and the taxonomy results generated by BAT and makes a mettannotator " + "input csv file. " + "The user has the option to either use the genome file name (minus the extension) as the " + "prefix for mettannotator or leave the prefix off and fill it out themselves after the script generates " + "an input file with just the FASTA location and the taxid. It is expected that for all genomes, BAT " + "results are stored in the same folder and are named as {fasta_base_name}.bin2classification.txt. " + "The script will use the lowest-level taxid without an asterisk as the taxid for the genome." + ) + ) + parser.add_argument( + "-i", + dest="infile", + required=True, + help="A file containing a list of genome files to include (file name only, with file extension, unzipped, " + "one file per line). ", + ) + parser.add_argument( + "-d", + dest="input_dir", + required=True, + help="Full path to the directory where the input FASTA files are located.", + ) + parser.add_argument( + "-b", + dest="bat_dir", + required=True, + help="Folder with BAT results. Results for all genomes should be in the same folder and should be named " + "{fasta_base_name}.bin2classification.txt", + ) + parser.add_argument( + "-o", + dest="outfile", + required=True, + help="Path to the file where the output will be saved to.", + ) + parser.add_argument( + "--no-prefix", + dest="no_prefix", + required=False, + action="store_true", + default=False, + help="Skip prefix generation and leave the first column of the output file empty for the user to fill out. " + "Defaule: False", + ) + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + main( + args.infile, + args.input_dir, + args.bat_dir, + args.outfile, + args.no_prefix, + ) diff --git a/subworkflows/download_databases.nf b/subworkflows/download_databases.nf index 8960e5a8..699c2284 100644 --- a/subworkflows/download_databases.nf +++ b/subworkflows/download_databases.nf @@ -11,6 +11,7 @@ include { EGGNOG_MAPPER_GETDB } from '../modules/local/eggnog_getdb' include { INTEPROSCAN_GETDB } from '../modules/local/interproscan_getdb' include { INTEPRO_ENTRY_LIST_GETDB } from '../modules/local/interpro_list_getdb' include { RFAM_GETMODELS } from '../modules/local/rfam_getmodels' +include { BAKTA_GETDB } from '../modules/local/bakta_getdb' workflow DOWNLOAD_DATABASES { @@ -24,6 +25,7 @@ workflow DOWNLOAD_DATABASES { interproscan_db = channel.empty() interpro_entry_list = channel.empty() eggnog_db = channel.empty() + bakta_db = channel.empty() amrfinder_plus_dir = file("$params.dbs/amrfinder/") antismash_dir = file("$params.dbs/antismash") @@ -33,6 +35,7 @@ workflow DOWNLOAD_DATABASES { interpro_entry_list_dir = file("$params.dbs/interpro_entry_list/") eggnog_data_dir = file("$params.dbs/eggnog") rfam_ncrna_models = file("$params.dbs/rfam_models/rfam_ncrna_cms") + bakta_dir = file("$params.dbs/bakta") if (amrfinder_plus_dir.exists()) { amrfinder_plus_db = tuple( @@ -122,6 +125,17 @@ workflow DOWNLOAD_DATABASES { file("${rfam_ncrna_models}/VERSION.txt", checkIfExists: true).text ) } + + if (bakta_db.exists()) { + log.info("Bakta database exists, or at least the expected folders.") + bakta_db = tuple( + bakta_dir, + file("${bakta_dir}/VERSION.txt", checkIfExists: true).text + ) + } else { + BAKTA_GETDB() + bakta_db = BAKTA_GETDB.out.bakta_db.first() + } emit: amrfinder_plus_db = amrfinder_plus_db antismash_db = antismash_db @@ -131,5 +145,6 @@ workflow DOWNLOAD_DATABASES { interpro_entry_list = interpro_entry_list eggnog_db = eggnog_db rfam_ncrna_models = rfam_ncrna_models + bakta_db = bakta_db } diff --git a/workflows/mettannotator.nf b/workflows/mettannotator.nf index fe8ad259..bdf8d1bb 100644 --- a/workflows/mettannotator.nf +++ b/workflows/mettannotator.nf @@ -44,6 +44,7 @@ include { UNIFIRE } from '../modules/local/un include { ANNOTATE_GFF } from '../modules/local/annotate_gff' include { ANTISMASH } from '../modules/local/antismash' include { DBCAN } from '../modules/local/dbcan' +include { CIRCOS_PLOT } from '../modules/local/circos_plot' include { DOWNLOAD_DATABASES } from '../subworkflows/download_databases' @@ -52,6 +53,7 @@ include { DOWNLOAD_DATABASES } from '../subworkflows/dow IMPORT NF-CORE MODULES/SUBWORKFLOWS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +include { BAKTA_BAKTA } from '../modules/nf-core/bakta/bakta/main' include { GECCO_RUN } from '../modules/nf-core/gecco/run/main' include { QUAST } from '../modules/nf-core/quast/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' @@ -98,7 +100,7 @@ workflow METTANNOTATOR { if (params.dbs) { // Download databases (if needed) // - DOWNLOAD_DATABASES() + DOWNLOAD_DATABASES(params.bakta) amrfinder_plus_db = DOWNLOAD_DATABASES.out.amrfinder_plus_db @@ -115,6 +117,10 @@ workflow METTANNOTATOR { eggnog_db = DOWNLOAD_DATABASES.out.eggnog_db rfam_ncrna_models = DOWNLOAD_DATABASES.out.rfam_ncrna_models + + if (params.bakta) { + bakta_db = DOWNLOAD_DATABASES.out.bakta_db + } } else { // Use the parametrized folders and files for the databases // amrfinder_plus_db = tuple( @@ -156,20 +162,66 @@ workflow METTANNOTATOR { file(params.rfam_ncrna_models, checkIfExists: true), params.rfam_ncrna_models_rfam_version ) + if (params.bakta) { + bakta_db = tuple( + file(params.bakta_db, checkIfExists: true), + params.bakta_db_version + ) + } } ch_versions = Channel.empty() assemblies = Channel.fromSamplesheet("input") + annotations_fna = channel.empty() + annotations_gbk = channel.empty() + annotations_faa = channel.empty() + annotations_gff = channel.empty() LOOKUP_KINGDOM( assemblies ) - PROKKA( assemblies.join( LOOKUP_KINGDOM.out.detected_kingdom )) + assemblies_with_kingdom = assemblies.join( LOOKUP_KINGDOM.out.detected_kingdom ).map{ meta, file1, file2 -> { + def parts = file2.toString().split('/') + def filename = parts[-1] + def name_parts = filename.split('_') + def kingdom_name = name_parts[0] + return [meta, file1, kingdom_name] + } + } + + + if ( params.bakta ) { + assemblies_with_kingdom.branch { + bacteria: it[2] == "Bacteria" + archaea: it[2] == "Archaea" + }.set { assemblies_to_annotate } + + BAKTA_BAKTA( assemblies_to_annotate.bacteria, bakta_db ) + + PROKKA( assemblies_to_annotate.archaea ) - ch_versions = ch_versions.mix(PROKKA.out.versions.first()) + ch_versions = ch_versions.mix(BAKTA_BAKTA.out.versions.first()) + ch_versions = ch_versions.mix(PROKKA.out.versions.first()) + + annotations_fna = annotations_fna.mix( BAKTA_BAKTA.out.fna ).mix( PROKKA.out.fna ) + annotations_gbk = annotations_gbk.mix( BAKTA_BAKTA.out.gbk ).mix( PROKKA.out.gbk ) + annotations_faa = annotations_faa.mix( BAKTA_BAKTA.out.faa ).mix( PROKKA.out.faa ) + annotations_gff = annotations_gff.mix( BAKTA_BAKTA.out.gff ).mix( PROKKA.out.gff ) + + } else { + + PROKKA( assemblies_with_kingdom ) + + ch_versions = ch_versions.mix(PROKKA.out.versions.first()) + + annotations_fna = PROKKA.out.fna + annotations_gbk = PROKKA.out.gbk + annotations_faa = PROKKA.out.faa + annotations_gff = PROKKA.out.gff + } assemblies_for_quast = assemblies.join( - PROKKA.out.gff + annotations_gff ).map { it -> tuple(it[0], it[1], it[2]) } QUAST( @@ -184,7 +236,7 @@ workflow METTANNOTATOR { ch_versions = ch_versions.mix(CRISPRCAS_FINDER.out.versions.first()) // EGGNOG_MAPPER_ORTHOLOGS - needs a third empty file in mode=mapper - proteins_for_emapper_orth = PROKKA.out.faa.map { it -> tuple( it[0], file(it[1]), file("NO_FILE") ) } + proteins_for_emapper_orth = annotations_faa.map { it -> tuple( it[0], file(it[1]), file("NO_FILE") ) } EGGNOG_MAPPER_ORTHOLOGS( proteins_for_emapper_orth, @@ -210,16 +262,16 @@ workflow METTANNOTATOR { ch_versions = ch_versions.mix(EGGNOG_MAPPER_ANNOTATIONS.out.versions.first()) INTERPROSCAN( - PROKKA.out.faa, + annotations_faa, interproscan_db ) ch_versions = ch_versions.mix(INTERPROSCAN.out.versions.first()) assemblies_plus_faa_and_gff = assemblies.join( - PROKKA.out.faa + annotations_faa ).join( - PROKKA.out.gff + annotations_gff ) AMRFINDER_PLUS( @@ -234,57 +286,57 @@ workflow METTANNOTATOR { ch_versions = ch_versions.mix(AMRFINDER_PLUS_TO_GFF.out.versions.first()) DEFENSE_FINDER ( - PROKKA.out.faa.join( PROKKA.out.gff ), + annotations_faa.join( annotations_gff ), defense_finder_db ) ch_versions = ch_versions.mix(DEFENSE_FINDER.out.versions.first()) - UNIFIRE ( PROKKA.out.faa ) + UNIFIRE ( annotations_faa ) ch_versions = ch_versions.mix(UNIFIRE.out.versions.first()) DETECT_TRNA( - PROKKA.out.fna, + annotations_fna.join( LOOKUP_KINGDOM.out.detected_kingdom ) ) ch_versions = ch_versions.mix(DETECT_TRNA.out.versions.first()) DETECT_NCRNA( - PROKKA.out.fna, + annotations_fna, rfam_ncrna_models ) ch_versions = ch_versions.mix(DETECT_NCRNA.out.versions.first()) SANNTIS( - INTERPROSCAN.out.ips_annotations.join(PROKKA.out.gbk) + INTERPROSCAN.out.ips_annotations.join(annotations_gbk) ) ch_versions = ch_versions.mix(SANNTIS.out.versions.first()) GECCO_RUN( - PROKKA.out.gbk.map { meta, gbk -> [meta, gbk, []] }, [] + annotations_gbk.map { meta, gbk -> [meta, gbk, []] }, [] ) ch_versions = ch_versions.mix(GECCO_RUN.out.versions.first()) ANTISMASH( - PROKKA.out.gbk, + annotations_gbk, antismash_db ) ch_versions = ch_versions.mix(ANTISMASH.out.versions.first()) DBCAN( - PROKKA.out.faa.join( PROKKA.out.gff ), + annotations_faa.join( annotations_gff ), dbcan_db ) ch_versions = ch_versions.mix(DBCAN.out.versions.first()) ANNOTATE_GFF( - PROKKA.out.gff.join( + annotations_gff.join( INTERPROSCAN.out.ips_annotations ).join( EGGNOG_MAPPER_ANNOTATIONS.out.annotations @@ -318,6 +370,12 @@ workflow METTANNOTATOR { ch_versions = ch_versions.mix(ANNOTATE_GFF.out.versions.first()) + CIRCOS_PLOT ( + ANNOTATE_GFF.out.annotated_gff + ) + + ch_versions = ch_versions.mix(CIRCOS_PLOT.out.versions.first()) + CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') )