Merge pull request #33 from EBI-Metagenomics/dev

Dev
EBI-Metagenomics · Jun 5, 2024 · 455afd8 · 455afd8
2 parents df77457 + 05f68a4
commit 455afd8
Show file tree

Hide file tree

Showing 42 changed files with 1,358 additions and 109 deletions.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -17,7 +17,7 @@ Learn more about contributing: [CONTRIBUTING.md](https://github.com/ebi-metageno
 - [ ] If you've fixed a bug or added code that should be tested, add tests!
 - [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/ebi-metagenomics/mettannotator/tree/master/.github/CONTRIBUTING.md)
 - [ ] Make sure your code lints (`nf-core lint`).
-- [ ] Ensure the test suite passes (`nf-test test main.nf.test -profile test,docker`).
+- [ ] Ensure the test suite passes (`nextflow run . -profile test,docker --outdir <OUTDIR>`).
 - [ ] Check for unexpected warnings in debug mode (`nextflow run . -profile debug,test,docker --outdir <OUTDIR>`).
 - [ ] Usage Documentation in `docs/usage.md` is updated.
 - [ ] Output Documentation in `docs/output.md` is updated.

diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
@@ -14,13 +14,12 @@ jobs:
   pre-commit:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4
+      - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4
 
-      - name: Set up Python 3.11
-        uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5
+      - name: Set up Python 3.12
+        uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5
         with:
-          python-version: 3.11
-          cache: "pip"
+          python-version: "3.12"
 
       - name: Install pre-commit
         run: pip install pre-commit
@@ -32,14 +31,14 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Check out pipeline code
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4
 
       - name: Install Nextflow
-        uses: nf-core/setup-nextflow@v1
+        uses: nf-core/setup-nextflow@v2
 
-      - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5
+      - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5
         with:
-          python-version: "3.11"
+          python-version: "3.12"
           architecture: "x64"
 
       - name: Install dependencies
@@ -60,7 +59,7 @@ jobs:
 
       - name: Upload linting log file artifact
         if: ${{ always() }}
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4
         with:
           name: linting-logs
           path: |

diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Download lint results
-        uses: dawidd6/action-download-artifact@f6b0bace624032e30a85a8fd9c1a7f8f611f5737 # v3
+        uses: dawidd6/action-download-artifact@09f2f74827fd3a8607589e5ad7f9398816f540fe # v3
         with:
           workflow: linting.yml
           workflow_conclusion: completed

diff --git a/CITATIONS.md b/CITATIONS.md
@@ -22,6 +22,10 @@
 
   > Seemann T. Prokka: rapid prokaryotic genome annotation. Bioinformatics. 2014 Jul 15;30(14):2068-9. doi: 10.1093/bioinformatics/btu153. Epub 2014 Mar 18. PMID: 24642063.
 
+- [Bakta](https://pubmed.ncbi.nlm.nih.gov/34739369/)
+
+  > Schwengers O, Jelonek L, Dieckmann MA, Beyvers S, Blom J, Goesmann A. Bakta: rapid and standardized annotation of bacterial genomes via alignment-free sequence identification. Microb Genom. 2021 Nov;7(11):000685. doi: 10.1099/mgen.0.000685. PMID: 34739369; PMCID: PMC8743544.
+
 - [InterProScan](https://pubmed.ncbi.nlm.nih.gov/24451626/)
 
   > Jones P, Binns D, Chang HY, Fraser M, Li W, McAnulla C, McWilliam H, Maslen J, Mitchell A, Nuka G, Pesseat S, Quinn AF, Sangrador-Vegas A, Scheremetjew M, Yong SY, Lopez R, Hunter S. InterProScan 5: genome-scale protein function classification. Bioinformatics. 2014 May 1;30(9):1236-40. doi: 10.1093/bioinformatics/btu031. Epub 2014 Jan 21. PMID: 24451626; PMCID: PMC3998142.

diff --git a/README.md b/README.md
diff --git a/bin/add_hypothetical_protein_descriptions.py b/bin/add_hypothetical_protein_descriptions.py
@@ -30,7 +30,7 @@ def main(ipr_types_file, ipr_file, hierarchy_file, eggnog_file, infile, outfile)
     ipr_info, ipr_memberdb_only, ipr_leveled_info = load_ipr(
         ipr_file, ipr_types, levels
     )
-
+    gene_caller = "Prokka"
     fasta_flag = False
     with open(infile, "r") as file_in, open(outfile, "w") as file_out:
         for line in file_in:
@@ -54,6 +54,7 @@ def main(ipr_types_file, ipr_file, hierarchy_file, eggnog_file, infile, outfile)
                                 eggnog_info,
                                 ipr_info,
                                 ipr_memberdb_only,
+                                gene_caller,
                             )
 
                             if not function_source == "UniFIRE":
@@ -64,6 +65,7 @@ def main(ipr_types_file, ipr_file, hierarchy_file, eggnog_file, infile, outfile)
                                             found_function,
                                             function_source,
                                             attributes_dict,
+                                            gene_caller
                                         )
                                     )
                             found_function = escape_reserved_characters(found_function)
@@ -73,7 +75,7 @@ def main(ipr_types_file, ipr_file, hierarchy_file, eggnog_file, infile, outfile)
                             )
                         else:
                             attributes_dict = insert_product_source(
-                                attributes_dict, "Prokka"
+                                attributes_dict, gene_caller
                             )
                         col9_updated = update_col9(attributes_dict)
                         file_out.write(
@@ -96,11 +98,13 @@ def main(ipr_types_file, ipr_file, hierarchy_file, eggnog_file, infile, outfile)
                         file_out.write(line)
                 else:
                     file_out.write(line)
+                    if "Bakta" in line:
+                        gene_caller = "Bakta"
             else:
                 file_out.write(line)
 
 
-def keep_or_move_to_note(found_function, function_source, col9_dict):
+def keep_or_move_to_note(found_function, function_source, col9_dict, gene_caller):
     """
     Function aims to identify if a description is likely to be a sentence/paragraph rather than
     a succinct function description. If it's the former, move it to note and revert function to
@@ -172,12 +176,12 @@ def keep_or_move_to_note(found_function, function_source, col9_dict):
         if move_to_note:
             col9_dict = move_function_to_note(found_function, col9_dict)
             found_function = "hypothetical protein"
-            function_source = "Prokka"
+            function_source = gene_caller
     else:
         # Product is too long, move to note
         col9_dict = move_function_to_note(found_function, col9_dict)
         found_function = "hypothetical protein"
-        function_source = "Prokka"
+        function_source = gene_caller
     return found_function, function_source, col9_dict
 
 
@@ -248,7 +252,7 @@ def insert_product_source(my_dict, source):
     )
 
 
-def get_function(acc, attributes_dict, eggnog_annot, ipr_info, ipr_memberdb_only):
+def get_function(acc, attributes_dict, eggnog_annot, ipr_info, ipr_memberdb_only, gene_caller):
     """
     Identify function by carrying it over from a db match. The following priority is used:
     Priority 1: UniFIRE protein recommended full name
@@ -306,7 +310,7 @@ def get_function(acc, attributes_dict, eggnog_annot, ipr_info, ipr_memberdb_only
             return func_description, source
     if acc in eggnog_annot:
         return eggnog_annot[acc], "eggNOG"
-    return "hypothetical protein", "Prokka"
+    return "hypothetical protein", gene_caller
 
 
 def get_description_and_source(my_dict, ipr_type):

diff --git a/bin/annotate_gff.py b/bin/annotate_gff.py
@@ -456,9 +456,12 @@ def load_annotations(
                 line = line.replace("db_xref", "Dbxref")
                 cols = line.split("\t")
                 if len(cols) == 9:
-                    contig, feature, start, annot = cols[0], cols[2], cols[3], cols[8]
+                    contig, caller, feature, start, annot = cols[0], cols[1], cols[2], cols[3], cols[8]
                     if feature != "CDS":
-                        continue
+                        if caller == "Bakta" and feature == "region":
+                            main_gff.setdefault(contig, dict()).setdefault(int(start), list()).append(line)
+                        else:
+                            continue
                     protein = annot.split(";")[0].split("=")[-1]
                     added_annot[protein] = {}
                     try:

diff --git a/bin/circos_plot.py b/bin/circos_plot.py
@@ -0,0 +1,214 @@
+#!/usr/bin/env python3
+# coding=utf-8
+
+import argparse
+import logging
+import sys
+
+from pycirclize import Circos
+from pycirclize.parser import Gff
+
+from matplotlib.patches import Patch
+
+logging.basicConfig(level=logging.INFO)
+
+
+def main(infile, outfile, prefix, contig_num_limit, contig_trim, mobilome, dpi):
+    modified_infile = remove_escaped_characters(infile)
+    gff = Gff(modified_infile)
+    seqid2size = gff.get_seqid2size()
+    if len(seqid2size) > contig_num_limit:
+        logging.info(
+            "Skipping plot generation for file {} due to a large number of contigs: {}. "
+            "Plots are only generated for genomes with up to {} annotated contigs.".format(
+                infile, len(seqid2size), contig_num_limit
+            )
+        )
+        sys.exit()
+
+    seqid2features = gff.get_seqid2features(feature_type=None)
+
+    circos = Circos(seqid2size, space=1, start=1, end=358)
+
+    circos.text("{}\n".format(prefix), size=15, r=30)
+
+    # Skip printing contig names if the names are too long
+    print_contigs = True
+    if contig_trim == 500:  # the user didn't choose truncation, check lengths
+        for sector in circos.sectors:
+            if len(sector.name[:contig_trim]) > 24:
+                print_contigs = False
+    if not print_contigs:
+        logging.info("Not printing contig labels because they are too long. Rerun the script with the "
+                     "--contig-trim flag to truncate the labels if you would like them printed.")
+    for sector in circos.sectors:
+        if print_contigs:
+            # Plot contig labels
+            sector.text(
+                sector.name[:contig_trim], orientation="vertical", r=110, size=6, color="dimgrey"
+            )
+        # Plot scale
+        position_track = sector.add_track((99, 100))
+        position_track.axis(fc="none")
+        major_ticks_interval = 500000
+        minor_ticks_interval = 50000
+        if sector.size > minor_ticks_interval:
+            if sector.size >= minor_ticks_interval * 10:
+                position_track.xticks_by_interval(
+                    major_ticks_interval, label_formatter=lambda v: f"{v / 10 ** 6:.1f} Mb"
+                )
+            else:
+                position_track.xticks_by_interval(
+                    major_ticks_interval, show_label=False
+                )
+            position_track.xticks_by_interval(
+                minor_ticks_interval, tick_length=1, show_label=False
+            )
+
+        # Initiate feature tracks
+        f_cds_track = sector.add_track((93, 98), r_pad_ratio=0.1)
+        f_cds_track.axis(fc="none", ec="none")
+        r_cds_track = sector.add_track((88, 93), r_pad_ratio=0.1)
+        r_cds_track.axis(fc="none", ec="none")
+        rna_track = sector.add_track((83, 87), r_pad_ratio=0.1)
+        rna_track.axis(fc="none", ec="none")
+        bgc_track_antismash = sector.add_track((78, 80), r_pad_ratio=0.1)
+        bgc_track_antismash.axis(fc="none", ec="tomato", ls="dashdot", lw=0.15)
+        bgc_track_gecco = sector.add_track((76, 78), r_pad_ratio=0.1)
+        bgc_track_gecco.axis(fc="none", ec="lightsalmon", ls="dashdot", lw=0.15)
+        bgc_track_sanntis = sector.add_track((74, 76), r_pad_ratio=0.1)
+        bgc_track_sanntis.axis(fc="none", ec="firebrick", ls="dashdot", lw=0.15)
+        dbcan_track = sector.add_track((68, 70), r_pad_ratio=0.1)
+        dbcan_track.axis(fc="none", ec="forestgreen", ls="dashdot", lw=0.15)
+        amr_track = sector.add_track((62, 64), r_pad_ratio=0.1)
+        amr_track.axis(fc="none", ec="dodgerblue", ls="dashdot", lw=0.15)
+        antiphage_track = sector.add_track((56, 58), r_pad_ratio=0.1)
+        antiphage_track.axis(fc="none", ec="orchid", ls="dashdot", lw=0.15)
+        if mobilome:
+            mobilome_track = sector.add_track((50, 52), r_pad_ratio=0.1)
+            mobilome_track.axis(fc="none", ec="lightseagreen", ls="dashdot", lw=0.15)
+
+        for feature in seqid2features[sector.name]:
+            if feature.type == "CDS":
+                if feature.strand == 1:
+                    f_cds_track.genomic_features([feature], fc="hotpink")
+                else:
+                    r_cds_track.genomic_features([feature], fc="steelblue")
+                if "antismash_bgc_function" in feature.qualifiers:
+                    bgc_track_antismash.genomic_features([feature], fc="tomato")
+                if "gecco_bgc_type" in feature.qualifiers:
+                    bgc_track_gecco.genomic_features([feature], fc="lightsalmon")
+                if "nearest_MiBIG" in feature.qualifiers:
+                    bgc_track_sanntis.genomic_features([feature], fc="firebrick")
+                if "dbcan_prot_type" in feature.qualifiers:
+                    dbcan_track.genomic_features([feature], fc="forestgreen")
+                if "amrfinderplus_scope" in feature.qualifiers:
+                    amr_track.genomic_features([feature], fc="dodgerblue")
+                if "defense_finder_type" in feature.qualifiers:
+                    antiphage_track.genomic_features([feature], fc="orchid")
+
+            elif feature.type in ["tRNA", "ncRNA"]:
+                rna_track.genomic_features([feature], fc="darkmagenta")
+            elif mobilome and feature.type in [
+                "mobility_island",
+                "cellular_recombinase",
+                "insertion_sequence",
+                "conjugative_element",
+                "conjugative_integron",
+                "integron",
+                "plasmid",
+                "nested_mobile_element",
+                "terminal_inverted_repeat_element",
+                "direct_repeat_element",
+                "viral_sequence",
+            ]:
+                mobilome_track.genomic_features([feature], fc="lightseagreen")
+            elif mobilome and feature.type.lower() in ["phage", "prophage"]:
+                mobilome_track.genomic_features([feature], fc="blue")
+
+    fig = circos.plotfig()
+    # Add legend
+    handles = [
+        Patch(color="hotpink", label="Forward CDS"),
+        Patch(color="steelblue", label="Reverse CDS"),
+        Patch(color="darkmagenta", label="RNA"),
+        Patch(color="tomato", label="BGCs (antiSMASH)"),
+        Patch(color="lightsalmon", label="BGCs (GECCO)"),
+        Patch(color="firebrick", label="BGCs (SanntiS)"),
+        Patch(color="forestgreen", label="Predicted PULs"),
+        Patch(color="dodgerblue", label="AMR genes"),
+        Patch(color="orchid", label="Anti-phage defense genes"),
+    ]
+    if mobilome:
+        handles = handles + [
+            Patch(color="blue", label="Mobilome (phage)"),
+            Patch(color="lightseagreen", label="Mobilome (other)"),
+        ]
+
+    main_legend = circos.ax.legend(
+        handles=handles, bbox_to_anchor=(0.5, 0.475), loc="center", fontsize=8
+    )
+    circos.ax.add_artist(main_legend)
+    fig.savefig(outfile, dpi=dpi)
+
+
+def remove_escaped_characters(infile):
+    outfile = infile + "_modified"
+    with open(infile, "r") as file_in:
+        content = file_in.read()
+        modified_content = content.replace("\\=", "")
+
+    # Write the modified content into a file
+    with open(outfile, "w") as file_out:
+        file_out.write(modified_content)
+    return outfile
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Script for Circos plot generation.")
+    parser.add_argument(
+        "-i", "--infile", required=True, help="Path to the GFF file to plot"
+    )
+    parser.add_argument(
+        "-o", "--outfile", required=True, help="Path to the output file"
+    )
+    parser.add_argument(
+        "-p", "--prefix", required=True, help="Prefix to use for the genome"
+    )
+    parser.add_argument(
+        "-l",
+        "--limit",
+        required=False,
+        type=int,
+        default=50,
+        help="Only generate a plot if the genome has no more than this number of contigs. Limit introduced because "
+             "highly fragmented genomes do not produce readable plots. Default: 50.",
+    )
+    parser.add_argument(
+        "--contig-trim",
+        required=False,
+        default=500,
+        type=int,
+        help="If the contig length is over 24 characters long, contig names will not be printed on the plot. Specify "
+             "the length to trim the contig names down to if you would like the shorter names printed.",
+    )
+    parser.add_argument(
+        "--mobilome",
+        required=False,
+        action="store_true",
+        default=False,
+        help="Plot the mobilome track. Default: False",
+    )
+    parser.add_argument(
+        "--dpi",
+        required=False,
+        type=int,
+        default=600,
+        help="Specify the dpi for the plot. Default: 600",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args.infile, args.outfile, args.prefix, args.limit, args.contig_trim, args.mobilome, args.dpi)