From 94d6d1f2a0f1421ca18bfdfd05fd811825a18f8f Mon Sep 17 00:00:00 2001
From: svarona <sarai.var.fer@gmail.com>
Date: Mon, 23 Oct 2023 12:26:10 +0200
Subject: [PATCH 01/11] Added additional_annot param

---
 nextflow.config      | 1 +
 nextflow_schema.json | 8 ++++++++
 2 files changed, 9 insertions(+)

diff --git a/nextflow.config b/nextflow.config
index 812a2dfd..4e9f0abf 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -22,6 +22,7 @@ params {
     primer_left_suffix         = '_LEFT'
     primer_right_suffix        = '_RIGHT'
     save_reference             = false
+    additional_annot           = false
 
     // Nanopore options
     fastq_dir                  = null
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 083172b5..c4500c72 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -78,6 +78,14 @@
                     "description": "Full path to GFF annotation file.",
                     "fa_icon": "fas fa-file-invoice"
                 },
+                "additional_annot": {
+                    "type": "string",
+                    "format": "file-path",
+                    "mimetype": "text/plain",
+                    "pattern": "^\\S+(\\.gff|\\.gtf)(\\.gz)?$",
+                    "description": "Full path to additional annotation file in GTF or GFF format.",
+                    "fa_icon": "fas fa-file-invoice"
+                },
                 "bowtie2_index": {
                     "type": "string",
                     "format": "path",

From b7cc3161417e76d9bb5a7bd4a97ddd64f7f195b7 Mon Sep 17 00:00:00 2001
From: svarona <sarai.var.fer@gmail.com>
Date: Mon, 23 Oct 2023 12:49:20 +0200
Subject: [PATCH 02/11] Added param additional_annot channel

---
 workflows/illumina.nf | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/workflows/illumina.nf b/workflows/illumina.nf
index e256043d..0ae219b4 100644
--- a/workflows/illumina.nf
+++ b/workflows/illumina.nf
@@ -35,12 +35,13 @@ def checkPathParamList = [
     params.input, params.fasta, params.gff, params.bowtie2_index,
     params.kraken2_db, params.primer_bed, params.primer_fasta,
     params.blast_db, params.spades_hmm, params.multiqc_config,
-    params.freyja_barcodes, params.freyja_lineages
+    params.freyja_barcodes, params.freyja_lineages, params.additional_annot
 ]
 for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } }
 
-if (params.input)      { ch_input      = file(params.input)      } else { exit 1, 'Input samplesheet file not specified!' }
-if (params.spades_hmm) { ch_spades_hmm = file(params.spades_hmm) } else { ch_spades_hmm = []                              }
+if (params.input)            { ch_input          = file(params.input)            } else { exit 1, 'Input samplesheet file not specified!' }
+if (params.spades_hmm)       { ch_spades_hmm     = file(params.spades_hmm)       } else { ch_spades_hmm = []                              }
+if (params.additional_annot) { ch_additional_gtf = file(params.additional_annot) } else { additional_annot = []                           }
 
 def assemblers = params.assemblers ? params.assemblers.split(',').collect{ it.trim().toLowerCase() } : []
 

From 2fdd8aa450086316b976e69e432d3f57d6300ffd Mon Sep 17 00:00:00 2001
From: svarona <sarai.var.fer@gmail.com>
Date: Mon, 23 Oct 2023 12:50:30 +0200
Subject: [PATCH 03/11] Added ADDITIONAL_ANNOT process

---
 workflows/illumina.nf | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/workflows/illumina.nf b/workflows/illumina.nf
index 0ae219b4..b38fe9b6 100644
--- a/workflows/illumina.nf
+++ b/workflows/illumina.nf
@@ -85,6 +85,7 @@ include { VARIANTS_BCFTOOLS       } from '../subworkflows/local/variants_bcftool
 include { CONSENSUS_IVAR          } from '../subworkflows/local/consensus_ivar'
 include { CONSENSUS_BCFTOOLS      } from '../subworkflows/local/consensus_bcftools'
 include { VARIANTS_LONG_TABLE     } from '../subworkflows/local/variants_long_table'
+include { ADDITIONAL_ANNOT        } from '../subworkflows/local/additional_annot'
 include { ASSEMBLY_SPADES         } from '../subworkflows/local/assembly_spades'
 include { ASSEMBLY_UNICYCLER      } from '../subworkflows/local/assembly_unicycler'
 include { ASSEMBLY_MINIA          } from '../subworkflows/local/assembly_minia'
@@ -559,6 +560,21 @@ workflow ILLUMINA {
         ch_versions = ch_versions.mix(VARIANTS_LONG_TABLE.out.versions)
     }
 
+    //
+    // SUBWORKFLOW: Create variants long table report for additional annotation file
+    //
+    if (params.additional_annot) {
+        ADDITIONAL_ANNOT (
+            ch_vcf,
+            ch_tbi,
+            PREPARE_GENOME.out.fasta,
+            ch_additional_gtf,
+            ch_pangolin_multiqc
+
+        )
+        ch_versions = ch_versions.mix(ADDITIONAL_ANNOT.out.versions)
+    }
+
     //
     // MODULE: Primer trimming with Cutadapt
     //

From 97401619fb26c6e5dd9cb9ad42988d107f9d3157 Mon Sep 17 00:00:00 2001
From: svarona <sarai.var.fer@gmail.com>
Date: Mon, 23 Oct 2023 12:50:53 +0200
Subject: [PATCH 04/11] Added additional_annot subworkflow

---
 subworkflows/local/additional_annot.nf | 79 ++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 subworkflows/local/additional_annot.nf

diff --git a/subworkflows/local/additional_annot.nf b/subworkflows/local/additional_annot.nf
new file mode 100644
index 00000000..cde476e2
--- /dev/null
+++ b/subworkflows/local/additional_annot.nf
@@ -0,0 +1,79 @@
+//
+// Run snpEff, bgzip, tabix, stats and SnpSift commands
+//
+
+include { SNPEFF_BUILD                                                    } from '../../modules/local/snpeff_build'
+include { SNPEFF_ANN                                                      } from '../../modules/local/snpeff_ann'
+include { SNPSIFT_EXTRACTFIELDS                                           } from '../../modules/local/snpsift_extractfields'
+include { VCF_BGZIP_TABIX_STATS                                           } from './vcf_bgzip_tabix_stats'
+include { BCFTOOLS_QUERY                                                  } from '../../modules/nf-core/bcftools/query/main'
+include { MAKE_VARIANTS_LONG_TABLE as MAKE_VARIANTS_LONG_TABLE_ADDITIONAL } from '../../modules/local/make_variants_long_table'
+
+
+workflow ADDITIONAL_ANNOT {
+    take:
+    vcf      // channel: [ val(meta), [ vcf ] ]
+    tbi      // channel: [ val(meta), [ tbi ] ]
+    fasta    // path   : genome.fasta
+    annot    // path   : additional_annot
+    pangolin // channel: [ val(meta), [ csv ] ]
+
+    main:
+
+    ch_versions = Channel.empty()
+
+    //
+    // Make snpEff database
+    //
+    ch_snpeff_db     = Channel.empty()
+    ch_snpeff_config = Channel.empty()
+
+    SNPEFF_BUILD (
+        fasta,
+        annot
+    )
+    ch_snpeff_db     = SNPEFF_BUILD.out.db
+    ch_snpeff_config = SNPEFF_BUILD.out.config
+    ch_versions      = ch_versions.mix(SNPEFF_BUILD.out.versions)
+
+    SNPEFF_ANN (
+        vcf,
+        ch_snpeff_db,
+        ch_snpeff_config,
+        fasta
+    )
+    ch_versions = ch_versions.mix(SNPEFF_ANN.out.versions.first())
+
+    VCF_BGZIP_TABIX_STATS (
+        SNPEFF_ANN.out.vcf,
+        [],
+        [],
+        []
+    )
+    ch_versions = ch_versions.mix(VCF_BGZIP_TABIX_STATS.out.versions)
+
+    SNPSIFT_EXTRACTFIELDS (
+        VCF_BGZIP_TABIX_STATS.out.vcf
+    )
+    ch_versions = ch_versions.mix(SNPSIFT_EXTRACTFIELDS.out.versions.first())
+
+    BCFTOOLS_QUERY (
+        vcf.join(tbi, by: [0]),
+        [],
+        [],
+        []
+    )
+    ch_versions = ch_versions.mix(BCFTOOLS_QUERY.out.versions.first())
+
+    MAKE_VARIANTS_LONG_TABLE_ADDITIONAL (
+        BCFTOOLS_QUERY.out.txt.collect{it[1]},
+        SNPSIFT_EXTRACTFIELDS.out.txt.collect{it[1]}.ifEmpty([]),
+        pangolin.collect{it[1]}.ifEmpty([])
+    )
+    ch_versions = ch_versions.mix(MAKE_VARIANTS_LONG_TABLE_ADDITIONAL.out.versions)
+
+    emit:
+    long_table  = MAKE_VARIANTS_LONG_TABLE_ADDITIONAL.out.csv // channel: [ val(meta), [ csv ] ]
+
+    versions    = ch_versions    // channel: [ versions.yml ]
+}

From cafa3b491ee48f6db24e44a00046952b250075f8 Mon Sep 17 00:00:00 2001
From: svarona <sarai.var.fer@gmail.com>
Date: Mon, 23 Oct 2023 12:51:20 +0200
Subject: [PATCH 05/11] Added option to detec annotation format in snpeff build

---
 modules/local/snpeff_build.nf | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/modules/local/snpeff_build.nf b/modules/local/snpeff_build.nf
index e1ab367f..b9c875ba 100644
--- a/modules/local/snpeff_build.nf
+++ b/modules/local/snpeff_build.nf
@@ -20,7 +20,14 @@ process SNPEFF_BUILD {
     task.ext.when == null || task.ext.when
 
     script:
+    def args = task.ext.args ?: ''
     def basename = fasta.baseName
+    def extension = gff.getExtension()
+    if (extension == "gtf") {
+        format = "gtf22"
+    } else {
+        format = "gff3"
+    }
 
     def avail_mem = 4
     if (!task.memory) {
@@ -36,7 +43,7 @@ process SNPEFF_BUILD {
     cd ../../
     mkdir -p snpeff_db/${basename}/
     cd snpeff_db/${basename}/
-    ln -s ../../$gff genes.gff
+    ln -s ../../$gff genes.$extension
 
     cd ../../
     echo "${basename}.genome : ${basename}" > snpeff.config
@@ -46,7 +53,8 @@ process SNPEFF_BUILD {
         build \\
         -config snpeff.config \\
         -dataDir ./snpeff_db \\
-        -gff3 \\
+        -${format} \\
+        $args \\
         -v \\
         ${basename}
 

From c750f3810721e832d3c066a2ce6007d0b3ab8030 Mon Sep 17 00:00:00 2001
From: svarona <sarai.var.fer@gmail.com>
Date: Mon, 23 Oct 2023 12:53:08 +0200
Subject: [PATCH 06/11] variants long table more than one annotation, remove
 upstream and downstream

---
 bin/make_variants_long_table.py | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/bin/make_variants_long_table.py b/bin/make_variants_long_table.py
index f0bad221..19e1d0ed 100755
--- a/bin/make_variants_long_table.py
+++ b/bin/make_variants_long_table.py
@@ -236,11 +236,7 @@ def snpsift_to_table(snpsift_file):
     new_colnames = [x.replace("ANN[*].", "") for x in old_colnames]
     table.rename(columns=dict(zip(old_colnames, new_colnames)), inplace=True)
     table = table.loc[:, ["CHROM", "POS", "REF", "ALT", "GENE", "EFFECT", "HGVS_C", "HGVS_P"]]
-
-    ## Split by comma and get first value in cols = ['ALT','GENE','EFFECT','HGVS_C','HGVS_P']
-    for i in range(len(table)):
-        for j in range(3, 8):
-            table.iloc[i, j] = str(table.iloc[i, j]).split(",")[0]
+    table = one_effect_per_line(table)
 
     ## Amino acid substitution
     aa = []
@@ -251,6 +247,28 @@ def snpsift_to_table(snpsift_file):
 
     return table
 
+def one_effect_per_line(table):
+    one_effect_per_line_table = pd.DataFrame()
+    for i in range(len(table)):
+        gene_list = table.iloc[i,4].split(",")
+        effect_list = table.iloc[i,5].split(",")
+        hgvs_c_list = table.iloc[i,6].split(",")
+        hgvs_p_list = table.iloc[i,7].split(",")
+        
+        count = 0
+        for j in range(len(gene_list)):
+            if "upstream" in effect_list[j] or "downstream" in effect_list[j]:
+                count += 1
+        for j in range(len(gene_list)):
+            if len(effect_list) == count:
+                row = {'CHROM':table.iloc[i,0], 'POS':table.iloc[i,1], 'REF':table.iloc[i,2], 'ALT':table.iloc[i,3], 'GENE':gene_list[0], 'EFFECT':effect_list[0], 'HGVS_C':hgvs_c_list[0], 'HGVS_P':hgvs_p_list[0],}
+                one_effect_per_line_table = pd.concat([one_effect_per_line_table, pd.DataFrame([row])], ignore_index=True)
+            else:
+                if not "upstream" in effect_list[j] and not "downstream" in effect_list[j]:
+                    row = {'CHROM':table.iloc[i,0], 'POS':table.iloc[i,1], 'REF':table.iloc[i,2], 'ALT':table.iloc[i,3], 'GENE':gene_list[j], 'EFFECT':effect_list[j], 'HGVS_C':hgvs_c_list[j], 'HGVS_P':hgvs_p_list[j],}
+                    one_effect_per_line_table = pd.concat([one_effect_per_line_table, pd.DataFrame([row])], ignore_index=True)
+    return(one_effect_per_line_table)
+
 
 def main(args=None):
     args = parser_args(args)

From 15d99197654e5b7eece80352a2ff05c3742c76ab Mon Sep 17 00:00:00 2001
From: svarona <sarai.var.fer@gmail.com>
Date: Mon, 23 Oct 2023 12:53:23 +0200
Subject: [PATCH 07/11] Rename output file for additional annotation variants
 long table

---
 conf/modules_illumina.config | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/conf/modules_illumina.config b/conf/modules_illumina.config
index 2cc48bea..4c1c4c22 100644
--- a/conf/modules_illumina.config
+++ b/conf/modules_illumina.config
@@ -564,6 +564,14 @@ if (!params.skip_variants) {
                         saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
                     ]
                 }
+                withName: 'MAKE_VARIANTS_LONG_TABLE_ADDITIONAL' {
+                    ext.args = "--variant_caller ${variant_caller} --output_file 'additional_variants_long_table.csv'"
+                    publishDir = [
+                        path: { "${params.outdir}/variants/${variant_caller}" },
+                        mode: params.publish_dir_mode,
+                        saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+                    ]
+                }
             }
         }
     }

From 6d834f2589abee5a56d5a356ce0db8e7ba33444e Mon Sep 17 00:00:00 2001
From: svarona <sarai.var.fer@gmail.com>
Date: Mon, 23 Oct 2023 13:22:05 +0200
Subject: [PATCH 08/11] Fixed black linting

---
 bin/make_variants_long_table.py | 43 +++++++++++++++++++++++++--------
 1 file changed, 33 insertions(+), 10 deletions(-)

diff --git a/bin/make_variants_long_table.py b/bin/make_variants_long_table.py
index 19e1d0ed..dbcfc277 100755
--- a/bin/make_variants_long_table.py
+++ b/bin/make_variants_long_table.py
@@ -247,27 +247,50 @@ def snpsift_to_table(snpsift_file):
 
     return table
 
+
 def one_effect_per_line(table):
     one_effect_per_line_table = pd.DataFrame()
     for i in range(len(table)):
-        gene_list = table.iloc[i,4].split(",")
-        effect_list = table.iloc[i,5].split(",")
-        hgvs_c_list = table.iloc[i,6].split(",")
-        hgvs_p_list = table.iloc[i,7].split(",")
-        
+        gene_list = table.iloc[i, 4].split(",")
+        effect_list = table.iloc[i, 5].split(",")
+        hgvs_c_list = table.iloc[i, 6].split(",")
+        hgvs_p_list = table.iloc[i, 7].split(",")
+
         count = 0
         for j in range(len(gene_list)):
             if "upstream" in effect_list[j] or "downstream" in effect_list[j]:
                 count += 1
         for j in range(len(gene_list)):
             if len(effect_list) == count:
-                row = {'CHROM':table.iloc[i,0], 'POS':table.iloc[i,1], 'REF':table.iloc[i,2], 'ALT':table.iloc[i,3], 'GENE':gene_list[0], 'EFFECT':effect_list[0], 'HGVS_C':hgvs_c_list[0], 'HGVS_P':hgvs_p_list[0],}
-                one_effect_per_line_table = pd.concat([one_effect_per_line_table, pd.DataFrame([row])], ignore_index=True)
+                row = {
+                    "CHROM": table.iloc[i, 0],
+                    "POS": table.iloc[i, 1],
+                    "REF": table.iloc[i, 2],
+                    "ALT": table.iloc[i, 3],
+                    "GENE": gene_list[0],
+                    "EFFECT": effect_list[0],
+                    "HGVS_C": hgvs_c_list[0],
+                    "HGVS_P": hgvs_p_list[0],
+                }
+                one_effect_per_line_table = pd.concat(
+                    [one_effect_per_line_table, pd.DataFrame([row])], ignore_index=True
+                )
             else:
                 if not "upstream" in effect_list[j] and not "downstream" in effect_list[j]:
-                    row = {'CHROM':table.iloc[i,0], 'POS':table.iloc[i,1], 'REF':table.iloc[i,2], 'ALT':table.iloc[i,3], 'GENE':gene_list[j], 'EFFECT':effect_list[j], 'HGVS_C':hgvs_c_list[j], 'HGVS_P':hgvs_p_list[j],}
-                    one_effect_per_line_table = pd.concat([one_effect_per_line_table, pd.DataFrame([row])], ignore_index=True)
-    return(one_effect_per_line_table)
+                    row = {
+                        "CHROM": table.iloc[i, 0],
+                        "POS": table.iloc[i, 1],
+                        "REF": table.iloc[i, 2],
+                        "ALT": table.iloc[i, 3],
+                        "GENE": gene_list[j],
+                        "EFFECT": effect_list[j],
+                        "HGVS_C": hgvs_c_list[j],
+                        "HGVS_P": hgvs_p_list[j],
+                    }
+                    one_effect_per_line_table = pd.concat(
+                        [one_effect_per_line_table, pd.DataFrame([row])], ignore_index=True
+                    )
+    return one_effect_per_line_table
 
 
 def main(args=None):

From 90a9f93fa14a537f4b7b6bebdc78998051d3a5f0 Mon Sep 17 00:00:00 2001
From: svarona <sarai.var.fer@gmail.com>
Date: Mon, 23 Oct 2023 13:24:05 +0200
Subject: [PATCH 09/11] changed default value for additional_annot path to null

---
 nextflow.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nextflow.config b/nextflow.config
index 4e9f0abf..561769ff 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -22,7 +22,7 @@ params {
     primer_left_suffix         = '_LEFT'
     primer_right_suffix        = '_RIGHT'
     save_reference             = false
-    additional_annot           = false
+    additional_annot           = null
 
     // Nanopore options
     fastq_dir                  = null

From 27f4614e78e965e199b7d6685153c11be7eaf027 Mon Sep 17 00:00:00 2001
From: svarona <sarai.var.fer@gmail.com>
Date: Mon, 23 Oct 2023 15:36:53 +0200
Subject: [PATCH 10/11] Added changes to changelog

---
 CHANGELOG.md | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 52ae7f0e..0d3d42c0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,16 +19,18 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements
 - [[#299](https://github.com/nf-core/viralrecon/issues/299)] - Add the freyja pipeline as a subworkflow
 - [[PR #387](https://github.com/nf-core/viralrecon/pull/387)] - Software closes gracefully when encountering an error
 - [[PR #395](https://github.com/nf-core/viralrecon/pull/395)] - Remove minia from default assemblers because it is unreliable
+- [[PR #401](https://github.com/nf-core/viralrecon/pull/401)] - Added option to add a custom annotation
 
 ### Parameters
 
-| Old parameter       | New parameter |
-| ------------------- | ------------- |
-| `--skip_freyja`     |               |
-| `--freyja_repeats`  |               |
-| `--freyja_db_name`  |               |
-| `--freyja_barcodes` |               |
-| `--freyja_lineages` |               |
+| Old parameter       | New parameter        |
+| ------------------- | -------------------- |
+| `--skip_freyja`     |                      |
+| `--freyja_repeats`  |                      |
+| `--freyja_db_name`  |                      |
+| `--freyja_barcodes` |                      |
+| `--freyja_lineages` |                      |
+|                     | `--additional_annot` |
 
 > **NB:** Parameter has been **updated** if both old and new parameter information is present.
 > **NB:** Parameter has been **added** if just the new parameter information is present.

From cbab17b2a8ce7d7dfebd2a97e0e385a621814b67 Mon Sep 17 00:00:00 2001
From: svarona <sarai.var.fer@gmail.com>
Date: Mon, 23 Oct 2023 15:54:12 +0200
Subject: [PATCH 11/11] added description for additional_variants_long_table

---
 docs/output.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/output.md b/docs/output.md
index 536a20d3..0287e986 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -289,6 +289,7 @@ As described in the documentation, [ASCIIGenome](https://asciigenome.readthedocs
 
 - `<CALLER>/`
   - `variants_long_table.csv`: Long format table collating per-sample information for individual variants, functional effect prediction and lineage analysis.
+  - `additional_variants_long_table.csv`: Long format table similar to `variants_long_table.csv` for additional annotation file with overlapping annotation features.
 
 **NB:** The value of `<CALLER>` in the output directory name above is determined by the `--artic_minion_caller` parameter (Default: 'nanopolish').