Merge branch 'custom_gtf' into dev

BU-ISCIII · Jan 17, 2024 · b57d083 · b57d083
2 parents ce72f48 + cbab17b
commit b57d083
Show file tree

Hide file tree

Showing 9 changed files with 182 additions and 17 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -20,16 +20,18 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements
 - [[PR #387](https://github.com/nf-core/viralrecon/pull/387)] - Software closes gracefully when encountering an error
 - [[PR #395](https://github.com/nf-core/viralrecon/pull/395)] - Remove minia from default assemblers because it is unreliable
 - [[PR #393](https://github.com/nf-core/viralrecon/pull/393)] - Changed primer set to params
+- [[PR #401](https://github.com/nf-core/viralrecon/pull/401)] - Added option to add a custom annotation
 
 ### Parameters
 
-| Old parameter       | New parameter |
-| ------------------- | ------------- |
-| `--skip_freyja`     |               |
-| `--freyja_repeats`  |               |
-| `--freyja_db_name`  |               |
-| `--freyja_barcodes` |               |
-| `--freyja_lineages` |               |
+| Old parameter       | New parameter        |
+| ------------------- | -------------------- |
+| `--skip_freyja`     |                      |
+| `--freyja_repeats`  |                      |
+| `--freyja_db_name`  |                      |
+| `--freyja_barcodes` |                      |
+| `--freyja_lineages` |                      |
+|                     | `--additional_annot` |
 
 > **NB:** Parameter has been **updated** if both old and new parameter information is present.
 > **NB:** Parameter has been **added** if just the new parameter information is present.

diff --git a/bin/make_variants_long_table.py b/bin/make_variants_long_table.py
@@ -236,11 +236,7 @@ def snpsift_to_table(snpsift_file):
     new_colnames = [x.replace("ANN[*].", "") for x in old_colnames]
     table.rename(columns=dict(zip(old_colnames, new_colnames)), inplace=True)
     table = table.loc[:, ["CHROM", "POS", "REF", "ALT", "GENE", "EFFECT", "HGVS_C", "HGVS_P"]]
-
-    ## Split by comma and get first value in cols = ['ALT','GENE','EFFECT','HGVS_C','HGVS_P']
-    for i in range(len(table)):
-        for j in range(3, 8):
-            table.iloc[i, j] = str(table.iloc[i, j]).split(",")[0]
+    table = one_effect_per_line(table)
 
     ## Amino acid substitution
     aa = []
@@ -252,6 +248,51 @@ def snpsift_to_table(snpsift_file):
     return table
 
 
+def one_effect_per_line(table):
+    one_effect_per_line_table = pd.DataFrame()
+    for i in range(len(table)):
+        gene_list = table.iloc[i, 4].split(",")
+        effect_list = table.iloc[i, 5].split(",")
+        hgvs_c_list = table.iloc[i, 6].split(",")
+        hgvs_p_list = table.iloc[i, 7].split(",")
+
+        count = 0
+        for j in range(len(gene_list)):
+            if "upstream" in effect_list[j] or "downstream" in effect_list[j]:
+                count += 1
+        for j in range(len(gene_list)):
+            if len(effect_list) == count:
+                row = {
+                    "CHROM": table.iloc[i, 0],
+                    "POS": table.iloc[i, 1],
+                    "REF": table.iloc[i, 2],
+                    "ALT": table.iloc[i, 3],
+                    "GENE": gene_list[0],
+                    "EFFECT": effect_list[0],
+                    "HGVS_C": hgvs_c_list[0],
+                    "HGVS_P": hgvs_p_list[0],
+                }
+                one_effect_per_line_table = pd.concat(
+                    [one_effect_per_line_table, pd.DataFrame([row])], ignore_index=True
+                )
+            else:
+                if not "upstream" in effect_list[j] and not "downstream" in effect_list[j]:
+                    row = {
+                        "CHROM": table.iloc[i, 0],
+                        "POS": table.iloc[i, 1],
+                        "REF": table.iloc[i, 2],
+                        "ALT": table.iloc[i, 3],
+                        "GENE": gene_list[j],
+                        "EFFECT": effect_list[j],
+                        "HGVS_C": hgvs_c_list[j],
+                        "HGVS_P": hgvs_p_list[j],
+                    }
+                    one_effect_per_line_table = pd.concat(
+                        [one_effect_per_line_table, pd.DataFrame([row])], ignore_index=True
+                    )
+    return one_effect_per_line_table
+
+
 def main(args=None):
     args = parser_args(args)
 

diff --git a/conf/modules_illumina.config b/conf/modules_illumina.config
@@ -564,6 +564,14 @@ if (!params.skip_variants) {
                         saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
                     ]
                 }
+                withName: 'MAKE_VARIANTS_LONG_TABLE_ADDITIONAL' {
+                    ext.args = "--variant_caller ${variant_caller} --output_file 'additional_variants_long_table.csv'"
+                    publishDir = [
+                        path: { "${params.outdir}/variants/${variant_caller}" },
+                        mode: params.publish_dir_mode,
+                        saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+                    ]
+                }
             }
         }
     }

diff --git a/docs/output.md b/docs/output.md
@@ -289,6 +289,7 @@ As described in the documentation, [ASCIIGenome](https://asciigenome.readthedocs
 
 - `<CALLER>/`
   - `variants_long_table.csv`: Long format table collating per-sample information for individual variants, functional effect prediction and lineage analysis.
+  - `additional_variants_long_table.csv`: Long format table similar to `variants_long_table.csv` for additional annotation file with overlapping annotation features.
 
 **NB:** The value of `<CALLER>` in the output directory name above is determined by the `--artic_minion_caller` parameter (Default: 'nanopolish').
 

diff --git a/modules/local/snpeff_build.nf b/modules/local/snpeff_build.nf
@@ -20,7 +20,14 @@ process SNPEFF_BUILD {
     task.ext.when == null || task.ext.when
 
     script:
+    def args = task.ext.args ?: ''
     def basename = fasta.baseName
+    def extension = gff.getExtension()
+    if (extension == "gtf") {
+        format = "gtf22"
+    } else {
+        format = "gff3"
+    }
 
     def avail_mem = 4
     if (!task.memory) {
@@ -36,7 +43,7 @@ process SNPEFF_BUILD {
     cd ../../
     mkdir -p snpeff_db/${basename}/
     cd snpeff_db/${basename}/
-    ln -s ../../$gff genes.gff
+    ln -s ../../$gff genes.$extension
 
     cd ../../
     echo "${basename}.genome : ${basename}" > snpeff.config
@@ -46,7 +53,8 @@ process SNPEFF_BUILD {
         build \\
         -config snpeff.config \\
         -dataDir ./snpeff_db \\
-        -gff3 \\
+        -${format} \\
+        $args \\
         -v \\
         ${basename}
 

diff --git a/nextflow.config b/nextflow.config
@@ -22,6 +22,7 @@ params {
     primer_left_suffix         = '_LEFT'
     primer_right_suffix        = '_RIGHT'
     save_reference             = false
+    additional_annot           = null
 
     // Nanopore options
     fastq_dir                  = null

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -78,6 +78,14 @@
                     "description": "Full path to GFF annotation file.",
                     "fa_icon": "fas fa-file-invoice"
                 },
+                "additional_annot": {
+                    "type": "string",
+                    "format": "file-path",
+                    "mimetype": "text/plain",
+                    "pattern": "^\\S+(\\.gff|\\.gtf)(\\.gz)?$",
+                    "description": "Full path to additional annotation file in GTF or GFF format.",
+                    "fa_icon": "fas fa-file-invoice"
+                },
                 "bowtie2_index": {
                     "type": "string",
                     "format": "path",

diff --git a/subworkflows/local/additional_annot.nf b/subworkflows/local/additional_annot.nf
@@ -0,0 +1,79 @@
+//
+// Run snpEff, bgzip, tabix, stats and SnpSift commands
+//
+
+include { SNPEFF_BUILD                                                    } from '../../modules/local/snpeff_build'
+include { SNPEFF_ANN                                                      } from '../../modules/local/snpeff_ann'
+include { SNPSIFT_EXTRACTFIELDS                                           } from '../../modules/local/snpsift_extractfields'
+include { VCF_BGZIP_TABIX_STATS                                           } from './vcf_bgzip_tabix_stats'
+include { BCFTOOLS_QUERY                                                  } from '../../modules/nf-core/bcftools/query/main'
+include { MAKE_VARIANTS_LONG_TABLE as MAKE_VARIANTS_LONG_TABLE_ADDITIONAL } from '../../modules/local/make_variants_long_table'
+
+
+workflow ADDITIONAL_ANNOT {
+    take:
+    vcf      // channel: [ val(meta), [ vcf ] ]
+    tbi      // channel: [ val(meta), [ tbi ] ]
+    fasta    // path   : genome.fasta
+    annot    // path   : additional_annot
+    pangolin // channel: [ val(meta), [ csv ] ]
+
+    main:
+
+    ch_versions = Channel.empty()
+
+    //
+    // Make snpEff database
+    //
+    ch_snpeff_db     = Channel.empty()
+    ch_snpeff_config = Channel.empty()
+
+    SNPEFF_BUILD (
+        fasta,
+        annot
+    )
+    ch_snpeff_db     = SNPEFF_BUILD.out.db
+    ch_snpeff_config = SNPEFF_BUILD.out.config
+    ch_versions      = ch_versions.mix(SNPEFF_BUILD.out.versions)
+
+    SNPEFF_ANN (
+        vcf,
+        ch_snpeff_db,
+        ch_snpeff_config,
+        fasta
+    )
+    ch_versions = ch_versions.mix(SNPEFF_ANN.out.versions.first())
+
+    VCF_BGZIP_TABIX_STATS (
+        SNPEFF_ANN.out.vcf,
+        [],
+        [],
+        []
+    )
+    ch_versions = ch_versions.mix(VCF_BGZIP_TABIX_STATS.out.versions)
+
+    SNPSIFT_EXTRACTFIELDS (
+        VCF_BGZIP_TABIX_STATS.out.vcf
+    )
+    ch_versions = ch_versions.mix(SNPSIFT_EXTRACTFIELDS.out.versions.first())
+
+    BCFTOOLS_QUERY (
+        vcf.join(tbi, by: [0]),
+        [],
+        [],
+        []
+    )
+    ch_versions = ch_versions.mix(BCFTOOLS_QUERY.out.versions.first())
+
+    MAKE_VARIANTS_LONG_TABLE_ADDITIONAL (
+        BCFTOOLS_QUERY.out.txt.collect{it[1]},
+        SNPSIFT_EXTRACTFIELDS.out.txt.collect{it[1]}.ifEmpty([]),
+        pangolin.collect{it[1]}.ifEmpty([])
+    )
+    ch_versions = ch_versions.mix(MAKE_VARIANTS_LONG_TABLE_ADDITIONAL.out.versions)
+
+    emit:
+    long_table  = MAKE_VARIANTS_LONG_TABLE_ADDITIONAL.out.csv // channel: [ val(meta), [ csv ] ]
+
+    versions    = ch_versions    // channel: [ versions.yml ]
+}
diff --git a/workflows/illumina.nf b/workflows/illumina.nf
@@ -35,12 +35,13 @@ def checkPathParamList = [
     params.input, params.fasta, params.gff, params.bowtie2_index,
     params.kraken2_db, params.primer_bed, params.primer_fasta,
     params.blast_db, params.spades_hmm, params.multiqc_config,
-    params.freyja_barcodes, params.freyja_lineages
+    params.freyja_barcodes, params.freyja_lineages, params.additional_annot
 ]
 for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } }
 
-if (params.input)      { ch_input      = file(params.input)      } else { exit 1, 'Input samplesheet file not specified!' }
-if (params.spades_hmm) { ch_spades_hmm = file(params.spades_hmm) } else { ch_spades_hmm = []                              }
+if (params.input)            { ch_input          = file(params.input)            } else { exit 1, 'Input samplesheet file not specified!' }
+if (params.spades_hmm)       { ch_spades_hmm     = file(params.spades_hmm)       } else { ch_spades_hmm = []                              }
+if (params.additional_annot) { ch_additional_gtf = file(params.additional_annot) } else { additional_annot = []                           }
 
 def assemblers = params.assemblers ? params.assemblers.split(',').collect{ it.trim().toLowerCase() } : []
 
@@ -84,6 +85,7 @@ include { VARIANTS_BCFTOOLS       } from '../subworkflows/local/variants_bcftool
 include { CONSENSUS_IVAR          } from '../subworkflows/local/consensus_ivar'
 include { CONSENSUS_BCFTOOLS      } from '../subworkflows/local/consensus_bcftools'
 include { VARIANTS_LONG_TABLE     } from '../subworkflows/local/variants_long_table'
+include { ADDITIONAL_ANNOT        } from '../subworkflows/local/additional_annot'
 include { ASSEMBLY_SPADES         } from '../subworkflows/local/assembly_spades'
 include { ASSEMBLY_UNICYCLER      } from '../subworkflows/local/assembly_unicycler'
 include { ASSEMBLY_MINIA          } from '../subworkflows/local/assembly_minia'
@@ -560,6 +562,21 @@ workflow ILLUMINA {
         ch_versions = ch_versions.mix(VARIANTS_LONG_TABLE.out.versions)
     }
 
+    //
+    // SUBWORKFLOW: Create variants long table report for additional annotation file
+    //
+    if (params.additional_annot) {
+        ADDITIONAL_ANNOT (
+            ch_vcf,
+            ch_tbi,
+            PREPARE_GENOME.out.fasta,
+            ch_additional_gtf,
+            ch_pangolin_multiqc
+
+        )
+        ch_versions = ch_versions.mix(ADDITIONAL_ANNOT.out.versions)
+    }
+
     //
     // MODULE: Primer trimming with Cutadapt
     //