From 94d6d1f2a0f1421ca18bfdfd05fd811825a18f8f Mon Sep 17 00:00:00 2001 From: svarona Date: Mon, 23 Oct 2023 12:26:10 +0200 Subject: [PATCH 01/11] Added additional_annot param --- nextflow.config | 1 + nextflow_schema.json | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/nextflow.config b/nextflow.config index 812a2dfd..4e9f0abf 100644 --- a/nextflow.config +++ b/nextflow.config @@ -22,6 +22,7 @@ params { primer_left_suffix = '_LEFT' primer_right_suffix = '_RIGHT' save_reference = false + additional_annot = false // Nanopore options fastq_dir = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 083172b5..c4500c72 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -78,6 +78,14 @@ "description": "Full path to GFF annotation file.", "fa_icon": "fas fa-file-invoice" }, + "additional_annot": { + "type": "string", + "format": "file-path", + "mimetype": "text/plain", + "pattern": "^\\S+(\\.gff|\\.gtf)(\\.gz)?$", + "description": "Full path to additional annotation file in GTF or GFF format.", + "fa_icon": "fas fa-file-invoice" + }, "bowtie2_index": { "type": "string", "format": "path", From b7cc3161417e76d9bb5a7bd4a97ddd64f7f195b7 Mon Sep 17 00:00:00 2001 From: svarona Date: Mon, 23 Oct 2023 12:49:20 +0200 Subject: [PATCH 02/11] Added param additional_annot channel --- workflows/illumina.nf | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/workflows/illumina.nf b/workflows/illumina.nf index e256043d..0ae219b4 100644 --- a/workflows/illumina.nf +++ b/workflows/illumina.nf @@ -35,12 +35,13 @@ def checkPathParamList = [ params.input, params.fasta, params.gff, params.bowtie2_index, params.kraken2_db, params.primer_bed, params.primer_fasta, params.blast_db, params.spades_hmm, params.multiqc_config, - params.freyja_barcodes, params.freyja_lineages + params.freyja_barcodes, params.freyja_lineages, params.additional_annot ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } -if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet file not specified!' } -if (params.spades_hmm) { ch_spades_hmm = file(params.spades_hmm) } else { ch_spades_hmm = [] } +if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet file not specified!' } +if (params.spades_hmm) { ch_spades_hmm = file(params.spades_hmm) } else { ch_spades_hmm = [] } +if (params.additional_annot) { ch_additional_gtf = file(params.additional_annot) } else { additional_annot = [] } def assemblers = params.assemblers ? params.assemblers.split(',').collect{ it.trim().toLowerCase() } : [] From 2fdd8aa450086316b976e69e432d3f57d6300ffd Mon Sep 17 00:00:00 2001 From: svarona Date: Mon, 23 Oct 2023 12:50:30 +0200 Subject: [PATCH 03/11] Added ADDITIONAL_ANNOT process --- workflows/illumina.nf | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/workflows/illumina.nf b/workflows/illumina.nf index 0ae219b4..b38fe9b6 100644 --- a/workflows/illumina.nf +++ b/workflows/illumina.nf @@ -85,6 +85,7 @@ include { VARIANTS_BCFTOOLS } from '../subworkflows/local/variants_bcftool include { CONSENSUS_IVAR } from '../subworkflows/local/consensus_ivar' include { CONSENSUS_BCFTOOLS } from '../subworkflows/local/consensus_bcftools' include { VARIANTS_LONG_TABLE } from '../subworkflows/local/variants_long_table' +include { ADDITIONAL_ANNOT } from '../subworkflows/local/additional_annot' include { ASSEMBLY_SPADES } from '../subworkflows/local/assembly_spades' include { ASSEMBLY_UNICYCLER } from '../subworkflows/local/assembly_unicycler' include { ASSEMBLY_MINIA } from '../subworkflows/local/assembly_minia' @@ -559,6 +560,21 @@ workflow ILLUMINA { ch_versions = ch_versions.mix(VARIANTS_LONG_TABLE.out.versions) } + // + // SUBWORKFLOW: Create variants long table report for additional annotation file + // + if (params.additional_annot) { + ADDITIONAL_ANNOT ( + ch_vcf, + ch_tbi, + PREPARE_GENOME.out.fasta, + ch_additional_gtf, + ch_pangolin_multiqc + + ) + ch_versions = ch_versions.mix(ADDITIONAL_ANNOT.out.versions) + } + // // MODULE: Primer trimming with Cutadapt // From 97401619fb26c6e5dd9cb9ad42988d107f9d3157 Mon Sep 17 00:00:00 2001 From: svarona Date: Mon, 23 Oct 2023 12:50:53 +0200 Subject: [PATCH 04/11] Added additional_annot subworkflow --- subworkflows/local/additional_annot.nf | 79 ++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 subworkflows/local/additional_annot.nf diff --git a/subworkflows/local/additional_annot.nf b/subworkflows/local/additional_annot.nf new file mode 100644 index 00000000..cde476e2 --- /dev/null +++ b/subworkflows/local/additional_annot.nf @@ -0,0 +1,79 @@ +// +// Run snpEff, bgzip, tabix, stats and SnpSift commands +// + +include { SNPEFF_BUILD } from '../../modules/local/snpeff_build' +include { SNPEFF_ANN } from '../../modules/local/snpeff_ann' +include { SNPSIFT_EXTRACTFIELDS } from '../../modules/local/snpsift_extractfields' +include { VCF_BGZIP_TABIX_STATS } from './vcf_bgzip_tabix_stats' +include { BCFTOOLS_QUERY } from '../../modules/nf-core/bcftools/query/main' +include { MAKE_VARIANTS_LONG_TABLE as MAKE_VARIANTS_LONG_TABLE_ADDITIONAL } from '../../modules/local/make_variants_long_table' + + +workflow ADDITIONAL_ANNOT { + take: + vcf // channel: [ val(meta), [ vcf ] ] + tbi // channel: [ val(meta), [ tbi ] ] + fasta // path : genome.fasta + annot // path : additional_annot + pangolin // channel: [ val(meta), [ csv ] ] + + main: + + ch_versions = Channel.empty() + + // + // Make snpEff database + // + ch_snpeff_db = Channel.empty() + ch_snpeff_config = Channel.empty() + + SNPEFF_BUILD ( + fasta, + annot + ) + ch_snpeff_db = SNPEFF_BUILD.out.db + ch_snpeff_config = SNPEFF_BUILD.out.config + ch_versions = ch_versions.mix(SNPEFF_BUILD.out.versions) + + SNPEFF_ANN ( + vcf, + ch_snpeff_db, + ch_snpeff_config, + fasta + ) + ch_versions = ch_versions.mix(SNPEFF_ANN.out.versions.first()) + + VCF_BGZIP_TABIX_STATS ( + SNPEFF_ANN.out.vcf, + [], + [], + [] + ) + ch_versions = ch_versions.mix(VCF_BGZIP_TABIX_STATS.out.versions) + + SNPSIFT_EXTRACTFIELDS ( + VCF_BGZIP_TABIX_STATS.out.vcf + ) + ch_versions = ch_versions.mix(SNPSIFT_EXTRACTFIELDS.out.versions.first()) + + BCFTOOLS_QUERY ( + vcf.join(tbi, by: [0]), + [], + [], + [] + ) + ch_versions = ch_versions.mix(BCFTOOLS_QUERY.out.versions.first()) + + MAKE_VARIANTS_LONG_TABLE_ADDITIONAL ( + BCFTOOLS_QUERY.out.txt.collect{it[1]}, + SNPSIFT_EXTRACTFIELDS.out.txt.collect{it[1]}.ifEmpty([]), + pangolin.collect{it[1]}.ifEmpty([]) + ) + ch_versions = ch_versions.mix(MAKE_VARIANTS_LONG_TABLE_ADDITIONAL.out.versions) + + emit: + long_table = MAKE_VARIANTS_LONG_TABLE_ADDITIONAL.out.csv // channel: [ val(meta), [ csv ] ] + + versions = ch_versions // channel: [ versions.yml ] +} From cafa3b491ee48f6db24e44a00046952b250075f8 Mon Sep 17 00:00:00 2001 From: svarona Date: Mon, 23 Oct 2023 12:51:20 +0200 Subject: [PATCH 05/11] Added option to detec annotation format in snpeff build --- modules/local/snpeff_build.nf | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/modules/local/snpeff_build.nf b/modules/local/snpeff_build.nf index e1ab367f..b9c875ba 100644 --- a/modules/local/snpeff_build.nf +++ b/modules/local/snpeff_build.nf @@ -20,7 +20,14 @@ process SNPEFF_BUILD { task.ext.when == null || task.ext.when script: + def args = task.ext.args ?: '' def basename = fasta.baseName + def extension = gff.getExtension() + if (extension == "gtf") { + format = "gtf22" + } else { + format = "gff3" + } def avail_mem = 4 if (!task.memory) { @@ -36,7 +43,7 @@ process SNPEFF_BUILD { cd ../../ mkdir -p snpeff_db/${basename}/ cd snpeff_db/${basename}/ - ln -s ../../$gff genes.gff + ln -s ../../$gff genes.$extension cd ../../ echo "${basename}.genome : ${basename}" > snpeff.config @@ -46,7 +53,8 @@ process SNPEFF_BUILD { build \\ -config snpeff.config \\ -dataDir ./snpeff_db \\ - -gff3 \\ + -${format} \\ + $args \\ -v \\ ${basename} From c750f3810721e832d3c066a2ce6007d0b3ab8030 Mon Sep 17 00:00:00 2001 From: svarona Date: Mon, 23 Oct 2023 12:53:08 +0200 Subject: [PATCH 06/11] variants long table more than one annotation, remove upstream and downstream --- bin/make_variants_long_table.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/bin/make_variants_long_table.py b/bin/make_variants_long_table.py index f0bad221..19e1d0ed 100755 --- a/bin/make_variants_long_table.py +++ b/bin/make_variants_long_table.py @@ -236,11 +236,7 @@ def snpsift_to_table(snpsift_file): new_colnames = [x.replace("ANN[*].", "") for x in old_colnames] table.rename(columns=dict(zip(old_colnames, new_colnames)), inplace=True) table = table.loc[:, ["CHROM", "POS", "REF", "ALT", "GENE", "EFFECT", "HGVS_C", "HGVS_P"]] - - ## Split by comma and get first value in cols = ['ALT','GENE','EFFECT','HGVS_C','HGVS_P'] - for i in range(len(table)): - for j in range(3, 8): - table.iloc[i, j] = str(table.iloc[i, j]).split(",")[0] + table = one_effect_per_line(table) ## Amino acid substitution aa = [] @@ -251,6 +247,28 @@ def snpsift_to_table(snpsift_file): return table +def one_effect_per_line(table): + one_effect_per_line_table = pd.DataFrame() + for i in range(len(table)): + gene_list = table.iloc[i,4].split(",") + effect_list = table.iloc[i,5].split(",") + hgvs_c_list = table.iloc[i,6].split(",") + hgvs_p_list = table.iloc[i,7].split(",") + + count = 0 + for j in range(len(gene_list)): + if "upstream" in effect_list[j] or "downstream" in effect_list[j]: + count += 1 + for j in range(len(gene_list)): + if len(effect_list) == count: + row = {'CHROM':table.iloc[i,0], 'POS':table.iloc[i,1], 'REF':table.iloc[i,2], 'ALT':table.iloc[i,3], 'GENE':gene_list[0], 'EFFECT':effect_list[0], 'HGVS_C':hgvs_c_list[0], 'HGVS_P':hgvs_p_list[0],} + one_effect_per_line_table = pd.concat([one_effect_per_line_table, pd.DataFrame([row])], ignore_index=True) + else: + if not "upstream" in effect_list[j] and not "downstream" in effect_list[j]: + row = {'CHROM':table.iloc[i,0], 'POS':table.iloc[i,1], 'REF':table.iloc[i,2], 'ALT':table.iloc[i,3], 'GENE':gene_list[j], 'EFFECT':effect_list[j], 'HGVS_C':hgvs_c_list[j], 'HGVS_P':hgvs_p_list[j],} + one_effect_per_line_table = pd.concat([one_effect_per_line_table, pd.DataFrame([row])], ignore_index=True) + return(one_effect_per_line_table) + def main(args=None): args = parser_args(args) From 15d99197654e5b7eece80352a2ff05c3742c76ab Mon Sep 17 00:00:00 2001 From: svarona Date: Mon, 23 Oct 2023 12:53:23 +0200 Subject: [PATCH 07/11] Rename output file for additional annotation variants long table --- conf/modules_illumina.config | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/conf/modules_illumina.config b/conf/modules_illumina.config index 2cc48bea..4c1c4c22 100644 --- a/conf/modules_illumina.config +++ b/conf/modules_illumina.config @@ -564,6 +564,14 @@ if (!params.skip_variants) { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } + withName: 'MAKE_VARIANTS_LONG_TABLE_ADDITIONAL' { + ext.args = "--variant_caller ${variant_caller} --output_file 'additional_variants_long_table.csv'" + publishDir = [ + path: { "${params.outdir}/variants/${variant_caller}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } } } } From 6d834f2589abee5a56d5a356ce0db8e7ba33444e Mon Sep 17 00:00:00 2001 From: svarona Date: Mon, 23 Oct 2023 13:22:05 +0200 Subject: [PATCH 08/11] Fixed black linting --- bin/make_variants_long_table.py | 43 +++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/bin/make_variants_long_table.py b/bin/make_variants_long_table.py index 19e1d0ed..dbcfc277 100755 --- a/bin/make_variants_long_table.py +++ b/bin/make_variants_long_table.py @@ -247,27 +247,50 @@ def snpsift_to_table(snpsift_file): return table + def one_effect_per_line(table): one_effect_per_line_table = pd.DataFrame() for i in range(len(table)): - gene_list = table.iloc[i,4].split(",") - effect_list = table.iloc[i,5].split(",") - hgvs_c_list = table.iloc[i,6].split(",") - hgvs_p_list = table.iloc[i,7].split(",") - + gene_list = table.iloc[i, 4].split(",") + effect_list = table.iloc[i, 5].split(",") + hgvs_c_list = table.iloc[i, 6].split(",") + hgvs_p_list = table.iloc[i, 7].split(",") + count = 0 for j in range(len(gene_list)): if "upstream" in effect_list[j] or "downstream" in effect_list[j]: count += 1 for j in range(len(gene_list)): if len(effect_list) == count: - row = {'CHROM':table.iloc[i,0], 'POS':table.iloc[i,1], 'REF':table.iloc[i,2], 'ALT':table.iloc[i,3], 'GENE':gene_list[0], 'EFFECT':effect_list[0], 'HGVS_C':hgvs_c_list[0], 'HGVS_P':hgvs_p_list[0],} - one_effect_per_line_table = pd.concat([one_effect_per_line_table, pd.DataFrame([row])], ignore_index=True) + row = { + "CHROM": table.iloc[i, 0], + "POS": table.iloc[i, 1], + "REF": table.iloc[i, 2], + "ALT": table.iloc[i, 3], + "GENE": gene_list[0], + "EFFECT": effect_list[0], + "HGVS_C": hgvs_c_list[0], + "HGVS_P": hgvs_p_list[0], + } + one_effect_per_line_table = pd.concat( + [one_effect_per_line_table, pd.DataFrame([row])], ignore_index=True + ) else: if not "upstream" in effect_list[j] and not "downstream" in effect_list[j]: - row = {'CHROM':table.iloc[i,0], 'POS':table.iloc[i,1], 'REF':table.iloc[i,2], 'ALT':table.iloc[i,3], 'GENE':gene_list[j], 'EFFECT':effect_list[j], 'HGVS_C':hgvs_c_list[j], 'HGVS_P':hgvs_p_list[j],} - one_effect_per_line_table = pd.concat([one_effect_per_line_table, pd.DataFrame([row])], ignore_index=True) - return(one_effect_per_line_table) + row = { + "CHROM": table.iloc[i, 0], + "POS": table.iloc[i, 1], + "REF": table.iloc[i, 2], + "ALT": table.iloc[i, 3], + "GENE": gene_list[j], + "EFFECT": effect_list[j], + "HGVS_C": hgvs_c_list[j], + "HGVS_P": hgvs_p_list[j], + } + one_effect_per_line_table = pd.concat( + [one_effect_per_line_table, pd.DataFrame([row])], ignore_index=True + ) + return one_effect_per_line_table def main(args=None): From 90a9f93fa14a537f4b7b6bebdc78998051d3a5f0 Mon Sep 17 00:00:00 2001 From: svarona Date: Mon, 23 Oct 2023 13:24:05 +0200 Subject: [PATCH 09/11] changed default value for additional_annot path to null --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 4e9f0abf..561769ff 100644 --- a/nextflow.config +++ b/nextflow.config @@ -22,7 +22,7 @@ params { primer_left_suffix = '_LEFT' primer_right_suffix = '_RIGHT' save_reference = false - additional_annot = false + additional_annot = null // Nanopore options fastq_dir = null From 27f4614e78e965e199b7d6685153c11be7eaf027 Mon Sep 17 00:00:00 2001 From: svarona Date: Mon, 23 Oct 2023 15:36:53 +0200 Subject: [PATCH 10/11] Added changes to changelog --- CHANGELOG.md | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 52ae7f0e..0d3d42c0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,16 +19,18 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements - [[#299](https://github.com/nf-core/viralrecon/issues/299)] - Add the freyja pipeline as a subworkflow - [[PR #387](https://github.com/nf-core/viralrecon/pull/387)] - Software closes gracefully when encountering an error - [[PR #395](https://github.com/nf-core/viralrecon/pull/395)] - Remove minia from default assemblers because it is unreliable +- [[PR #401](https://github.com/nf-core/viralrecon/pull/401)] - Added option to add a custom annotation ### Parameters -| Old parameter | New parameter | -| ------------------- | ------------- | -| `--skip_freyja` | | -| `--freyja_repeats` | | -| `--freyja_db_name` | | -| `--freyja_barcodes` | | -| `--freyja_lineages` | | +| Old parameter | New parameter | +| ------------------- | -------------------- | +| `--skip_freyja` | | +| `--freyja_repeats` | | +| `--freyja_db_name` | | +| `--freyja_barcodes` | | +| `--freyja_lineages` | | +| | `--additional_annot` | > **NB:** Parameter has been **updated** if both old and new parameter information is present. > **NB:** Parameter has been **added** if just the new parameter information is present. From cbab17b2a8ce7d7dfebd2a97e0e385a621814b67 Mon Sep 17 00:00:00 2001 From: svarona Date: Mon, 23 Oct 2023 15:54:12 +0200 Subject: [PATCH 11/11] added description for additional_variants_long_table --- docs/output.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/output.md b/docs/output.md index 536a20d3..0287e986 100644 --- a/docs/output.md +++ b/docs/output.md @@ -289,6 +289,7 @@ As described in the documentation, [ASCIIGenome](https://asciigenome.readthedocs - `/` - `variants_long_table.csv`: Long format table collating per-sample information for individual variants, functional effect prediction and lineage analysis. + - `additional_variants_long_table.csv`: Long format table similar to `variants_long_table.csv` for additional annotation file with overlapping annotation features. **NB:** The value of `` in the output directory name above is determined by the `--artic_minion_caller` parameter (Default: 'nanopolish').