UPHL-BioNGS · tives82 · Mar 15, 2024 · Mar 5, 2024 · Mar 5, 2024 · Mar 5, 2024
diff --git a/README.md b/README.md
@@ -49,6 +49,8 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool
 > **Clean read data undergo assembly and influenza typing and subtyping. Based on the subtype information, Nextclade variables are gathered.**
 
 * Assembly of influenza gene segments with (`IRMA`) using the built-in FLU module. Also, influenza typing and H/N subtype classifications are made.
+* Calculate the reference length, sequence length, and percent_coverage for segments assembled by IRMA with (`IRMA_SEGMENT_COVERAGE`)
+* Calculate the number of mapped reads and mean depth for segments assembled by IRMA with (`SAMTOOLS_MAPPED_READS`)
 * QC of consensus assembly (`IRMA_Consensus_QC`).
 * Generate IRMA consensus QC report (`IRMA_Consensus_QC_Reportsheet`)
 * Annotation of IRMA consensus sequences with (`VADR`)
@@ -69,6 +71,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool
 > **Compiles report sheets from modules and outputs a pipeline summary report tsv file.**
 
 * The (`Summary_Report`) consolidates and merges multiple report sheets into a single comprehensive summary report.
+* The (`merged_bam_coverage_results`) merges the gene segment report sheets detailing mapped reads, mean depth, reference length, sequence length, and percent_coverage.
 
 ## Quick Start
 

diff --git a/bin/calc_percent_cov.py b/bin/calc_percent_cov.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python
+
+# version = '1.0.0'
+
+# Modified from script https://github.com/CDPHE-bioinformatics/CDPHE-influenza/blob/main/scripts/calc_percent_cov.py
+
+# import python modules
+import pandas as pd
+from datetime import date
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+
+import sys
+import argparse
+import subprocess
+
+### Segment length dictionary
+ref_len_dict = {
+    "A_MP": 982,
+    "A_NP": 1497,
+    "A_NS": 863,
+    "A_PA": 2151,
+    "A_PB1": 2274,
+    "A_PB2": 2280,
+    "A_HA_H1": 1704,
+    "A_HA_H10": 1686,
+    "A_HA_H11": 1698,
+    "A_HA_H12": 1695,
+    "A_HA_H13": 1701,
+    "A_HA_H14": 1707,
+    "A_HA_H15": 1713,
+    "A_HA_H16": 1698,
+    "A_HA_H2": 1689,
+    "A_HA_H3": 1704,
+    "A_HA_H4": 1695,
+    "A_HA_H5": 1707,
+    "A_HA_H6": 1704,
+    "A_HA_H7": 1713,
+    "A_HA_H8": 1701,
+    "A_HA_H9": 1683,
+    "A_NA_N1": 1413,
+    "A_NA_N2": 1410,
+    "A_NA_N3": 1410,
+    "A_NA_N4": 1413,
+    "A_NA_N5": 1422,
+    "A_NA_N6": 1413,
+    "A_NA_N7": 1416,
+    "A_NA_N8": 1413,
+    "A_NA_N9": 1413,
+    "B_HA": 1758,
+    "B_MP": 1139,
+    "B_NA": 1408,
+    "B_NP": 1683,
+    "B_NS": 1034,
+    "B_PA": 2181,
+    "B_PB1": 2263,
+    "B_PB2": 2313,
+}
+
+
+#### FUNCTIONS #####
+def getOptions():
+    parser = argparse.ArgumentParser(description="Parses command.")
+    parser.add_argument("fasta_files", help="Path to the fasta file")
+    parser.add_argument("meta_id", help="Meta ID")
+    options = parser.parse_args()
+    return options
+
+
+def get_fasta_file_basename(fasta_file_path):
+    basename = fasta_file_path.split("/")[-1]  # strip directories
+    return basename
+
+
+def get_segment_name(fasta_file_path):
+    basename = fasta_file_path.split("/")[-1]  # strip directories
+    segment_name = basename.split(".")[0]  # remove file extension
+    return segment_name
+
+
+def get_gene_name(fasta_file_path):
+    basename = fasta_file_path.split("/")[-1]  # strip directories
+    segment_name = basename.split(".")[0]  # remove file extension
+    gene_name = segment_name.split("_")[1]  # extract gene name
+    return gene_name
+
+
+def get_seq_length(fasta_file_path):
+    # read in fasta file
+    record = SeqIO.read(fasta_file_path, "fasta")
+
+    # get length of non ambigous bases
+    seq = record.seq
+    seq_length = seq.count("A") + seq.count("C") + seq.count("G") + seq.count("T")
+
+    return seq_length
+
+
+def calc_percent_cov(seq_length, ref_len_dict, segment_name):
+    # calcuate per cov based on expected ref length
+    expected_length = ref_len_dict[segment_name]
+    percent_coverage = round(((seq_length / expected_length) * 100), 2)
+
+    return percent_coverage
+
+
+def create_output(meta_id, segment_name, seq_length, percent_coverage, reference_length):
+    df = pd.DataFrame()
+    df["Sample"] = [meta_id]
+    df["segment_name"] = [segment_name]
+    df["reference_length"] = [reference_length]
+    df["seq_length"] = [seq_length]
+    df["percent_coverage"] = [percent_coverage]
+
+    # Construct the output filename header
+    output_header = "Sample\tsegment_name\treference_length\tseq_length\tpercent_coverage"
+
+    # Construct the output filename
+    output_filename = f"{meta_id}.{segment_name}.perc_cov_results.tsv"
+
+    # Write the dataframe to the output file
+    with open(output_filename, "w") as f:
+        f.write(output_header + "\n")
+        df.to_csv(f, sep="\t", index=False, header=False)
+
+
+#### MAIN ####
+if __name__ == "__main__":
+    options = getOptions()
+    fasta_file_path = options.fasta_files
+    meta_id = options.meta_id
+
+    basename = get_fasta_file_basename(fasta_file_path=fasta_file_path)
+
+    segment_name = get_segment_name(fasta_file_path=fasta_file_path)
+    gene_name = get_gene_name(fasta_file_path=fasta_file_path)
+
+    seq_length = get_seq_length(fasta_file_path=fasta_file_path)
+    percent_coverage = calc_percent_cov(seq_length=seq_length, ref_len_dict=ref_len_dict, segment_name=segment_name)
+
+    reference_length = ref_len_dict[segment_name]
+
+    create_output(
+        meta_id=meta_id,
+        segment_name=segment_name,
+        seq_length=seq_length,
+        percent_coverage=percent_coverage,
+        reference_length=reference_length,
+    )
diff --git a/bin/flu_nextclade_variables.py b/bin/flu_nextclade_variables.py
@@ -4,31 +4,13 @@
 import os
 from os.path import exists
 import argparse
-import pandas as pd
 
-# Dictionary containing data for various flu subtypes.
-# Each subtype has associated Nextclade dataset, reference, and tag variables.
+# Dictionary containing data for various flu subtypes with their associated datasets.
 flu_subtypes = {
-    "H1N1": {
-        "dataset": "flu_h1n1pdm_ha",
-        "reference": "CY121680",
-        "tag": "2023-08-10T12:00:00Z",
-    },
-    "H3N2": {
-        "dataset": "flu_h3n2_ha",
-        "reference": "CY163680",
-        "tag": "2023-08-10T12:00:00Z",
-    },
-    "Victoria": {
-        "dataset": "flu_vic_ha",
-        "reference": "KX058884",
-        "tag": "2023-08-10T12:00:00Z",
-    },
-    "Yamagata": {
-        "dataset": "flu_yam_ha",
-        "reference": "JN993010",
-        "tag": "2022-07-27T12:00:00Z",
-    },
+    "H1N1": {"dataset": "flu_h1n1pdm_ha"},
+    "H3N2": {"dataset": "flu_h3n2_ha"},
+    "Victoria": {"dataset": "flu_vic_ha"},
+    "Yamagata": {"dataset": "flu_yam_ha"},
 }
 
 
@@ -57,12 +39,14 @@ def main():
         print(f"Error: Invalid flu subtype '{flu_subtype}' for sample '{args.sample}'")
         return
 
-    # For each variables (dataset, reference, tag) of the identified subtype, write it to a separate file and print variables.
-    for item in ["dataset", "reference", "tag"]:
-        file_path = flu_subtypes[flu_subtype][item]
-        with open(file_path, "w") as f:
-            f.write(f"{flu_subtypes[flu_subtype][item]}\n")
-            print(f"  {item}: {flu_subtypes[flu_subtype][item]} (output to {file_path})")
+    # Prepare the dataset and file_path
+    dataset = flu_subtypes[flu_subtype]["dataset"]
+    file_path = dataset
+
+    # Write to the file and print information
+    with open(file_path, "w") as f:
+        f.write(f"{dataset}\n")
+        print(f"  {dataset}: {dataset} (output to {file_path})")
 
 
 if __name__ == "__main__":

diff --git a/bin/irma_consensus_qc.py b/bin/irma_consensus_qc.py
@@ -86,13 +86,13 @@ def main(consensus_fasta, meta_id):
         writer.writerow(
             [
                 "Sample",
-                "IRMA consensus ACTG Count",
-                "IRMA consensus Degenerate Count",
-                "IRMA consensus N Count",
-                "IRMA consensus Total Count",
-                "IRMA consensus Segment Count",
-                "IRMA consensus N50",
-                "IRMA consensus GC Content",
+                "IRMA_consensus_ACTG_count",
+                "IRMA_consensus_degenerate_count",
+                "IRMA_consensus_N_count",
+                "IRMA_consensus_total_count",
+                "IRMA_consensus_segment_count",
+                "IRMA_consensus_N50",
+                "IRMA_consensus_GC_content",
             ]
         )
 

diff --git a/bin/merge_bam_coverage.py b/bin/merge_bam_coverage.py
@@ -0,0 +1,36 @@
+import pandas as pd
+import sys
+
+
+def main():
+    if len(sys.argv) != 4:
+        print("Usage: python script.py <bam_results> <coverage_results> <output_file>")
+        sys.exit(1)
+
+    bam_file_path = sys.argv[1]
+    coverage_file_path = sys.argv[2]
+    output_file_path = sys.argv[3]
+
+    # Read the TSV files into DataFrames
+    bam_df = pd.read_csv(bam_file_path, sep="\t")
+    coverage_df = pd.read_csv(coverage_file_path, sep="\t")
+
+    # Make sure 'Sample' is the first column in both DataFrames
+    bam_df = bam_df[["Sample"] + [col for col in bam_df.columns if col != "Sample"]]
+    coverage_df = coverage_df[["Sample"] + [col for col in coverage_df.columns if col != "Sample"]]
+
+    # Merge the DataFrames on the 'Sample' column
+    merged_df = pd.merge(bam_df, coverage_df, on="Sample", how="outer")
+
+    # Round all numeric columns to 2 decimal places
+    merged_df = merged_df.round(2)
+
+    # Make sure 'Sample' is the first column in the merged DataFrame
+    merged_df = merged_df[["Sample"] + [col for col in merged_df.columns if col != "Sample"]]
+
+    # Write the merged DataFrame to a TSV file
+    merged_df.to_csv(output_file_path, sep="\t", index=False)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/merge_reports.py b/bin/merge_reports.py
@@ -11,6 +11,9 @@ def merge_tsvs(files):
         temp_df = pd.read_csv(file, sep="\t")
         df = pd.merge(df, temp_df, on="Sample", how="outer")
 
+    # Round all numeric columns to 2 decimal places
+    df = df.round(2)
+
     return df
 
 

diff --git a/conf/modules.config b/conf/modules.config
@@ -290,13 +290,63 @@ process {
             pattern: "*"
         ]
     }
+    withName: 'IRMA_SEGMENT_COVERAGE' {
+        ext.args         = { "" }
+        ext.when         = {  }
+        publishDir       = [
+            enabled: true,
+            mode: "${params.publish_dir_mode}",
+            path: { "${params.outdir}/irma_segment_coverage/${meta.id}" },
+            pattern: "*"
+        ]
+    }
+    withName: 'MERGE_COVERAGE_RESULTS' {
+        ext.args         = { "" }
+        ext.when         = {  }
+        publishDir       = [
+            enabled: true,
+            mode: "${params.publish_dir_mode}",
+            path: { "${params.outdir}/irma_segment_coverage/" },
+            pattern: "*"
+        ]
+    }
+    withName: 'SAMTOOLS_MAPPED_READS' {
+        ext.args         = { "" }
+        ext.when         = {  }
+        publishDir       = [
+            enabled: true,
+            mode: "${params.publish_dir_mode}",
+            path: { "${params.outdir}/samtools_mapped_reads/${meta.id}" },
+            pattern: "*"
+        ]
+    }
+    withName: 'MERGE_BAM_RESULTS' {
+        ext.args         = { "" }
+        ext.when         = {  }
+        publishDir       = [
+            enabled: true,
+            mode: "${params.publish_dir_mode}",
+            path: { "${params.outdir}/samtools_mapped_reads/" },
+            pattern: "*"
+        ]
+    }
+    withName: 'MERGE_BAM_COVERAGE_RESULTS' {
+        ext.args         = { "" }
+        ext.when         = {  }
+        publishDir       = [
+            enabled: true,
+            mode: "${params.publish_dir_mode}",
+            path: { "${params.outdir}/SUMMARY_REPORTS" },
+            pattern: "*"
+        ]
+    }
     withName: COMBINED_SUMMARY_REPORT {
         ext.args   = { "" }
         ext.when   = {  }
         publishDir = [
             enabled: true,
             mode: "${params.publish_dir_mode}",
-            path: { "${params.outdir}/SUMMARY_REPORT" },
+            path: { "${params.outdir}/SUMMARY_REPORTS" },
             pattern: "*"
         ]
     }
@@ -306,7 +356,7 @@ process {
         publishDir = [
             enabled: true,
             mode: "${params.publish_dir_mode}",
-            path: { "${params.outdir}/SUMMARY_REPORT" },
+            path: { "${params.outdir}/SUMMARY_REPORTS" },
             pattern: "*"
         ]
     }

diff --git a/modules/local/irma.nf b/modules/local/irma.nf
@@ -2,14 +2,16 @@ process IRMA {
     tag "$meta.id"
     label 'process_high'
 
-    container 'quay.io/staphb/irma:1.1.3'
+    container 'quay.io/staphb/irma:1.1.4'
 
     input:
     tuple val(meta), path(reads)
     val(irma_module)
 
     output:
     tuple val(meta), path("${meta.id}/")               , emit: irma
+    tuple val(meta), path("${meta.id}/*.bam")          , emit: irma_bam
+    tuple val(meta), path("${meta.id}/*.fasta")        , emit: irma_fasta
     tuple val(meta), path("*.irma.consensus.fasta")    , optional:true, emit: assembly
     tuple val(meta), path("*_LOW_ABUNDANCE.txt")       , optional:true, emit: failed_assembly
     tuple val(meta), path("*_HA.fasta")                , optional:true, emit: HA
@@ -110,3 +112,4 @@ process IRMA {
     """
 }
 
+