Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update to Nextclade v3 and Enhancement of Coverage and Depth Analysis #7

Merged
merged 18 commits into from
Mar 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool
> **Clean read data undergo assembly and influenza typing and subtyping. Based on the subtype information, Nextclade variables are gathered.**

* Assembly of influenza gene segments with (`IRMA`) using the built-in FLU module. Also, influenza typing and H/N subtype classifications are made.
* Calculate the reference length, sequence length, and percent_coverage for segments assembled by IRMA with (`IRMA_SEGMENT_COVERAGE`)
* Calculate the number of mapped reads and mean depth for segments assembled by IRMA with (`SAMTOOLS_MAPPED_READS`)
* QC of consensus assembly (`IRMA_Consensus_QC`).
* Generate IRMA consensus QC report (`IRMA_Consensus_QC_Reportsheet`)
* Annotation of IRMA consensus sequences with (`VADR`)
Expand All @@ -69,6 +71,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool
> **Compiles report sheets from modules and outputs a pipeline summary report tsv file.**

* The (`Summary_Report`) consolidates and merges multiple report sheets into a single comprehensive summary report.
* The (`merged_bam_coverage_results`) merges the gene segment report sheets detailing mapped reads, mean depth, reference length, sequence length, and percent_coverage.

## Quick Start

Expand Down
149 changes: 149 additions & 0 deletions bin/calc_percent_cov.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
#!/usr/bin/env python

# version = '1.0.0'

# Modified from script https://github.com/CDPHE-bioinformatics/CDPHE-influenza/blob/main/scripts/calc_percent_cov.py

# import python modules
import pandas as pd
from datetime import date
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

import sys
import argparse
import subprocess

### Segment length dictionary
ref_len_dict = {
"A_MP": 982,
"A_NP": 1497,
"A_NS": 863,
"A_PA": 2151,
"A_PB1": 2274,
"A_PB2": 2280,
"A_HA_H1": 1704,
"A_HA_H10": 1686,
"A_HA_H11": 1698,
"A_HA_H12": 1695,
"A_HA_H13": 1701,
"A_HA_H14": 1707,
"A_HA_H15": 1713,
"A_HA_H16": 1698,
"A_HA_H2": 1689,
"A_HA_H3": 1704,
"A_HA_H4": 1695,
"A_HA_H5": 1707,
"A_HA_H6": 1704,
"A_HA_H7": 1713,
"A_HA_H8": 1701,
"A_HA_H9": 1683,
"A_NA_N1": 1413,
"A_NA_N2": 1410,
"A_NA_N3": 1410,
"A_NA_N4": 1413,
"A_NA_N5": 1422,
"A_NA_N6": 1413,
"A_NA_N7": 1416,
"A_NA_N8": 1413,
"A_NA_N9": 1413,
"B_HA": 1758,
"B_MP": 1139,
"B_NA": 1408,
"B_NP": 1683,
"B_NS": 1034,
"B_PA": 2181,
"B_PB1": 2263,
"B_PB2": 2313,
}


#### FUNCTIONS #####
def getOptions():
parser = argparse.ArgumentParser(description="Parses command.")
parser.add_argument("fasta_files", help="Path to the fasta file")
parser.add_argument("meta_id", help="Meta ID")
options = parser.parse_args()
return options


def get_fasta_file_basename(fasta_file_path):
basename = fasta_file_path.split("/")[-1] # strip directories
return basename


def get_segment_name(fasta_file_path):
basename = fasta_file_path.split("/")[-1] # strip directories
segment_name = basename.split(".")[0] # remove file extension
return segment_name


def get_gene_name(fasta_file_path):
basename = fasta_file_path.split("/")[-1] # strip directories
segment_name = basename.split(".")[0] # remove file extension
gene_name = segment_name.split("_")[1] # extract gene name
return gene_name


def get_seq_length(fasta_file_path):
# read in fasta file
record = SeqIO.read(fasta_file_path, "fasta")

# get length of non ambigous bases
seq = record.seq
seq_length = seq.count("A") + seq.count("C") + seq.count("G") + seq.count("T")

return seq_length


def calc_percent_cov(seq_length, ref_len_dict, segment_name):
# calcuate per cov based on expected ref length
expected_length = ref_len_dict[segment_name]
percent_coverage = round(((seq_length / expected_length) * 100), 2)

return percent_coverage


def create_output(meta_id, segment_name, seq_length, percent_coverage, reference_length):
df = pd.DataFrame()
df["Sample"] = [meta_id]
df["segment_name"] = [segment_name]
df["reference_length"] = [reference_length]
df["seq_length"] = [seq_length]
df["percent_coverage"] = [percent_coverage]

# Construct the output filename header
output_header = "Sample\tsegment_name\treference_length\tseq_length\tpercent_coverage"

# Construct the output filename
output_filename = f"{meta_id}.{segment_name}.perc_cov_results.tsv"

# Write the dataframe to the output file
with open(output_filename, "w") as f:
f.write(output_header + "\n")
df.to_csv(f, sep="\t", index=False, header=False)


#### MAIN ####
if __name__ == "__main__":
options = getOptions()
fasta_file_path = options.fasta_files
meta_id = options.meta_id

basename = get_fasta_file_basename(fasta_file_path=fasta_file_path)

segment_name = get_segment_name(fasta_file_path=fasta_file_path)
gene_name = get_gene_name(fasta_file_path=fasta_file_path)

seq_length = get_seq_length(fasta_file_path=fasta_file_path)
percent_coverage = calc_percent_cov(seq_length=seq_length, ref_len_dict=ref_len_dict, segment_name=segment_name)

reference_length = ref_len_dict[segment_name]

create_output(
meta_id=meta_id,
segment_name=segment_name,
seq_length=seq_length,
percent_coverage=percent_coverage,
reference_length=reference_length,
)
42 changes: 13 additions & 29 deletions bin/flu_nextclade_variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,31 +4,13 @@
import os
from os.path import exists
import argparse
import pandas as pd

# Dictionary containing data for various flu subtypes.
# Each subtype has associated Nextclade dataset, reference, and tag variables.
# Dictionary containing data for various flu subtypes with their associated datasets.
flu_subtypes = {
"H1N1": {
"dataset": "flu_h1n1pdm_ha",
"reference": "CY121680",
"tag": "2023-08-10T12:00:00Z",
},
"H3N2": {
"dataset": "flu_h3n2_ha",
"reference": "CY163680",
"tag": "2023-08-10T12:00:00Z",
},
"Victoria": {
"dataset": "flu_vic_ha",
"reference": "KX058884",
"tag": "2023-08-10T12:00:00Z",
},
"Yamagata": {
"dataset": "flu_yam_ha",
"reference": "JN993010",
"tag": "2022-07-27T12:00:00Z",
},
"H1N1": {"dataset": "flu_h1n1pdm_ha"},
"H3N2": {"dataset": "flu_h3n2_ha"},
"Victoria": {"dataset": "flu_vic_ha"},
"Yamagata": {"dataset": "flu_yam_ha"},
}


Expand Down Expand Up @@ -57,12 +39,14 @@ def main():
print(f"Error: Invalid flu subtype '{flu_subtype}' for sample '{args.sample}'")
return

# For each variables (dataset, reference, tag) of the identified subtype, write it to a separate file and print variables.
for item in ["dataset", "reference", "tag"]:
file_path = flu_subtypes[flu_subtype][item]
with open(file_path, "w") as f:
f.write(f"{flu_subtypes[flu_subtype][item]}\n")
print(f" {item}: {flu_subtypes[flu_subtype][item]} (output to {file_path})")
# Prepare the dataset and file_path
dataset = flu_subtypes[flu_subtype]["dataset"]
file_path = dataset

# Write to the file and print information
with open(file_path, "w") as f:
f.write(f"{dataset}\n")
print(f" {dataset}: {dataset} (output to {file_path})")


if __name__ == "__main__":
Expand Down
14 changes: 7 additions & 7 deletions bin/irma_consensus_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,13 +86,13 @@ def main(consensus_fasta, meta_id):
writer.writerow(
[
"Sample",
"IRMA consensus ACTG Count",
"IRMA consensus Degenerate Count",
"IRMA consensus N Count",
"IRMA consensus Total Count",
"IRMA consensus Segment Count",
"IRMA consensus N50",
"IRMA consensus GC Content",
"IRMA_consensus_ACTG_count",
"IRMA_consensus_degenerate_count",
"IRMA_consensus_N_count",
"IRMA_consensus_total_count",
"IRMA_consensus_segment_count",
"IRMA_consensus_N50",
"IRMA_consensus_GC_content",
]
)

Expand Down
36 changes: 36 additions & 0 deletions bin/merge_bam_coverage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import pandas as pd
import sys


def main():
if len(sys.argv) != 4:
print("Usage: python script.py <bam_results> <coverage_results> <output_file>")
sys.exit(1)

bam_file_path = sys.argv[1]
coverage_file_path = sys.argv[2]
output_file_path = sys.argv[3]

# Read the TSV files into DataFrames
bam_df = pd.read_csv(bam_file_path, sep="\t")
coverage_df = pd.read_csv(coverage_file_path, sep="\t")

# Make sure 'Sample' is the first column in both DataFrames
bam_df = bam_df[["Sample"] + [col for col in bam_df.columns if col != "Sample"]]
coverage_df = coverage_df[["Sample"] + [col for col in coverage_df.columns if col != "Sample"]]

# Merge the DataFrames on the 'Sample' column
merged_df = pd.merge(bam_df, coverage_df, on="Sample", how="outer")

# Round all numeric columns to 2 decimal places
merged_df = merged_df.round(2)

# Make sure 'Sample' is the first column in the merged DataFrame
merged_df = merged_df[["Sample"] + [col for col in merged_df.columns if col != "Sample"]]

# Write the merged DataFrame to a TSV file
merged_df.to_csv(output_file_path, sep="\t", index=False)


if __name__ == "__main__":
main()
3 changes: 3 additions & 0 deletions bin/merge_reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ def merge_tsvs(files):
temp_df = pd.read_csv(file, sep="\t")
df = pd.merge(df, temp_df, on="Sample", how="outer")

# Round all numeric columns to 2 decimal places
df = df.round(2)

return df


Expand Down
54 changes: 52 additions & 2 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -290,13 +290,63 @@ process {
pattern: "*"
]
}
withName: 'IRMA_SEGMENT_COVERAGE' {
ext.args = { "" }
ext.when = { }
publishDir = [
enabled: true,
mode: "${params.publish_dir_mode}",
path: { "${params.outdir}/irma_segment_coverage/${meta.id}" },
pattern: "*"
]
}
withName: 'MERGE_COVERAGE_RESULTS' {
ext.args = { "" }
ext.when = { }
publishDir = [
enabled: true,
mode: "${params.publish_dir_mode}",
path: { "${params.outdir}/irma_segment_coverage/" },
pattern: "*"
]
}
withName: 'SAMTOOLS_MAPPED_READS' {
ext.args = { "" }
ext.when = { }
publishDir = [
enabled: true,
mode: "${params.publish_dir_mode}",
path: { "${params.outdir}/samtools_mapped_reads/${meta.id}" },
pattern: "*"
]
}
withName: 'MERGE_BAM_RESULTS' {
ext.args = { "" }
ext.when = { }
publishDir = [
enabled: true,
mode: "${params.publish_dir_mode}",
path: { "${params.outdir}/samtools_mapped_reads/" },
pattern: "*"
]
}
withName: 'MERGE_BAM_COVERAGE_RESULTS' {
ext.args = { "" }
ext.when = { }
publishDir = [
enabled: true,
mode: "${params.publish_dir_mode}",
path: { "${params.outdir}/SUMMARY_REPORTS" },
pattern: "*"
]
}
withName: COMBINED_SUMMARY_REPORT {
ext.args = { "" }
ext.when = { }
publishDir = [
enabled: true,
mode: "${params.publish_dir_mode}",
path: { "${params.outdir}/SUMMARY_REPORT" },
path: { "${params.outdir}/SUMMARY_REPORTS" },
pattern: "*"
]
}
Expand All @@ -306,7 +356,7 @@ process {
publishDir = [
enabled: true,
mode: "${params.publish_dir_mode}",
path: { "${params.outdir}/SUMMARY_REPORT" },
path: { "${params.outdir}/SUMMARY_REPORTS" },
pattern: "*"
]
}
Expand Down
5 changes: 4 additions & 1 deletion modules/local/irma.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,16 @@ process IRMA {
tag "$meta.id"
label 'process_high'

container 'quay.io/staphb/irma:1.1.3'
container 'quay.io/staphb/irma:1.1.4'

input:
tuple val(meta), path(reads)
val(irma_module)

output:
tuple val(meta), path("${meta.id}/") , emit: irma
tuple val(meta), path("${meta.id}/*.bam") , emit: irma_bam
tuple val(meta), path("${meta.id}/*.fasta") , emit: irma_fasta
tuple val(meta), path("*.irma.consensus.fasta") , optional:true, emit: assembly
tuple val(meta), path("*_LOW_ABUNDANCE.txt") , optional:true, emit: failed_assembly
tuple val(meta), path("*_HA.fasta") , optional:true, emit: HA
Expand Down Expand Up @@ -110,3 +112,4 @@ process IRMA {
"""
}


Loading
Loading