Skip to content

Commit

Permalink
reformatted merge_bam_coverage.py and calc_percent_cov.py with python…
Browse files Browse the repository at this point in the history
… black
  • Loading branch information
tives82 committed Mar 14, 2024
1 parent 494f854 commit 2fa0e60
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 50 deletions.
122 changes: 76 additions & 46 deletions bin/calc_percent_cov.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

#version = '1.0.0'
# version = '1.0.0'

# Modified from script https://github.com/CDPHE-bioinformatics/CDPHE-influenza/blob/main/scripts/calc_percent_cov.py

Expand All @@ -15,13 +15,48 @@
import subprocess

### Segment length dictionary
ref_len_dict = {'A_MP': 982,'A_NP': 1497,'A_NS': 863,'A_PA': 2151,'A_PB1': 2274,'A_PB2': 2280,
'A_HA_H1': 1704,'A_HA_H10': 1686,'A_HA_H11': 1698,'A_HA_H12': 1695,'A_HA_H13': 1701,'A_HA_H14': 1707,
'A_HA_H15': 1713,'A_HA_H16': 1698,'A_HA_H2': 1689,'A_HA_H3': 1704,'A_HA_H4': 1695,'A_HA_H5': 1707,
'A_HA_H6': 1704,'A_HA_H7': 1713,'A_HA_H8': 1701,'A_HA_H9': 1683,'A_NA_N1': 1413,'A_NA_N2': 1410,
'A_NA_N3': 1410,'A_NA_N4': 1413,'A_NA_N5': 1422,'A_NA_N6': 1413,'A_NA_N7': 1416,
'A_NA_N8': 1413,'A_NA_N9': 1413,'B_HA': 1758,'B_MP': 1139,'B_NA': 1408,'B_NP': 1683,
'B_NS': 1034,'B_PA': 2181,'B_PB1': 2263,'B_PB2': 2313}
ref_len_dict = {
"A_MP": 982,
"A_NP": 1497,
"A_NS": 863,
"A_PA": 2151,
"A_PB1": 2274,
"A_PB2": 2280,
"A_HA_H1": 1704,
"A_HA_H10": 1686,
"A_HA_H11": 1698,
"A_HA_H12": 1695,
"A_HA_H13": 1701,
"A_HA_H14": 1707,
"A_HA_H15": 1713,
"A_HA_H16": 1698,
"A_HA_H2": 1689,
"A_HA_H3": 1704,
"A_HA_H4": 1695,
"A_HA_H5": 1707,
"A_HA_H6": 1704,
"A_HA_H7": 1713,
"A_HA_H8": 1701,
"A_HA_H9": 1683,
"A_NA_N1": 1413,
"A_NA_N2": 1410,
"A_NA_N3": 1410,
"A_NA_N4": 1413,
"A_NA_N5": 1422,
"A_NA_N6": 1413,
"A_NA_N7": 1416,
"A_NA_N8": 1413,
"A_NA_N9": 1413,
"B_HA": 1758,
"B_MP": 1139,
"B_NA": 1408,
"B_NP": 1683,
"B_NS": 1034,
"B_PA": 2181,
"B_PB1": 2263,
"B_PB2": 2313,
}


#### FUNCTIONS #####
def getOptions():
Expand All @@ -33,44 +68,49 @@ def getOptions():


def get_fasta_file_basename(fasta_file_path):
basename = fasta_file_path.split('/')[-1] # strip directories
basename = fasta_file_path.split("/")[-1] # strip directories
return basename


def get_segment_name(fasta_file_path):
basename = fasta_file_path.split('/')[-1] # strip directories
segment_name = basename.split('.')[0] # remove file extension
basename = fasta_file_path.split("/")[-1] # strip directories
segment_name = basename.split(".")[0] # remove file extension
return segment_name


def get_gene_name(fasta_file_path):
basename = fasta_file_path.split('/')[-1] # strip directories
segment_name = basename.split('.')[0] # remove file extension
gene_name = segment_name.split('_')[1] # extract gene name
basename = fasta_file_path.split("/")[-1] # strip directories
segment_name = basename.split(".")[0] # remove file extension
gene_name = segment_name.split("_")[1] # extract gene name
return gene_name


def get_seq_length(fasta_file_path):
# read in fasta file
record = SeqIO.read(fasta_file_path, 'fasta')
record = SeqIO.read(fasta_file_path, "fasta")

# get length of non ambigous bases
seq = record.seq
seq_length = (seq.count('A') + seq.count('C') + seq.count('G') + seq.count('T'))
seq_length = seq.count("A") + seq.count("C") + seq.count("G") + seq.count("T")

return seq_length


def calc_percent_cov(seq_length, ref_len_dict, segment_name):
# calcuate per cov based on expected ref length
expected_length = ref_len_dict[segment_name]
percent_coverage = round(((seq_length/expected_length)*100), 2)
percent_coverage = round(((seq_length / expected_length) * 100), 2)

return percent_coverage


def create_output(meta_id, segment_name, seq_length, percent_coverage, reference_length):
df = pd.DataFrame()
df['Sample'] = [meta_id]
df['segment_name'] = [segment_name]
df['reference_length'] = [reference_length]
df['seq_length'] = [seq_length]
df['percent_coverage'] = [percent_coverage]
df["Sample"] = [meta_id]
df["segment_name"] = [segment_name]
df["reference_length"] = [reference_length]
df["seq_length"] = [seq_length]
df["percent_coverage"] = [percent_coverage]

# Construct the output filename header
output_header = "Sample\tsegment_name\treference_length\tseq_length\tpercent_coverage"
Expand All @@ -79,41 +119,31 @@ def create_output(meta_id, segment_name, seq_length, percent_coverage, reference
output_filename = f"{meta_id}.{segment_name}.perc_cov_results.tsv"

# Write the dataframe to the output file
with open(output_filename, 'w') as f:
f.write(output_header + '\n')
df.to_csv(f, sep='\t', index=False, header=False)
with open(output_filename, "w") as f:
f.write(output_header + "\n")
df.to_csv(f, sep="\t", index=False, header=False)

#### MAIN ####
if __name__ == '__main__':

#### MAIN ####
if __name__ == "__main__":
options = getOptions()
fasta_file_path = options.fasta_files
meta_id = options.meta_id

basename = get_fasta_file_basename(fasta_file_path = fasta_file_path)
basename = get_fasta_file_basename(fasta_file_path=fasta_file_path)

segment_name = get_segment_name(fasta_file_path=fasta_file_path)
gene_name = get_gene_name(fasta_file_path=fasta_file_path)

seq_length = get_seq_length(fasta_file_path=fasta_file_path)
percent_coverage = calc_percent_cov(seq_length=seq_length,
ref_len_dict=ref_len_dict,
segment_name=segment_name)
percent_coverage = calc_percent_cov(seq_length=seq_length, ref_len_dict=ref_len_dict, segment_name=segment_name)

reference_length = ref_len_dict[segment_name]

create_output(meta_id=meta_id,
segment_name=segment_name,
seq_length=seq_length,
percent_coverage=percent_coverage,
reference_length=reference_length)










create_output(
meta_id=meta_id,
segment_name=segment_name,
seq_length=seq_length,
percent_coverage=percent_coverage,
reference_length=reference_length,
)
10 changes: 6 additions & 4 deletions bin/merge_bam_coverage.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import pandas as pd
import sys


def main():
if len(sys.argv) != 4:
print("Usage: python script.py <bam_results> <coverage_results> <output_file>")
sys.exit(1)

bam_file_path = sys.argv[1]
coverage_file_path = sys.argv[2]
output_file_path = sys.argv[3]
Expand All @@ -15,8 +16,8 @@ def main():
coverage_df = pd.read_csv(coverage_file_path, sep="\t")

# Make sure 'Sample' is the first column in both DataFrames
bam_df = bam_df[['Sample'] + [col for col in bam_df.columns if col != 'Sample']]
coverage_df = coverage_df[['Sample'] + [col for col in coverage_df.columns if col != 'Sample']]
bam_df = bam_df[["Sample"] + [col for col in bam_df.columns if col != "Sample"]]
coverage_df = coverage_df[["Sample"] + [col for col in coverage_df.columns if col != "Sample"]]

# Merge the DataFrames on the 'Sample' column
merged_df = pd.merge(bam_df, coverage_df, on="Sample", how="outer")
Expand All @@ -25,10 +26,11 @@ def main():
merged_df = merged_df.round(2)

# Make sure 'Sample' is the first column in the merged DataFrame
merged_df = merged_df[['Sample'] + [col for col in merged_df.columns if col != 'Sample']]
merged_df = merged_df[["Sample"] + [col for col in merged_df.columns if col != "Sample"]]

# Write the merged DataFrame to a TSV file
merged_df.to_csv(output_file_path, sep="\t", index=False)


if __name__ == "__main__":
main()

0 comments on commit 2fa0e60

Please sign in to comment.