Skip to content

Commit

Permalink
Dump bam stats (#18)
Browse files Browse the repository at this point in the history
* added bam stat dump and fixed minor coverage issues

* updated version

* small fix

* deleted useless line and added % to recovery
  • Loading branch information
jonas-fuchs authored Aug 6, 2024
1 parent e8a9ef1 commit cb4d1f4
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 35 deletions.
2 changes: 1 addition & 1 deletion bamdash/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""interactively visualize coverage and tracks"""
_program = "bamdash"
__version__ = "0.2.4"
__version__ = "0.3"
32 changes: 17 additions & 15 deletions bamdash/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def main(sysargs=sys.argv[1:]):
args = get_args(sysargs)

# define subplot number, track heights and parse data
coverage_df, title = data.bam_to_coverage_df(args.bam, args.reference, args.coverage, args.quality_threshold)
coverage_df, title, stat_dict = data.bam_to_coverage_df(args.bam, args.reference, args.coverage, args.quality_threshold)
track_heights = [1]
track_data = []
# extract data and check if ref was found
Expand Down Expand Up @@ -299,17 +299,19 @@ def main(sysargs=sys.argv[1:]):

# dump track data
vcf_track_count, bed_track_count, gb_track_count = 0, 0, 0
if args.dump and track_data:
for track in track_data:
if track[1] == "vcf":
track[0].to_csv(f"{args.reference}_vcf_data_{vcf_track_count}.tabular", sep="\t", header=True, index=False)
vcf_track_count += 1
elif track[1] == "bed":
bed_df = pd.DataFrame.from_dict(track[0]["bed annotations"], orient="index")
bed_df.drop("track", axis=1, inplace=True)
bed_df.to_csv(f"{args.reference}_bed_data_{bed_track_count}.tabular", sep="\t", header=True, index=False)
bed_track_count += 1
elif track[1] == "gb":
with open(f"{args.reference}_gb_data_{gb_track_count}.json", "w") as fp:
json.dump(track[0], fp)
gb_track_count += 1
if args.dump:
pd.DataFrame.from_dict(stat_dict, orient="index").to_csv("bam_stats.tabular", sep="\t", header=False, index=True)
if track_data:
for track in track_data:
if track[1] == "vcf":
track[0].to_csv(f"{args.reference}_vcf_data_{vcf_track_count}.tabular", sep="\t", header=True, index=False)
vcf_track_count += 1
elif track[1] == "bed":
bed_df = pd.DataFrame.from_dict(track[0]["bed annotations"], orient="index")
bed_df.drop("track", axis=1, inplace=True)
bed_df.to_csv(f"{args.reference}_bed_data_{bed_track_count}.tabular", sep="\t", header=True, index=False)
bed_track_count += 1
elif track[1] == "gb":
with open(f"{args.reference}_gb_data_{gb_track_count}.json", "w") as fp:
json.dump(track[0], fp)
gb_track_count += 1
47 changes: 28 additions & 19 deletions bamdash/scripts/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,17 @@ def get_coverage_stats(coverage_df, start, stop, min_cov):
:param min_cov: min coverage to consider covered
:return: stats
"""
df_subset = coverage_df[(coverage_df["position"] >= start) &
(coverage_df["position"] <= stop) &
(coverage_df["coverage"] > min_cov)]
if df_subset.empty:
df_subset_cov = coverage_df[(coverage_df["position"] >= start) & (coverage_df["position"] <= stop)]
df_subset_rec = coverage_df[(coverage_df["position"] >= start) & (coverage_df["position"] <= stop) & (coverage_df["coverage"] > min_cov)]
if df_subset_cov.empty:
mean_coverage = 0
recovery = 0
elif df_subset_rec.empty and not df_subset_cov.empty:
mean_coverage = sum(df_subset_cov["coverage"])/(stop-start+1)
recovery = 0
else:
mean_coverage = sum(df_subset["coverage"])/(stop-start)
recovery = len(df_subset["coverage"])/(stop-start+1)*100
mean_coverage = sum(df_subset_cov["coverage"])/(stop-start+1)
recovery = len(df_subset_rec["coverage"])/(stop-start+1)*100

return round(mean_coverage), round(recovery, 2)

Expand All @@ -46,21 +48,25 @@ def make_title_string(parsed_bam, coverage_df, reference, min_cov):
:param min_cov: min coverage to consider covered
:return: string for header
"""
stat_dict = {}
# get bam stats for correct chrom
bam_stats = parsed_bam.get_index_statistics()[0]
mean, rec = get_coverage_stats(coverage_df, min(coverage_df["position"]), max(coverage_df["position"]), min_cov)
# pop dict
stat_dict["reference"] = bam_stats[0]
stat_dict["reference length (bp)"] = parsed_bam.get_reference_length(reference)
for stat_type, bam_stat in zip(["mapped", "unmapped", "total"], bam_stats[1:]):
stat_dict[stat_type] = bam_stat
stat_dict["mean coverage"], stat_dict[f"% recovery >= {min_cov}x"] = get_coverage_stats(coverage_df, min(coverage_df["position"]), max(coverage_df["position"]), min_cov)
stat_dict["gc content (%)"] = round((sum(coverage_df["C"]) + sum(coverage_df["G"])) / len(coverage_df), 2)
# format title string
stat_string = ""
stat_string = make_stat_substring(stat_string, "reference", bam_stats[0])
stat_string = make_stat_substring(stat_string, "reference length", f"{parsed_bam.get_reference_length(reference)} bp")
gc_content = round((sum(coverage_df["C"])+sum(coverage_df["G"]))/len(coverage_df), 2)
for bam_stat, stat_type in zip(bam_stats[1:], ["mapped", "unmapped", "total"]):
stat_string = make_stat_substring(stat_string, stat_type, bam_stat)
stat_string = make_stat_substring(stat_string, "<br>mean coverage", mean)
stat_string = make_stat_substring(stat_string, "recovery", rec)
stat_string = make_stat_substring(stat_string, "gc content", f"{gc_content}%")
for key in stat_dict:
if key == "mean coverage":
stat_string = make_stat_substring(stat_string, f"<br>{key}", stat_dict[key])
else:
stat_string = make_stat_substring(stat_string, key, stat_dict[key])

return stat_string
return stat_string, stat_dict


def bam_to_coverage_df(bam_file, ref, min_cov, quality_thres):
Expand Down Expand Up @@ -99,7 +105,10 @@ def bam_to_coverage_df(bam_file, ref, min_cov, quality_thres):
"G",
"T"
])
return coverage_df, make_title_string(bam, coverage_df, ref, min_cov)

title, stat_dict = make_title_string(bam, coverage_df, ref, min_cov)

return coverage_df, title, stat_dict


def vcf_to_df(vcf_file, ref):
Expand Down Expand Up @@ -216,7 +225,7 @@ def genbank_to_dict(gb_file, coverage_df, ref, min_cov):
feature_dict[feature.type][f"{start} {stop}"]["start"] = start
feature_dict[feature.type][f"{start} {stop}"]["stop"] = stop
feature_dict[feature.type][f"{start} {stop}"]["mean coverage"] = mean_cov
feature_dict[feature.type][f"{start} {stop}"]["% recovery"] = rec
feature_dict[feature.type][f"{start} {stop}"][f"% recovery >= {min_cov}x"] = rec
# define strand info
if feature.strand == 1:
feature_dict[feature.type][f"{start} {stop}"]["strand"] = "+"
Expand Down Expand Up @@ -278,7 +287,7 @@ def bed_to_dict(bed_file, coverage_df, ref, min_cov):
# compute mean coverage
mean_cov, rec = get_coverage_stats(coverage_df, start, stop, min_cov)
bed_dict["bed annotations"][f"{start} {stop}"]["mean coverage"] = mean_cov
bed_dict["bed annotations"][f"{start} {stop}"]["recovery"] = rec
bed_dict["bed annotations"][f"{start} {stop}"][f"% recovery >= {min_cov}x"] = rec

return define_track_position(bed_dict)

Expand Down

0 comments on commit cb4d1f4

Please sign in to comment.