-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathquantify_deletions.py
128 lines (92 loc) · 3.35 KB
/
quantify_deletions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
### only documentation ###
import matplotlib.pyplot as plt
import numpy as np
import re
# --- end of imports --- #
def load_cov( coverage_file, cutoff ):
"""! @brief load all information from coverage file """
cov = {}
with open( coverage_file, "r" ) as f:
line = f.readline()
header = line.split('\t')[0]
tmp = []
while line:
parts = line.strip().split('\t')
if parts[0] != header:
cov.update( { header: tmp } )
header = parts[0]
tmp = []
tmp.append( min( [ float( parts[-1] ), cutoff ] ) )
line = f.readline()
cov.update( { header: tmp } )
return cov
def get_coverage_and_windows( coverage, window, step ):
"""! @brief calculate coverage per window and window center """
y_values = []
for key in sorted( coverage.keys() )[:5]:
cov = coverage[ key ]
start = 0
end = 0 + step
while end < len( cov ):
y_values.append( sum( cov[ start:end ] ) / float( window ) )
start += step
end += step
return y_values
def load_BUSCO_positions( busco_position_file ):
"""! @brief load single copy and complete BUSCO positions """
busco_pos = []
with open( busco_position_file, "r" ) as f:
line = f.readline()
while line:
parts = line.strip().split('\t')
if parts[1] == "Complete":
busco_pos.append( { 'chr': parts[2], 'start': int( parts[3] ), 'end': int( parts[4] ) } )
line = f.readline()
return busco_pos
def calculate_cov_distr( busco_pos, coverage ):
"""! @brief calculate average coverage and define lower and upper cutoff based on distribution """
values = []
for busco in busco_pos:
values.append( np.mean( coverage[ busco['chr'] ][ busco['start']: busco['end'] ] ) ) #calculate average per BUSCO
avg_coverage = np.mean( values )
low_cut = avg_coverage - ( 2 * np.std( values ) )
up_cut = avg_coverage + ( 2 * np.std( values ) )
return avg_coverage, low_cut, up_cut
def generate_figure( avg_cov, low_cut, up_cut , windows, outputfile ):
"""! @brief generate histogram and print result value """
fig, ax = plt.subplots()
low_cut = 50
ax.hist( windows, bins=1000, color="lime" )
ax.plot( [ low_cut, low_cut ], [ 0, 500 ], color="magenta" )
ax.plot( [ avg_cov, avg_cov ], [ 0, 500 ], color="black" )
ax.set_xlabel( "sequencing coverage depth" )
ax.set_ylabel( "number of genomic blocks" )
ax.set_xlim( 0, 350 )
fig.savefig( outputfile, dpi=300 )
plt.close( "all" )
counter = 0
for each in windows:
if each < low_cut:
counter += 1
print "number of regions deleted in one haplotype: " + str( counter )
illumina_cov_file = "Nd1_Illumina_reads_vs_Col0.sorted.cov"
ont_cov_file = "Nd1_ONT_reads_vs_Col0.sorted.cov"
busco_position_file = "full_table_BUSCO_on_Col0.tsv"
output_dir = "./deletion_quantification/"
out_file = output_dir + "deletion_quantification.png"
window = 1000
step = 1000
cutoff = 175.0 #maximal coverage
illumina_cov = load_cov( illumina_cov_file, cutoff )
ont_cov = load_cov( ont_cov_file, cutoff )
busco_pos = load_BUSCO_positions( busco_position_file )
at7_cov = {}
for key in illumina_cov.keys():
vals = illumina_cov[ key ]
new_vals = []
for idx, each in enumerate( vals ):
new_vals.append( each + ont_cov[ key ][ idx ] )
at7_cov.update( { key: new_vals } )
avg_cov, low_cut, up_cut = calculate_cov_distr( busco_pos, at7_cov )
at7_windows = get_coverage_and_windows( at7_cov, window, step )
generate_figure( avg_cov, low_cut, up_cut , at7_windows, out_file )