forked from sc-zhang/bioscripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_chr_centromere.py
executable file
·134 lines (123 loc) · 3.24 KB
/
get_chr_centromere.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python
import sys
import copy
def get_chr_cen(in_ctg_cen, in_agp, bin_size, out_chr_cen):
print("Loading agp")
chr_len = {}
ctg_on_chr = {}
with open(in_agp, 'r') as fin:
for line in fin:
data = line.strip().split()
if data[4]!= 'W' or data[0] == data[5]:
continue
chrn = data[0]
tig = data[5]
sp = int(data[1])
ep = int(data[2])
direct = data[-1]
if chrn not in chr_len:
chr_len[chrn] = 0
chr_len[chrn] = ep
ctg_on_chr[tig] = [chrn, sp, ep, direct]
centro_dist = {}
for chrn in chr_len:
centro_dist[chrn] = [ 0 for i in range(0, int(chr_len[chrn]/bin_size+2))]
print("Loading centromere information with contig level")
centro_db = {}
with open(in_ctg_cen, 'r') as fin:
for line in fin:
data = line.strip().split()
if data[0] == 'seqid':
continue
tig = data[0]
tsp = int(data[1])
tep = int(data[2])
if tig not in ctg_on_chr:
continue
chrn, csp, cep, direct = ctg_on_chr[tig]
if direct == '+':
nsp = csp+tsp-1
nep = csp+tep-1
else:
nsp = cep-tep+1
nep = cep-tsp+1
if chrn not in centro_db:
centro_db[chrn] = []
centro_db[chrn].append([nsp, nep])
for chrn in centro_db:
centro_db[chrn] = sorted(centro_db[chrn])
for sp, ep in centro_db[chrn]:
sidx = int(round(sp/bin_size+0.51))
eidx = int(round(sp/bin_size+0.51))
for i in range(sidx, eidx+1):
centro_dist[chrn][i] += 1
print("Calculating centromere positions")
res_db = {}
for chrn in sorted(centro_dist):
print("\tCalculating %s"%chrn)
tmp_list = sorted(centro_dist[chrn], reverse=True)
d = len(tmp_list)/100.0
last_idx = -1
centro_pre = []
for i in range(0, len(centro_dist[chrn])):
cur_val = centro_dist[chrn][i]
if cur_val > 0:
if last_idx == -1:
centro_pre.append([i, i])
else:
if i <= last_idx+d:
centro_pre[-1][1] = i
else:
centro_pre.append([i, i])
last_idx = i
max_pair = []
max_len = 0
for si, ei in centro_pre:
if ei-si+1 > max_len:
max_len = ei-si+1
max_pair = [si, ei]
if max_pair == []:
ms = "NA"
me = "NA"
ml = "NA"
else:
ms = max_pair[0]*bin_size
me = max_pair[1]*bin_size
ml = me-ms+1
ms = "%.2f"%(ms/1e6)
me = "%.2f"%(me/1e6)
ml = "%.2f"%(ml/1e6)
base_chr = chrn[:-1]
hap = chrn[-1]
if base_chr not in res_db:
res_db[base_chr] = {}
res_db[base_chr][hap] = [ms, me, ml]
hap_list = sorted(res_db[base_chr])
print("Writing result")
with open(out_chr_cen, 'w') as fout:
fout.write(",%s\n"%(',,'.join(hap_list)))
for chrn in sorted(res_db):
info = []
for hap in sorted(res_db[chrn]):
ms, me, ml = res_db[chrn][hap]
info.append("%s-%s,%s"%(ms, me, ml))
fout.write("%s,%s\n"%(chrn, ','.join(info)))
print("Finished")
if __name__ == "__main__":
if len(sys.argv) < 5:
print("Usage: python %s <in_ctg_centro> <in_agp> <bin_size> <out_chr_centro>"%sys.argv[0])
else:
in_ctg_cen, in_agp, bin_size, out_chr_cen = sys.argv[1:]
if not bin_size[-1].isdigit():
ratio = bin_size[-1].lower()
base = float(bin_size[:-1])
if ratio == 'g':
ratio = 1e9
elif ratio == 'm':
ratio = 1e6
else:
ratio = 1e3
bin_size = int(base*ratio)
else:
bin_size = int(bin_size)
get_chr_cen(in_ctg_cen, in_agp, bin_size, out_chr_cen)