forked from sc-zhang/bioscripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmerge_bed_regions.py
executable file
·81 lines (74 loc) · 1.78 KB
/
merge_bed_regions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/usr/bin/env python
import sys
def merge_reions(region_db, md):
new_regions = {}
for chrn in region_db:
new_regions[chrn] = []
tmp_list = []
tmp_gns = [[]]
for region in sorted(region_db[chrn]):
s = region[0]
e = region[1]
gn = region[2]
if len(tmp_list) == 0:
tmp_list.append(s)
laste = e
else:
if s > laste+md:
tmp_list.append(laste)
tmp_list.append(s)
tmp_gns.append([])
laste = e
else:
if e > laste:
laste = e
tmp_gns[-1].extend(gn)
tmp_list.append(laste)
for i in range(0, len(tmp_list), 2):
s = tmp_list[i]
e = tmp_list[i+1]
gns = ''.join(tmp_gns[int(i/2)])
gns = list(set(gns.split(',')))
new_gns = []
for gn in gns:
if gn != '':
new_gns.append(gn)
gns = ','.join(new_gns)
new_regions[chrn].append([s, e, gns])
return new_regions
def read_bed(in_bed):
bed_db = {}
with open(in_bed, 'r') as f_in:
for line in f_in:
if line.strip() == '':
continue
data = line.strip().split()
chrn = data[0]
sp = int(data[1])
ep = int(data[2])
if len(data) > 3:
gn = data[-1]
else:
gn = ''
if sp > ep:
temp = sp
sp = ep
ep = temp
if chrn not in bed_db:
bed_db[chrn] = []
bed_db[chrn].append([sp, ep, gn])
return bed_db
def merge_regions_in_bed(in_bed, out_bed, md):
ori_regions = read_bed(in_bed)
new_regions = merge_reions(ori_regions, md)
with open(out_bed, 'w') as f_out:
for chrn in sorted(new_regions):
for region in new_regions[chrn]:
f_out.write("%s\t%d\t%d\t%s\n"%(chrn, region[0], region[1], region[2]))
if __name__ == "__main__":
if len(sys.argv) < 4:
print("Usage: python "+sys.argv[0]+" <in_bed> <out_bed> <max_distance>")
else:
in_bed, out_bed, md = sys.argv[1:]
md = int(md)
merge_regions_in_bed(in_bed, out_bed, md)