forked from sc-zhang/bioscripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
find_gff_ovlp_regions.py
executable file
·52 lines (47 loc) · 1.31 KB
/
find_gff_ovlp_regions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/usr/bin/env python
import sys
def find_ovlp(in_gff3, out_bed):
gff3_db = {}
with open(in_gff3, 'r') as fin:
for line in fin:
if line[0] == '#' or line.strip() == '':
continue
data = line.strip().split()
if data[2] != 'gene':
continue
chrn = data[0]
sp = int(data[3])
ep = int(data[4])
dir = data[6]
gn = data[8].split(";")[1].split("=")[1]
if chrn not in gff3_db:
gff3_db[chrn] = {}
if dir not in gff3_db[chrn]:
gff3_db[chrn][dir] = []
gff3_db[chrn][dir].append([sp, ep, gn])
with open(out_bed, 'w') as fout:
for chrn in sorted(gff3_db):
for dir in sorted(gff3_db[chrn]):
tmp_list = []
pos_list = sorted(gff3_db[chrn][dir])
for i in range(0, len(pos_list)):
if len(tmp_list) == 0:
tmp_list.append(pos_list[i])
else:
s, e, gn = pos_list[i]
if s <= last_e:
tmp_list.append(pos_list[i])
else:
if len(tmp_list) > 1:
for s, e, gn in tmp_list:
fout.write("%s\t%d\t%d\t%s\t%s\n"%(chrn, s, e, dir, gn))
fout.write("###\n")
tmp_list = []
tmp_list.append(pos_list[i])
last_s, last_e, gn = pos_list[i]
if __name__ == "__main__":
if len(sys.argv) < 3:
print("Usage: python "+sys.argv[0]+" <in_gff3> <out_bed>")
else:
in_gff3, out_bed = sys.argv[1:]
find_ovlp(in_gff3, out_bed)