-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBlast2Del.py
76 lines (65 loc) · 2.93 KB
/
Blast2Del.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env python3
def process_blast(fil, evalue_thr=1e-3, aln_len_thr=20, identity_thr=90):
"""
Parser of blast results satisfying the threshold conditions.
Takes a blast file in format 6 and returns a dictionary:
{identifier:[evalue, query start, query end, subject start, subject end]}
"""
rs = {}
discarded = {}
with open(fil, 'r') as fi:
for line in fi:
line = line.strip().split()
evalue = float(line[-2])
if evalue<=evalue_thr and float(line[2])>=identity_thr and int(line[3])>=aln_len_thr:
ide = ':'.join(line[0].split(':')[-2:])
qstart, qend = sorted([int(i) for i in line[6:8]])
sstart, send = sorted([int(i) for i in line[-4:-2]])
minirs = [evalue, qstart, qend, sstart, send]
if ide in rs:
rs[ide].append(minirs)
else:
rs[ide] = [minirs]
return rs
def filter_deletions(inFile, n=2,
evalue_thr=1e-3, aln_len_thr=20, identity_thr=90,
mapping_place_filter=1):
"""
Extract deletions from a <inFile> blast format 6 file looking for pasted
reads mapping to <n> different regions. For example, with <n> default 2:
------------deleted_region-----------------
r1++++ r2++++
Different filter conditions can be applied using the following thresholds:
<evalue_thr>, <aln_len_thr> (alignment length), <identity_thr>
<mapping_place_filter>: ensure the two gDNA are subsequent in the read.
this can be a numerical value representing the number of bps allowed
between the two positions.
"""
blast_all = process_blast(inFile, evalue_thr=evalue_thr, aln_len_thr=aln_len_thr, identity_thr=identity_thr)
comp_rs = {}
rs = {}
delpoints = {}
for k, v in blast_all.items():
if len(v)==2:
evalueA, qstartA, qendA, sstartA, sendA = v[0]
evalueB, qstartB, qendB, sstartB, sendB = v[1]
if abs(qendA-qstartB)==mapping_place_filter or abs(qendB-qstartA)==mapping_place_filter:
print(k)
st, en = sorted([sstartA, sendA, sstartB, sendB])[1:3]
del_ide = str(bin_dictionary[st])+'..'+str(bin_dictionary[en])
comp_rs[k] = [st, en, del_ide]
# Extract coordinates of the sides of deletions
if st in delpoints:
delpoints[st]+=1
else:
delpoints[st]=1
if en in delpoints:
delpoints[en]+=1
else:
delpoints[en]=1
if del_ide in rs:
rs[del_ide].append(k)
else:
rs[del_ide] = [k]
return comp_rs, rs, delpoints
all_dels, filt_dels, delpoints = filter_deletions('<file generated with LoxDel_paired or LoxDel_singe>')