-
Notifications
You must be signed in to change notification settings - Fork 7
/
teflon_count.py
238 lines (213 loc) · 8.6 KB
/
teflon_count.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
import argparse, sys, os
import multiprocessing as mp
import subprocess as sp
import shlex
import shutil
teflonBase = os.path.dirname(os.path.abspath(sys.argv[0]))
sys.path.insert(1, teflonBase)
from teflon_scripts import genotyper_writeBed as wb
from teflon_scripts import genotyper_countReads as cr
from teflon_scripts import reduceSearchSpace as rss
def drawProgressBar(percent, barLen = 50):
sys.stdout.write("\r")
progress = ""
for i in range(barLen):
if i < int(barLen * percent):
progress += "="
else:
progress += " "
sys.stdout.write("[ %s ] %.2f%%" % (progress, percent * 100))
sys.stdout.flush()
def check_dependency(exePATH):
try:
cmd = "%s" %(exePATH)
sp.Popen(shlex.split(cmd), stdout=sp.PIPE, stderr=sp.PIPE)
except OSError:
print "Cannot find samtools"
sys.exit(1)
def mkdir_if_not_exist(*dirs):
for dir in dirs:
if not os.path.exists(dir):
os.makedirs(dir)
print "creating directory: %s" %(dir)
def assign_task(siteID, task_q, nProcs):
c,i,nth_job=0,0,1
while (i+1)*nProcs <= len(siteID):
i+=1
nP1=nProcs-(len(siteID)%nProcs)
for j in range(nP1):
task_q.put((siteID[c:c+i], nth_job))
nth_job += 1
c=c+i
for j in range(nProcs-nP1):
task_q.put((siteID[c:c+i+1], nth_job))
nth_job += 1
c=c+i+1
def create_procs(nProcs, task_q, result_q, params):
for _ in range(nProcs):
p = mp.Process(target=worker, args=(task_q, result_q, params))
p.daemon = True
p.start()
def worker(task_q, result_q, params):
while True:
try:
siteID, nth_job = task_q.get()
#unpack parameters
genoDir, tmpDir, exePATH, union, samples, qual, hierarchy, label, l2, annotation = params
tmp=[]
ct=1
for ID in siteID:
flag=0
#print union[ID]
# 1. write sam files for all samples
samFILE = wb.wb_portal(ID, union, samples, tmpDir, exePATH, qual, "firstPass")
# 2. count support and non support reads for each event
counts = cr.countReads_portal(ID, union, samples, tmpDir, samFILE, hierarchy, label, l2, annotation) # counts = [[#p(sample1),#a(sample1),#NA(sample1)],[#p(sampleN),#a(sampleN),#NA(sampleN)]]
#print "counts:",counts,":counts"
# 3. were new clipped positions identified?
if counts[1]!="-":
union[ID][1]=counts[1]
union[ID][7]="+"
flag=1
if counts[2]!="-":
union[ID][2]=counts[2]
union[ID][8]="+"
flag=1
if flag==1: #position estimates were refined, recount
# 1. write sam files for all samples
samFILE = wb.wb_portal(ID, union, samples, tmpDir, exePATH, qual, "refine")
# 2. count support and non support reads for each event
counts = cr.countReads_portal(ID, union, samples, tmpDir, samFILE, hierarchy, label, l2, annotation) # counts = [[#p(sample1),#a(sample1),#NA(sample1)],[#p(sampleN),#a(sampleN),#NA(sampleN)]]
union[ID].extend(ct for ct in counts[0])
tmp.append([ID,union[ID]])
if nth_job==1:
drawProgressBar(ct/float(len(siteID)))
else:
union[ID].extend(ct for ct in counts[0])
union[ID].append(counts[3]) # appends count of reads with mate in TE
union[ID].append(counts[4]) # appends count of reads that are only soft-clipped at the junction
tmp.append([ID,union[ID]])
if nth_job==1:
drawProgressBar(ct/float(len(siteID)))
ct+=1
result_q.put(tmp)
finally:
task_q.task_done()
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-wd',dest='wd',help='full path to working directory',default=-1)
parser.add_argument('-d',dest='DIR',help='full path to prep_TF directory')
parser.add_argument('-s',dest='samples',help='samples file')
parser.add_argument('-i',dest='ID',help='unique id of this sample')
parser.add_argument('-eb',dest='exeBWA',help='full path to bwa executable', default="bwa")
parser.add_argument('-es',dest='exeSAM',help='full path to samtools executable', default="samtools")
parser.add_argument('-l2',dest='cLevel',help='level of hierarchy to cluster')
parser.add_argument('-q',dest='qual',help='map quality threshold',type=int)
parser.add_argument("-t", dest="nProc", type=int, default=4, help="Specify number of processes")
args = parser.parse_args()
# identify current working directory
if args.wd == -1:
cwd=os.getcwd()
else:
cwd=os.path.abspath(args.wd)
# import options
prep_TF=os.path.abspath(args.DIR)
prefix=os.path.abspath(args.DIR).split("/")[-1].replace(".prep_TF","")
exeSAM=args.exeSAM
exeBWA=args.exeBWA
qual=args.qual
nProc=args.nProc
#check dependencies for function
check_dependency(exeSAM)
check_dependency(exeBWA)
# import hierarchy
l2=args.cLevel
hierFILE=os.path.join(prep_TF,prefix+".hier")
hierarchy,label={},[]
ct=0
with open(hierFILE, 'r') as fIN:
for line in fIN:
if ct==0:
label=line.split()[1:]
else:
hierarchy[line.split()[0]] = line.split()[1:]
ct+=1
# read annotation
annotation={}
with open(os.path.join(prep_TF,prefix+".te.pseudo.bed"), 'r') as fIN:
for line in fIN:
arr=line.split()
annotation[arr[3]]=[arr[0],int(arr[1]),int(arr[2])]
# read chromosome lengths
chromosomes,lengths=[],[]
genomeSizeFILE=os.path.join(prep_TF,prefix+".genomeSize.txt")
with open(genomeSizeFILE, 'r') as fIN:
for line in fIN:
arr=line.split()
chromosomes.append(arr[0])
lengths.append(int(arr[2]))
# read samples and stats
samples=[]
with open(os.path.abspath(args.samples), 'r') as fIN:
for line in fIN:
bamFILE = line.split()[0].replace(".bam",".subsmpl.bam")
#print "#",line.split()[1],args.ID
if line.split()[1] == args.ID:
statsFILE = bamFILE.replace(".bam", ".stats.txt")
pre=line.split()[1]
with open(statsFILE, 'r') as fIN:
for l in fIN:
if 'average length' in l:
readLen=int(float(l.split()[-1]))
if 'insert size average' in l:
insz=int(float(l.split()[-1]))
if 'insert size standard deviation' in l:
sd=int(float(l.split()[-1]))
samples.append([bamFILE, pre, [readLen, insz, sd]])
# create the genotype directory
genoDir = os.path.join(cwd,"genotypes")
tmpDir = os.path.join(cwd,pre+".tmp")
mkdir_if_not_exist(genoDir, tmpDir)
# read positions to search
union=[]
with open(os.path.join(cwd,"countPos","union_sorted.collapsed.txt"), "r") as fIN:
for line in fIN:
union.append(line.split())
# reduce search space
samples[0][0]=rss.reduceSearchSpace_portal(union,samples,tmpDir,exeSAM,qual,nProc)
# index new reduced alignment
cmd="%s index %s" %(exeBWA,samples[0][0])
print "cmd:", cmd
os.system(cmd)
# partition data for multiprocessing
siteID = range(len(union))
task_q = mp.JoinableQueue()
result_q = mp.Queue()
params=[genoDir, tmpDir, exeSAM, union, samples, qual, hierarchy, label, l2, annotation]
# do the work boyeeee!
print "counting reads..."
create_procs(nProc, task_q, result_q, params)
assign_task(siteID, task_q, nProc)
try:
task_q.join()
except KeyboardInterrupt:
print "KeyboardInterrupt"
sys.exit(0)
else:
outCounts=[]
while nProc:
x = result_q.get()
outCounts.extend(y for y in x)
nProc-=1
cts=sorted(outCounts)
# write the results
for i in range(len(samples)):
outFILE=os.path.join(cwd,"countPos",samples[i][1]+".counts.txt")
with open(outFILE, "w") as fOUT:
for j in range(len(cts)):
fOUT.write("\t".join([str(y) for y in cts[j][1]])+"\n")
# remove directory with beds/sams
shutil.rmtree(tmpDir)
print "\nTEFLON COUNT FINISHED!"
if __name__ == "__main__":
main()