-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathassembly.py
289 lines (252 loc) · 10.8 KB
/
assembly.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 1 09:18:37 2019
@author: Guilherme Neumann
"""
import os
import logging
import time
import multiprocessing
import importlib
from Bio import SeqIO
import kmergenie as kmer
import threading as thr
from abc import ABC, abstractmethod
class Assembling_Controller():
"""
This Class represents the Assembly Component of the Framework. It receives
reads and outputs assemblies.
Attributes
----------
file_format : str
the format of the reads, generally fa or fq (default fa)
k : int
the k-mer number used in k-based assemblers. When not provided, it is
generated by Kmergenie
technology : str
the sequencing technology (default Illumina)
t : int
Number of threads. When not informed, it is set to the number of cpus-2
Methods
-------
run(software)
Run the selected assemblers
"""
file_format="fa"
k=None
technology="Illumina"
t=multiprocessing.cpu_count()-2
def __init__(self, reads, out, exp,reads_len):
"""
Parameters
----------
reads : list
A list of string sample-reads names. For example, the list ['reads100',
'reads150'] indicates two assemblies using the reads reads100_1.fa(fq),
reads100_2.fa(fq) for reads100 assembly and reads150_1.fa(fq),
reads150_2.fa(fq) for reads150.
out : str
The output directory to store the results and where the reads
are stored
exp : str
The Experiment Name
reads_len : int/list
The average reads length. When list type, each length is associated to
one sample (from reads list)
"""
self.exp=exp
self.len=reads_len
self.phred=40
self.output=out
self.reads=reads
logging.basicConfig(format='%(asctime)s %(message)s',filename= out+ exp + '.log',level=logging.DEBUG)
def __fasta2fastq(self,file,ql):
"""
It converts a fasta file to a fastq file.
Parameters
----------
file : str
file name, including format, e.g. 'reads_1.fa'
ql : int
phred number
"""
try:
fq=file.split(".")
fq=fq[0] + ".fastq"
with open(file, "r") as fasta, open(fq, "w") as fastq:
for record in SeqIO.parse(fasta, "fasta"):
record.letter_annotations["phred_quality"] = [ql] * len(record)
SeqIO.write(record, fastq, "fastq")
fasta.close()
fastq.close()
except IOError:
logging.error(IOError)
exit()
def run_selected_assemblers(self,software):
"""
Run all the selected assemblers.
It is important to notice that here only the names are provided, the same
ones used to create the modules stored in assemblers/. For example, if you
set SPAdes to run, it implies assemblers/ contains spades.py, and spades.py
contains the class Spades().
How assemblers work and where they are installed are coded in the respective
assembler class (a hotspot).
Parameters
----------
software : list
A list containing all the assemblers that may run the assemblies. If
['all'] is passed, all software run.
"""
try:
if not (os.path.exists(self.output+ "assemblies")):
os.system("mkdir " +self.output+ "assemblies")
if "all" in software:
software=[]
tools=os.listdir("assemblers/")
for each in tools:
if ".py" in each and "__init__" not in each and not ".pyc" in each:
software.append(each[:-3])
for i,sample in enumerate(self.reads):
logging.info(" Sample:"+sample)
print("\n======================Sample:"+sample+"=================================================")
logging.info("======================Sample "+sample+"=================================================")
exists1 = os.path.isfile(self.output+"reads/"+sample+"_1.fastq")
exists2 = os.path.isfile(self.output+"reads/"+sample+"_2.fastq")
thread1=thr.Thread(target=self.__fasta2fastq,args=(self.output+"reads/"+sample+"_1."+self.file_format,int(self.phred)))
thread2=thr.Thread(target=self.__fasta2fastq,args=(self.output+"reads/"+sample+"_2."+self.file_format,int(self.phred)))
if not exists1 or not exists2 and self.file_format!='fq':
try:
logging.info(" Converting fasta read files to fastq")
print("\n ============ Converting fasta read files to fastq")
thread1.daemon=True
thread1.start()
thread2.daemon=True
thread2.start()
except thr.ThreadError:
logging.error(thr.ThreadError)
try:
if type(self.len)==list:
read_len=self.len[i]
else:
read_len=self.len
if self.k==None: #BOTAR KMERGENIE COMO FUNÇÃO E CHAMAR APENAS QUANDO NECESSÁRIO
logging.info(" Kmergenie:")
print("\n============ Kmergenie:")
if not (os.path.exists(self.output+ "assemblies/kmer")):
os.system("mkdir "+self.output+"assemblies/kmer")
k = kmer.Kmergenie(self.exp,self.output,sample,self.file_format)
best_k=k.get_bestk()
else:
best_k=self.k
exists = os.path.isfile(self.output+"assemblies/run_done.log")
done=[]
if exists:
last_run=open(self.output+"assemblies/run_done.log","r")
for line in last_run:
done.append(line.strip().split(","))
run_log=open(self.output+"assemblies/run_done.log","a")
for assembler in software:
if not (os.path.exists(self.output+ "assemblies/"+assembler)):
os.system("mkdir "+self.output+"assemblies/"+assembler)
logging.info(assembler+":")
print("\n =============== "+assembler)
module = importlib.import_module('assemblers.'+assembler.lower())
my_class = getattr(module, assembler.capitalize() )
genome=my_class(self.technology,self.exp,self.output,sample,read_len,self.file_format,best_k,self.t)
#Some assemblers such as Masurca and Mira only work with fastq files, in this case we need the conversion to be finished
if genome.require_fastq and self.file_format!='fq':
genome.file_format="fastq"
while(thread1.is_alive() and thread2.is_alive()):
time.sleep(.10)
thread=thr.Thread(target=genome.run_assembly,args=())
if [assembler,sample] not in done:
if genome.python_threads:
thread.daemon=True
thread.start()
else:
genome.run_assembly()
run_log.write(assembler+","+sample+"\n")
while(thread.is_alive()):
time.sleep(.10)
run_log.close()
except IOError:
logging.error(IOError)
exit()
except TypeError:
logging.error(TypeError)
exit()
class Assembler(ABC):
"""
Abstract assembler class
Attributes
----------
assembler_name : str
The name of the assembler tool
require_fastq : bool
if the assembler only work with fastq files, please set as True
(default False)
python_threads : bool
in case the assembler do not use multithread, you can at least
activate it to run with python threads. Those threads are used
by Assembly Module (default False).
Methods
-------
run_assembly()
Run the assembly
"""
__assembler_name=''
require_fastq=False
python_threads=False
#please, keep all the arguments in __init__ even you do not need it
def __init__(self, technology, exp, out, sample,read_len,file_format,k,t):
"""
Parameters
----------
technology : str
the sequencing technology, e.g. Illumina
exp : str
The Experiment Name
out : str
The output directory to store the results and where the reads
are stored
sample : str
Sample Name (assembly name)
read_len : int
The average reads length
file_format : str
the format of the reads, generally fa or fq
k : int
the k-mer number used in k-based assemblers
t : int
Number of threads
"""
self.tech=technology
self.exp=exp
self.out=out
self.sample=sample
self.read_len=read_len
self.file_format=file_format
self.k=k
self.t=t
super(Assembler,self).__init__()
try:
#Using the shared logging system
logging.basicConfig(format='%(asctime)s %(message)s',filename= out+ exp + '.log',level=logging.DEBUG)
if file_format=="fa" or file_format=="fasta" or file_format=="fna":
self.tipo="fasta"
else:
self.tipo="fastq"
except IOError:
logging.error(IOError)
exit()
@abstractmethod
def run_assembly(self):
"""
Run the assembly.
"""
if not(os.path.exists(self.out+"assemblies/"+self.__assembler_name)):
os.system("mkdir "+self.out+"assemblies/"+self.__assembler_name)
os.system("mkdir "+self.out+"assemblies/"+self.assembler_name+"/"+self.sample)
pass
#command="call the command here"
#os.system(command)