-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgeminiassembly.py
332 lines (325 loc) · 18 KB
/
geminiassembly.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
'''
┌─┐ ┌─┐ ┌┬┐ ┬ ┌┐┌ ┬
│ ┬ ├┤ │││ │ │││ │
└─┘ └─┘ ┴ ┴ ┴ ┘└┘ ┴ assembly
---------------------------------------------------
-*- coding: utf-8 -*- |
title : geminiassembly.py |
description : gemini assembly functions |
author : dooguypapua |
lastmodification : 20220805 |
version : 0.1 |
python_version : 3.8.5 |
---------------------------------------------------
'''
import os
import sys
import shutil
import geminiset
from tabulate import tabulate
from typing import Tuple
from tqdm import tqdm
from yaspin import yaspin
from yaspin.spinners import Spinners
from geminini import path_converter, get_input_files, printcolor, get_gemini_path, get_sys_info, fct_checker
from geminini import title, exit_gemini, read_file, launch_threads
from geminiparse import unwrap_fasta, make_blast_dict, make_fasta_dict, check_circular
@fct_checker
def phage_assembly(pathIN: str, pathOUT: str) -> Tuple[str, str]:
'''
------------------------------------------------------------
| PHAGE ASSEMBLY |
------------------------------------------------------------
| Trim FASTQ & de novo assembly using spades (-careful) |
------------------------------------------------------------
|PARAMETERS |
| pathIN : path of input files or folder (required) |
| pathOUT : path of output files (required) |
------------------------------------------------------------
|TOOLS: trimmomatic, spades |
------------------------------------------------------------
'''
lstFiles, maxpathSize = get_input_files(pathIN, "phage_assembly", [".fastq.gz", ".fastq", ".fq.gz", ".fq"])
if len(lstFiles) == 0:
printcolor("[ERROR: phage_assembly]\nAny input files found\n", 1, "212;64;89", "None", True)
exit_gemini()
dicoGeminiPath, dicoGeminiModule = get_gemini_path()
if 'trimmomatic' in dicoGeminiModule:
os.system("module load "+dicoGeminiModule['trimmomatic'])
if 'spades' in dicoGeminiModule:
os.system("module load "+dicoGeminiModule['spades'])
slurmBool, cpu, memMax, memMin = get_sys_info()
if pathOUT == "":
printcolor("[ERROR: phage_assembly]\nMissing '-o'pathOUT\n", 1, "212;64;89", "None", True)
exit_gemini()
pathOUT = path_converter(pathOUT)
os.makedirs(pathOUT, exist_ok=True)
trimmomatic_param = "LEADING: 3 TRAILING: 3 SLIDINGWINDOW: 4:15 MINLEN: 36"
spades_param = "--careful --cov-cutoff auto -k 21, 33, 55, 77 -m 10"
pbar = tqdm(total=int(len(lstFiles)/2), dynamic_ncols=True, ncols=50+maxpathSize, leave=False, desc="", file=sys.stdout, bar_format=" {percentage: 3.0f}%|{bar}| {n_fmt}/{total_fmt} [{desc}]")
for pathFASTQ in lstFiles:
file = os.path.basename(pathFASTQ)
if "_R1" in file:
sampleName = os.path.basename(pathFASTQ).split("_R1")[0]
pathFASTA = pathOUT+"/"+sampleName+".fasta"
if not os.path.isfile(pathFASTA):
# Construct requierd path
pathFASTQr1 = pathFASTQ
pathFASTQr2 = pathFASTQ.replace("_R1", "_R2")
if not os.path.isfile(pathFASTQr2):
printcolor("[ERROR: phage_assembly]\nUnable to find R2 for \""+pathFASTQr1+"\"\n", 1, "212;64;89", "None", True)
exit_gemini()
pathTRIMr1 = geminiset.pathTMP+"/"+os.path.basename(pathFASTQr1).replace("_R1", "_trimmed_R1")
pathTRIMr2 = geminiset.pathTMP+"/"+os.path.basename(pathFASTQr2).replace("_R2", "_trimmed_R2")
pathUNTRIMr1 = geminiset.pathTMP+"/"+os.path.basename(pathFASTQr1).replace("_R1", "_untrimmed_R1")
pathUNTRIMr2 = geminiset.pathTMP+"/"+os.path.basename(pathFASTQr2).replace("_R2", "_untrimmed_R2")
pathUNTRIM = geminiset.pathTMP+"/"+os.path.basename(pathFASTQr1).replace("_R1", "_untrimmed")
pathSPADES = geminiset.pathTMP+"/spades_"+sampleName
pathSPADESlog = geminiset.pathTMP+"/spades_"+sampleName+".log"
# Launch command
cmdTrim = "java -jar "+dicoGeminiPath['TOOLS']['trimmomatic']+" PE -threads "+str(cpu)+" -quiet "+pathFASTQr1+" "+pathFASTQr2+" "+pathTRIMr1+" "+pathUNTRIMr1+" "+pathTRIMr2+" "+pathUNTRIMr2+" "+trimmomatic_param
pbar.set_description_str(sampleName+": trimmomatic"+" ".rjust(maxpathSize-len(sampleName)))
os.system(cmdTrim)
cmdCat = "cat "+pathUNTRIMr1+" "+pathUNTRIMr2+" > "+pathUNTRIM
os.system(cmdCat)
cmdSpades = dicoGeminiPath['TOOLS']['spades']+" -1 "+pathTRIMr1+" -2 "+pathTRIMr2+" -s "+pathUNTRIM+" -o "+pathSPADES+" --threads "+str(cpu)+" "+spades_param+" > "+pathSPADESlog
pbar.set_description_str(sampleName+": spades"+" ".rjust(maxpathSize-len(sampleName)+5))
os.system(cmdSpades)
# Copy output contigs FASTA file
shutil.copyfile(pathSPADES+"/contigs.fasta", pathFASTA)
# Unwrap FASTA
unwrap_fasta(pathFASTA)
pbar.update(1)
title("Phage ass", pbar)
pbar.close()
@fct_checker
def filter_phage_assembly(pathIN: str, pathOUT: str, minLen: int = 100, minCov: int = 2, ext: str = ".fasta") -> Tuple[str, str, int, int, str]:
'''
------------------------------------------------------------
| FILTER PHAGE ASSEMBLY |
------------------------------------------------------------
| Filter phage assembly contigs |
------------------------------------------------------------
|PARAMETERS |
| pathIN : path of input files or folder (required) |
| pathOUT: path of output files (required) |
| minLen : minimum contig length (default=100) |
| minCov : minimum contig coverage (default=2) |
| ext : extension of input files (default=.fasta) |
------------------------------------------------------------
|TOOLS: blastn |
------------------------------------------------------------
'''
pathOUT = path_converter(pathOUT)
lstFiles, maxpathSize = get_input_files(pathIN, "filter_phage_assembly", [ext])
if len(lstFiles) == 0:
printcolor("[ERROR: filter_phage_assembly]\nAny input files found, check extension\n", 1, "212;64;89", "None", True)
exit_gemini()
os.makedirs(pathOUT, exist_ok=True)
tabulateTable = []
dicoGeminiPath, dicoGeminiModule = get_gemini_path()
if 'blastn' in dicoGeminiModule:
os.system("module load "+dicoGeminiModule['blastn'])
printcolor("♊ Filter assembly"+"\n")
pbar = tqdm(total=len(lstFiles), dynamic_ncols=True, ncols=50+maxpathSize, leave=False, desc="", file=sys.stdout, bar_format=" {percentage: 3.0f}%|{bar}| {n_fmt}/{total_fmt} [{desc}]")
for pathAssembly in lstFiles:
file = os.path.basename(pathAssembly)
orgName = file.replace(ext, "").replace("."+ext, "")
pbar.set_description_str(orgName+" ".rjust(maxpathSize-len(orgName)))
# Blast contigs against contamination sequence
cmdBlastn = dicoGeminiPath['TOOLS']['blastn']+" -task blastn -query "+pathAssembly+" -db "+dicoGeminiPath['DATABASES']['contaPhageDB']+" -out "+geminiset.pathTMP+"/blast.xml -outfmt 5 -perc_identity 80 -qcov_hsp_perc 30 -max_hsps 10 -max_target_seqs 5"
os.system(cmdBlastn)
dicoBlast = make_blast_dict(pathIN=geminiset.pathTMP, dbName="blast", pathJSON="None", idThr=30, minLRthr=0, maxLRthr=0, ext=".xml")
lstContamContig = list(dicoBlast.keys())
# Read contigs FASTA
dicoAssembly = make_fasta_dict(pathAssembly)
dicoFiltered = {}
totalSize = 0
for contig in dicoAssembly:
contigCov = float(contig.split("_cov_")[1])
if contigCov >= minCov and len(dicoAssembly[contig]) >= minLen and contig not in lstContamContig:
oneNuclContig = False
for nucl in ["A", "T", "G", "C"]:
if dicoAssembly[contig].count(nucl) == len(dicoAssembly[contig]):
oneNuclContig = True
break
if not oneNuclContig:
overlap = check_circular(dicoAssembly[contig])
if overlap == "":
dicoFiltered["CONTIG_"+str(len(dicoFiltered)+1)+"|cov: "+str(contigCov)] = dicoAssembly[contig]
else:
dicoFiltered["CONTIG_"+str(len(dicoFiltered)+1)+"|cov: "+str(contigCov)+"|circular: "+overlap] = dicoAssembly[contig]
totalSize += len(dicoAssembly[contig])
# Write new filtered FASTA
pathOUTassembly = pathOUT+"/"+file
OUT = open(pathOUTassembly, 'w')
for key in dicoFiltered:
OUT.write(">"+key+"\n"+dicoFiltered[key]+"\n")
OUT.close()
# Results table
tabulateTable.append([orgName, str(len(dicoAssembly)), str(len(dicoFiltered)), overlap != "", str(totalSize)])
pbar.update(1)
title("Filter assembly", pbar)
pbar.close()
printcolor("♊ Results\n")
printcolor(tabulate(tabulateTable, headers=["orgName", "Initial contigs", "Final contigs", "Circular", "Final size"], tablefmt="pretty")+"\n")
@fct_checker
def replicon_distribution(pathIN: str, pathREF: str, pathOUT: str, idThr: int = 80, extIN: str = ".fasta", extREF: str = ".fasta") -> Tuple[str, str, str, int, str, str]:
'''
------------------------------------------------------------
| REPLICON DISTRIBUTION |
------------------------------------------------------------
| Search replicons in assembly contigs |
------------------------------------------------------------
|PARAMETERS |
| pathIN : path of input files or folder (required) |
| pathREF: path of reference files (required) |
| pathOUT: path of output tsv (required) |
| idThr : %identity threshold (default=80) |
| extIN : extension of input files (default=.fasta) |
| extREF : extension of reference files (default=.fasta) |
------------------------------------------------------------
'''
pathOUT = path_converter(pathOUT)
lstQuery, maxpathSize1 = get_input_files(pathIN, "replicon_distribution", [extIN])
if len(lstQuery) == 0:
printcolor("[ERROR: replicon_distribution]\nAny query files found, check extension\n", 1, "212;64;89", "None", True)
exit_gemini()
lstRef, maxpathSize2 = get_input_files(pathREF, "replicon_distribution", [extREF])
if len(lstQuery) == 0:
printcolor("[ERROR: replicon_distribution]\nAny reference files found, check extension\n", 1, "212;64;89", "None", True)
exit_gemini()
dicoGeminiPath, dicoGeminiModule = get_gemini_path()
# Variables
maxpathSize = max(maxpathSize1, maxpathSize2)
setRefRepliconPath = set()
setRefRepliconName = set()
dicoRepliconCov = {}
blastoutfmt = "6 delim=; qseqid sseqid sstart send bitscore slen"
# Split reference replicons in distinct FASTA files (required header ">repliconType [orgName]")
printcolor("♊ Split replicons"+"\n")
pbar = tqdm(total=len(lstRef), dynamic_ncols=True, ncols=50+maxpathSize, leave=False, desc="", file=sys.stdout, bar_format=" {percentage: 3.0f}%|{bar}| {n_fmt}/{total_fmt} [{desc}]")
for pathFile in lstRef:
fileName = os.path.basename(pathFile).replace(extREF, "").replace("."+extREF, "")
pbar.set_description_str(fileName+" ".rjust(maxpathSize-len(fileName)))
dicoFASTA = make_fasta_dict(pathFile)
for key in dicoFASTA:
repliconType = key.replace("|", " ").split(" ")[0]
orgName = key.split("[")[1].replace("]", "").replace(" ", "_")
pathREPLICON = geminiset.pathTMP+"/"+repliconType+".fasta"
setRefRepliconPath.add(pathREPLICON)
setRefRepliconName.add(repliconType)
REPLICON = open(pathREPLICON, 'a')
REPLICON.write(">"+orgName+"\n"+dicoFASTA[key]+"\n")
REPLICON.close()
pbar.update(1)
title("Split replicons", pbar)
pbar.close()
# Blast each query on each replicon fasta
spinner = yaspin(Spinners.aesthetic, text="♊ Blast replicons", side="right")
spinner.start()
title("Blast", None)
dicoThread = {}
for pathFile in lstQuery:
orgName = os.path.basename(pathFile).replace(extIN, "").replace("."+extIN, "")
# Foreach replicon FASTA
for pathREPLICON in setRefRepliconPath:
repliconName = os.path.basename(pathREPLICON).replace(".fasta", "")
# Launch blastn
pathREPLICONOUT = geminiset.pathTMP+"/"+orgName+"_____"+repliconName+".out"
cmdBlastn = dicoGeminiPath["TOOLS"]["blastn"]+" -query "+pathFile+" -subject "+pathREPLICON+" -out "+pathREPLICONOUT+" -outfmt \""+blastoutfmt+"\" -perc_identity "+str(idThr)
dicoThread[orgName+"_"+repliconName] = {"cmd": cmdBlastn, "returnstatut": None, "returnlines": []}
# Launch threads
if len(dicoThread) > 0:
launch_threads(dicoThread, "blast", geminiset.cpu, geminiset.pathTMP, spinner)
spinner.stop()
printcolor("♊ Blast replicons"+"\n")
# Parse blast results
printcolor("♊ Search replicons"+"\n")
pbar = tqdm(total=len(lstQuery), dynamic_ncols=True, ncols=50+maxpathSize, leave=False, desc="", file=sys.stdout, bar_format=" {percentage: 3.0f}%|{bar}| {n_fmt}/{total_fmt} [{desc}]")
for pathFile in lstQuery:
orgName = os.path.basename(pathFile).replace(extIN, "").replace("."+extIN, "")
pbar.set_description_str(orgName+" ".rjust(maxpathSize-len(orgName)))
dicoQuery = {}
dicoSubject = {}
dicoRepliconCov[orgName] = {}
for pathREPLICON in setRefRepliconPath:
repliconName = os.path.basename(pathREPLICON).replace(".fasta", "")
pathREPLICONOUT = geminiset.pathTMP+"/"+orgName+"_____"+repliconName+".out"
if os.path.isfile(pathREPLICONOUT):
lstLine = read_file(pathREPLICONOUT, yaspinBool=False)
for line in lstLine:
splitLine = line.split(";")
query = splitLine[0]
bitscore = float(splitLine[4])
# Keep best for each contig query
if query not in dicoQuery:
dicoQuery[query] = {}
if repliconName not in dicoQuery[query]:
dicoQuery[query][repliconName] = {'bestScore': bitscore, 'line': []}
dicoQuery[query][repliconName]['bestScore'] = max(dicoQuery[query][repliconName]['bestScore'], bitscore)
dicoQuery[query][repliconName]['line'].append(line)
# Compute coverage for each bestScore replicon
for query in dicoQuery:
bestScore = 0
bestReplicon = ""
for repliconName in dicoQuery[query]:
if dicoQuery[query][repliconName]['bestScore'] > bestScore:
bestScore = dicoQuery[query][repliconName]['bestScore']
bestReplicon = repliconName
if bestReplicon not in dicoSubject:
dicoSubject[bestReplicon] = {}
# Parse correspunding blast line
for line in dicoQuery[query][bestReplicon]['line']:
splitLine = line.split(";")
query = splitLine[0]
subject = splitLine[1]
sstart = int(splitLine[2])
send = int(splitLine[3])
bitscore = float(splitLine[4])
slen = int(splitLine[5])
# Add covered position
for i in range(min(sstart, send), max(sstart, send)+1, 1):
try:
dicoSubject[bestReplicon][subject]['cov_pos'][i] = 1
except KeyError:
dicoSubject[bestReplicon][subject] = {'cov_pos': {i: 1}, 'len': slen}
# Look for coverage per replicon
for repliconName in dicoSubject:
bestCov = 0
for subject in dicoSubject[repliconName]:
percentCov = len(dicoSubject[repliconName][subject]['cov_pos'])*100/dicoSubject[repliconName][subject]['len']
bestCov = max(bestCov, percentCov)
dicoRepliconCov[orgName][repliconName] = bestCov
pbar.update(1)
title("Search replicons", pbar)
pbar.close()
# Write output
printcolor("♊ Write output"+"\n")
OUT = open(pathOUT, 'w')
OUT.write("organism")
# Sorted replicon name with chromosome as first
lstSortedRepliconChrom = []
lstSortedRepliconOthers = []
for repliconName in setRefRepliconName:
if "chr" in repliconName or "chrom" in repliconName or "chromosome" in repliconName:
lstSortedRepliconChrom.append(repliconName)
else:
lstSortedRepliconOthers.append(repliconName)
lstSortedRepliconChrom.sort()
lstSortedRepliconOthers.sort()
lstSortedReplicon = lstSortedRepliconChrom+lstSortedRepliconOthers
# Write header
for repliconName in lstSortedReplicon:
OUT.write("\t"+repliconName)
OUT.write("\n")
# Write % coverage
for orgName in dicoRepliconCov:
OUT.write(orgName)
for repliconName in lstSortedReplicon:
if repliconName in dicoRepliconCov[orgName]:
OUT.write("\t"+str(round(dicoRepliconCov[orgName][repliconName], 1)).replace(".", ","))
else:
OUT.write("\t0")
OUT.write("\n")
OUT.close()