forked from aaranyue/quarTeT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathquartet_assemblymapper.py
318 lines (295 loc) · 16.5 KB
/
quartet_assemblymapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
#!/usr/bin/env python3
import argparse
import subprocess
import re
import os
import sys
import quartet_util
### MAIN PROGRAM ###
def AssemblyMapper(args):
refgenomefile, qryfile, mincontiglength, minalignmentlength, minalignmentidentity, prefix, threads, aligner, nofilter, plot, overwrite, nucmeroption, deltafilteroption, minimapoption = args
# split scaffolds to contigs and remove short contigs
print('[Info] Filtering contigs input...')
subprocess.run(f'mkdir tmp', stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
inputdict = quartet_util.readFastaAsDict(qryfile)
totaldict = inputdict.copy()
contigsdict = {}
if nofilter != True:
for tigid, seq in inputdict.items():
if 'N'*100 in seq:
i = 1
for tig in re.split(r'N+', seq):
totaldict[f'{tigid}.{i}'] = tig
if len(tig) >= mincontiglength:
contigsdict[f'{tigid}.{i}'] = tig
i += 1
elif len(seq) >= mincontiglength:
contigsdict[tigid] = seq
contigfile = f'tmp/{prefix}.contig.fasta'
with open(contigfile, 'w') as c:
for tigid, seq in contigsdict.items():
c.write(f'>{tigid}\n{seq}\n')
else:
contigfile = qryfile
contigsdict = inputdict
# check telomere in contigs
print('[Info] Checking telomere in contigs...')
telofile = f'tmp/{prefix}.tig.telo.info'
if not os.path.exists(telofile) or overwrite == True:
subprocess.run(f'python3 {sys.path[0]}/quartet_teloexplorer.py -i {contigfile} -p {prefix}.tig', stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
subprocess.run(f'mv -t tmp/ -f {prefix}.tig.telo.info {prefix}.tig.telo.png {prefix}.tig.telo.svg', stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
monopolize = []
forceleft = []
forceright = []
refdict = quartet_util.readFastaAsDict(refgenomefile)
minchrlen = min([len(y) for x, y in refdict.items()])
if os.path.exists(telofile):
with open(telofile, 'r') as telo:
for line in telo:
if line.startswith('#'):
continue
tigid, tiglen, status = line.split()[0:3]
if status == 'both' and int(tiglen) >= minchrlen:
monopolize.append(tigid)
elif status == 'left':
forceleft.append(tigid)
elif status == 'right':
forceright.append(tigid)
else:
print('[Warning] Cannot identify telomeres in contigs.')
# reduce memory
with open(f'tmp/{prefix}.totaldict.fasta', 'w') as tmptotaldict:
for sid, seq in totaldict.items():
tmptotaldict.write(f'>{sid}\n{seq}\n')
del totaldict
with open(f'tmp/{prefix}.contigsdict.fasta', 'w') as tmpcontigsdict:
for sid, seq in contigsdict.items():
tmpcontigsdict.write(f'>{sid}\n{seq}\n')
del contigsdict
refdictkey = refdict.keys()
del refdict
del inputdict
# get all alignments
allAlignment = {}
if aligner == 'mummer':
coordsfile = quartet_util.mummer(refgenomefile, contigfile, prefix, 'contig_map_ref', nucmeroption, deltafilteroption, True, overwrite)
with open(coordsfile, 'r') as f:
for line in f:
if len(line.split()) != 9:
continue
refstart, refend, qrystart, qryend, reflen, qrylen, identity, refid, qryid = line.split()
if f'{refid}#{qryid}' not in allAlignment:
alignment = {'refid': refid, 'qryid': qryid, 'weight': 0, 'sumposition': 0, 'sumpositive': 0, 'sumnegative': 0, 'score': 0}
else:
alignment = allAlignment[f'{refid}#{qryid}']
alignment['weight'] += 1
alignment['sumposition'] += int(refstart)
alignment['score'] += float(identity) * float(qrylen)
if qrystart < qryend:
alignment['sumpositive'] += int(qrylen)
else:
alignment['sumnegative'] += int(qrylen)
allAlignment[f'{refid}#{qryid}'] = alignment
elif aligner == 'minimap2':
paffile = quartet_util.minimap(refgenomefile, contigfile, prefix, 'contig_map_ref', minimapoption, True, overwrite)
with open(paffile, 'r') as f:
for line in f:
qryid, qrylen, qrystart, qryend, strand, refid, reflen, refstart, refend, match, alignlen = line.split()[:11]
if float(alignlen) < minalignmentlength:
continue
if float(match) / float(alignlen) < minalignmentidentity:
continue
if f'{refid}#{qryid}' not in allAlignment:
alignment = {'refid': refid, 'qryid': qryid, 'weight': 0, 'sumposition': 0, 'sumpositive': 0, 'sumnegative': 0, 'score': 0}
else:
alignment = allAlignment[f'{refid}#{qryid}']
alignment['weight'] += int(alignlen)
alignment['sumposition'] += (int(refstart) + 1) * int(alignlen)
alignment['score'] += float(match)
if strand == '+':
alignment['sumpositive'] += int(alignlen)
else:
alignment['sumnegative'] += int(alignlen)
allAlignment[f'{refid}#{qryid}'] = alignment
if allAlignment == {}:
print(f'[Error] All alignments are filtered. Recommended to adjust filter arguments.')
sys.exit(0)
# analysis each alignment and create map
print('[Info] Analysising aligments...')
map = {}
for key, alignment in allAlignment.items():
# decide monopolizer contig first
if alignment['qryid'] in monopolize:
strand = '+' if alignment['sumpositive'] >= alignment['sumnegative'] else '-'
if alignment['qryid'] not in map:
map[alignment['qryid']] = {'refid': alignment['refid'], 'score': alignment['score'], 'position': 0, 'strand': strand}
else:
# solve double mapping by score
if alignment['score'] > map[alignment['qryid']]['score']:
map[alignment['qryid']] = {'refid': alignment['refid'], 'score': alignment['score'], 'position': 0, 'strand': strand}
monopolizedchr = [y['refid'] for x, y in map.items()]
for key, alignment in allAlignment.items():
if alignment['qryid'] not in monopolize and alignment['refid'] not in monopolizedchr:
strand = '+' if alignment['sumpositive'] >= alignment['sumnegative'] else '-'
if (strand == '+' and alignment['qryid'] in forceleft) or (strand == '-' and alignment['qryid'] in forceright):
position = 0
elif (strand == '+' and alignment['qryid'] in forceright) or (strand == '-' and alignment['qryid'] in forceleft):
position = 999999999
else:
position = alignment['sumposition'] / alignment['weight']
if alignment['qryid'] not in map:
map[alignment['qryid']] = {'refid': alignment['refid'], 'score': alignment['score'], 'position': position, 'strand': strand}
else:
# solve double mapping by score
if alignment['score'] > map[alignment['qryid']]['score']:
map[alignment['qryid']] = {'refid': alignment['refid'], 'score': alignment['score'], 'position': position, 'strand': strand}
# write all contigs' destination
contiginfo = []
mappedbases, discardedbases = 0, 0
totaldict = quartet_util.readFastaAsDict(f'tmp/{prefix}.totaldict.fasta')
contigsdict = quartet_util.readFastaAsDict(f'tmp/{prefix}.contigsdict.fasta')
for tigid, seq in totaldict.items():
tiglen = len(seq)
if tigid in map:
target = map[tigid]['refid']
mappedbases += tiglen
else:
if tiglen < mincontiglength and nofilter != True:
target = 'TooShort'
discardedbases += tiglen
elif tigid in contigsdict:
target = 'NoAlignment'
discardedbases += tiglen
else:
target = 'Split'
contiginfo.append([tigid, tiglen, target])
contiginfo.sort(key=lambda x: x[0])
contigmapinfofile = f'{prefix}.contig.mapinfo'
with open(contigmapinfofile, 'w') as w:
w.write(f'# Total mapped: {mappedbases} bp\n')
w.write(f'# Total discarded: {discardedbases} bp\n')
w.write(f'# ContigID\tContigLength\tTargetID\n')
for [tigid, tiglen, target] in contiginfo:
w.write(f'{tigid}\t{tiglen}\t{target}\n')
print(f'[Output] Mapping result for each contigs write to: {contigmapinfofile}')
# check if all Chr have contigs mapped.
mappedchrlist = []
for qryid, mapinfo in map.items():
if mapinfo['refid'] not in mappedchrlist:
mappedchrlist.append(mapinfo['refid'])
for id in refdictkey:
if id not in mappedchrlist:
print(f'[Warning] {id} failed to be mapped by any contigs.')
# write draft genome fasta and agp and stat
print('[Info] Generating draft genome and statistics...')
chrfastadict = {}
agplist = []
gaplocus = {}
counts = {}
qryorder = sorted(map.keys(), key=lambda x: map[x]['position'],)
for qryid in qryorder:
seq = contigsdict[qryid] if map[qryid]['strand'] == '+' else quartet_util.reversedseq(contigsdict[qryid])
newid = f"{map[qryid]['refid']}"
if newid not in chrfastadict:
chrfastadict[newid] = seq
gaplocus[newid] = []
counts[newid] = 1
agplist.append([newid, 1, len(seq), counts[newid], 'W', qryid, 1, len(seq), map[qryid]['strand']])
else:
gapstart = len(chrfastadict[newid]) + 1
chrfastadict[newid] += 'N'*100 + seq
gaplocus[newid].append([gapstart, gapstart + 99])
counts[newid] += 1
agplist.append([newid, gapstart, gapstart + 99, counts[newid], 'U', 100, 'scaffold', 'yes', 'align_genus'])
counts[newid] += 1
agplist.append([newid, gapstart + 100, gapstart + len(seq) + 99, counts[newid], 'W', qryid, 1, len(seq), map[qryid]['strand']])
draftgenomefastafile = f'{prefix}.draftgenome.fasta'
with open(draftgenomefastafile, 'w') as fa:
for chrs, seq in sorted(chrfastadict.items(), key=lambda x: x[0]):
fa.write(f'>{chrs}\n{seq}\n')
if os.path.getsize(draftgenomefastafile) == 0:
print('[Error] No Chromosome can be assembly.')
sys.exit(0)
print(f'[Output] draft genome fasta file write to: {draftgenomefastafile}')
draftgenomeagpfile = f'{prefix}.draftgenome.agp'
with open(draftgenomeagpfile, 'w') as agp:
agplist.sort(key=lambda x: x[0])
for inf in agplist:
agp.write("\t".join([str(x) for x in inf]) + '\n')
print(f'[Output] draft genome agp file write to: {draftgenomeagpfile}')
draftgenomestatfile = f'{prefix}.draftgenome.stat'
with open(draftgenomestatfile, 'w') as info:
totallen = 0
gc = 0
for chrs in chrfastadict:
totallen += len(chrfastadict[chrs])
gc += chrfastadict[chrs].count('G') + chrfastadict[chrs].count('C')
gccontent = gc / totallen
info.write(f'# Total Size: {totallen}\n')
info.write(f'# GC content: {gccontent}\n')
info.write(f'# AssemblyID\tLength\tGapcount\tGaplocus\n')
for chrs in sorted(gaplocus.keys()):
gapcount = len(gaplocus[chrs])
tmpl = []
for [start, end] in gaplocus[chrs]:
tmpl.append(f'{start}-{end}')
locus = '\t'.join(tmpl)
info.write(f'{chrs}\t{len(chrfastadict[chrs])}\t{gapcount}\t{locus}\n')
print(f'[Output] draft genome stat write to: {draftgenomestatfile}')
quartet_util.drawgenome(draftgenomeagpfile, f'{prefix}.draftgenome')
# show colinearity in plot
if plot == True:
print('[Info] Aligning draft and ref to plot...')
if aligner == 'mummer':
quartet_util.mummer(refgenomefile, draftgenomefastafile, prefix, 'draftgenome_mummer_ref', nucmeroption, deltafilteroption, plot, overwrite)
if aligner == 'minimap2':
quartet_util.minimap(refgenomefile, draftgenomefastafile, prefix, 'draftgenome_map_ref', minimapoption, plot, overwrite)
### RUN ###
if __name__ == '__main__':
# Argparse
parser = argparse.ArgumentParser()
parser.add_argument('-r', dest='reference_genome',required=True, help='(*Required) Reference genome file, FASTA format.')
parser.add_argument('-q', dest='contigs', required=True, help='(*Required) Phased contigs file, FASTA format.')
parser.add_argument('-c', dest='min_contig_length', type=int, default=50000, help='Contigs shorter than INT (bp) will be removed, default: 50000')
parser.add_argument('-l', dest='min_alignment_length', type=int, default=10000, help='The min alignment length to be select (bp), default: 10000')
parser.add_argument('-i', dest='min_alignment_identity', type=float, default=90, help='The min alignment identity to be select (%%), default: 90')
parser.add_argument('-p', dest='prefix', default='quarTeT', help='The prefix used on generated files, default: quarTeT')
parser.add_argument('-t', dest='threads', default='1', help='Use number of threads, default: 1')
parser.add_argument('-a', dest='aligner', choices=['minimap2', 'mummer'], default='minimap2', help='Specify alignment program (support minimap2 and mummer), default: minimap2')
parser.add_argument('--nofilter', dest='nofilter', action='store_true', default=False, help='Use original sequence input, no filtering.')
parser.add_argument('--plot', dest='plot', action='store_true', default=False, help='Plot a colinearity graph for draft genome to reference alignments. (will cost more time)')
parser.add_argument('--overwrite', dest='overwrite', action='store_true', default=False, help='Overwrite existing alignment file instead of reuse.')
parser.add_argument('--minimapoption', dest='minimapoption', default='-x asm5', help='Pass additional parameters to minimap2 program, default: -x asm5')
parser.add_argument('--nucmeroption', dest='nucmeroption', default='', help='Pass additional parameters to nucmer program.')
parser.add_argument('--deltafilteroption', dest='deltafilteroption', default='', help='Pass additional parameters to delta-filter program.')
# parse input paramater
refgenomefile = quartet_util.decompress(parser.parse_args().reference_genome)
qryfile = quartet_util.decompress(parser.parse_args().contigs)
mincontiglength = int(parser.parse_args().min_contig_length)
minalignmentlength = int(parser.parse_args().min_alignment_length)
minalignmentidentity = float(parser.parse_args().min_alignment_identity) / 100
if minalignmentidentity < 0 or minalignmentidentity > 1:
print('[Error] min_alignment_identity should be within 0~100.')
sys.exit(0)
prefix = parser.parse_args().prefix
threads = parser.parse_args().threads
aligner = parser.parse_args().aligner
nofilter = parser.parse_args().nofilter
plot = parser.parse_args().plot
overwrite = parser.parse_args().overwrite
quartet_util.check_prerequisite(['Rscript', 'delta-filter', 'mummerplot', 'show-coords', 'gnuplot'])
if aligner == 'mummer':
quartet_util.check_prerequisite(['nucmer'])
nucmeroption = parser.parse_args().nucmeroption + f' -t {threads}'
deltafilteroption = parser.parse_args().deltafilteroption + f' -l {minalignmentlength} -i {minalignmentidentity}'
minimapoption = ''
if aligner == 'minimap2':
quartet_util.check_prerequisite(['minimap2'])
nucmeroption = ''
deltafilteroption = ''
minimapoption = parser.parse_args().minimapoption + f' -t {threads}'
# run
args = [refgenomefile, qryfile, mincontiglength, minalignmentlength, minalignmentidentity,
prefix, threads, aligner, nofilter, plot, overwrite, nucmeroption, deltafilteroption, minimapoption]
print(f'[Info] Paramater: refgenomefile={refgenomefile}, qryfile={qryfile}, mincontiglength={mincontiglength}, minalignmentlength={minalignmentlength}, minalignmentidentity={minalignmentidentity}, prefix={prefix}, threads={threads}, aligner={aligner}, nofilter={nofilter}, plot={plot}, overwrite={overwrite}, nucmeroption={nucmeroption}, deltafilteroption={deltafilteroption}, minimapoption={minimapoption}')
quartet_util.run(AssemblyMapper, args)