-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathSnakefile_metaassembly
326 lines (312 loc) · 16.6 KB
/
Snakefile_metaassembly
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
# This workflow uses Novogene's EGYPTREFV2 assembly and our WTDBG2-based
# assembly to construct a meta-assembly
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
ASSEMBLIES = ["EGYPTREFV2","EGYPTREFWTDBG2V3PILON"]
# At first, we cut out from the EGYPTREFV2 assembly the sequence region that is
# not part of the wtdbg2-based assembly
# This gets the alignment coordinates sorted by chromosome and then start
# position
rule get_aligned_reference_coordinates:
input: "quast_results/latest/contigs_reports/all_alignments_Homo_sapiens-{assembly}-dna-primary_assembly.tsv"
output: "meta_assembly/{assembly}_aligned_coordinates.txt"
shell: "cat {input} | grep True | " + \
"awk '{{print $5 \"\t\" $1 \"\t\" $2 \"\t\" $6 \"\t\" $3 \"\t\" $4}}' | " + \
"sort -k 1,1h -k 2,2n > {output}"
rule uncovered_reference_greater_800kb:
input: "meta_assembly/{assembly}_aligned_coordinates.txt"
output: "meta_assembly/{assembly}_uncovered_reference_in_between.txt",
"meta_assembly/{assembly}_gaps.txt"
run:
prev_chrom = "GL000008.2"
prev_start = 0
prev_end = 0
prev_line = ""
with open(input[0],"r") as f_in, open(output[0],"w") as f_out, \
open(output[1],"w") as f_gap:
f_gap.write("chrom\tstart\tend\tgapsize\n")
for line in f_in:
s = line.split("\t")
chrom = s[0]
start = s[1]
end = s[2]
contig = s[3]
assert(int(start)<int(end))
if chrom == prev_chrom:
# Note that if multiple contigs align to the same reference
# region, this may be negative
if int(start)-int(prev_end)>800000:
f_out.write(prev_line)
f_out.write(line)
f_gap.write("\t".join([chrom,prev_end,start,str(int(start)-int(prev_end))])+"\n")
prev_chrom = chrom
prev_start = start
prev_end = end
prev_contig = contig
prev_line = line
# Select EGYPTREF assembly alignments covering or within the min 1MB regions
# not covered; thereby do not select alignments which are entirely contained
# within centromere regions
# I obtained centromere positions from here:
# http://genome.ucsc.edu/cgi-bin/hgTables?hgsid=727922323_Dy46eSarKzMaYCPcTPenS8wKKcIQ&clade=mammal&org=Human&db=hg38&hgta_group=allTables&hgta_track=hg38&hgta_table=centromeres&hgta_regionType=genome&position=chr1%3A11%2C102%2C837-11%2C267%2C747&hgta_outputType=primaryTable&hgta_outFileName=
# by selecting 'group: all tables' and 'table: centromeres'
rule check_which_egyptrefv2_covers_wtdbg:
input: "meta_assembly/EGYPTREFWTDBG2V3PILON_gaps.txt",
"meta_assembly/EGYPTREFV2_aligned_coordinates.txt",
"data/misc/centromere_positions.txt"
output: "meta_assembly/EGYPTREFV2_covering_EGYPTREFWTDB2V3PILON_gaps.txt"
run:
# Read in centromere regions
centromeres = []
with open(input[2],"r") as f_in:
for line in f_in:
if line[0] == "#":
continue
s = line.split("\t")
chrom,start,end = s[1:4]
# Remember to remove "chr" from chromosome
centromeres.append([chrom[3:],int(start),int(end)])
# Read in aligned EGYPTREF coordinates
aligned = []
with open(input[1],"r") as f_in:
for line in f_in:
s = line.split("\t")
chrom,start,end = s[:3]
aligned.append([chrom,int(start),int(end),line])
# Go over gap regions
header = ["gap_chrom","grch38_gap_start","grch38_gap_end","gap_size"] + \
["v2_grch38_al_chrom","v2_grch38_al_start","v2_grch38_al_end"] + \
["v2_al_contig","v2_al_start","v2_al_end","v2_al_len"]
with open(input[0],"r") as f_in, open(output[0],"w") as f_out:
f_out.write("\t".join(header)+"\n")
for line in f_in:
# Skip header
if line[:5] == "chrom":
continue
s = line.split("\t")
chrom = s[0]
start = int(s[1])
end = int(s[2])
# Check if this gap is entirely within a centromere region
# then ignore it
for centromere in centromeres:
centro_chrom = centromere[0]
centro_start = centromere[1]
centro_end = centromere[2]
if chrom == centro_chrom and centro_start<=start \
and end<=centro_end:
continue
for alignment in aligned:
align_chrom = alignment[0]
align_start = alignment[1]
align_end = alignment[2]
align_line = alignment[3]
# If an alignment crosses gap boundaries or is within a gap:
# The cases are:
# 1) entirely covering the gap
# 2) crossing right gap border
# 3) crossing left gap border
# 4) being entirely contained in the gap
if chrom == align_chrom and \
((align_start<=start and align_end>=end) or \
(align_start<=end and align_end>=end) or \
(align_start<=start and align_end>=start) or \
(align_start>=start and align_end<=end)):
align_len = str(align_end-align_start)
f_out.write(line.strip("\n")+"\t"+ \
align_line.strip("\n")+"\t"+align_len+"\n")
rule make_fasta_with_egyptrefv2_added_seq:
input: "meta_assembly/EGYPTREFV2_covering_EGYPTREFWTDB2V3PILON_gaps.txt",
"seq_EGYPTREFV2/Homo_sapiens.EGYPTREFV2.dna.primary_assembly.fa"
output: "meta_assembly/EGYPTREFV2_addto_EGYPTREFWTDB2V3PILON.fa"
run:
# Read in the Sequences to be extracted from EGYPTREFV2
egyptrefv2_seq_coords = {}
with open(input[0],"r") as f_in:
for line in f_in:
# Skip header
if line[:9] == "gap_chrom":
continue
contig,start,end = line.split("\t")[7:10]
if contig in egyptrefv2_seq_coords:
egyptrefv2_seq_coords[contig].append([int(start),int(end)])
else:
egyptrefv2_seq_coords[contig] = [[int(start),int(end)]]
added_seqs = {}
with open(input[1],"r") as f_in, open(output[0],"w") as f_out:
for record in SeqIO.parse(f_in,"fasta"):
# Check if this contig is used to add onto
if record.id in egyptrefv2_seq_coords:
get_seqs = egyptrefv2_seq_coords[record.id]
for seq_pos in get_seqs:
start = seq_pos[0]
end = seq_pos[1]
seq_name = record.id+"_"+str(start)+"_"+str(end)
# Some seqs would be added multiple times because they
# cover more than one gap, thus check for this and don't
# add if this is the case
if seq_name in added_seqs:
continue
added_seqs[seq_name] = True
# If the sequence is less than
new_record = SeqRecord(record.seq[min(start,end):max(start,end)],seq_name, '', '')
SeqIO.write(new_record, f_out, "fasta")
# Here, we extract a list of partially covered genes in the base wtdbg2 assembly
# For this: get the genes covered in EGYPTREFV2 if they are complete there
# and add a gene to the list only if it is in the base assembly split in minimum
# three parts and thus likely missing something in between (still we
# don't know this and may make a mistake)
rule get_partial_genes_wtdbg2_in_egyptrefv2:
input: "quast_results/latest/genome_stats/Homo_sapiens-EGYPTREFWTDBG2V3PILON-dna-primary_assembly_genomic_features_gene.txt",
"quast_results/latest/genome_stats/Homo_sapiens-EGYPTREFV2-dna-primary_assembly_genomic_features_gene.txt"
output: "meta_assembly/partial_genes_addon.txt"
run:
# Read in the completely covered EGYPTREFV2 genes
complete_genes = {}
with open(input[1],"r") as f_in:
for line in f_in:
# Skip header
if line[:2] in ["ID","=="]:
continue
s = line.strip("\n").split("\t")
if not s[4] == "complete":
continue
complete_genes[s[0]] = s
# Go over the base assembly genes
with open(input[0],"r") as f_in, open(output[0],"w") as f_out:
header = ["gene","v2_contig","v2_start","v2_end","wtdbg2_contig",\
"v2_start","v2_end"]
f_out.write("\t".join(header)+"\n")
for line in f_in:
# Skip header
if line[:2] in ["ID","=="]:
continue
s = line.strip("\n").split("\t")
gene = s[0]
# Complete genes are fine and kept
if s[4] == "complete":
continue
# Genes that are not complete in EGYPTREFV2 are also skipped
if not gene in complete_genes:
continue
# Check if this partial gene is likely "missing something in
# the center", if not, skip it; this is the case if at least
# two regions are aligned
if not "," in s[5]:
continue
aligned_regions = s[5].split(",")
start_contig,start_region = aligned_regions[0].split(":")
end_contig,end_region = aligned_regions[-1].split(":")
# If start and end contig are not the same, we also don't
# want to replace this gene
if not start_contig == end_contig:
print("Start and end not on same contig!")
continue
start = start_region.split("-")[0]
end = end_region.split("-")[1]
v2_contig,region = complete_genes[gene][-1].split(":")
v2_start,v2_end = region.split("-")
info = [gene,start_contig,start,end,v2_contig,v2_start,v2_end]
f_out.write("\t".join(info)+"\n")
# Here, we cut the sequences of the corresponding genes from EGYPTREFV2 and
# insert them into the WTDBG2-based assembly
rule replace_partial_genes:
input: "seq_EGYPTREFV2/Homo_sapiens.EGYPTREFV2.dna.primary_assembly.fa",
"seq_EGYPTREFWTDBG2V3PILON/Homo_sapiens.EGYPTREFWTDBG2V3PILON.dna.primary_assembly.fa",
"meta_assembly/partial_genes_addon.txt"
output: "meta_assembly/Homo_sapiens.EGYPTREFMETAGENES.dna.primary_assembly.fa",
"meta_assembly/replaced_gene_seqs.txt"
run:
# Get the v2 scaffold sequences
egyptrefv2 = {}
with open(input[0],"r") as f_v2:
for record in SeqIO.parse(f_v2,"fasta"):
egyptrefv2[record.id] = record.seq
# Get the sequences of genes to be replaced
egyptrefwtdbg2_coord = {}
egyptrefv2_geneseq = {}
# Get the V2 assembly sequences; save them to a hash table with the
# target contig, start and end being the key
with open(input[2],"r") as f_genes:
for line in f_genes:
# Skip header
if line[:5] == "gene\t":
continue
s = line.strip("\n").split("\t")
wtdbg2_contig,wtdbg2_start,wtdbg2_end = s[1:4]
v2_scaffold,v2_start,v2_end = s[4:]
v2_seq = egyptrefv2[v2_scaffold]
left_pos_wtdbg2 = min(int(wtdbg2_start),int(wtdbg2_end))
right_pos_wtdbg2 = max(int(wtdbg2_start),int(wtdbg2_end))
v2_key = "\t".join([wtdbg2_contig,str(left_pos_wtdbg2),str(right_pos_wtdbg2)])
left_pos_v2 = min(int(v2_start),int(v2_end))
right_pos_v2 = max(int(v2_start),int(v2_end))
# Watch out: if the start and end position ordering
# doesn't match between v2 and wtdbg2, then the v2
# seq needs to be reverse complemented
if (int(wtdbg2_start)<int(wtdbg2_end) and \
int(v2_start)>int(v2_end)) or \
(int(wtdbg2_start)>int(wtdbg2_end) and \
int(v2_start)<int(v2_end)):
egyptrefv2_geneseq[v2_key] = str(v2_seq[left_pos_v2:right_pos_v2].reverse_complement())
else:
egyptrefv2_geneseq[v2_key] = v2_seq[left_pos_v2:right_pos_v2]
# Save the WTDBG2 assembly coordinates; save them to a table which is
# sorted according to increasing coordinates
if not wtdbg2_contig in egyptrefwtdbg2_coord:
egyptrefwtdbg2_coord[wtdbg2_contig] = [[left_pos_wtdbg2,right_pos_wtdbg2]]
else:
egyptrefwtdbg2_coord[wtdbg2_contig].append([left_pos_wtdbg2,right_pos_wtdbg2])
print(egyptrefwtdbg2_coord.keys())
# Get the base contig sequences
egyptrefwtdbg2 = {}
with open(input[1],"r") as f_wtdbg2:
for record in SeqIO.parse(f_wtdbg2,"fasta"):
egyptrefwtdbg2[record.id] = record.seq
# Go over base contig sequences and sort changes py start coordinates
with open(output[0],"w") as f_out, open(output[1],"w") as f_replaced:
for contig in egyptrefwtdbg2:
# If no changes needed, write contig out as it is
if not contig in egyptrefwtdbg2_coord:
new_record = SeqRecord(egyptrefwtdbg2[contig],contig, '', '')
SeqIO.write(new_record, f_out, "fasta")
#print("Wrote "+contig+" Len: "+str(len(egyptrefwtdbg2[contig])))
continue
#print(egyptrefwtdbg2_coord[contig])
egyptrefwtdbg2_coord[contig].sort(key=lambda x: x[0])
last_coord = 0
new_seq = ""
for coord in egyptrefwtdbg2_coord[contig]:
# Make sure genes to be replaced are non-overlapping
first_coord = int(coord[0])
# Skip overlapping genes
if last_coord>first_coord:
print("Overlappling genes!")
continue
# Add base seq between v2 gene seqs to be added
new_seq += egyptrefwtdbg2[contig][last_coord:first_coord]
# Add gene seq to be added
# Check if the start and end sequences match
f_replaced.write("Add: "+str(egyptrefv2_geneseq["\t".join([contig,str(coord[0]),str(coord[1])])])[:20]+"\n")
f_replaced.write("Rem: "+str(egyptrefwtdbg2[contig][coord[0]:coord[1]])[:20]+"\n")
f_replaced.write("RAd: "+str(egyptrefv2_geneseq["\t".join([contig,str(coord[0]),str(coord[1])])])[-20:]+"\n")
f_replaced.write("RRe: "+str(egyptrefwtdbg2[contig][coord[0]:coord[1]])[-20:]+"\n")
new_seq += egyptrefv2_geneseq["\t".join([contig,str(coord[0]),str(coord[1])])]
last_coord = int(coord[1])
# Add sequence from last replaced gene to end
new_seq += egyptrefwtdbg2[contig][last_coord:]
print("Wrote "+contig+" Len: "+str(len(egyptrefwtdbg2[contig]))+" New Len: "+str(len(new_seq)))
# Mak a record and write this contig to file
new_record = SeqRecord(new_seq,contig, '', '')
SeqIO.write(new_record, f_out, "fasta")
rule combine_gene_replaced_with_added_seq:
input: "meta_assembly/Homo_sapiens.EGYPTREFMETAGENES.dna.primary_assembly.fa",
"meta_assembly/EGYPTREFV2_addto_EGYPTREFWTDB2V3PILON.fa"
output: "meta_assembly/Homo_sapiens.EGYPTREFMETA.dna.primary_assembly.fa"
shell: "cat {input} > {output}"
rule combine_with_added_seq:
input: "seq_EGYPTREFWTDBG2V3PILON/Homo_sapiens.EGYPTREFWTDBG2V3PILON.dna.primary_assembly.fa",
"meta_assembly/EGYPTREFV2_addto_EGYPTREFWTDB2V3PILON.fa"
output: "meta_assembly/Homo_sapiens.EGYPTREFMETAV2ADDED.dna.primary_assembly.fa"
shell: "cat {input} > {output}"