-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaligner.py
224 lines (192 loc) · 9.36 KB
/
aligner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os
from runners import run, run_log
from utils import log, check_files, get_read_group, index_bam
# Pipeline
def align_reads(args) :
"""Pipeline to align reads to reference with automatic reads group"""
"""Uses BWA MEM / SAMBAMBA / """
# 0. parse arguments & check files
#print(args.R1[0])
R1_reads = check_files(args.R1[0]) # Get R1 reads per lane
R2_reads = check_files(args.R2[0]) # Get R2 reads per lane
ref = check_files(args.Reference)[0]
out = args.Sample[0]
dc_args = {"threads":args.threads[0], "ram":args.ram[0], "mapq":args.mapq[0], "sample":args.Sample[0], "output_logs":args.log} # Dict to make commands
keep = args.keep_temp
print("# runGATK.py align")
print("Reference genome:\t{}\n".format(ref))
print("R1 reads:\t{}".format(" | ".join(x for x in R1_reads)))
print("R2 reads:\t{}\n".format(" | ".join(x for x in R2_reads)))
print("Output stored in:\t{}\n".format(out))
print("Keeping intermediate files:\t{}".format(keep))
print("Other arguments:\t" + str(dc_args))
print("===============================================================================\n")
# 0. Make output
if os.path.isdir(out) :
log("WARNING: Output directory already exists: {}.".format(out))
else :
os.makedirs(out) # Create directory following path
out = os.path.abspath(out)
# 1. Index the reference
indexed_ref = ref + ".bwt"
cmd = "bwa index {ref}".format(**{"ref":ref})
if not os.path.isfile(indexed_ref) :
print(cmd + "\n")
run(cmd)
else :
log("SKIP: found bwa index: {}".format(indexed_ref))
# 2. Preparing outputs and required variables
out_merge = os.path.join(out, dc_args["sample"] + ".merged.bam")
out_merge_exists = os.path.isfile(out_merge)
out_markdup = os.path.join(out, dc_args["sample"] + ".markdup.bam")
out_markdup_exists = os.path.isfile(out_markdup)
out_sort = os.path.join(out, dc_args["sample"] + ".sorted.CALL.bam")
out_sort_exists = os.path.isfile(out_sort)
intermediate_files = []
# 3. Run alignment jobs if output files are not found
if out_merge_exists or out_markdup_exists or out_sort_exists :
log("SKIP: found output .bam file.")
bam_to_merge = []
for r1, r2 in zip(R1_reads, R2_reads) :
out_sort_first = os.path.join(out, dc_args["sample"] + "_" + os.path.basename(r1) + "_" + os.path.basename(r1) + ".sorted.bam")
bam_to_merge.append(out_sort_first)
else :
# Loop through given libraries
i = 0
bam_to_merge = []
for r1, r2 in zip(R1_reads, R2_reads) :
# 3.1 Get read group for alignment
header, id = get_read_group(r1)
readgroup = "@RG\\tID:{id}\\tSM:{sample}\\tLB:{id}_{sample}\\tPL:{sequencer}"
dc_rg = {"id":id, "sample":dc_args["sample"], "sequencer":"ILLUMINA"}
readgroup = readgroup.format(**dc_rg)
# 3.2 Preparing outputs
out_bwa_sam = os.path.join(out, dc_args["sample"] + "_" + os.path.basename(r1) + "_" + os.path.basename(r1) + ".sam")
out_bwa_sam_exists = os.path.isfile(out_bwa_sam)
out_view = os.path.join(out, dc_args["sample"] + "_" + os.path.basename(r1) + "_" + os.path.basename(r1) + ".view.bam")
out_view_exists = os.path.isfile(out_view)
out_sort_first = os.path.join(out, dc_args["sample"] + "_" + os.path.basename(r1) + "_" + os.path.basename(r1) + ".sorted.bam")
out_sort_first_exists = os.path.isfile(out_sort_first)
# 3.4 make a BWA mem command
if out_bwa_sam_exists :
log("SKIP: found .sam file.")
else :
bwa_align_reads(ref, r1, r2, out_bwa_sam, readgroup, dc_args["threads"], dc_args["output_logs"])
intermediate_files.append(out_bwa_sam)
# 3.5. make a sambamba view command: filtering on mapq
if out_view_exists :
log("SKIP: found filtered .bam file.")
else :
filter_mapq_bam(out_bwa_sam, out_view, dc_args["threads"], dc_args["mapq"], dc_args["output_logs"])
intermediate_files.append(out_view)
intermediate_files.append(out_view+".bai")
# 3.6 sort the created alignment file
if out_sort_first_exists :
log("SKIP: found sorted .bam file.")
else :
sort_bam(out_view, out_sort_first, dc_args["threads"], dc_args["output_logs"])
intermediate_files.append(out_sort_first)
intermediate_files.append(out_sort_first+".bai")
# 3.7 Add a merged file to merge into a final file
if os.path.isfile(out_sort_first) : # cannot call out_sort_first_exists because it might have been created in the meantime
bam_to_merge.append(out_sort_first)
# 4. Merging all sorted-filtered-per library alignment files
if out_markdup_exists or out_sort_exists :
log("SKIP: found merged .bam file.")
else :
merge_bam_list(bam_to_merge, out_merge, dc_args["threads"], dc_args["output_logs"])
intermediate_files.append(out_merge)
intermediate_files.append(out_merge+".bai")
# 5. Marking duplicates in the merged-filtered alignment file
if out_sort_exists :
log("SKIP: found sorted .bam file.")
else :
markdup_bam(out_merge, out_markdup, dc_args["threads"], dc_args["output_logs"])
intermediate_files.append(out_markdup)
intermediate_files.append(out_markdup+".bai")
# 6. Sorting the markedup-merged-filtered final file
sort_bam(out_markdup, out_sort, dc_args["threads"], dc_args["output_logs"])
# 7. Indexing final file (may be useless, not sure because sambamba sort might already index)
index_bam(out_sort, dc_args["threads"], dc_args["output_logs"])
# IF REMOVE TEMP : remove sam file, filtered bam file, merged file, marked file
# Keep only filtered, merged, marked and sorted final file
if not keep :
log("Removing:\n")
for file in intermediate_files :
if os.path.isfile(file) :
print("- " + str(file))
os.remove(file)
# Functions
def bwa_align_reads(ref, r1, r2, output, readgroup, threads, output_logs) :
"""Align a library with BWA-MEM"""
if os.path.isfile(output) :
log("SKIP: found alignement: {}.".format(output))
else :
dc_bwa = {"threads":threads, "readgroup":readgroup, "output":output, "ref":ref, "r1":r1, "r2":r2}
cmd = "bwa mem -K 100000000 -t {threads} -R \"{readgroup}\" -o {output} {ref} {r1} {r2}"
cmd = cmd.format(**dc_bwa)
log("Aligning reads.")
print(cmd + "\n")
log_file = output + ".log" if output_logs else None
run_log(cmd, log_file)
def filter_mapq_bam(bamin, bamout, threads, mapq, output_logs) :
"""Filter a bam file"""
if os.path.isfile(bamout) :
log("SKIP: found bam file: {}.".format(bamout))
else :
dc_view = {"threads":threads, "mapq":mapq, "output":bamout, "input_sam":bamin}
cmd = "sambamba view -h -S -f bam -l 7 -t {threads} -F \"mapping_quality >= {mapq}\" -o {output} {input_sam}"
cmd = cmd.format(**dc_view)
log("Converting to BAM file.")
print(cmd + "\n")
#run(cmd)
log_file = bamout + ".log" if output_logs else None
run_log(cmd, log_file)
def merge_bam_list(bamlist, bamout, threads, output_logs) :
"""Merge a list of .bam files"""
if os.path.isfile(bamout) :
log("SKIP: found merged .bam file.")
else :
if len(bamlist) > 1 : # If more than one file is found
# 4.1 create and run sambamba merge command
input_bams = " ".join(x for x in bamlist)
dc_merge = {"threads":threads, "output":bamout, "input_bams":input_bams}
cmd = "sambamba merge -l 7 -t {threads} {output} {input_bams}"
cmd = cmd.format(**dc_merge)
log("Merging alignments.")
print(cmd + "\n")
log_file = bamout + ".log" if output_logs else None
run_log(cmd, log_file)
#run(cmd)
else :
log("SKIP: Merging is not necessary.")
# In case only one sample change rename file
os.rename(bamlist[0], bamout)
def sort_bam(bamin, bamout, threads, output_logs) :
"""Sort a .bam files"""
if os.path.isfile(bamout) :
log("SKIP: found sorted .bam file.")
else :
dc_sort = {"threads":threads, "output":bamout, "input":bamin}
cmd = "sambamba sort -l 7 -t {threads} -o {output} {input}"
cmd = cmd.format(**dc_sort)
log("Sorting bam file.")
print(cmd + "\n")
log_file = bamout + ".log" if output_logs else None
run_log(cmd, log_file)
#run(cmd)
def markdup_bam(bamin, bamout, threads, output_logs) :
"""Mark duplicates in a .bam file"""
if os.path.isfile(bamout) :
log("SKIP: found marked duplicates .bam file.")
else :
dc_markdup = {"threads":threads, "output":bamout, "input":bamin}
cmd = "sambamba markdup --overflow-list-size 600000 --hash-table-size 500000 -l 7 -t {threads} {input} {output}"
cmd = cmd.format(**dc_markdup)
log("Marking duplicates.")
print(cmd + "\n")
log_file = bamout + ".log" if output_logs else None
run_log(cmd, log_file)
#run(cmd)