-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmitome.sh
297 lines (218 loc) · 8.56 KB
/
mitome.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
#!/usr/bin/env bash
#A script to assemble a mitome
#............................................................................
# How to use
# activate your conda environment with the tools needed
# bash mitome.sh -b baits R1 R2 nano
#............................................................................
# Defaults
set -e #exit if a command exits with non zero status
script=$(basename $0) #script name less file path
baits=""
genome_size=700000
threads=16
#............................................................................
# Functions
function msg {
echo -e "$*"
}
# $* is args passed in, -e means interpret backslashes
function err {
echo "Error: $*" 1>&2
}
# redirects stdout to stderr
function banner {
printf '%*s\n' "${COLUMNS:-$(tput cols)}" '' | tr ' ' -
}
function msg_banner {
printf '%*s\n' "${COLUMNS:-$(tput cols)}" '' | tr ' ' -
msg "$*"
printf '%*s\n' "${COLUMNS:-$(tput cols)}" '' | tr ' ' -
}
function usage {
banner
msg "$script\n Assemble a mitome with long and short reads."
msg "Author:\n Anna Syme, anna.syme@gmail.com"
msg "Usage:\n $script [options] R1 R2 nano"
msg "Parameters:"
msg " Illumina R1 reads, trimmed, filtered R1.fq.gz"
msg " Illumina R2 reads, trimmed, filtered R2.fq.gz"
msg " Nanopore reads, raw nano.fq.gz"
msg "Options:"
msg " -h Show this help"
msg " -t NUM Number of threads (default=16)"
msg " -g NUM genome size in bp (default=700000)"
msg " -b FILE bait sequences file name"
msg "Example:"
msg " $script -t 8 R1.fq.gz R2.fq.gz minion.fq.gz"
banner
exit 1
#exits script
}
#...........................................................................
# Parse the command line options
#loops through, sets variable or runs function
#have to add in a colon after the flag, if it's looking for an arg
#e.g. t: means flag t is set, then look for the arg (eg 32) to set $threads to
# : at the start disables default error handling for this bit
#instead it will label an odd flag as a ?
#the extra : at the end means it will label missing args as :
while getopts ':ht:g:b::' opt ; do
case $opt in
h)
usage
;;
t)
threads=$OPTARG
;;
g)
genome_size=$OPTARG
;;
a)
adapters=$OPTARG
;;
b)
baits=$OPTARG
;;
\?)
echo "Invalid option '=$OPTARG'"
exit 1
;;
:)
echo "Option '-$OPTARG' requires an argument"
exit 1
;;
esac
done
shift $((OPTIND-1))
#remove all options that has been parsed by getopts
#so that $1 refers to next arg passed to script
#e.g. a positional arg
if [ $# -ne 3 ]; then
msg "\n **Please provide three input parameters** \n"
usage
fi
#if number of pos params is not 3, print usage msg
#...........................................................................
#start
msg "\n"
msg_banner "now running $script"
#...........................................................................
#check inputs
#give variable names to the inputs
R1=$1
R2=$2
nano_raw=$3
msg "This script will use:"
msg " Illumina reads R1: $R1"
msg " Illumina reads R2: $R2"
msg " Nanpore reads: $nano_raw"
msg " Bait sequences: $baits"
msg " Adapter sequences: $adapters"
msg " Genome size: $genome_size"
msg " Threads: $threads"
#...........................................................................
conda env export --name bio > bio.yml
#saves conda env with tools and versions
#...........................................................................
msg_banner "Round 1"
#...........................................................................
msg_banner "now extracting mitome nanopore reads from all reads"
nano_mt1=nano_mt1.fq.gz
minimap2 -a -x map-ont -t $threads $baits $nano_raw |
samtools fastq -0 $nano_mt1 -n -F 4 -
#map the reads to a set of baits (e.g. CDS.fasta from sister taxa)
#this pipes the output to samtools fastq,
#which extracts the fastq reads from the alignment
#the flag -F 4 means exclude unmapped reads (i.e., non mt reads)
#...........................................................................
msg_banner "now assembling nanopore mt reads"
flye --nano-raw $nano_mt1 --genome-size $genome_size \
--out-dir flye001 --threads $threads
#using uncorrected reads as read correction can lead to errors
assembly1=assembly1.fasta
cp flye001/assembly.fasta $assembly1
#...........................................................................
msg_banner "Round 2"
#...........................................................................
msg_banner "now extracting mitome nanopore reads from all reads"
nano_mt2=nano_mt2.fq.gz
minimap2 -m 5000 -a -x map-ont -t $threads $assembly1 $nano_raw |
samtools fastq -0 $nano_mt2 -n -F 4 -
#map the reads to a set of baits- this time, use assembly1
#add in a minimum match requirement with -m
#...........................................................................
msg_banner "now assembling nanopore mt reads"
flye --nano-raw $nano_mt2 --genome-size $genome_size \
--out-dir flye002 --threads $threads
assembly2=assembly2.fasta
cp flye002/assembly.fasta $assembly2
#...........................................................................
msg_banner "Round 3"
#...........................................................................
msg_banner "now extracting mitome nanopore reads from all reads"
nano_mt3=nano_mt3.fq.gz
minimap2 -m 5000 -a -x map-ont -t $threads $assembly2 $nano_raw |
samtools fastq -0 $nano_mt3 -n -F 4 -
#map the reads to a set of baits- this time, use assembly2
#add in a minimum match requirement with -m
#...........................................................................
msg_banner "now assembling nanopore mt reads"
flye --nano-raw $nano_mt3 --genome-size $genome_size \
--out-dir flye003 --threads $threads
assembly3=assembly3.fasta
cp flye003/assembly.fasta $assembly3
#...........................................................................
msg_banner "now polishing flye assembly with long reads"
#round 1. make overlaps file, then run racon
minimap2 -x map-ont -t $threads $assembly3 $nano_mt3 \
| gzip > overlaps1.paf.gz
assembly3_racon = assembly3_racon.fasta
racon --threads $threads $nano_mt3 overlaps1.paf.gz \
$assembly3 > $assembly3_racon
#here, further rounds of polishing made little difference
#option to add in medaka polishing here
#...........................................................................
msg_banner "now extracting illumina mitome reads from all reads"
R1_mt=R1_mt.fq.gz
R2_mt=R2_mt.fq.gz
minimap2 -a -x sr $assembly3_racon $R1 $R2 \
| samtools fastq -1 $R1_mt -2 $R2_mt -F 0x4 -f 0x2 -
#extract cp reads only by mapping to long-read assembly
#needs end dash for stdin
#more info https://broadinstitute.github.io/picard/explain-flags.html
#samtools flags: -F4 exclude unmapped reads, -f2 include properly paired reads
#...........................................................................
msg_banner "now creating subset of illumina mitome reads"
R1_mt_subset = R1_mt_subset.fq.gz
R2_mt_subset = R2_mt_subset.fq.gz
rasusa -i $R1_mt -i $R2_mt --coverage 300 \
--genome-size $genome_size -o $R1_mt_subset -o $R2_mt_subset
#downsample to required coverage, x300
#...........................................................................
msg_banner "now polishing flye-racon assembly with illumina"
#round 1pilon polish
bwa index $assembly3_racon
bwa mem -t $threads $assembly3_racon $R1_mt_subset $R2_mt_subset \
| samtools sort > flye_aln1.bam
samtools index flye_aln1.bam
samtools faidx $assembly3_racon
assembly3_racon_pilon = assembly3_racon_pilon.fasta
pilon --genome $assembly_flye_racon2 --frags flye_aln1.bam \
--output assembly3_racon_pilon \
--fix bases --mindepth 0.5 --changes --threads $threads --verbose
#...........................................................................
msg_banner "now running unicycler assembler"
unicycler -1 $R1_mt_subset -2 $R2_mt_subset -l $nano_mt3 -o unicycler \
--threads $threads --no_rotate --keep 2
#--keep 2 will keep final files but also SAM
#--no_rotate means don't rotate replicons to certain start pos
assembly_unicycler=assembly_unicycler.fasta
cp unicycler/assembly.fasta $assembly_unicycler
#...........................................................................
msg_banner "now getting assembly stats"
seqkit stats $assembly1 $assembly2 $assembly3 \
$assembly3_racon $assembly3_racon_pilon $assembly_unicycler \
-Ta > assembly_stats.tsv
#...........................................................................
msg_banner "Script finished!"