-
Notifications
You must be signed in to change notification settings - Fork 0
/
Taxa4Meta.sh
388 lines (314 loc) · 20.5 KB
/
Taxa4Meta.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
#!/bin/bash
usage(){
echo "
Taxa4Meta pipeline
Written by Qinglong Wu, Ph.D. from Baylor College of Medicine (Qinglong.Wu@bcm.edu)
Description: Perform human gut microbiome meta-analysis, i.e. 16S rRNA gene amplicon data generated from different sequencing strategies.
The core concept bases on the accurate taxamonic binning instead of OTU interpretation during 16S amplicon meta-analysis.
Usage: Taxa4Meta.sh --path /path-to-project-dir/ --threads [integer]
--fastq_truncee [integer] --minReadLen [integer] --maxReadLen [integer]
--fastq_stripleft [integer] --fastq_stripright [integer]
--BLCA-GenusConf [integer] --BLCA-SpeciesConf [integer]
--ReverseComplement [string] --PairedEnd [string] --ReferenceMode [string]
(Note: all positional parameters must be provided in the same order as indicated above)
Parameters:
--path Absolute path to the project directory. Arrange data: /ProjectDir/RawReadProcess/SampleDir/RawSequenceFiles
--threads Number of threads to be specified. (must be integer)
--fastq_truncee Truncate sequences from left until the total expected error is not higher than the specified value;
usually 1 for Illumina short reads; could be 2 for 454 longer reads. (must be integer)
--minReadLen During quality filtering (after paired-end merging), discard sequenced reads if they are shorter than
the specified length. (must be integer)
--maxReadLen During quality filtering (after paired-end merging), truncate sequence reads to the specified length,
and keep sequence reads shorter than the length specified. (must be integer)
--fastq_stripleft The length of any other technical sequence plus primer sequence to re removed from left. (must be integer)
--fastq_stripright The length of any other technical sequence plus primer sequence to be removed from right. (must be integer)
--BLCA-GenusConf During BLCA taxonomic assignment, confidence cut-off for sequence annotation at genus level. (must be integer)
--BLCA-SpeciesConf During BLCA taxonomic assignment, confidence cut-off for sequence annotation at species level. (must be integer)
--ReverseComplement Perform reverse complement of filtered reads before clustering (reduce run time); must be TRUE or FALSE.
--PairedEnd Switch to the process mode for paired-end reads, otherwise remain the process
for single-end reads; must be TRUE or FALSE.
--ReferenceMode Perform chimeras removal using reference-based mode, otherwise perform chimeras removal
using both de novo mode and reference-based mode; must be TRUE or FALSE.
Suggested key parameters:
V1V3: forward orientation (minReadLen: 200, maxReadLen: 450; GenusConf: 90, SpeciesConf: 60)
reverse orientation (minReadLen: 300, maxReadLen: 450; GenusConf: 90, SpeciesConf: 60)
V3V5: forward orientation (minReadLen: 250, maxReadLen: 450; GenusConf: 85, SpeciesConf: 50)
reverse orientation (minReadLen: 300, maxReadLen: 450; GenusConf: 85, SpeciesConf: 50)
V4: forward orientation (minReadLen: 200, maxReadLen: 250; GenusConf: 70, SpeciesConf: 50)
reverse orientation (minReadLen: 200, maxReadLen: 250; GenusConf: 70, SpeciesConf: 50)
V6V9: forward orientation (minReadLen: 300, maxReadLen: 450; GenusConf: 90, SpeciesConf: 50)
reverse orientation (minReadLen: 250, maxReadLen: 450; GenusConf: 90, SpeciesConf: 50)
"
}
if [ -z "$1" ] || [ "$1" == "-h" ] || [ "$1" == "-help" ] || [ "$1" == "--help" ]
then
usage
exit
fi
#assign variables with the input of positional arguments
ProgramDir=$(dirname $(readlink -f $0)) #get the path for the script and its dependencies
WorkDir=$2
THREADS=$4
maxEE=$6
MinLen=$8
MaxLen=${10}
TrimLeft=${12}
TrimRight=${14}
GenusConf=${16}
SpeciesConf=${18}
ReverseComplement=${20}
PairedEnd=${22}
ReferenceMode=${24}
cd $WorkDir
###################################################################################################################################################
#loop every sample directory
if [ $PairedEnd == "TRUE" ]
then #process with paired-end reads from Illumina
cd $WorkDir
#loop every sample
for SampleDir in $WorkDir/RawReadProcess/*
do
cd $SampleDir
#for SRR files download from NCBI database: SRRxxxxx_1.fastq & SRRxxxxx_2.fastq in SRRxxxxx directory
#"those variables might be changed if the input sequence is different"
ForwardSeq=`ls *_R1_*`
ReverseSeq=`ls *_R2_*`
SampleName=`basename $SampleDir`
#merge the paired-end reads
$ProgramDir/scripts/vsearch --threads $THREADS \
--fastq_mergepairs $ForwardSeq \
--reverse $ReverseSeq \
--fastqout ''$SampleName'_merged.fq' \
--fastqout_notmerged_fwd ''$SampleName'_unmerged_forward.fq'
#quality filtering and trimming, and strip forward and reverse primers by position
$ProgramDir/scripts/vsearch --threads $THREADS \
--fastq_filter ''$SampleName'_merged.fq' \
--fastaout ''$SampleName'_merged_maxEE.fa' \
--fastq_stripleft $TrimLeft \
--fastq_stripright $TrimRight \
--fastq_truncee $maxEE \
--fastq_trunclen_keep $MaxLen \
--fastq_minlen $MinLen
#apply for unmerged forward sequence, try to keep many reads
$ProgramDir/scripts/vsearch --threads $THREADS \
--fastq_filter ''$SampleName'_unmerged_forward.fq' \
--fastaout ''$SampleName'_unmerged_forward_maxEE.fa' \
--fastq_stripleft $TrimLeft \
--fastq_truncee $maxEE \
--fastq_minlen $MinLen \
--fastq_maxlen $MaxLen
#concatenate clean reads of merged and forward unmerged reads
cat ''$SampleName'_merged_maxEE.fa' ''$SampleName'_unmerged_forward_maxEE.fa' \
> ''$SampleName'_merged_filtered_noprimers.fa'
#perform reverse complement of the sequences if necessary
if [ $ReverseComplement == "TRUE" ]
then
$ProgramDir/scripts/vsearch --threads $THREADS \
--fastx_revcomp ''$SampleName'_merged_filtered_noprimers.fa' \
--fastaout ''$SampleName'_merged_clean.fa'
else
#generate the same output for the next command
cp ''$SampleName'_merged_filtered_noprimers.fa' ../''$SampleName'_merged_filtered_noprimers.fa'
mv ../''$SampleName'_merged_filtered_noprimers.fa' ''$SampleName'_merged_clean.fa'
fi
#QIIME format: SampleID_SeqID such as PatientID-Day-Collection_SeqID or PatientID.Day.Collection_SeqID
#UPARSE/VSEARCH format: SampleID.SeqID such as PatientID_Day_Collection.SeqID
prefix=`echo $SampleName | sed 's/-/_/g'`
#Dereplicate sequences at sample level to reduce run time for OTU clustering
$ProgramDir/scripts/vsearch --threads $THREADS \
--derep_fulllength ''$SampleName'_merged_clean.fa' \
--output ''$SampleName'_merged_clean_derep.fa' \
--relabel ''$prefix'.' \
--strand plus --fasta_width 0 --sizeout
done
else #process with single-end reads from 454 or Illumina (could be merged paired-end reads)
cd $WorkDir
for SampleDir in $WorkDir/RawReadProcess/*
do
cd $SampleDir
#"those variables might be changed if the input sequence is different"
SingleSeq=`ls *` #VSEARCH accepts .fq.gz or .fastq.gz files
SampleName=`basename $SampleDir`
#quality filtering and trimming, and strip primers positionaly (pay attention to the order of parameters)
$ProgramDir/scripts/vsearch --threads $THREADS \
--fastq_filter $SingleSeq \
--fastaout ''$SampleName'_maxEE.fa' \
--fastq_stripleft $TrimLeft \
--fastq_stripright $TrimRight \
--fastq_truncee $maxEE \
--fastq_trunclen_keep $MaxLen \
--fastq_minlen $MinLen
#perform reverse complement of the sequences: common in 454 sequencing if necessary
if [ $ReverseComplement == "TRUE" ]
then
$ProgramDir/scripts/vsearch --threads $THREADS \
--fastx_revcomp ''$SampleName'_maxEE.fa' \
--fastaout ''$SampleName'_maxEE_clean.fa'
else
#generate the same output for the next command
cp ''$SampleName'_maxEE.fa' ../''$SampleName'_maxEE.fa'
mv ../''$SampleName'_maxEE.fa' ''$SampleName'_maxEE_clean.fa'
fi
#QIIME format: SampleID_SeqID such as PatientID-Day-Collection_SeqID or PatientID.Day.Collection_SeqID
#UPARSE/VSEARCH format: SampleID.SeqID such as PatientID_Day_Collection.SeqID
prefix=`echo $SampleName | sed 's/\./_/g' | sed 's/-/_/g'`
#Dereplicate sequences at sample level to reduce run time for OTU clustering
$ProgramDir/scripts/vsearch --threads $THREADS \
--derep_fulllength ''$SampleName'_maxEE_clean.fa' \
--output ''$SampleName'_maxEE_clean_derep.fa' \
--relabel ''$prefix'.' \
--strand plus --fasta_width 0 --sizeout
done
fi
###################################################################################################################################################
#prepare for OTU clustering
mkdir $WorkDir/Taxa4Meta_output
mkdir $WorkDir/Taxa4Meta_output/temp && cd $WorkDir/Taxa4Meta_output/temp
if [ $PairedEnd == "TRUE" ]
then #prcess if paired-end reads
cat $WorkDir/RawReadProcess/*/*_merged_clean_derep.fa > all_maxEE_clean_derep.fa
rm $WorkDir/RawReadProcess/*/*merged* #save storage
else #prcess if single-end reads
cat $WorkDir/RawReadProcess/*/*_maxEE_clean_derep.fa > all_maxEE_clean_derep.fa
rm $WorkDir/RawReadProcess/*/*maxEE* #save storage
fi
#First dereplicate across samples and remove singletons
$ProgramDir/scripts/vsearch --threads $THREADS \
--sizein --sizeout --fasta_width 0 \
--derep_fulllength all_maxEE_clean_derep.fa \
--uc all.derep.uc \
--output all.derep.fasta
#Remove chimeras
if [ $ReferenceMode == "TRUE" ]
then
#only reference-based chimera detection (SILVA 132)
$ProgramDir/scripts/vsearch --threads $THREADS \
--sizein --sizeout --fasta_width 0 \
--uchime_ref all.derep.fasta \
--db $ProgramDir/databases/silva_132_99_16S.fna \
--nonchimeras all.derep.nonchimeras.fasta
else
#de novo chimera detection, time-consuming for large dataset
$ProgramDir/scripts/vsearch --threads $THREADS \
--sizein --sizeout --fasta_width 0 \
--uchime_denovo all.derep.fasta \
--nonchimeras all.denovo.nonchimeras.fasta
#continue with reference-based chimera detection (SILVA 132)
$ProgramDir/scripts/vsearch --threads $THREADS \
--sizein --sizeout --fasta_width 0 \
--uchime_ref all.denovo.nonchimeras.fasta \
--db $ProgramDir/databases/silva_132_99_16S.fna \
--nonchimeras all.derep.nonchimeras.fasta
rm all.denovo.nonchimeras.fasta
fi
#Extract all non-chimeric seqs
perl $ProgramDir/scripts/map.pl all_maxEE_clean_derep.fa all.derep.uc all.derep.nonchimeras.fasta > all_maxEE_clean_derep_nochimeras.fa
rm all.derep.uc all.derep.fasta all.denovo.nonchimeras.fasta all.derep.nonchimeras.fasta
#VSEARCH will internally sort the sequences by decreasing the length, cluster at 97% or 100% and relabel with OTU_n
#generate OTU table and OTU sequences (longest sequence, "no size (remove --sizeout)" in the identifiers - required by BLCA script)
#must use option "--cluster_fast" for input sequences with variable read length from the same region
#options "--centroids" + "--cluster_fast" will output the longest seed read as representative OTU sequence
$ProgramDir/scripts/vsearch --threads $THREADS --id 0.99 \
--strand plus --fasta_width 0 --sizein \
--cluster_fast all_maxEE_clean_derep_nochimeras.fa \
--relabel OTU_ \
--centroids all.otus.fasta \
--otutabout all.otutab.txt
###################################################################################################################################################
#assign taxonomy by BLCA with NCBI 16S Microbial DB (downloaded on July 25th, 2019)
cd $WorkDir/Taxa4Meta_output/temp
mkdir BLCA_taxonomy
#BLCA current version just use single thread for the script, split input into several new files of relatively even size
$ProgramDir/scripts/seqkit split --by-part 30 all.otus.fasta --out-dir Splits #split sequences into 30 parts for downstream parallel processing
for SplitSeq in $PWD/Splits/*.fasta
do
SplitSeqName=`basename $SplitSeq .fasta`
#make sure you have dependencies installed (clustalo and blastn) and should be in your PATH
python $ProgramDir/scripts/BLCA/2.blca_main.py -i $SplitSeq \
-r $ProgramDir/scripts/BLCA/db/16SMicrobial.ACC.taxonomy \
-q $ProgramDir/scripts/BLCA/db/16SMicrobial \
--cvrset 0.99 --iset 99 \
-o $PWD/BLCA_taxonomy/''$SplitSeqName'_BLCA_out.txt' & #keep in background
done
wait #wait the completion of all the background runs
cat ./BLCA_taxonomy/*_BLCA_out.txt > ./BLCA_taxonomy/otu_seqs_BLCA_tax_assignment.txt
rm ./BLCA_taxonomy/*_BLCA_out.txt && rm -rf Splits #remove the sequence splits
#parse the BLCA output file (classified OTU sequences)
sed -n '/Unclassified/!p' ./BLCA_taxonomy/otu_seqs_BLCA_tax_assignment.txt > ./BLCA_taxonomy/otu_seqs_BLCA_tax_assignment_classified.txt
sed 's/\t/;/g' ./BLCA_taxonomy/otu_seqs_BLCA_tax_assignment_classified.txt > otu_taxonomy.txt #fix for delimiter for parsing
#filter each taxa based on confidence score and keep all sorted gene IDs from LCA output for merging purpose
awk -v var=$GenusConf -v OFS="\t" -F ';' '{if ($3 >= var) print $1,$2; else print $1,"unclassified"}' otu_taxonomy.txt > taxa_domain.txt
awk -v var=$GenusConf -v OFS="\t" -F ';' '{if ($5 >= var) print $1,$4; else print $1,"unclassified"}' otu_taxonomy.txt > taxa_phylum.txt
awk -v var=$GenusConf -v OFS="\t" -F ';' '{if ($7 >= var) print $1,$6; else print $1,"unclassified"}' otu_taxonomy.txt > taxa_class.txt
awk -v var=$GenusConf -v OFS="\t" -F ';' '{if ($9 >= var) print $1,$8; else print $1,"unclassified"}' otu_taxonomy.txt > taxa_order.txt
awk -v var=$GenusConf -v OFS="\t" -F ';' '{if ($11 >= var) print $1,$10; else print $1,"unclassified"}' otu_taxonomy.txt > taxa_family.txt
awk -v var=$GenusConf -v OFS="\t" -F ';' '{if ($13 >= var) print $1,$12; else print $1,"unclassified"}' otu_taxonomy.txt > taxa_genus.txt
awk -v var=$SpeciesConf -v OFS="\t" -F ';' '{if ($15 >= var) print $1,$14; else print $1,"unclassified"}' otu_taxonomy.txt > taxa_species.txt
#join all ranks together
join -j 1 -a 1 -a 2 -t $'\t' -o auto taxa_domain.txt taxa_phylum.txt | \
join -j 1 -a 1 -a 2 -t $'\t' -o auto - taxa_class.txt | \
join -j 1 -a 1 -a 2 -t $'\t' -o auto - taxa_order.txt | \
join -j 1 -a 1 -a 2 -t $'\t' -o auto - taxa_family.txt | \
join -j 1 -a 1 -a 2 -t $'\t' -o auto - taxa_genus.txt | \
join -j 1 -a 1 -a 2 -t $'\t' -o auto - taxa_species.txt > temp
cat temp | cut -d$'\t' -f 1 > temp1 #extract OTU ID column
cat temp | cut -d$'\t' -f 2-8 | sed 's/\t/;/g' > temp2 #extract taxonomy columns and convert to semi-colon delimiter
paste temp1 temp2 | sed 's/unclassified/Other/g' \
| sed 's/superkingdom://g' | sed 's/phylum://g' | sed 's/class://g' | sed 's/order://g' \
| sed 's/family://g' | sed 's/genus://g' | sed 's/species://g' \
| sed 's/Not Available/Other/g' \
> ./BLCA_taxonomy/otu_seqs_BLCA_tax_assignment_classified_parsed.txt #to be the same style with IDTAXA taxonomy file and taxonomic collapsing
rm taxa_*.txt temp* otu_taxonomy.txt
###################################################################################################################################################
#IDTAXA classifier for unclassified OTU sequences
#extract the unclassified OTU sequences from BLCA output file
sed -n '/Unclassified/p' ./BLCA_taxonomy/otu_seqs_BLCA_tax_assignment.txt | cut -d$'\t' -f 1 > ./BLCA_taxonomy/unclassified_OTU_IDs.txt
xargs samtools faidx all.otus.fasta < ./BLCA_taxonomy/unclassified_OTU_IDs.txt > all.otus_BLCA-unclassified.fasta
#run IDTAXA with the training set of RDP_v16-mod_March2018.RData (threshold: 70)
#input file - all.otus_BLCA-unclassified.fasta
#output file - all.otus_BLCA-unclassified_taxonomy_IDTAXA_0.7_RDP-V16-modified_original.txt
Rscript --vanilla $ProgramDir/scripts/IDTAXA.R $WorkDir/Taxa4Meta_output/temp $ProgramDir/databases/RDP_v16-mod_March2018.RData
#parse the IDTAXA output file which does fit QIIME format of taxonomy file
cat all.otus_BLCA-unclassified_taxonomy_IDTAXA_0.7_RDP-V16-modified_original.txt \
| sed -n '/subclass/p' | sed -n '/suborder/p' | sed 's/"//g' | sed 's/\\//g' \
| cut -d ';' -f 1,5,8,11,17,23,26 > all.otus_BLCA-unclassified_taxonomy_IDTAXA_0.7_RDP-V16-modified_temp1.txt #extract lines with subclass & suborder
cat all.otus_BLCA-unclassified_taxonomy_IDTAXA_0.7_RDP-V16-modified_original.txt \
| sed -n '/subclass/p' | sed -n '/suborder/!p' | sed 's/"//g' | sed 's/\\//g' \
| cut -d ';' -f 1,5,8,11,17,20,23 > all.otus_BLCA-unclassified_taxonomy_IDTAXA_0.7_RDP-V16-modified_temp2.txt #extract lines with subclass only
cat all.otus_BLCA-unclassified_taxonomy_IDTAXA_0.7_RDP-V16-modified_original.txt \
| sed -n '/subclass/!p' | sed -n '/suborder/!p' \
| sed -n '/unclassified_Root/!p' | sed -n '/Viruses/!p' | sed -n '/Eukaryota/!p' \
| sed 1d | sed 's/"//g' | sed 's/\\//g' \
| cut -d ';' -f 1,5,8,11,14,17,20 > all.otus_BLCA-unclassified_taxonomy_IDTAXA_0.7_RDP-V16-modified_temp3.txt #extract Bacteria and Archaea
cat all.otus_BLCA-unclassified_taxonomy_IDTAXA_0.7_RDP-V16-modified_temp*.txt | sed 's/rootrank;//g' \
> all.otus_BLCA-unclassified_taxonomy_IDTAXA_0.7_RDP-V16-modified_parsed.txt #clean taxonomy file a little bit
sed -i.backup 's/;unclassified_.*//g' all.otus_BLCA-unclassified_taxonomy_IDTAXA_0.7_RDP-V16-modified_parsed.txt #remove meaningless taxa
rm all.otus_BLCA-unclassified_taxonomy_IDTAXA_0.7_RDP-V16-modified_temp*.txt all.otus_BLCA-unclassified_taxonomy_IDTAXA_0.7_RDP-V16-modified_parsed.txt.backup #save storage
###################################################################################################################################################
#make OTU table with taxonomy assignments from BLCA and IDTAXA
cd $WorkDir/Taxa4Meta_output/temp
#concatenate BLCA and IDTAXA taxonomy file
cat ./BLCA_taxonomy/otu_seqs_BLCA_tax_assignment_classified_parsed.txt all.otus_BLCA-unclassified_taxonomy_IDTAXA_0.7_RDP-V16-modified_parsed.txt \
> all.otus_BLCA+IDTAXA_taxonomy.txt
cat all.otus_BLCA+IDTAXA_taxonomy.txt | cut -d$'\t' -f 1 > OTUs_list_bacteria_archaea.txt
sed -n '1p' all.otutab.txt > temp_1st-line.txt
grep -w -F -f OTUs_list_bacteria_archaea.txt all.otutab.txt > temp_matched.txt
cat temp_1st-line.txt temp_matched.txt > otu_table_bacteria_without_taxa.txt
rm temp* OTUs_list_bacteria_archaea.txt
rm -rf ./BLCA_taxonomy
biom convert -i otu_table_bacteria_without_taxa.txt -o otu_table_bacteria_without_taxa.biom --table-type="OTU table" --to-hdf5 #generate OTU table
biom add-metadata -i otu_table_bacteria_without_taxa.biom -o otu_table_bacteria_with_BLCA_IDTAXA.biom \
--observation-metadata-fp all.otus_BLCA+IDTAXA_taxonomy.txt \
--observation-header OTUID,taxonomy
biom summarize-table -i otu_table_bacteria_with_BLCA_IDTAXA.biom -o otu_table_bacteria_with_BLCA_IDTAXA_stats.txt #get sequencing depth per sample
biom convert -i otu_table_bacteria_with_BLCA_IDTAXA.biom -o otu_table_bacteria_with_BLCA_IDTAXA.txt --to-tsv --header-key taxonomy
#extract OTU sequences based on the list in "otu_table_bacteria_with_BLCA_IDTAXA.txt"
cat otu_table_bacteria_with_BLCA_IDTAXA.txt | cut -d$'\t' -f 1 | sed 1,2d > temp
xargs samtools faidx all.otus.fasta < temp > final_otu_seqs.fasta
rm temp
#simplify the output directory
cd $WorkDir/Taxa4Meta_output/temp
cp otu_table_bacteria_with_BLCA_IDTAXA.biom ../
cp otu_table_bacteria_with_BLCA_IDTAXA.txt ../
cp final_otu_seqs.fasta ../
#rm -rf $WorkDir/Taxa4Meta_output/temp