-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcleaning_multiple_seq.sh
209 lines (167 loc) · 11 KB
/
cleaning_multiple_seq.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/bin/bash
echo "-------------------- Run of data cleaning and taxonomic assignment programs --------------------"
# Define parameters
multiple_analysis=$1 #different sequencing output to analyse together ? "Yes" or "No" expected
base_dir=$2 #path to the main fold containing fastq files
output_dir=$3 #path to output fold
primer_removed=$4 #primer already removed? "Yes" or "No" expected
type_data=$5 #16S, 18S or ITS
type_database=$6 #silva, pr2 or unite
forward_primer=$7 #forward primer sequence
reverse_primer=$8 #reverse primer sequence
# Just for 16S or 18S
amplicon_length_variable=$9 #are amplicon lengths variable ? "Yes" or "No" expected
# Just for 16S or 18S non variable
amplicon_length=${10} #expected amplicon length
# Parameters for figaro
figaro="/Users/ambre/figaro/figaro/"
forward_primer_length=${#forward_primer}
reverse_primer_length=${#reverse_primer}
# Parameters for R script to clean data with DADA2
rscript="/usr/local/bin"
script_dada2_dir="/Users/ambre/Desktop/MASTER/M2/Stage/Script_nettoyage"
# Add informations about the run in the file
echo "Different sequencing outputs to analyze together ? $multiple_analysis " > "$output_dir/output_parameters.txt"
echo "Type of data for analysis : $type_data" >> "$output_dir/output_parameters.txt"
echo "Database used : $type_database" >> "$output_dir/output_parameters.txt"
echo "Primers sequence used : " >> "$output_dir/output_parameters.txt"
echo " Forward primer = $forward_primer" >> "$output_dir/output_parameters.txt"
echo " Reverse primer = $reverse_primer" >> "$output_dir/output_parameters.txt"
echo "Primer were already been removed ? $primer_removed" >> "$output_dir/output_parameters.txt"
echo "Version of tools used during this analysis : " >> "$output_dir/output_parameters.txt"
echo " Cutadapt : $(cutadapt --version)" >> "$output_dir/output_parameters.txt"
echo " R : $($rscript/Rscript --version)" >> "$output_dir/output_parameters.txt"
echo "----------- Start Bash program -----------"
#### CUTADAPT ####
# The aim is to cut all sequences to the same length
for sub_dir in "$base_dir"/*/
do
fastq_dir=$(readlink -f "$sub_dir/fastq")
echo $sub_dir
echo $fastq_dir
## Quality control ##
for file in $fastq_dir/*.fastq.gz;
do
[ -e "$file" ] || continue
# Check if files contains more than 10000 reads, if not delete file
number_reads=$(gzcat "$file" | awk '!(NR % 4) {k++}END{print k}') #number of reads for the file
echo "Number of reads in the file $file : $number_reads " >> "$output_dir/output_parameters.txt"
if [[ "$number_reads" -lt 10000 ]]; then
echo "Number of reads is less than 10,000. Deleting file: $file" >> "$output_dir/output_parameters.txt"
rm "$file"
continue
fi
length=$(gzcat "$file" | awk 'NR%4 == 2 {print length($0)}') #length of sequences for the file
sum_length=$(gzcat "$file" | awk 'NR%4 == 2 {sum+=length($0)} END {print sum}') #sum of sequence lengths for the file
number_seq=$(gzcat "$file" | awk '!(NR % 4) {k++}END{print k}') #number of sequences for the file
mean=$((sum_length/number_seq)) #mean of length for the file
total_sum_length=$((total_sum_length + sum_length))
total_number_seq=$((total_number_seq + number_seq))
done
total_mean=$((total_sum_length / total_number_seq))
echo "Total average length of sequences in all files : $total_mean" >> "$output_dir/output_parameters.txt"
if [[ ($type_data == "16S" || $type_data == "18S") && $amplicon_length_variable == "No" ]]; then
min_length_amplicon_expected=$((amplicon_length / 2 - forward_primer_length - reverse_primer_length))
if [[ total_mean -lt $min_length_amplicon_expected ]]; then
echo "Error : Total average length of sequences in all files is less than $min_length_amplicon_expected bp (minimum length amplicon expected), exit program."
exit 1
fi
elif [[ ($type_data == "ITS") || (($type_data == "16S" || $type_data == "18S") && $amplicon_length_variable == "Yes") ]]; then
if [[ total_mean -lt 100 ]]; then
echo "Error : Total average length of sequences in all files is less than 100 bp, exit program."
exit 1
fi
fi
if [[ ($type_data == "16S" || $type_data == "18S") && $amplicon_length_variable == "No" ]]
then
echo "---------Type of gene studied : $type_data"
cutadapt_dir="$sub_dir/cutadapt"
figaro_dir="$sub_dir/figaro"
# Parameters for cutadapt mean
total_sum_length=0
total_number_seq=0
figaro_update="TRUE"
# Check if directory exists, if not create it
if [ ! -d "$cutadapt_dir" ]; then
mkdir -p "$cutadapt_dir"
fi
if [ ! -d "$figaro_dir" ]; then
mkdir -p "$figaro_dir"
fi
echo "Length at which the sequences was cut to run figaro : $total_mean" >> "$output_dir/output_parameters.txt"
for file in $fastq_dir/*.fastq.gz
do
output_file="$cutadapt_dir/$(basename "$file")"
# Cutadapt is used to cut sequences at the same length, defined previously
cutadapt -l "$total_mean" -m "$total_mean" -o "$output_file" "$file"
echo "File $file processed and new file saved in $output_file."
done
#### FIGARO ####
# The aim is to have the parameters form filterAndTrim() function
python3 $figaro/figaro.py -i $cutadapt_dir -o $figaro_dir -a $amplicon_length -f $forward_primer_length -r $reverse_primer_length
if [ $? -ne 0 ]
then
figaro_update="FALSE"
fi
fi
#### CUTADAPT ####
# The aim is to cut primer from sequences
## Condition : ask if primer already been removed, if yes don't do this step
if [[ $primer_removed == "No" && ($type_data == "16S" || $type_data == "18S") ]]
then
fastq_no_primer="$sub_dir/fastq_no_primer"
# Check if there are matching files
if [ ! -d "$fastq_no_primer" ]; then
mkdir -p "$fastq_no_primer"
fi
for file_R1 in $fastq_dir/*_R1*.fastq.gz
do
#Find file R2 associated with R1 (with file name)
file_R2=$(echo "$file_R1" | sed 's/_R1/_R2/g')
echo "---------File $file_R1 associated with file $file_R2 being processed..."
# Define output files
output_R1="$fastq_no_primer/$(basename "$file_R1")"
output_R2="$fastq_no_primer/$(basename "$file_R2")"
cutadapt -g $forward_primer -G $reverse_primer -o "$output_R1" -p "$output_R2" "$file_R1" "$file_R2"
done
echo "Primers were removed with cutadapt for files in $fastq_dir. " >> "$output_dir/output_parameters.txt"
fi
done
#### R SCRIPT DADA2 ####
if [[ $multiple_analysis == "No" ]]
then
if [[ $type_data == "16S" || $type_data == "18S" ]]
then
if [[ $amplicon_length_variable == "No" ]]
then
echo "Command line used to run R script : $rscript/Rscript $script_dada2_dir/cleaning_multiple_seq.R -f $base_dir -t $type_data -o $output_dir --type_database $type_database -j $figaro_dir -p $primer_removed --figaro $figaro_update -a $amplicon_length --primer_forward $forward_primer --primer_reverse $reverse_primer" >> "$output_dir/output_parameters.txt"
$rscript/Rscript $script_dada2_dir/cleaning_multiple_seq.R -f $base_dir -t $type_data -o $output_dir --type_database $type_database -j $figaro_dir -p $primer_removed --figaro $figaro_update -a $amplicon_length --primer_forward $forward_primer --primer_reverse $reverse_primer
elif [[ $amplicon_length_variable == "Yes" ]]
then
echo "Command line used to run R script : $rscript/Rscript $script_dada2_dir/cleaning_multiple_seq.R -f $base_dir -t $type_data -o $output_dir --type_database $type_database -p $primer_removed -l $amplicon_length_variable --primer_forward $forward_primer --primer_reverse $reverse_primer" >> "$output_dir/output_parameters.txt"
$rscript/Rscript $script_dada2_dir/cleaning_multiple_seq.R -f $base_dir -t $type_data -o $output_dir --type_database $type_database -p $primer_removed -l $amplicon_length_variable --primer_forward $forward_primer --primer_reverse $reverse_primer
fi
elif [[ $type_data == "ITS" ]]
then
echo "Command line used to run R script : $rscript/Rscript $script_dada2_dir/cleaning_multiple_seq.R -f $base_dir -t $type_data -o $output_dir --type_database $type_database -p $primer_removed --primer_forward $forward_primer --primer_reverse $reverse_primer " >> "$output_dir/output_parameters.txt"
$rscript/Rscript $script_dada2_dir/cleaning_multiple_seq.R -f $base_dir -t $type_data -o $output_dir --type_database $type_database -p $primer_removed --primer_forward $forward_primer --primer_reverse $reverse_primer
fi
elif [[ $multiple_analysis == "Yes" ]]
then
if [[ $type_data == "16S" || $type_data == "18S" ]]
then
if [[ $amplicon_length_variable == "No" ]]
then
echo "Command line used to run R script : $rscript/Rscript $script_dada2_dir/cleaning_multiple_seq.R -m $multiple_analysis -f $base_dir -t $type_data -o $output_dir --type_database $type_database -p $primer_removed -a $amplicon_length --primer_forward $forward_primer --primer_reverse $reverse_primer" >> "$output_dir/output_parameters.txt"
$rscript/Rscript $script_dada2_dir/cleaning_multiple_seq.R -m $multiple_analysis -f $base_dir -t $type_data -o $output_dir --type_database $type_database -p $primer_removed -a $amplicon_length --primer_forward $forward_primer --primer_reverse $reverse_primer
elif [[ $amplicon_length_variable == "Yes" ]]
then
echo "Command line used to run R script : $rscript/Rscript $script_dada2_dir/cleaning_multiple_seq.R -m $multiple_analysis -f $base_dir -t $type_data -o $output_dir --type_database $type_database -p $primer_removed -l $amplicon_length_variable --primer_forward $forward_primer --primer_reverse $reverse_primer" >> "$output_dir/output_parameters.txt"
$rscript/Rscript $script_dada2_dir/cleaning_multiple_seq.R -m $multiple_analysis -f $base_dir -t $type_data -o $output_dir --type_database $type_database -p $primer_removed -l $amplicon_length_variable --primer_forward $forward_primer --primer_reverse $reverse_primer
fi
elif [[ $type_data == "ITS" ]]
then
echo "Command line used to run R script : $rscript/Rscript $script_dada2_dir/cleaning_multiple_seq.R -m $multiple_analysis -f $base_dir -t $type_data -o $output_dir --type_database $type_database -p $primer_removed --primer_forward $forward_primer --primer_reverse $reverse_primer" >> "$output_dir/output_parameters.txt"
$rscript/Rscript $script_dada2_dir/cleaning_multiple_seq.R -m $multiple_analysis -f $base_dir -t $type_data -o $output_dir --type_database $type_database -p $primer_removed --primer_forward $forward_primer --primer_reverse $reverse_primer
fi
fi