-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlivid
317 lines (272 loc) · 8.76 KB
/
livid
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
#!/bin/bash
#set -x
#Vishnu Raghuram 2021-03-1
#LIVID: Locus specIfic Variant IDentification
#bash script for extracting a locus from any given WGS and identifying variants when compared to a reference locus.
version="0.2"
USAGE=$(echo -e "\n\
LIVID: Locus specIfic Variant IDentification \n\n\
VERSION: livid v$version \n\n\
USAGE: \
livid [options] -i filename.fasta -r reference.gbk -p primers.fasta -x <int> -y <int> -d <int> \n\n\
FLAGS: \n\
-i | --input REQUIRED: Input genome in FASTA format\n\
-r | --reference REQUIRED: Reference locus/gene sequence in GENBANK or FASTA format (Use genbank for annotated frameshifts) \n\
-p | --primers REQUIRED: File containing forward and reverse primer in FASTA format \n\
-d | --maxdiff REQUIRED: Integer for maximum number of primer mismatches allowed \n\
-x | --minamp REQUIRED: minamp parameter for usearch
-y | --maxamp REQUIRED: maxamp parameter for usearch
-f | --force Force overwrite existing results directory\n\
-h | --help Print this help message and exit\n\
-v | --version Print version and exit\n\n\
SOURCE: COMING SOON\n\n\
")
force=0
mumm=0
script_dir=$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )
extraction="pcr"
if [[ ! $1 ]]
then
echo -e "$USAGE"
exit
fi
while :; do
case $1 in
-h|-\?|--help) #Print usage and exit
echo -e "$USAGE"
shift
exit
;;
-i|--input) #Save input with specified path to $fasta_path
if [[ $2 && $2 != -* ]]
then
fasta_path=$2
bname=$(basename $fasta_path)
fna_name="${bname%.*}"
echo -e "Processing $fasta_path ..."
shift
else
1>&2 echo -e " -i requires an argument.\n$USAGE "
exit 1
fi
;;
-r|--reference) #Save input with specified path to $fasta_path
if [[ $2 && $2 != -* ]]
then
reference_path=$2
echo -e "Processing $reference_path ..."
shift
else
1>&2 echo -e " -r requires an argument.\n$USAGE "
exit 1
fi
;;
-p|--primers) #Save input with specified path to $fasta_path
if [[ $2 && $2 != -* ]]
then
primers_path=$2
echo -e "Processing $primers_path ..."
shift
else
1>&2 echo -e " -p requires an argument.\n$USAGE "
exit 1
fi
;;
-d|--maxdiff)
if [[ $2 && $2 != -* ]]
then
maxdiff=$2
shift
else
1>&2 echo -e " -d requires a positive integer argument.\n$USAGE "
exit 1
fi
;;
-x|--minamp)
if [[ $2 && $2 != -* ]]
then
minamp=$2
shift
else
1>&2 echo -e " -x requires a positive integer argument.\n$USAGE "
exit 1
fi
;;
-y|--maxamp)
if [[ $2 && $2 != -* ]]
then
maxamp=$2
shift
else
1>&2 echo -e " -y requires a positive integer argument.\n$USAGE "
exit 1
fi
;;
-f|--force)
force=$((force + 1))
shift
continue
;;
-v|--version)
echo -e "livid v$version"
shift
exit
;;
--)
shift
break
;;
-?*)
1>&2 echo -e " $1 is an invalid option. \n$USAGE "
shift
exit 1
;;
*)
shift
break
esac
shift
done
###########################
#### ERROR REPORT VARS ####
###########################
input_check="fail"
outdir_check="fail"
usearch_check="fail"
snippy_check="fail"
#Error report file
echo -e "#input_name\tinput_check\toutdir_check\tusearch_check\tsnippy_check" > $bname-error-report.tab
###########################
#### INPUT VALIDATIONS ####
###########################
#check if results directory already exists
if [[ -d $fna_name-results ]]
then
if [[ $force > 0 ]]
then
echo "Results directory already exists, -f specified, overwriting..."
rm -rf $fna_name-results
mkdir $fna_name-results
outdir_check="pass"
else
1>&2 echo "Results directory already exists, cannot overwrite. Use option -f to force overwrite"
#Error report file
echo -e "#input_name\tinput_check\toutdir_check\tusearch_check\tsnippy_check" >> $bname-error-report.tab
exit 1
fi
else
mkdir $fna_name-results
outdir_check="pass"
fi
# validate input fasta
validate_fasta () {
if [[ -f $1 ]] #Check if input is a file
then
if [[ $(file $1 | grep -c "compressed") -eq 0 ]] #checks if file is compressed
then
if [[ $(grep -q "^@" $1 ; echo $?) -eq 1 && $(seqkit seq -t dna -n --quiet $1 | wc -l) -ge 1 ]] # if file is NOT fastq and checks if seqkit can parse the file
then
if [[ $(grep -v ">" $1 | grep -q -i "[^ATCGNWSMKRY]"; echo $?) -eq 1 ]] #check if seqence has characters other than ATGCN
then
1>&2 echo -e "$1 is a valid fasta file"
else
1>&2 echo -e "Seqence has non-standard nucleic acid characters\n$USAGE"
echo -e "#input_name\tinput_check\toutdir_check\tusearch_check\tsnippy_check" >> $bname-error-report.tab
exit 1
fi
else
1>&2 echo -e "Input file not in FASTA format\n$USAGE"
echo -e "#input_name\tinput_check\toutdir_check\tusearch_check\tsnippy_check" >> $bname-error-report.tab
exit 1
fi
else
1>&2 echo -e "Compressed input not supported (for now)\n$USAGE"
echo -e "#input_name\tinput_check\toutdir_check\tusearch_check\tsnippy_check" >> $bname-error-report.tab
exit 1
fi
else
1>&2 echo -e "Invalid input\n$USAGE"
echo -e "#input_name\tinput_check\toutdir_check\tusearch_check\tsnippy_check" >> $bname-error-report.tab
exit 1
fi
}
validate_fasta $fasta_path
validate_fasta $reference_path
validate_fasta $primers_path
$input_check="pass"
#################################
##### EXTRACTING AGR OPERON #####
#################################
# Get operating system
if [[ $OSTYPE == *[lL]inux* ]]
then
usearch_bin=usearch11.0.667_i86linux32
elif [[ $OSTYPE == *[dD]arwin* ]]
then
usearch_bin=usearch11.0.667_i86osx32
else
1>&2 echo -e "Error running usearch, unable to determine usearch version."
fs="u"
exit 1
fi
# Check if usearch11.0.667 binary is in PATH
path_to_usearch=$(which $usearch_bin)
if [[ -x $path_to_usearch ]]
then
# Check usearch exit status
if [[ $($usearch_bin &> /dev/null; echo $?) > 0 ]]
then
1>&2 echo -e "Error running usearch, please make sure usearch is installed correctly."
fs="u"
exit 1
else
echo -e "usearch found"
usearch_check="pass"
fi
else
1>&2 echo -e "$usearch_bin not in path, cannot perform frameshift detection\n please download usearch11.0.667 from https://www.drive5.com/usearch/download.html"
1>&2 echo -e "For example:\n\tcurl 'https://www.drive5.com/downloads/$usearch_bin.gz' --output $usearch_bin.gz\n\tgunzip $usearch_bin.gz\n\tchmod 755 $usearch_bin\n\tcp ./$usearch_bin $script_dir/"
fs="u"
exit 1
fi
# In-silico PCR using predefined primers(agr_operon_primers.fa) to extract agr operon
echo -e "USEARCH COMMAND:"
echo -e "$usearch_bin -search_pcr $fasta_path -db $primers_path -strand both -maxdiffs $maxdiff -minamp $minamp -maxamp $maxamp -ampout $fna_name-results/$fna_name-pcr_extracted.fna &>$fna_name-results/$fna_name-pcr-log.txt\n\n"
$usearch_bin -search_pcr $fasta_path -db $primers_path -strand both -maxdiffs $maxdiff -minamp $minamp -maxamp $maxamp -ampout $fna_name-results/$fna_name-pcr_extracted.fna &>$fna_name-results/$fna_name-pcr-log.txt
#Check if agr operon file is empty
if [[ -s $fna_name-results/$fna_name-pcr_extracted.fna ]]
then
echo -e "Locus extraction successful"
else
echo -e "Unable to find specified region, check $fna_name-results/$fna_name-$extraction-log.txt"
fs="u"
exit
fi
################################
##### FRAMESHIFT DETECTION #####
################################
#Running snippy with a group specific reference to call variants in the agr operon
snippy --outdir $fna_name-results/$fna_name-snippy --ctgs $fna_name-results/$fna_name-pcr_extracted.fna --ref $reference_path --minqual 1 --mincov 2 2> $fna_name-results/$fna_name-snippy-log.txt
#Check if snps file is empty
if [[ -s $fna_name-results/$fna_name-snippy/snps.tab ]]
then
echo -e "Snippy successful"
snippy_check="pass"
else
1>&2 echo -e "Snippy unsuccessful, check $fna_name-results/$fna_name-snippy-log.txt"
fs="u"
exit 1
fi
#Filtering out frameshifts in coding regions from snippy data
echo -e "#filename\tposition\ttype\teffect\tgene" > $fna_name-results/$fna_name-frameshifts.tab
awk -v i="$fna_name" 'BEGIN{FS=OFS="\t"};{if($7=="CDS") print i,$2,$3,$11,$13}' $fna_name-results/$fna_name-snippy/snps.tab | sed 's/ /\t/g' | grep -E -v '[conservative|disruptive]_inframe_[insertion|deletion]|splice_region_variant&stop_retained_variant|intergenic_region|initiator_codon_variant|gene_fusion|missense_variant' | grep -E -v 'synonymous_variant' >> $fna_name-results/$fna_name-frameshifts.tab
#Subtract 1 from $fs to account for the header
fs=$(($(cat $fna_name-results/$fna_name-frameshifts.tab | wc -l)-1))
#Check if frameshifts found
if [[ $fs > 0 ]]
then
echo -e "Frameshifts in specified region found, check $fna_name-results/$fna_name-frameshifts.tab"
else
echo -e "No frameshifts found"
exit
fi