-
Notifications
You must be signed in to change notification settings - Fork 0
/
pan_pipe
186 lines (129 loc) · 4.78 KB
/
pan_pipe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/bin/bash
set -e
scriptName="pan_pipe"
version="0.0.1"
## download and collect all of the genomes you want in your pangenome
# it is very important to QC this collection.
# often genomes are mislabeled in genbank or of poor quality
HELP=0
THREADS=1
ROARY_ARGS=''
PROKKA_ARGS=''
GIFROP_ARGS=''
# parse arguments here
for arg in "$@"
do
case $arg in
-h|--help)
HELP=1
shift
;;
--roary_args)
ROARY_ARGS=$2
shift # Remove --initialize from processing
;;
--prokka_args)
PROKKA_ARGS=$2
shift # Remove --initialize from processing
;;
--gifrop_args)
GIFROP_ARGS=$2
shift # Remove argument name from processing
# shift # Remove argument value from processing
;;
-t|--threads)
THREADS="$2"
shift # Remove argument name from processing
# shift # Remove argument value from processing
;;
*)
OTHER_ARGUMENTS+=("$1") # arg values seem to be accumulating in here
shift # Remove generic argument from processing
;;
esac
done
# help message here
# Print usage
usage() {
echo -n "
Usage:
${scriptName} [OPTION]...
This script should be executed from a directory that contains bacterial genomes in fasta format
with the suffix ".fna". This script will annotate all genomes with prokka, then calculate a
pangenome with roary, finally it will extract genomic islands from the pangenome with gifrop.
Options can be passed to prokka, roary, and gifrop.
IMPORTANT!!
Inputs should be carefully QC'd. Garbage in, garbage out.
Options:
-h, --help Display this help and exit
-t, --threads Number of threads to use for parallel commands. Will be overridden by values in *_args options
--roary_args a quoted string of arguments to pass to roary, e.g.: '-p 8 -s -e --mafft'
--prokka_args a quoted string of arguments to pass to prokka, e.g: '--rawproduct --proteins prots.gbk'
--gifrop_args a quoted string of arguments to pass to gifrop, e.g: '--no_plots -m 7'
Example:
pan_pipe
or
pan_pipe --gifrop_args '--min_genes 5 --no_plots' --roary_args '-s' --prokka_args '--proteins prots.gbk'
"
}
if [[ $HELP -eq 1 ]]
then
usage
exit
fi
# check deps
# check dependencies
# ERR=0
echo "===== Dependencies check ====="
[ -z `which parallel` ] && echo "parallel .... not found" && ERR=1 || echo "parallel .... good"
[ -z `which abricate` ] && echo "abricate .... not found" && ERR=1 || echo "abricate .... good"
[ -z `which Rscript` ] && echo "Rscript .... not found" && ERR=1 || echo "Rscript .... good"
[ -z `which find` ] && echo "find .... not found" && ERR=1 || echo "find .... good"
[ -z `which prokka` ] && echo "prokka .... not found" && ERR=1 || echo "prokka .... good"
[ -z `which roary` ] && echo "roary .... not found" && ERR=1 || echo "roary .... good"
[ -z `which gifrop` ] && echo "girfop .... not found" && ERR=1 || echo "gifrop .... good"
if [[ $ERR -eq 1 ]]
then
echo "Link or install any of your 'need to install' programs above"
exit
fi
### CHECK FOR '.fna' FILES HERE: ###
numFNA=$(find . -name '*.fna' | wc -l)
if [[ $numFNA -lt 2 ]]
then
echo "less than two fastas detected. Cannot calculate pangenome with less than two fastas"
echo "Please make sure your fasta files end in '.fna'"
exit
fi
##### annotate all fasta formatted genomes with prokka #####
# technically this will give an error if there are enough *fna files to give an arguments too long error
# but if you are really trying to make a pangenome with that many fastas, this error will be the least of your worries
mkdir panpipe_logs
echo 'generating prokka commands... see prokka_cmds.txt'
for x in *fna
do
printf 'prokka --prefix %s %s %s\n' "${x%.fna}" "$PROKKA_ARGS" "$x" >> prokka_cmds.txt
done
echo 'done generating prokka commands'
echo 'executing prokka commands in parallel'
echo 'this can take a bit...'
cat prokka_cmds.txt | parallel >> ./panpipe_logs/prokka_logs.txt 2>&1
echo 'done with prokka'
# make a directory for the pangenome analysis
mkdir pan
# then copy all gffs from prokka directories
find . -maxdepth 2 -path ./pan -prune -o -name '*.gff' -exec cp {} ./pan/ \;
# enter the pan directory
cd pan
# run roary
roar_cmd="roary $ROARY_ARGS *gff"
echo "running command: $roar_cmd"
roary $ROARY_ARGS *gff 2>&1 | tee -a ../panpipe_logs/roary.log
echo 'done running roary'
# then run gifrop
echo 'running gifrop'
gif_cmd="gifrop $GIFROP_ARGS --get_islands"
echo "running command: $gif_cmd"
gifrop $GIFROP_ARGS --get_islands 2>&1 | tee -a ../panpipe_logs/gifrop.log
echo 'Done!'
# boom! now you have a pan genome and genomic islands in the context of this pan genome