This repository has been archived by the owner on Feb 6, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrun_assembly.uge-nextflow
357 lines (319 loc) · 11.9 KB
/
run_assembly.uge-nextflow
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
#!/usr/bin/env bash
#$ -cwd
# This wrapper script manages job submission of
# the wf-paired-end-illumina-assembly workflow
# to the UGE scheduler.
SCRIPT_NAME="$(basename ${0#_} .uge-nextflow)"
# Define script usage
usage() {
echo "
Usage: ${0##*/} InputDirectory [OutputDirectory] [-h|--help]
Required:
<InputDirectory> Path containing PE Illumina 1.8+ files as *.fastq.gz
Sample names are extracted from the first underscore of read files. Hyphens,
periods, and commas are automatically discarded from the name to avoid errors.
WARNING: If you have HiSeq data where samples are ran on more than one lane,
be sure to concatenate them.
File searching goes 2 levels deep to find .fq and .fastq file pairs,
optionally gunzip compressed.
Sample pairing of sister reads is done by consecutively using filenames sorted
with \`ls\`, which flexibly allows for naming other than traditional R1 and R2.
Verify that files will properly pair by checking the output order of \`ls\`
within the <InputDirectory>.
Optional:
-h | --help Show this help message and exit.
<OutputDirectory> Location for the output files, which
includes job logfiles. Output files
will be overwritten if already present.
Default: current working directory.
"
}
# Color output for success, warnings, and errors to catch user's attention
GREEN_TXT='\033[0;32m'
RED_TXT='\033[0;31m'
YELLOW_TXT='\033[0;93m'
RED_BG='\033[41m'
COLOR_OFF='\033[0m'
# Allow user to re-specify the output path
prompt_new_outdir() {
echo -e "${RED_BG}"
read -p "Would you like to use a different output path? (yes|no) " -n 1 -r
echo -e "${COLOR_OFF}"
if [[ $REPLY =~ ^[yY] ]]; then
echo -e "${RED_BG}"
read -p "Enter new Output path: " _new_outdir
echo -e "${COLOR_OFF}"
OUT="$(readlink -f ${_new_outdir})"
else
echo -e "${RED_TXT}\nYou can re-run this workflow with a new output path."
echo -e "Submission was cancelled.\n ${COLOR_OFF}"
exit 0
fi
}
# Check if there is a possibility data will be overwritten
prompt_if_previous_nextflow_run() {
# Parameters:
# $1 = file path
# $2 = filename (no path) to search for
if [ -f $(find "${1}" -maxdepth 1 -name "${2}*" | grep -q '.') ]; then
echo -e "${YELLOW_TXT}\nThis workflow has been ran in output path before."
echo -e "Processing samples will overwrite older data."
echo -e "If you have jobs in queue this could be confusing.\n ${COLOR_OFF}"
echo -e "${RED_BG}"
read -p "Do you want to continue? (yes|no) " -n 1 -r
echo -e "${COLOR_OFF}"
if [[ $REPLY =~ ^[yY] ]]; then
# Continue submission if user wants to possibly overwrite data
echo -e "${YELLOW_TXT}\nData in ${1} will be overwritten."
echo -e "Continuing to submit samples...${COLOR_OFF}"
# Rename .work directory
# if [[ -d "${OUT}/.work" ]]; then
# mv "${OUT}/.work" "${OUT}/.work_$(basename $(readlink -f ${OUT}/.work))"
# fi
else
# If user doesn't want to continue, ask for a different output path
prompt_new_outdir
prompt_if_previous_nextflow_run "${OUT}/pipeline_info" "nextflow_log"
fi
fi
}
# Check argument requirements
[[ $1 == "--help" || $1 == "-h" ]] && { usage; exit 0; }
if [[ $# -lt 1 || $# -gt 2 ]]; then
echo -e "${RED_TXT}ERROR:${COLOR_OFF} improper number ("$#") of arguments provided" >&2
usage
exit 1
fi
# Confirm we are on a host that can submit jobs
if [ ${HOSTNAME%%.*} == 'biolinux' ] || \
[ ${HOSTNAME%%.*} == 'login02' ] || \
[ ${HOSTNAME%%.*} == 'rosalind01' ] || \
[ $(echo ${HOSTNAME%%.*} | sed 's/[0-9]//g') == 'node' ]; then
:
else
echo -e "${RED_TXT}ERROR:${COLOR_OFF} must be on aspen or biolinux" >&2
exit 1
fi
# I/O handling
if [[ $1 =~ .+[[:space:]].+ ]]; then
echo -e "${RED_TXT}ERROR:${COLOR_OFF} whitespace in $1 path unsupported" >&2
exit 1
fi
if [[ $2 =~ .+[[:space:]].+ ]]; then
echo -e "${RED_TXT}ERROR:${COLOR_OFF} whitespace in $2 path unsupported" >&2
exit 1
fi
IN=$(readlink -f "$1")
if [[ -z "$2" ]]; then
OUT="${PWD}"
else
OUT=$(readlink -f "$2")
fi
# Get LAB_HOME or custom tmp/cache variables from user's ~/.bashrc,
# while still enabling fancy people to override these from their
# current working environment too.
source ${HOME}/.bashrc
if [[ -z "${LAB_HOME}" ]]; then
echo -e "${RED_TXT}ERROR:${COLOR_OFF} ${LAB_HOME} not set" >&2
exit 1
fi
# Check if Singularity environment variables are set,
# and confirm the user has write access.
# We expect these 2 paths to already exist.
if [[ -z ${SINGULARITY_CACHEDIR} ]]; then
echo -e "${RED_TXT}ERROR:${COLOR_OFF} \$SINGULARITY_CACHEDIR not set" >&2
exit 1
else
if [[ ! -d ${SINGULARITY_CACHEDIR} ]] || [[ ! -w ${SINGULARITY_CACHEDIR} ]]; then
echo -e "${RED_TXT}ERROR:${COLOR_OFF} \$SINGULARITY_CACHEDIR ${SINGULARITY_CACHEDIR} not writeable for ${USER}" >&2
exit 1
fi
fi
if [[ -z ${SINGULARITY_TMPDIR} ]]; then
echo -e "${RED_TXT}ERROR:${COLOR_OFF} \$SINGULARITY_TMPDIR not set" >&2
exit 1
else
if [[ ! -d ${SINGULARITY_TMPDIR} ]] || [[ ! -w ${SINGULARITY_TMPDIR} ]]; then
echo -e "${RED_TXT}ERROR:${COLOR_OFF} \$SINGULARITY_TMPDIR ${SINGULARITY_TMPDIR} not writeable for ${USER}" >&2
exit 1
fi
fi
# If Windows-style mount point (e.g., "Z:\Streptobacillus\Raw_FQs") given as a
# path, check both user's home, lab home, and group as possible mount areas
# to determine if files exist in 1 but not the others.
if [[ ${IN} =~ ^.+[A-Z]\:.+ ]]; then
path_slashes_replaced="$(echo "${IN}" | cut -d ':' -f 2 | sed 's/\\/\//g')"
user_home="${HOME}"/"${path_slashes_replaced}"
user_group_main="/scicomp${path_slashes_replaced}" # Dropped slash between scicomp and var
user_group_branch="/scicomp/groups/OID/NCEZID/DHCPP/BSPB/${path_slashes_replaced}"
lab_home="${LAB_HOME}"/"${path_slashes_replaced}"
paths=(
"${user_home}"
"${user_group_main}"
"${user_group_branch}"
"${lab_home}"
)
for path in "${paths[@]}"; do
cnt_read_files=$(find -L "${path}" \
-maxdepth 2 \
-type f \
-readable \
-regextype posix-extended \
-regex ".+_(R)?(1|2)(.+)?\.(fq|fastq)($|\.gz$)" \
2> /dev/null \
| wc -l)
if [[ ${cnt_read_files} -ge 1 ]]; then
echo -e "${YELLOW_TXT}\n${cnt_read_files} FastQ files found in: ${path}${COLOR_OFF}"
echo -e "${RED_BG}"
read -p "Use ${path} as Input Directory? (yes|no) " -n 1 -r
echo -e "${COLOR_OFF}"
if [[ $REPLY =~ ^[yY] ]]; then
IN="${path}"
else
exit 0
fi
break
fi
done
fi
# For now, just output into shared LAB_HOME area
if [[ ${OUT} =~ ^.+[A-Z]\:.+ ]]; then
echo -e "${YELLOW_TXT}\nWindows hard mount path detected as Output Directory ${COLOR_OFF}"
proposed_outdir="${LAB_HOME}"/"$(date '+%Y-%b-%d_%a_%H:%M:%S')"
echo -e "${RED_BG}"
read -p "Use ${proposed_outdir} as Output Directory? (yes|no) " -n 1 -r
echo -e "${COLOR_OFF}"
if [[ $REPLY =~ ^[yY] ]]; then
OUT="${path}"
else
echo -e "${RED_TXT}\nOkay, bad autoselection? Re-run with a different specified path ${COLOR_OFF}\n"
exit 0
fi
fi
# If Windows-style path from Core Facility given via email
# (e.g., "\\...cdc.gov\groups\OID\...BCFB\by-instrument\NovaSeq\")
# is given as path, form the Linux path
if [[ ${IN} =~ ^.+\.cdc\.gov\\.+\\.+ ]]; then
inpath_stripped_hostname="${IN/*.cdc\.gov/\scicomp}"
inpath_dewindowsed="$(echo "${inpath_stripped_hostname}" | sed 's/\\/\//g')"
IN="${inpath_dewindowsed}"
fi
# Check for specific nextflow logfile for this workflow, which
# means this nextflow workflow has been ran before in the specified outdir.
if [ -d "${OUT}/pipeline_info" ] && \
[ -f $(find "${OUT}" -maxdepth 2 -name "nextflow_log*" | grep -q '.') ]; then
prompt_if_previous_nextflow_run "${OUT}/pipeline_info" "nextflow_log"
fi
# Also check for unsupported behavior combining legacy bash and newer nextflow
if [ -d "${OUT}/.log" ] && \
[ -f "${OUT}/.log/trim.asm.annot.job_ids.txt" ] && \
[ -f "${OUT}/.log/trim.asm.annot.log.txt" ]; then
echo -e "${RED_TXT}Legacy bash workflow detected in ${OUT}"
echo -e "Combining data from that and this nextflow pipeline is unsupported."
echo -e "Use a different output path. ${COLOR_OFF}"
exit 1
fi
# Add found files to an array
found_files=( $(find -L ${IN} \
-maxdepth 2 \
-type f \
-readable \
-regextype posix-extended \
-regex ".+_(R)?(1|2)(.+)?\.(fq|fastq)($|\.gz$)" | sort --dictionary-order ))
# Require even input file count for paired-end analysis
if [ ${#found_files[@]} -lt 1 ]; then
echo -e "${RED_TXT}ERROR:${COLOR_OFF} no read sets found" >&2
exit 1
fi
if [[ $((${#found_files[@]} % 2)) -ne 0 ]]; then
echo -e "${RED_TXT}ERROR:${COLOR_OFF} uneven number (${#found_files[@]}) of FastQ files" >&2
exit 1
fi
# Pair up the FastQ files based on basename (prefix) to submit jobs
submitted=()
for (( i=0; i<${#found_files[@]}; i+=2 )); do
read1=${found_files[i]}
read2=${found_files[i+1]}
b1=$(basename ${read1} | cut -d _ -f 1 | sed 's/[-\.,]//g')
b2=$(basename ${read2} | cut -d _ -f 1 | sed 's/[-\.,]//g')
if [[ "${submitted[*]}" =~ " ${b1} " ]]; then
echo -e "${RED_TXT}ERROR:${COLOR_OFF} ${b1} sample name occurs in >1 paired set" >&2
exit 1
fi
if [ "${b1}" != "${b2}" ]; then
echo -e "${RED_TXT}ERROR:${COLOR_OFF} improperly paired ${b1} ${b2}" >&2
exit 1
fi
submitted+=(${b1})
done
# Prompt user if >10 samples to prevent accidentally assembling too many
if [[ ${#submitted[@]} -ge 10 ]]; then
echo -e "${YELLOW_TXT}\nWARNING: ${#submitted[@]} samples have been identified:\n${COLOR_OFF}"
for f in ${submitted[@]}; do
echo -e "${YELLOW_TXT}${f}${COLOR_OFF}"
done | xargs -L3 | column -t
# Prompt user to verify submission of large sample set
echo -e "${RED_BG}"
read -p "Do you want to continue submitting ${#submitted[@]} samples? (yes|no) " -n 1 -r
echo -e "${COLOR_OFF}"
if [[ $REPLY =~ ^[yY] ]]; then
echo -e "\nContinuing to submit ${#submitted[@]} samples...\n"
else
echo -e "${RED_TXT}\nSubmission cancelled ${COLOR_OFF}"
exit 0
fi
fi
# Set up log directory in OUT directory
mkdir -p ${OUT}/pipeline_info
# Get node number - <=230 = biolinux, >=231 = rosalind
NODE_NUM=$(echo ${HOSTNAME%%.*} | sed 's/node//1')
# If FastQ files are able to be submitted, submit and display basenames
if [[ ${#submitted[@]} -ge 1 ]] && \
[[ ${HOSTNAME%%.*} == 'biolinux' || 'login02' ]] || \
[[ ${NODE_NUM} -le 230 ]]; then
# Submit Nextflow pipeline to Aspen HPC
qsub \
-m ba \
-q all.q \
-v IN=${IN} \
-v OUT=${OUT} \
-o ${OUT}/pipeline_info \
-e ${OUT}/pipeline_info \
-M ${USER}@cdc.gov \
-v LAB_HOME=${LAB_HOME} \
-N ASM_${#submitted[@]} \
-v SINGULARITY_TMPDIR=${SINGULARITY_TMPDIR} \
-v SINGULARITY_CACHEDIR=${SINGULARITY_CACHEDIR} \
-v NXF_SINGULARITY_CACHEDIR=${SINGULARITY_CACHEDIR} \
${LAB_HOME}/workflows/wf-paired-end-illumina-assembly/_run_assembly.uge-nextflow
elif [[ ${#submitted[@]} -ge 1 ]] && \
[[ ${HOSTNAME%%.*} == 'rosalind01' ]] || \
[[ ${NODE_NUM} -ge 231 ]]; then
# Submit Nextflow pipeline to Rosalind HPC
qsub \
-m ba \
-q all.q \
-v IN=${IN} \
-v OUT=${OUT} \
-o ${OUT}/pipeline_info \
-e ${OUT}/pipeline_info \
-M ${USER}@cdc.gov \
-v LAB_HOME=${LAB_HOME} \
-N ASM_${#submitted[@]} \
-l max_runtime=72:00:00 \
-v SINGULARITY_TMPDIR=${SINGULARITY_TMPDIR} \
-v SINGULARITY_CACHEDIR=${SINGULARITY_CACHEDIR} \
-v NXF_SINGULARITY_CACHEDIR=${SINGULARITY_CACHEDIR} \
${LAB_HOME}/workflows/wf-paired-end-illumina-assembly/_run_assembly.uge-nextflow
else
echo -e "${RED_TXT}Biolinux/Aspen/Rosalind HPC is not detected.\nSubmission cancelled. ${COLOR_OFF}"
exit 1
fi
# Display basename of FastQ files submitted
echo -e "${GREEN_TXT}\nInput directory:${COLOR_OFF} ${IN}"
echo -e "${GREEN_TXT}Output directory:${COLOR_OFF} ${OUT}"
echo -e "${GREEN_TXT}\nGenerating annotated assemblies for:"
for elem in "${submitted[@]}"; do
echo "${elem}"
done | xargs -L3 | column -t
echo -e "${COLOR_OFF}"