Skip to content
This repository has been archived by the owner on Feb 6, 2024. It is now read-only.

Commit

Permalink
Merge pull request #7 from gregorysprenger/dev
Browse files Browse the repository at this point in the history
v2.0.1
  • Loading branch information
gregorysprenger authored Nov 16, 2023
2 parents 823fa8b + e68f3f9 commit 4c8c47b
Show file tree
Hide file tree
Showing 12 changed files with 44 additions and 30 deletions.
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,20 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## v2.0.1 - November 16, 2023

### `Added`

### `Fixed`

- Made bin directory executable when added to github so Nextflow can use them
- Changed log directory to pipeline_info directory in run_assembly.uge-nextflow scripts
- Removed --gtdb_db from run_assembly.uge-nextflow scripts until GTDB-Tk module version is updated

### `Updated`

### `Deprecated`

## v2.0.0 - November 15, 2023

### `Added`
Expand Down
35 changes: 17 additions & 18 deletions _run_assembly.uge-nextflow
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ fi

module load nextflow
nextflow \
-log ${OUT}/log/nextflow_log.${SCRIPT_NAME}.txt \
-log ${OUT}/pipeline_info/nextflow_log.${SCRIPT_NAME}.txt \
run \
${LAB_HOME}/workflows/wf-paired-end-illumina-assembly/main.nf \
-profile ${HPC} \
Expand All @@ -23,14 +23,13 @@ nextflow \
-N ${USER}@cdc.gov \
-w ${OUT}/.work \
--blast_db ${LAB_HOME}/.databases/ncbi \
--gtdb_db ${LAB_HOME}/.databases/GTDB/release207_v2 \
--kraken1_db /scicomp/reference-pure/kraken/OLD/1.0.0/kraken_db \
--kraken2_db ${LAB_HOME}/.databases/kraken2 \
-resume

# Check for errors and add to errors.tsv
# Get nextflow run name
run_name=$(grep "Launching" ${OUT}/log/ASM_*.o${SCRIPT_NAME} | cut -d '[' -f 2 | cut -d ']' -f 1)
run_name=$(grep "Launching" ${OUT}/pipeline_info/ASM_*.o${SCRIPT_NAME} | cut -d '[' -f 2 | cut -d ']' -f 1)

# Read each line from nextflow log, find info, and add to errors.tsv
while read -r line; do
Expand All @@ -40,8 +39,8 @@ while read -r line; do
# If process is already running, clean up error
if [[ "${line}" =~ ^Unable[[:space:]]to[[:space:]]acquire[[:space:]]lock.* ]]; then
error="You are trying to resume the execution of an already running pipeline."
ASM_OUT=$(realpath ${OUT}/log/ASM_*.o*)
echo -e "-\t-\t${error}\t${ASM_OUT}\t${time_stamp}\t${run_name}" >> ${OUT}/log/errors.tsv
ASM_OUT=$(realpath ${OUT}/pipeline_info/ASM_*.o*)
echo -e "-\t-\t${error}\t${ASM_OUT}\t${time_stamp}\t${run_name}" >> ${OUT}/pipeline_info/errors.tsv
else
# Workflow ran some processes
sample_name=$(grep "nf-" ${line}/.command.run | cut -d '(' -f 2 | cut -d ')' -f 1)
Expand Down Expand Up @@ -75,30 +74,30 @@ while read -r line; do
fi

# If process for sample retried and succeeded, ignore
if [[ -f "$(ls ${OUT}/log/process_logs/${sample_name}.${process}*out)" ]] \
if [[ -f "$(ls ${OUT}/pipeline_info/process_logs/${sample_name}.${process}*out)" ]] \
&& [[ $(cat ${line}/.exitcode) = @(0|143|137|104|134|139|71|255) ]]; then
continue
else
echo -e "${sample_name}\t${process}\t${error}\t${line}\t${time_stamp}\t${run_name}" >> ${OUT}/log/errors.tsv
echo -e "${sample_name}\t${process}\t${error}\t${line}\t${time_stamp}\t${run_name}" >> ${OUT}/pipeline_info/errors.tsv
fi
fi
done < <(nextflow log ${run_name} -filter 'status == "FAILED"')

# If errors.tsv found..
if [[ -f "${OUT}/log/errors.tsv" ]]; then
if [[ -f "${OUT}/pipeline_info/errors.tsv" ]]; then
# Add column headers
sed -i '1i Sample Name\tProcess\tError\tError Directory\tTimestamp\tRun Name' ${OUT}/log/errors.tsv
sed -i '1i Sample Name\tProcess\tError\tError Directory\tTimestamp\tRun Name' ${OUT}/pipeline_info/errors.tsv

# Remove duplicate lines and lines that have an empty first column
awk '!a[$0]++' ${OUT}/log/errors.tsv \
awk '!a[$0]++' ${OUT}/pipeline_info/errors.tsv \
| awk -F '\t' '$1{print $0}' \
> ${OUT}/log/errors_new.tsv
> ${OUT}/pipeline_info/errors_new.tsv

# Delete original errors.tsv and rename errors_new.tsv
rm ${OUT}/log/errors.tsv
rm ${OUT}/pipeline_info/errors.tsv

mv ${OUT}/log/errors_new.tsv \
${OUT}/log/errors.tsv
mv ${OUT}/pipeline_info/errors_new.tsv \
${OUT}/pipeline_info/errors.tsv
fi

# Count lines in Summary.Illumina.GenomeCoverage.tab
Expand All @@ -108,12 +107,12 @@ fi

# E-mail completion status
if [[ -f "${OUT}/qa/Summary.Illumina.GenomeCoverage.tab" ]] \
&& [[ -f "${OUT}/log/errors.tsv" ]]; then
&& [[ -f "${OUT}/pipeline_info/errors.tsv" ]]; then
echo -e "Assembly and QA finished on $(date)\n${OUT}" | mail \
-s "${num_assemblies} assembled $(basename "${OUT}") [HPC]" \
-S smtp="smtpgw.cdc.gov" \
-a "${OUT}/qa/Summary.Illumina.GenomeCoverage.tab" \
-a "${OUT}/log/errors.tsv" \
-a "${OUT}/pipeline_info/errors.tsv" \
"${USER}@cdc.gov"

elif [[ -f "${OUT}/qa/Summary.Illumina.GenomeCoverage.tab" ]]; then
Expand All @@ -123,11 +122,11 @@ elif [[ -f "${OUT}/qa/Summary.Illumina.GenomeCoverage.tab" ]]; then
-a "${OUT}/qa/Summary.Illumina.GenomeCoverage.tab" \
"${USER}@cdc.gov"

elif [[ -f "${OUT}/log/errors.tsv" ]]; then
elif [[ -f "${OUT}/pipeline_info/errors.tsv" ]]; then
echo -e "Assembly and QA could not be completed on $(date)\n${OUT}" | mail \
-s "No assemblies found $(basename "${OUT}") [HPC]" \
-S smtp="smtpgw.cdc.gov" \
-a "${OUT}/log/errors.tsv" \
-a "${OUT}/pipeline_info/errors.tsv" \
"${USER}@cdc.gov"
fi

Expand Down
Empty file modified bin/bash_functions.sh
100644 → 100755
Empty file.
Empty file modified bin/extract.record.from.genbank.py
100644 → 100755
Empty file.
Empty file modified bin/filter.blast.py
100644 → 100755
Empty file.
Empty file modified bin/filter.contigs.py
100644 → 100755
Empty file.
Empty file modified bin/split.multifasta.py
100644 → 100755
Empty file.
Empty file modified bin/summarize_kraken.sh
100644 → 100755
Empty file.
2 changes: 1 addition & 1 deletion conf/profiles/aspen_hpc.config
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ process {
penv = params.sge_penv
queue = params.sge_queue
clusterOptions = params.sge_options
errorStrategy = { task.exitStatus in [71,104,134,137,139,140,143,255] ? 'retry' : 'ignore' }
errorStrategy = { task.exitStatus in [71,104,134,137,139,140,143,250,255] ? 'retry' : 'ignore' }

// Default process_high label is 8h, 4h is sufficient
withLabel:process_high {
Expand Down
2 changes: 1 addition & 1 deletion conf/profiles/rosalind_hpc.config
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ process {
penv = params.sge_penv
queue = { task.time <= 4.h ? 'short.q' : task.time > 5.h ? 'all.q' : 'short.q' }
clusterOptions = params.sge_options
errorStrategy = { task.exitStatus in [71,104,134,137,139,140,143,255] ? 'retry' : 'ignore' }
errorStrategy = { task.exitStatus in [71,104,134,137,139,140,143,250,255] ? 'retry' : 'ignore' }

// Increase memory for Kraken 1
withName:READ_CLASSIFY_KRAKEN_ONE {
Expand Down
1 change: 1 addition & 0 deletions modules/local/estimate_genome_size_kmc/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ process ESTIMATE_GENOME_SIZE_KMC {

tag { "${meta.id}" }
container "gregorysprenger/kmc@sha256:27603041f8c8818aa71a1d0386df17eddca59dbd6441b7e84b78b8a09dc137df"
label "process_medium"

input:
tuple val(meta), path(reads)
Expand Down
20 changes: 10 additions & 10 deletions run_assembly.uge-nextflow
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ prompt_if_previous_nextflow_run() {
else
# If user doesn't want to continue, ask for a different output path
prompt_new_outdir
prompt_if_previous_nextflow_run "${OUT}/log" "nextflow_log"
prompt_if_previous_nextflow_run "${OUT}/pipeline_info" "nextflow_log"
fi
fi
}
Expand Down Expand Up @@ -226,9 +226,9 @@ fi

# Check for specific nextflow logfile for this workflow, which
# means this nextflow workflow has been ran before in the specified outdir.
if [ -d "${OUT}/log" ] && \
if [ -d "${OUT}/pipeline_info" ] && \
[ -f $(find "${OUT}" -maxdepth 2 -name "nextflow_log*" | grep -q '.') ]; then
prompt_if_previous_nextflow_run "${OUT}/log" "nextflow_log"
prompt_if_previous_nextflow_run "${OUT}/pipeline_info" "nextflow_log"
fi

# Also check for unsupported behavior combining legacy bash and newer nextflow
Expand Down Expand Up @@ -297,7 +297,7 @@ if [[ ${#submitted[@]} -ge 10 ]]; then
fi

# Set up log directory in OUT directory
mkdir -p ${OUT}/log
mkdir -p ${OUT}/pipeline_info

# Get node number - <=230 = biolinux, >=231 = rosalind
NODE_NUM=$(echo ${HOSTNAME%%.*} | sed 's/node//1')
Expand All @@ -312,15 +312,15 @@ if [[ ${#submitted[@]} -ge 1 ]] && \
-q all.q \
-v IN=${IN} \
-v OUT=${OUT} \
-o ${OUT}/log \
-e ${OUT}/log \
-o ${OUT}/pipeline_info \
-e ${OUT}/pipeline_info \
-M ${USER}@cdc.gov \
-v LAB_HOME=${LAB_HOME} \
-N ASM_${#submitted[@]} \
-v SINGULARITY_TMPDIR=${SINGULARITY_TMPDIR} \
-v SINGULARITY_CACHEDIR=${SINGULARITY_CACHEDIR} \
-v NXF_SINGULARITY_CACHEDIR=${SINGULARITY_CACHEDIR} \
${LAB_HOME}/workflows/wf-paired-end-illumina-assembly/_run_assembly.uge-nextflow
${LAB_HOME}/workflows//wf-paired-end-illumina-assembly/_run_assembly.uge-nextflow

elif [[ ${#submitted[@]} -ge 1 ]] && \
[[ ${HOSTNAME%%.*} == 'rosalind01' ]] || \
Expand All @@ -331,16 +331,16 @@ elif [[ ${#submitted[@]} -ge 1 ]] && \
-q all.q \
-v IN=${IN} \
-v OUT=${OUT} \
-o ${OUT}/log \
-e ${OUT}/log \
-o ${OUT}/pipeline_info \
-e ${OUT}/pipeline_info \
-M ${USER}@cdc.gov \
-v LAB_HOME=${LAB_HOME} \
-N ASM_${#submitted[@]} \
-l max_runtime=72:00:00 \
-v SINGULARITY_TMPDIR=${SINGULARITY_TMPDIR} \
-v SINGULARITY_CACHEDIR=${SINGULARITY_CACHEDIR} \
-v NXF_SINGULARITY_CACHEDIR=${SINGULARITY_CACHEDIR} \
${LAB_HOME}/workflows/wf-paired-end-illumina-assembly/_run_assembly.uge-nextflow
${LAB_HOME}/workflows//wf-paired-end-illumina-assembly/_run_assembly.uge-nextflow

else
echo -e "${RED_TXT}Biolinux/Aspen/Rosalind HPC is not detected.\nSubmission cancelled. ${COLOR_OFF}"
Expand Down

0 comments on commit 4c8c47b

Please sign in to comment.