From c047f946b0c345bc9fc2a4f2a9d2ccc07937ebea Mon Sep 17 00:00:00 2001 From: gregorysprenger <42686628+gregorysprenger@users.noreply.github.com> Date: Wed, 15 Nov 2023 20:09:38 -0500 Subject: [PATCH 1/7] git update index chmod to +x --- bin/bash_functions.sh | 0 bin/extract.record.from.genbank.py | 0 bin/filter.blast.py | 0 bin/filter.contigs.py | 0 bin/split.multifasta.py | 0 bin/summarize_kraken.sh | 0 6 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 bin/bash_functions.sh mode change 100644 => 100755 bin/extract.record.from.genbank.py mode change 100644 => 100755 bin/filter.blast.py mode change 100644 => 100755 bin/filter.contigs.py mode change 100644 => 100755 bin/split.multifasta.py mode change 100644 => 100755 bin/summarize_kraken.sh diff --git a/bin/bash_functions.sh b/bin/bash_functions.sh old mode 100644 new mode 100755 diff --git a/bin/extract.record.from.genbank.py b/bin/extract.record.from.genbank.py old mode 100644 new mode 100755 diff --git a/bin/filter.blast.py b/bin/filter.blast.py old mode 100644 new mode 100755 diff --git a/bin/filter.contigs.py b/bin/filter.contigs.py old mode 100644 new mode 100755 diff --git a/bin/split.multifasta.py b/bin/split.multifasta.py old mode 100644 new mode 100755 diff --git a/bin/summarize_kraken.sh b/bin/summarize_kraken.sh old mode 100644 new mode 100755 From bdf6483b7677906accea5ab5ec7e05c84da595e5 Mon Sep 17 00:00:00 2001 From: gregorysprenger <42686628+gregorysprenger@users.noreply.github.com> Date: Thu, 16 Nov 2023 09:12:33 -0500 Subject: [PATCH 2/7] add exit code 250 for spades --- conf/profiles/aspen_hpc.config | 2 +- conf/profiles/rosalind_hpc.config | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/profiles/aspen_hpc.config b/conf/profiles/aspen_hpc.config index b254df8..1df6004 100644 --- a/conf/profiles/aspen_hpc.config +++ b/conf/profiles/aspen_hpc.config @@ -15,7 +15,7 @@ process { penv = params.sge_penv queue = params.sge_queue clusterOptions = params.sge_options - errorStrategy = { task.exitStatus in [71,104,134,137,139,140,143,255] ? 'retry' : 'ignore' } + errorStrategy = { task.exitStatus in [71,104,134,137,139,140,143,250,255] ? 'retry' : 'ignore' } // Default process_high label is 8h, 4h is sufficient withLabel:process_high { diff --git a/conf/profiles/rosalind_hpc.config b/conf/profiles/rosalind_hpc.config index 7f0ad2e..be8c291 100644 --- a/conf/profiles/rosalind_hpc.config +++ b/conf/profiles/rosalind_hpc.config @@ -16,7 +16,7 @@ process { penv = params.sge_penv queue = { task.time <= 4.h ? 'short.q' : task.time > 5.h ? 'all.q' : 'short.q' } clusterOptions = params.sge_options - errorStrategy = { task.exitStatus in [71,104,134,137,139,140,143,255] ? 'retry' : 'ignore' } + errorStrategy = { task.exitStatus in [71,104,134,137,139,140,143,250,255] ? 'retry' : 'ignore' } // Increase memory for Kraken 1 withName:READ_CLASSIFY_KRAKEN_ONE { From 6266e3d48aeef36448e07ea01307ba4ddbd1aed8 Mon Sep 17 00:00:00 2001 From: gregorysprenger <42686628+gregorysprenger@users.noreply.github.com> Date: Thu, 16 Nov 2023 09:13:27 -0500 Subject: [PATCH 3/7] added label to give kmc more memory --- modules/local/estimate_genome_size_kmc/main.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/local/estimate_genome_size_kmc/main.nf b/modules/local/estimate_genome_size_kmc/main.nf index 893181e..613943f 100644 --- a/modules/local/estimate_genome_size_kmc/main.nf +++ b/modules/local/estimate_genome_size_kmc/main.nf @@ -2,6 +2,7 @@ process ESTIMATE_GENOME_SIZE_KMC { tag { "${meta.id}" } container "gregorysprenger/kmc@sha256:27603041f8c8818aa71a1d0386df17eddca59dbd6441b7e84b78b8a09dc137df" + label "process_medium" input: tuple val(meta), path(reads) From ea731397fd1886eedcd0e888f249d0fe748696ba Mon Sep 17 00:00:00 2001 From: gregorysprenger <42686628+gregorysprenger@users.noreply.github.com> Date: Thu, 16 Nov 2023 09:19:34 -0500 Subject: [PATCH 4/7] replace log with pipeline_info --- _run_assembly.uge-nextflow | 34 +++++++++++++++++----------------- run_assembly.uge-nextflow | 20 ++++++++++---------- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/_run_assembly.uge-nextflow b/_run_assembly.uge-nextflow index 9a60c85..7e37c71 100644 --- a/_run_assembly.uge-nextflow +++ b/_run_assembly.uge-nextflow @@ -13,7 +13,7 @@ fi module load nextflow nextflow \ - -log ${OUT}/log/nextflow_log.${SCRIPT_NAME}.txt \ + -log ${OUT}/pipeline_info/nextflow_log.${SCRIPT_NAME}.txt \ run \ ${LAB_HOME}/workflows/wf-paired-end-illumina-assembly/main.nf \ -profile ${HPC} \ @@ -30,7 +30,7 @@ nextflow \ # Check for errors and add to errors.tsv # Get nextflow run name -run_name=$(grep "Launching" ${OUT}/log/ASM_*.o${SCRIPT_NAME} | cut -d '[' -f 2 | cut -d ']' -f 1) +run_name=$(grep "Launching" ${OUT}/pipeline_info/ASM_*.o${SCRIPT_NAME} | cut -d '[' -f 2 | cut -d ']' -f 1) # Read each line from nextflow log, find info, and add to errors.tsv while read -r line; do @@ -40,8 +40,8 @@ while read -r line; do # If process is already running, clean up error if [[ "${line}" =~ ^Unable[[:space:]]to[[:space:]]acquire[[:space:]]lock.* ]]; then error="You are trying to resume the execution of an already running pipeline." - ASM_OUT=$(realpath ${OUT}/log/ASM_*.o*) - echo -e "-\t-\t${error}\t${ASM_OUT}\t${time_stamp}\t${run_name}" >> ${OUT}/log/errors.tsv + ASM_OUT=$(realpath ${OUT}/pipeline_info/ASM_*.o*) + echo -e "-\t-\t${error}\t${ASM_OUT}\t${time_stamp}\t${run_name}" >> ${OUT}/pipeline_info/errors.tsv else # Workflow ran some processes sample_name=$(grep "nf-" ${line}/.command.run | cut -d '(' -f 2 | cut -d ')' -f 1) @@ -75,30 +75,30 @@ while read -r line; do fi # If process for sample retried and succeeded, ignore - if [[ -f "$(ls ${OUT}/log/process_logs/${sample_name}.${process}*out)" ]] \ + if [[ -f "$(ls ${OUT}/pipeline_info/process_logs/${sample_name}.${process}*out)" ]] \ && [[ $(cat ${line}/.exitcode) = @(0|143|137|104|134|139|71|255) ]]; then continue else - echo -e "${sample_name}\t${process}\t${error}\t${line}\t${time_stamp}\t${run_name}" >> ${OUT}/log/errors.tsv + echo -e "${sample_name}\t${process}\t${error}\t${line}\t${time_stamp}\t${run_name}" >> ${OUT}/pipeline_info/errors.tsv fi fi done < <(nextflow log ${run_name} -filter 'status == "FAILED"') # If errors.tsv found.. -if [[ -f "${OUT}/log/errors.tsv" ]]; then +if [[ -f "${OUT}/pipeline_info/errors.tsv" ]]; then # Add column headers - sed -i '1i Sample Name\tProcess\tError\tError Directory\tTimestamp\tRun Name' ${OUT}/log/errors.tsv + sed -i '1i Sample Name\tProcess\tError\tError Directory\tTimestamp\tRun Name' ${OUT}/pipeline_info/errors.tsv # Remove duplicate lines and lines that have an empty first column - awk '!a[$0]++' ${OUT}/log/errors.tsv \ + awk '!a[$0]++' ${OUT}/pipeline_info/errors.tsv \ | awk -F '\t' '$1{print $0}' \ - > ${OUT}/log/errors_new.tsv + > ${OUT}/pipeline_info/errors_new.tsv # Delete original errors.tsv and rename errors_new.tsv - rm ${OUT}/log/errors.tsv + rm ${OUT}/pipeline_info/errors.tsv - mv ${OUT}/log/errors_new.tsv \ - ${OUT}/log/errors.tsv + mv ${OUT}/pipeline_info/errors_new.tsv \ + ${OUT}/pipeline_info/errors.tsv fi # Count lines in Summary.Illumina.GenomeCoverage.tab @@ -108,12 +108,12 @@ fi # E-mail completion status if [[ -f "${OUT}/qa/Summary.Illumina.GenomeCoverage.tab" ]] \ - && [[ -f "${OUT}/log/errors.tsv" ]]; then + && [[ -f "${OUT}/pipeline_info/errors.tsv" ]]; then echo -e "Assembly and QA finished on $(date)\n${OUT}" | mail \ -s "${num_assemblies} assembled $(basename "${OUT}") [HPC]" \ -S smtp="smtpgw.cdc.gov" \ -a "${OUT}/qa/Summary.Illumina.GenomeCoverage.tab" \ - -a "${OUT}/log/errors.tsv" \ + -a "${OUT}/pipeline_info/errors.tsv" \ "${USER}@cdc.gov" elif [[ -f "${OUT}/qa/Summary.Illumina.GenomeCoverage.tab" ]]; then @@ -123,11 +123,11 @@ elif [[ -f "${OUT}/qa/Summary.Illumina.GenomeCoverage.tab" ]]; then -a "${OUT}/qa/Summary.Illumina.GenomeCoverage.tab" \ "${USER}@cdc.gov" -elif [[ -f "${OUT}/log/errors.tsv" ]]; then +elif [[ -f "${OUT}/pipeline_info/errors.tsv" ]]; then echo -e "Assembly and QA could not be completed on $(date)\n${OUT}" | mail \ -s "No assemblies found $(basename "${OUT}") [HPC]" \ -S smtp="smtpgw.cdc.gov" \ - -a "${OUT}/log/errors.tsv" \ + -a "${OUT}/pipeline_info/errors.tsv" \ "${USER}@cdc.gov" fi diff --git a/run_assembly.uge-nextflow b/run_assembly.uge-nextflow index 92cea15..b5f4fd1 100644 --- a/run_assembly.uge-nextflow +++ b/run_assembly.uge-nextflow @@ -88,7 +88,7 @@ prompt_if_previous_nextflow_run() { else # If user doesn't want to continue, ask for a different output path prompt_new_outdir - prompt_if_previous_nextflow_run "${OUT}/log" "nextflow_log" + prompt_if_previous_nextflow_run "${OUT}/pipeline_info" "nextflow_log" fi fi } @@ -226,9 +226,9 @@ fi # Check for specific nextflow logfile for this workflow, which # means this nextflow workflow has been ran before in the specified outdir. -if [ -d "${OUT}/log" ] && \ +if [ -d "${OUT}/pipeline_info" ] && \ [ -f $(find "${OUT}" -maxdepth 2 -name "nextflow_log*" | grep -q '.') ]; then - prompt_if_previous_nextflow_run "${OUT}/log" "nextflow_log" + prompt_if_previous_nextflow_run "${OUT}/pipeline_info" "nextflow_log" fi # Also check for unsupported behavior combining legacy bash and newer nextflow @@ -297,7 +297,7 @@ if [[ ${#submitted[@]} -ge 10 ]]; then fi # Set up log directory in OUT directory -mkdir -p ${OUT}/log +mkdir -p ${OUT}/pipeline_info # Get node number - <=230 = biolinux, >=231 = rosalind NODE_NUM=$(echo ${HOSTNAME%%.*} | sed 's/node//1') @@ -312,15 +312,15 @@ if [[ ${#submitted[@]} -ge 1 ]] && \ -q all.q \ -v IN=${IN} \ -v OUT=${OUT} \ - -o ${OUT}/log \ - -e ${OUT}/log \ + -o ${OUT}/pipeline_info \ + -e ${OUT}/pipeline_info \ -M ${USER}@cdc.gov \ -v LAB_HOME=${LAB_HOME} \ -N ASM_${#submitted[@]} \ -v SINGULARITY_TMPDIR=${SINGULARITY_TMPDIR} \ -v SINGULARITY_CACHEDIR=${SINGULARITY_CACHEDIR} \ -v NXF_SINGULARITY_CACHEDIR=${SINGULARITY_CACHEDIR} \ - ${LAB_HOME}/workflows/wf-paired-end-illumina-assembly/_run_assembly.uge-nextflow + ${LAB_HOME}/workflows//wf-paired-end-illumina-assembly/_run_assembly.uge-nextflow elif [[ ${#submitted[@]} -ge 1 ]] && \ [[ ${HOSTNAME%%.*} == 'rosalind01' ]] || \ @@ -331,8 +331,8 @@ elif [[ ${#submitted[@]} -ge 1 ]] && \ -q all.q \ -v IN=${IN} \ -v OUT=${OUT} \ - -o ${OUT}/log \ - -e ${OUT}/log \ + -o ${OUT}/pipeline_info \ + -e ${OUT}/pipeline_info \ -M ${USER}@cdc.gov \ -v LAB_HOME=${LAB_HOME} \ -N ASM_${#submitted[@]} \ @@ -340,7 +340,7 @@ elif [[ ${#submitted[@]} -ge 1 ]] && \ -v SINGULARITY_TMPDIR=${SINGULARITY_TMPDIR} \ -v SINGULARITY_CACHEDIR=${SINGULARITY_CACHEDIR} \ -v NXF_SINGULARITY_CACHEDIR=${SINGULARITY_CACHEDIR} \ - ${LAB_HOME}/workflows/wf-paired-end-illumina-assembly/_run_assembly.uge-nextflow + ${LAB_HOME}/workflows//wf-paired-end-illumina-assembly/_run_assembly.uge-nextflow else echo -e "${RED_TXT}Biolinux/Aspen/Rosalind HPC is not detected.\nSubmission cancelled. ${COLOR_OFF}" From 03419e8cc01bc4d10f64883d53e9d8a48f3a9b5c Mon Sep 17 00:00:00 2001 From: gregorysprenger <42686628+gregorysprenger@users.noreply.github.com> Date: Thu, 16 Nov 2023 09:20:08 -0500 Subject: [PATCH 5/7] drop gtdb until gtdbtk is updated to newest version --- _run_assembly.uge-nextflow | 1 - 1 file changed, 1 deletion(-) diff --git a/_run_assembly.uge-nextflow b/_run_assembly.uge-nextflow index 7e37c71..496d2ee 100644 --- a/_run_assembly.uge-nextflow +++ b/_run_assembly.uge-nextflow @@ -23,7 +23,6 @@ nextflow \ -N ${USER}@cdc.gov \ -w ${OUT}/.work \ --blast_db ${LAB_HOME}/.databases/ncbi \ - --gtdb_db ${LAB_HOME}/.databases/GTDB/release207_v2 \ --kraken1_db /scicomp/reference-pure/kraken/OLD/1.0.0/kraken_db \ --kraken2_db ${LAB_HOME}/.databases/kraken2 \ -resume From 013303769cdd1c81986b2f8b6fd5c236906c6083 Mon Sep 17 00:00:00 2001 From: gregorysprenger <42686628+gregorysprenger@users.noreply.github.com> Date: Thu, 16 Nov 2023 09:35:30 -0500 Subject: [PATCH 6/7] update changelog --- CHANGELOG.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 560fc12..2e5be7e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,20 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## v2.0.1 - November 16, 2023 + +### `Added` + +### `Fixed` + +- Made bin directory executable when added to github so Nextflow can use them +- Changed log directory to pipeline_info directory in run_assembly.uge-nextflow scripts +- Dropped --gtdb_db from run_assembly.uge-nextflow scripts until GTDB-Tk module version is updated + +### `Updated` + +### `Deprecated` + ## v2.0.0 - November 15, 2023 ### `Added` From e68f3f9ab46c43aa98ccc0dc4712ca224c767faa Mon Sep 17 00:00:00 2001 From: gregorysprenger <42686628+gregorysprenger@users.noreply.github.com> Date: Thu, 16 Nov 2023 09:38:55 -0500 Subject: [PATCH 7/7] update changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2e5be7e..3245493 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Made bin directory executable when added to github so Nextflow can use them - Changed log directory to pipeline_info directory in run_assembly.uge-nextflow scripts -- Dropped --gtdb_db from run_assembly.uge-nextflow scripts until GTDB-Tk module version is updated +- Removed --gtdb_db from run_assembly.uge-nextflow scripts until GTDB-Tk module version is updated ### `Updated`