Merge pull request #7 from gregorysprenger/dev

v2.0.1
gregorysprenger · Nov 16, 2023 · 4c8c47b · 4c8c47b
2 parents 823fa8b + e68f3f9
commit 4c8c47b
Show file tree

Hide file tree

Showing 12 changed files with 44 additions and 30 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,20 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## v2.0.1 - November 16, 2023
+
+### `Added`
+
+### `Fixed`
+
+- Made bin directory executable when added to github so Nextflow can use them
+- Changed log directory to pipeline_info directory in run_assembly.uge-nextflow scripts
+- Removed --gtdb_db from run_assembly.uge-nextflow scripts until GTDB-Tk module version is updated
+
+### `Updated`
+
+### `Deprecated`
+
 ## v2.0.0 - November 15, 2023
 
 ### `Added`

diff --git a/_run_assembly.uge-nextflow b/_run_assembly.uge-nextflow
@@ -13,7 +13,7 @@ fi
 
 module load nextflow
 nextflow \
-  -log ${OUT}/log/nextflow_log.${SCRIPT_NAME}.txt \
+  -log ${OUT}/pipeline_info/nextflow_log.${SCRIPT_NAME}.txt \
   run \
   ${LAB_HOME}/workflows/wf-paired-end-illumina-assembly/main.nf \
   -profile ${HPC} \
@@ -23,14 +23,13 @@ nextflow \
   -N ${USER}@cdc.gov \
   -w ${OUT}/.work \
   --blast_db ${LAB_HOME}/.databases/ncbi \
-  --gtdb_db ${LAB_HOME}/.databases/GTDB/release207_v2 \
   --kraken1_db /scicomp/reference-pure/kraken/OLD/1.0.0/kraken_db \
   --kraken2_db ${LAB_HOME}/.databases/kraken2 \
   -resume
 
 # Check for errors and add to errors.tsv
 # Get nextflow run name
-run_name=$(grep "Launching" ${OUT}/log/ASM_*.o${SCRIPT_NAME} | cut -d '[' -f 2 | cut -d ']' -f 1)
+run_name=$(grep "Launching" ${OUT}/pipeline_info/ASM_*.o${SCRIPT_NAME} | cut -d '[' -f 2 | cut -d ']' -f 1)
 
 # Read each line from nextflow log, find info, and add to errors.tsv
 while read -r line; do
@@ -40,8 +39,8 @@ while read -r line; do
   # If process is already running, clean up error
   if [[ "${line}" =~ ^Unable[[:space:]]to[[:space:]]acquire[[:space:]]lock.* ]]; then
     error="You are trying to resume the execution of an already running pipeline."
-    ASM_OUT=$(realpath ${OUT}/log/ASM_*.o*)
-    echo -e "-\t-\t${error}\t${ASM_OUT}\t${time_stamp}\t${run_name}" >> ${OUT}/log/errors.tsv
+    ASM_OUT=$(realpath ${OUT}/pipeline_info/ASM_*.o*)
+    echo -e "-\t-\t${error}\t${ASM_OUT}\t${time_stamp}\t${run_name}" >> ${OUT}/pipeline_info/errors.tsv
   else
     # Workflow ran some processes
     sample_name=$(grep "nf-" ${line}/.command.run | cut -d '(' -f 2 | cut -d ')' -f 1)
@@ -75,30 +74,30 @@ while read -r line; do
     fi
 
     # If process for sample retried and succeeded, ignore
-    if [[ -f "$(ls ${OUT}/log/process_logs/${sample_name}.${process}*out)" ]] \
+    if [[ -f "$(ls ${OUT}/pipeline_info/process_logs/${sample_name}.${process}*out)" ]] \
       && [[ $(cat ${line}/.exitcode) = @(0|143|137|104|134|139|71|255) ]]; then
       continue
     else
-      echo -e "${sample_name}\t${process}\t${error}\t${line}\t${time_stamp}\t${run_name}" >> ${OUT}/log/errors.tsv
+      echo -e "${sample_name}\t${process}\t${error}\t${line}\t${time_stamp}\t${run_name}" >> ${OUT}/pipeline_info/errors.tsv
     fi
   fi
 done < <(nextflow log ${run_name} -filter 'status == "FAILED"')
 
 # If errors.tsv found..
-if [[ -f "${OUT}/log/errors.tsv" ]]; then
+if [[ -f "${OUT}/pipeline_info/errors.tsv" ]]; then
   # Add column headers
-  sed -i '1i Sample Name\tProcess\tError\tError Directory\tTimestamp\tRun Name' ${OUT}/log/errors.tsv
+  sed -i '1i Sample Name\tProcess\tError\tError Directory\tTimestamp\tRun Name' ${OUT}/pipeline_info/errors.tsv
 
   # Remove duplicate lines and lines that have an empty first column
-  awk '!a[$0]++' ${OUT}/log/errors.tsv \
+  awk '!a[$0]++' ${OUT}/pipeline_info/errors.tsv \
     | awk -F '\t' '$1{print $0}' \
-    > ${OUT}/log/errors_new.tsv
+    > ${OUT}/pipeline_info/errors_new.tsv
 
   # Delete original errors.tsv and rename errors_new.tsv
-  rm ${OUT}/log/errors.tsv
+  rm ${OUT}/pipeline_info/errors.tsv
 
-  mv ${OUT}/log/errors_new.tsv \
-    ${OUT}/log/errors.tsv
+  mv ${OUT}/pipeline_info/errors_new.tsv \
+    ${OUT}/pipeline_info/errors.tsv
 fi
 
 # Count lines in Summary.Illumina.GenomeCoverage.tab
@@ -108,12 +107,12 @@ fi
 
 # E-mail completion status
 if [[ -f "${OUT}/qa/Summary.Illumina.GenomeCoverage.tab" ]] \
-  && [[ -f "${OUT}/log/errors.tsv" ]]; then
+  && [[ -f "${OUT}/pipeline_info/errors.tsv" ]]; then
   echo -e "Assembly and QA finished on $(date)\n${OUT}" | mail \
   -s "${num_assemblies} assembled $(basename "${OUT}") [HPC]" \
   -S smtp="smtpgw.cdc.gov" \
   -a "${OUT}/qa/Summary.Illumina.GenomeCoverage.tab" \
-  -a "${OUT}/log/errors.tsv" \
+  -a "${OUT}/pipeline_info/errors.tsv" \
   "${USER}@cdc.gov"
 
 elif [[ -f "${OUT}/qa/Summary.Illumina.GenomeCoverage.tab" ]]; then
@@ -123,11 +122,11 @@ elif [[ -f "${OUT}/qa/Summary.Illumina.GenomeCoverage.tab" ]]; then
   -a "${OUT}/qa/Summary.Illumina.GenomeCoverage.tab" \
   "${USER}@cdc.gov"
 
-elif [[ -f "${OUT}/log/errors.tsv" ]]; then
+elif [[ -f "${OUT}/pipeline_info/errors.tsv" ]]; then
   echo -e "Assembly and QA could not be completed on $(date)\n${OUT}" | mail \
   -s "No assemblies found $(basename "${OUT}") [HPC]" \
   -S smtp="smtpgw.cdc.gov" \
-  -a "${OUT}/log/errors.tsv" \
+  -a "${OUT}/pipeline_info/errors.tsv" \
   "${USER}@cdc.gov"
 fi
 

diff --git a/bin/bash_functions.sh b/bin/bash_functions.sh
diff --git a/bin/extract.record.from.genbank.py b/bin/extract.record.from.genbank.py
diff --git a/bin/filter.blast.py b/bin/filter.blast.py
diff --git a/bin/filter.contigs.py b/bin/filter.contigs.py
diff --git a/bin/split.multifasta.py b/bin/split.multifasta.py
diff --git a/bin/summarize_kraken.sh b/bin/summarize_kraken.sh
diff --git a/conf/profiles/aspen_hpc.config b/conf/profiles/aspen_hpc.config
@@ -15,7 +15,7 @@ process {
     penv                    = params.sge_penv
     queue                   = params.sge_queue
     clusterOptions          = params.sge_options
-    errorStrategy           = { task.exitStatus in [71,104,134,137,139,140,143,255] ? 'retry' : 'ignore' }
+    errorStrategy           = { task.exitStatus in [71,104,134,137,139,140,143,250,255] ? 'retry' : 'ignore' }
 
     // Default process_high label is 8h, 4h is sufficient
     withLabel:process_high {

diff --git a/conf/profiles/rosalind_hpc.config b/conf/profiles/rosalind_hpc.config
@@ -16,7 +16,7 @@ process {
     penv                    = params.sge_penv
     queue                   = { task.time <= 4.h ? 'short.q' : task.time > 5.h ? 'all.q' : 'short.q' }
     clusterOptions          = params.sge_options
-    errorStrategy           = { task.exitStatus in [71,104,134,137,139,140,143,255] ? 'retry' : 'ignore' }
+    errorStrategy           = { task.exitStatus in [71,104,134,137,139,140,143,250,255] ? 'retry' : 'ignore' }
 
     // Increase memory for Kraken 1
     withName:READ_CLASSIFY_KRAKEN_ONE {

diff --git a/modules/local/estimate_genome_size_kmc/main.nf b/modules/local/estimate_genome_size_kmc/main.nf
@@ -2,6 +2,7 @@ process ESTIMATE_GENOME_SIZE_KMC {
 
     tag { "${meta.id}" }
     container "gregorysprenger/kmc@sha256:27603041f8c8818aa71a1d0386df17eddca59dbd6441b7e84b78b8a09dc137df"
+    label "process_medium"
 
     input:
     tuple val(meta), path(reads)

diff --git a/run_assembly.uge-nextflow b/run_assembly.uge-nextflow
@@ -88,7 +88,7 @@ prompt_if_previous_nextflow_run() {
     else
       # If user doesn't want to continue, ask for a different output path
       prompt_new_outdir
-      prompt_if_previous_nextflow_run "${OUT}/log" "nextflow_log"
+      prompt_if_previous_nextflow_run "${OUT}/pipeline_info" "nextflow_log"
     fi
   fi
 }
@@ -226,9 +226,9 @@ fi
 
 # Check for specific nextflow logfile for this workflow, which
 #  means this nextflow workflow has been ran before in the specified outdir.
-if [ -d "${OUT}/log" ] && \
+if [ -d "${OUT}/pipeline_info" ] && \
   [ -f $(find "${OUT}" -maxdepth 2 -name "nextflow_log*" | grep -q '.') ]; then
-  prompt_if_previous_nextflow_run "${OUT}/log" "nextflow_log"
+  prompt_if_previous_nextflow_run "${OUT}/pipeline_info" "nextflow_log"
 fi
 
 # Also check for unsupported behavior combining legacy bash and newer nextflow
@@ -297,7 +297,7 @@ if [[ ${#submitted[@]} -ge 10 ]]; then
 fi
 
 # Set up log directory in OUT directory
-mkdir -p ${OUT}/log
+mkdir -p ${OUT}/pipeline_info
 
 # Get node number - <=230 = biolinux, >=231 = rosalind
 NODE_NUM=$(echo ${HOSTNAME%%.*} | sed 's/node//1')
@@ -312,15 +312,15 @@ if [[ ${#submitted[@]} -ge 1 ]] && \
     -q all.q \
     -v IN=${IN} \
     -v OUT=${OUT} \
-    -o ${OUT}/log \
-    -e ${OUT}/log \
+    -o ${OUT}/pipeline_info \
+    -e ${OUT}/pipeline_info \
     -M ${USER}@cdc.gov \
     -v LAB_HOME=${LAB_HOME} \
     -N ASM_${#submitted[@]} \
     -v SINGULARITY_TMPDIR=${SINGULARITY_TMPDIR} \
     -v SINGULARITY_CACHEDIR=${SINGULARITY_CACHEDIR} \
     -v NXF_SINGULARITY_CACHEDIR=${SINGULARITY_CACHEDIR} \
-    ${LAB_HOME}/workflows/wf-paired-end-illumina-assembly/_run_assembly.uge-nextflow
+    ${LAB_HOME}/workflows//wf-paired-end-illumina-assembly/_run_assembly.uge-nextflow
 
 elif [[ ${#submitted[@]} -ge 1 ]] && \
   [[ ${HOSTNAME%%.*} == 'rosalind01' ]] || \
@@ -331,16 +331,16 @@ elif [[ ${#submitted[@]} -ge 1 ]] && \
     -q all.q \
     -v IN=${IN} \
     -v OUT=${OUT} \
-    -o ${OUT}/log \
-    -e ${OUT}/log \
+    -o ${OUT}/pipeline_info \
+    -e ${OUT}/pipeline_info \
     -M ${USER}@cdc.gov \
     -v LAB_HOME=${LAB_HOME} \
     -N ASM_${#submitted[@]} \
     -l max_runtime=72:00:00 \
     -v SINGULARITY_TMPDIR=${SINGULARITY_TMPDIR} \
     -v SINGULARITY_CACHEDIR=${SINGULARITY_CACHEDIR} \
     -v NXF_SINGULARITY_CACHEDIR=${SINGULARITY_CACHEDIR} \
-    ${LAB_HOME}/workflows/wf-paired-end-illumina-assembly/_run_assembly.uge-nextflow
+    ${LAB_HOME}/workflows//wf-paired-end-illumina-assembly/_run_assembly.uge-nextflow
 
 else
   echo -e "${RED_TXT}Biolinux/Aspen/Rosalind HPC is not detected.\nSubmission cancelled. ${COLOR_OFF}"