add in processing scripts for human data (#40)

* refactor: make json script work with gsutil * feat: add submit.sh * feat: add helper delete_json script * refactor: run ShellCheck, shellfmt, and pylint on some existing scripts * feat: add new helper scripts * feat: refactor status script * feat: continue working on various helper scripts * fix: mo' problems, mo' fixes * feat: add in merge_atac_qc_human.R script, fix some parallel processing, add progress * fix: a couple minor script tweaks * feat: add in complete wrapper and server setup scripts * feat: fix wrapper script with better outputs * fix: add in fixes for the setup and wrapper scripts * feat: update batch scripts * feat: add R, fish, neovim, and qc2tsv install to server setup * feat: clean up scripts * feat: clean up scripts * fix: add samtools quickcheck * fix: use date for job log * fix: update echo command make_json_singleton.sh * feat: clean up src directory, get ready for submission * chore: add latest updates to README.md * feat: add --bar to all GNU parallel jobs * Added scripts used to process pass1ac data. Made minor changes to script to match consistent naming convention for pass data * fix: fix shebang in server setup script * chore: add merge_jsons.py utility script * fix: remove trailing slash from input dir * fix: point to correct script names in wrappers * fix: remove home dir path addition from wrapper script * fix: fix paths in scripts * Fix : remove printing out encode workflow level qc , added vial_label column * fix: fix paths in scripts --------- Co-authored-by: archanaraja <araja7@stanford.edu>
MoTrPAC · Jul 4, 2024 · 7eb6b28 · 7eb6b28
1 parent 21cf116
commit 7eb6b28
Show file tree

Hide file tree

Showing 30 changed files with 2,382 additions and 653 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,6 @@ config/
 metadata/
 .DS_Store
 test_data/
+.idea/
+.fleet/
+stash/
diff --git a/README.md b/README.md
diff --git a/patches/fix__add_missing_runtime_attributes_to_atac-seq_v1_7_0.patch b/patches/fix__add_missing_runtime_attributes_to_atac-seq_v1_7_0.patch
@@ -0,0 +1,18 @@
+Index: atac-seq-pipeline/atac.wdl
+IDEA additional info:
+Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
+<+>UTF-8
+===================================================================
+diff --git a/atac-seq-pipeline/atac.wdl b/atac-seq-pipeline/atac.wdl
+--- a/atac-seq-pipeline/atac.wdl	(revision 46f2a928db20ca4889160b6e2a2e980fac34d672)
++++ b/atac-seq-pipeline/atac.wdl	(date 1663706161074)
+@@ -1990,5 +1990,9 @@
+ 	}
+ 	runtime {
+ 		maxRetries : 0
++		cpu : 1
++		memory : '2 GB'
++		time : 1
++		disks : 'local-disk 10 SSD'
+ 	}
+ }
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,86 @@
+aiohttp==3.8.1
+aiosignal==1.2.0
+argcomplete==1.12.3
+async-timeout==4.0.2
+attrs==22.1.0
+autouri==0.3.0
+awscli==1.25.78
+boto3==1.24.77
+botocore==1.27.77
+bullet==2.2.0
+cachetools==5.2.0
+caper==2.2.2
+certifi==2022.9.14
+cffi==1.15.1
+charset-normalizer==2.1.1
+colorama==0.4.4
+coloredlogs==15.0.1
+contourpy==1.0.5
+croo==0.6.0
+cryptography==38.0.1
+cycler==0.11.0
+dateparser==1.1.1
+decorator==5.1.1
+docker==6.0.0
+docutils==0.16
+filelock==3.8.0
+fonttools==4.37.3
+frozenlist==1.3.1
+fsspec==2022.8.2
+gcsfs==2022.8.2
+google-api-core==2.10.1
+google-auth==2.11.1
+google-auth-oauthlib==0.5.3
+google-cloud-core==2.3.2
+google-cloud-storage==2.5.0
+google-crc32c==1.5.0
+google-resumable-media==2.3.3
+googleapis-common-protos==1.56.4
+graphviz==0.20.1
+humanfriendly==10.0
+idna==3.4
+importlib-metadata==4.12.0
+jmespath==1.0.1
+joblib==1.2.0
+kiwisolver==1.4.4
+lark-parser==0.12.0
+matplotlib==3.6.0
+miniwdl==1.3.3
+multidict==6.0.2
+ntplib==0.4.0
+numpy==1.23.3
+oauthlib==3.2.1
+packaging==21.3
+pandas==1.5.0
+Pillow==9.2.0
+protobuf==4.21.6
+psutil==5.9.2
+pyasn1==0.4.8
+pyasn1-modules==0.2.8
+pycparser==2.21
+pygtail==0.12.0
+pyhocon==0.3.59
+pyOpenSSL==22.0.0
+pyparsing==2.4.7
+python-dateutil==2.8.2
+python-json-logger==2.0.4
+pytz==2022.2.1
+pytz-deprecation-shim==0.1.0.post0
+PyYAML==5.4.1
+regex==2022.3.2
+requests==2.28.1
+requests-oauthlib==1.3.1
+rsa==4.7.2
+s3transfer==0.6.0
+scikit-learn==1.1.2
+scipy==1.9.1
+six==1.16.0
+threadpoolctl==3.1.0
+tqdm==4.64.1
+tzdata==2022.2
+tzlocal==4.2
+urllib3==1.26.12
+websocket-client==1.4.1
+xdg==5.1.1
+yarl==1.8.1
+zipp==3.8.1
diff --git a/src/align_stats.sh b/src/align_stats.sh
@@ -5,12 +5,18 @@
 # usage: bash align_stats.sh [NUM_CORES] [indir] [bamdir]
 # make sure the vm has bc and samtools istalled
 # sudo apt install bc
-set -e 
+set -e
 
 cores=$1
 indir=$2
 bamdir=$3
 
+if [ -z "$4" ]; then
+  type="glob"
+else
+  type=$4
+fi
+
 #indir=/projects/motrpac/PASS1A/ATAC/NOVASEQ_BATCH2/outputs
 #indir=~/test_mnt/PASS/atac-seq/stanford/batch4_20200928/Output/final
 # path to ENCODE outputs, anywhere upstream of the bam files
@@ -20,45 +26,86 @@ bamdir=$3
 
 #module load samtools
 
-cd ${indir}
+cwd=$(pwd)
+cd "$indir"
 mkdir -p idxstats # make outdir
 
-align_stats () {
-	local bam=$1 # 90141015805_R1.trim.bam - UNFILTERED BAM
-	local viallabel=$(basename $bam | sed "s/_R1.*//")
-	local primary=idxstats/${viallabel}_primary.bam
-	# already sorted
-	# keep only primary alignments 
-	samtools view -b -F 0x900 ${bam} -o ${primary}
-	# index
-	samtools index ${primary}
-	samtools idxstats ${primary} > idxstats/${viallabel}_chrinfo.txt
-	rm ${primary} ${primary}.bai
+align_stats() {
+  local bam=$1 # 90141015805_R1.trim.bam - UNFILTERED BAM
+  local viallabel
+  local primary
+
+  viallabel=$(basename "$bam" | sed "s/_R1.*//")
+  echo "$viallabel"
+
+  if [ -f idxstats/"${viallabel}"_chrinfo.txt ] && samtools quickcheck "$bam" 2&> /dev/null ; then
+    echo "Skipping $viallabel"
+    return
+  fi
 
-	# get counts
-	local total=$(awk '{sum+=$3;}END{print sum;}' idxstats/${viallabel}_chrinfo.txt)
-	local y=$(grep -E "^chrY" idxstats/${viallabel}_chrinfo.txt | cut -f 3)
-	local x=$(grep -E "^chrX" idxstats/${viallabel}_chrinfo.txt | cut -f 3)
-	local mt=$(grep -E "^chrM" idxstats/${viallabel}_chrinfo.txt | cut -f 3)
-	local auto=$(grep -E "^chr[0-9]" idxstats/${viallabel}_chrinfo.txt | cut -f 3 | awk '{sum+=$1;}END{print sum;}')
-	local contig=$(grep -E -v "^chr" idxstats/${viallabel}_chrinfo.txt | cut -f 3 | awk '{sum+=$1;}END{print sum;}')
+  primary=idxstats/${viallabel}_primary.bam
+  # already sorted
+  # keep only primary alignments
+  samtools view -b -F 0x900 "$bam" -o "$primary"
+  # index
+  samtools index "$primary"
+  samtools idxstats "$primary" >"idxstats/${viallabel}_chrinfo.txt"
+  rm "$primary" "${primary}.bai"
 
-	# get fractions
-	local pct_y=$(echo "scale=5; ${y}/${total}*100" | bc -l | sed 's/^\./0./')
-	local pct_x=$(echo "scale=5; ${x}/${total}*100" | bc -l | sed 's/^\./0./')
-	local pct_mt=$(echo "scale=5; ${mt}/${total}*100" | bc -l | sed 's/^\./0./')
-	local pct_auto=$(echo "scale=5; ${auto}/${total}*100" | bc -l | sed 's/^\./0./')
-	local pct_contig=$(echo "scale=5; ${contig}/${total}*100" | bc -l | sed 's/^\./0./')
+  # get counts
+  local total
+  local y
+  local x
+  local mt
+  local auto
+  local contig
+  total=$(awk '{sum+=$3;}END{print sum;}' "idxstats/${viallabel}_chrinfo.txt")
+  y=$(grep -E "^chrY" "idxstats/${viallabel}_chrinfo.txt" | head -1 | cut -f 3)
+  x=$(grep -E "^chrX" "idxstats/${viallabel}_chrinfo.txt" | head -1 | cut -f 3)
+  mt=$(grep -E "^chrM" "idxstats/${viallabel}_chrinfo.txt" | head -1 | cut -f 3)
+  auto=$(grep -E "^chr[0-9]" "idxstats/${viallabel}_chrinfo.txt" | cut -f 3 | awk '{sum+=$1;}END{print sum;}')
+  contig=$(grep -E -v "^chr" "idxstats/${viallabel}_chrinfo.txt" | cut -f 3 | awk '{sum+=$1;}END{print sum;}')
 
-	# output to file 
-	echo 'viallabel,total_primary_alignments,pct_chrX,pct_chrY,pct_chrM,pct_auto,pct_contig' > idxstats/${viallabel}_chrinfo.csv
-	echo ${viallabel},${total},${pct_x},${pct_y},${pct_mt},${pct_auto},${pct_contig} >> idxstats/${viallabel}_chrinfo.csv
+  # get fractions
+  local pct_y
+  local pct_x
+  local pct_mt
+  local pct_auto
+  local pct_contig
+  pct_y=$(echo "scale=5; ${y}/${total}*100" | bc -l | sed 's/^\./0./')
+  pct_x=$(echo "scale=5; ${x}/${total}*100" | bc -l | sed 's/^\./0./')
+  pct_mt=$(echo "scale=5; ${mt}/${total}*100" | bc -l | sed 's/^\./0./')
+  pct_auto=$(echo "scale=5; ${auto}/${total}*100" | bc -l | sed 's/^\./0./')
+  pct_contig=$(echo "scale=5; ${contig}/${total}*100" | bc -l | sed 's/^\./0./')
+
+  echo $viallabel
+  echo $total
+  echo $pct_x
+  echo $pct_y
+  echo $pct_mt
+  echo $pct_auto
+  echo $pct_contig
+  # output to file
+  echo 'viallabel,total_primary_alignments,pct_chrX,pct_chrY,pct_chrM,pct_auto,pct_contig' >"idxstats/${viallabel}_chrinfo.csv"
+  echo "${viallabel},${total},${pct_x},${pct_y},${pct_mt},${pct_auto},${pct_contig}" >>"idxstats/${viallabel}_chrinfo.csv"
 }
 export -f align_stats
-#parallel --verbose --jobs ${cores} align_stats ::: $(find -name "*_R1.trim.bam")
-parallel --verbose --jobs ${cores} align_stats ::: $(ls ${bamdir}/*/*/align/rep*/*.trim.bam)
+
+if [ "$type" == "glob" ]; then
+  parallel --joblog ~/mnt/tmp/"$(basename bamdir)"_"$(date "+%b%d%Y_%H%M%S")"_align_stats_joblog.log --progress --bar --verbose --jobs "$cores" align_stats ::: "$(ls "${bamdir%/}"/*/*/align/rep*/*_R1.trim.bam)"
+elif [ "$type" == "file" ]; then
+  readarray -t raw_bam_list <<<"$bamdir"
+  parallel --joblog ~/mnt/tmp/"$(basename bamdir)"_"$(date "+%b%d%Y_%H%M%S")"_align_stats_joblog.log --progress --bar --verbose --jobs "$cores" align_stats ::: "${raw_bam_list[@]}"
+elif [ "$type" == "find" ]; then
+  parallel --joblog ~/mnt/tmp/"$(basename bamdir)"_"$(date "+%b%d%Y_%H%M%S")"_align_stats_joblog.log --progress --bar --verbose --jobs "$cores" align_stats ::: "$(find -name "*_R1.trim.bam" "$bamdir")"
+else
+  echo "type must be glob, file, or find"
+  exit 1
+fi
 
 # collapse
 #head -1 $(find -name "*_chrinfo.csv" | head -1) > merged_chr_info.csv
 #for file in $(find -name "*_chrinfo.csv"); do sed -e '1d' $file >> merged_chr_info.csv; done
-cat idxstats/*_chrinfo.csv|grep -v "^viallabel"|sed '1iviallabel,total_primary_alignments,pct_chrX,pct_chrY,pct_chrM,pct_auto,pct_contig' >merged_chr_info.csv
+cat idxstats/*_chrinfo.csv | grep -v "^viallabel" | sed '1iviallabel,total_primary_alignments,pct_chrX,pct_chrY,pct_chrM,pct_auto,pct_contig' >merged_chr_info.csv
+cd "$cwd" || exit 1
+
diff --git a/src/align_stats_concat_only.sh b/src/align_stats_concat_only.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+# Nicole Gay
+# 13 May 2020
+# ATAC-seq alignment stats
+# usage: bash align_stats.sh [NUM_CORES] [indir] [bamdir]
+# make sure the vm has bc and samtools istalled
+# sudo apt install bc
+set -e
+
+cores=$1
+indir=$2
+bamdir=$3
+
+if [ -z "$4" ]; then
+  type="glob"
+else
+  type=$4
+fi
+
+#indir=/projects/motrpac/PASS1A/ATAC/NOVASEQ_BATCH2/outputs
+#indir=~/test_mnt/PASS/atac-seq/stanford/batch4_20200928/Output/final
+# path to ENCODE outputs, anywhere upstream of the bam files
+# also where output folder will be made
+#bamdir=~/test_mnt/PASS/atac-seq/stanford/batch4_20200928/Output
+#path to the location of unfiltered bam files (*_R1.trim.bam)
+
+#module load samtools
+
+cd "$indir"
+mkdir -p idxstats # make outdir
+
+align_stats() {
+  local bam=$1 # 90141015805_R1.trim.bam - UNFILTERED BAM
+  local viallabel
+
+  viallabel=$(basename "$bam" | sed "s/_R1.*//")
+  echo "Processing $viallabel"
+
+  # get counts
+  local total
+  local y
+  local x
+  local mt
+  local auto
+  local contig
+  total=$(awk '{sum+=$3;}END{print sum;}' "idxstats/${viallabel}_chrinfo.txt")
+  y=$(grep -E "^chrY" "idxstats/${viallabel}_chrinfo.txt" | head -1 | cut -f 3)
+  x=$(grep -E "^chrX" "idxstats/${viallabel}_chrinfo.txt" | head -1 | cut -f 3)
+  mt=$(grep -E "^chrM" "idxstats/${viallabel}_chrinfo.txt" | head -1 | cut -f 3)
+  auto=$(grep -E "^chr[0-9]" "idxstats/${viallabel}_chrinfo.txt" | cut -f 3 | awk '{sum+=$1;}END{print sum;}')
+  contig=$(grep -E -v "^chr" "idxstats/${viallabel}_chrinfo.txt" | cut -f 3 | awk '{sum+=$1;}END{print sum;}')
+
+  # get fractions
+  local pct_y
+  local pct_x
+  local pct_mt
+  local pct_auto
+  local pct_contig
+  pct_y=$(echo "scale=5; ${y}/${total}*100" | bc -l | sed 's/^\./0./')
+  pct_x=$(echo "scale=5; ${x}/${total}*100" | bc -l | sed 's/^\./0./')
+  pct_mt=$(echo "scale=5; ${mt}/${total}*100" | bc -l | sed 's/^\./0./')
+  pct_auto=$(echo "scale=5; ${auto}/${total}*100" | bc -l | sed 's/^\./0./')
+  pct_contig=$(echo "scale=5; ${contig}/${total}*100" | bc -l | sed 's/^\./0./')
+
+  # output to file
+  echo 'viallabel,total_primary_alignments,pct_chrX,pct_chrY,pct_chrM,pct_auto,pct_contig' >"idxstats/${viallabel}_chrinfo.csv"
+  echo "${viallabel},${total},${pct_x},${pct_y},${pct_mt},${pct_auto},${pct_contig}" >>"idxstats/${viallabel}_chrinfo.csv"
+}
+export -f align_stats
+
+if [ "$type" == "glob" ]; then
+  parallel --joblog ~/mnt/tmp/"$(basename bamdir)"_align_stats_concat_joblog.log --progress --bar --verbose --jobs "$cores" align_stats ::: "$(ls "${bamdir%/}"/*/*/align/rep*/*_R1.trim.bam)"
+elif [ "$type" == "file" ]; then
+  readarray -t raw_bam_list <<<"$bamdir"
+  parallel --joblog ~/mnt/tmp/"$(basename bamdir)"_align_stats_concat_joblog.log --progress --bar --verbose --jobs "$cores" align_stats ::: "${raw_bam_list[@]}"
+elif [ "$type" == "find" ]; then
+  parallel --joblog ~/mnt/tmp/"$(basename bamdir)"_align_stats_concat_joblog.log --progress --bar --verbose --jobs "$cores" align_stats ::: "$(find -name "*_R1.trim.bam" "$bamdir")"
+else
+  echo "type must be glob, file, or find"
+  exit 1
+fi
+
+# collapse
+#head -1 $(find -name "*_chrinfo.csv" | head -1) > merged_chr_info.csv
+#for file in $(find -name "*_chrinfo.csv"); do sed -e '1d' $file >> merged_chr_info.csv; done
+cat idxstats/*_chrinfo.csv | grep -v "^viallabel" | sed '1iviallabel,total_primary_alignments,pct_chrX,pct_chrY,pct_chrM,pct_auto,pct_contig' >merged_chr_info.csv
diff --git a/src/atac-post-process-human-wrapper.sh b/src/atac-post-process-human-wrapper.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+# Wrapper script for running the ENCODE ATAC-seq pipeline
+
+###
+# working directory needs to be base directory (not src/)
+# USAGE: src/atac-seq-human-wrapper.sh
+###
+
+set -eu
+
+### VARIABLES TO SET ###
+NUM_CORES=12
+BATCH_PREFIX=batch1_20220710
+GCS_BUCKET=my_bucket
+ATAC_OUTPUT_DIR="gs://my_bucket/atac-seq/batch1_20220710/processed"
+SAMPLE_METADATA_FILENAME="batch1_20220710_sample_metadata.csv"
+# set this to the region you want to run in (e.g. us-central1), the region should be compatible with life sciences API
+CROMWELL_OUT_DIR="gs://my_bucket/pipelines/out"
+###
+
+# constants
+OUT_DIR=outputs/
+MOUNT_DIR=~/mnt/
+LOCAL_ATAC_OUT_DIR=${ATAC_OUTPUT_DIR#"gs://"}
+
+echo "Downloading outputs..."
+bash src/croo.sh $OUT_DIR/"$BATCH_PREFIX"_submissions.json $CROMWELL_OUT_DIR/atac/ $ATAC_OUTPUT_DIR/croo true
+
+echo "Mounting GCS bucket..."
+mkdir -p "$MOUNT_DIR"/"$GCS_BUCKET"
+mkdir -p "$MOUNT_DIR"/tmp
+gcsfuse --implicit-dirs "$GCS_BUCKET" "$MOUNT_DIR"/"$GCS_BUCKET"
+
+#get qc tsv report
+echo "Creating qc2tsv report..."
+mkdir -p "$MOUNT_DIR"/"$GCS_BUCKET"/qc/
+bash src/qc2tsv.sh $ATAC_OUTPUT_DIR/croo "$MOUNT_DIR"/"$GCS_BUCKET"/qc/"$BATCH_PREFIX"_qc.tsv
+
+# reorganize croo outputs for quantification
+echo "Copying files for quantification..."
+bash src/extract_atac_from_gcp_human.sh $NUM_CORES $ATAC_OUTPUT_DIR/ $ATAC_OUTPUT_DIR/croo
+
+# run samtools to generate genome alignment stats
+echo "Generating alignment stats..."
+bash src/align_stats.sh $NUM_CORES "$MOUNT_DIR"/$LOCAL_ATAC_OUT_DIR "$MOUNT_DIR"/$LOCAL_ATAC_OUT_DIR/croo
+
+# Create final qc report merging the metadata and workflow qc scores
+echo "Generating merged qc reports..."
+Rscript src/merge_atac_qc_human.R -w "$MOUNT_DIR/$LOCAL_ATAC_OUT_DIR/$SAMPLE_METADATA_FILENAME" -q "$MOUNT_DIR"/$LOCAL_ATAC_OUT_DIR/croo/final/qc/"$BATCH_PREFIX"_qc.tsv -a "$MOUNT_DIR"/$LOCAL_ATAC_OUT_DIR/merged_chr_info.csv -o "$MOUNT_DIR"/${ATAC_OUTPUT_DIR#"gs://"}/
+
+echo "Generating sample counts matrix..."
+bash src/encode_to_count_matrix_human.sh "$MOUNT_DIR/$LOCAL_ATAC_OUT_DIR" "$(pwd)"/src/ $NUM_CORES
+
+echo "Done!"