Skip to content

Commit

Permalink
add in processing scripts for human data (#40)
Browse files Browse the repository at this point in the history
* refactor: make json script work with gsutil

* feat: add submit.sh

* feat: add helper delete_json script

* refactor: run ShellCheck, shellfmt, and pylint on some existing scripts

* feat: add new helper scripts

* feat: refactor status script

* feat: continue working on various helper scripts

* fix: mo' problems, mo' fixes

* feat: add in merge_atac_qc_human.R script, fix some parallel processing, add progress

* fix: a couple minor script tweaks

* feat: add in complete wrapper and server setup scripts

* feat: fix wrapper script with better outputs

* fix: add in fixes for the setup and wrapper scripts

* feat: update batch scripts

* feat: add R, fish, neovim, and qc2tsv install to server setup

* feat: clean up scripts

* feat: clean up scripts

* fix: add samtools quickcheck

* fix: use date for job log

* fix: update echo command make_json_singleton.sh

* feat: clean up src directory, get ready for submission

* chore: add latest updates to README.md

* feat: add --bar to all GNU parallel jobs

* Added scripts used to process pass1ac data. Made minor changes to script to match consistent naming convention for pass data

* fix: fix shebang in server setup script

* chore: add merge_jsons.py utility script

* fix: remove trailing slash from input dir

* fix: point to correct script names in wrappers

* fix: remove home dir path addition from wrapper script

* fix: fix paths in scripts

* Fix : remove printing out encode workflow level qc , added vial_label column

* fix: fix paths in scripts

---------

Co-authored-by: archanaraja <araja7@stanford.edu>
  • Loading branch information
mihirsamdarshi and archanaraja committed Jul 4, 2024
1 parent 21cf116 commit 7eb6b28
Show file tree
Hide file tree
Showing 30 changed files with 2,382 additions and 653 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,6 @@ config/
metadata/
.DS_Store
test_data/
.idea/
.fleet/
stash/
677 changes: 446 additions & 231 deletions README.md

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
Index: atac-seq-pipeline/atac.wdl
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/atac-seq-pipeline/atac.wdl b/atac-seq-pipeline/atac.wdl
--- a/atac-seq-pipeline/atac.wdl (revision 46f2a928db20ca4889160b6e2a2e980fac34d672)
+++ b/atac-seq-pipeline/atac.wdl (date 1663706161074)
@@ -1990,5 +1990,9 @@
}
runtime {
maxRetries : 0
+ cpu : 1
+ memory : '2 GB'
+ time : 1
+ disks : 'local-disk 10 SSD'
}
}
86 changes: 86 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
aiohttp==3.8.1
aiosignal==1.2.0
argcomplete==1.12.3
async-timeout==4.0.2
attrs==22.1.0
autouri==0.3.0
awscli==1.25.78
boto3==1.24.77
botocore==1.27.77
bullet==2.2.0
cachetools==5.2.0
caper==2.2.2
certifi==2022.9.14
cffi==1.15.1
charset-normalizer==2.1.1
colorama==0.4.4
coloredlogs==15.0.1
contourpy==1.0.5
croo==0.6.0
cryptography==38.0.1
cycler==0.11.0
dateparser==1.1.1
decorator==5.1.1
docker==6.0.0
docutils==0.16
filelock==3.8.0
fonttools==4.37.3
frozenlist==1.3.1
fsspec==2022.8.2
gcsfs==2022.8.2
google-api-core==2.10.1
google-auth==2.11.1
google-auth-oauthlib==0.5.3
google-cloud-core==2.3.2
google-cloud-storage==2.5.0
google-crc32c==1.5.0
google-resumable-media==2.3.3
googleapis-common-protos==1.56.4
graphviz==0.20.1
humanfriendly==10.0
idna==3.4
importlib-metadata==4.12.0
jmespath==1.0.1
joblib==1.2.0
kiwisolver==1.4.4
lark-parser==0.12.0
matplotlib==3.6.0
miniwdl==1.3.3
multidict==6.0.2
ntplib==0.4.0
numpy==1.23.3
oauthlib==3.2.1
packaging==21.3
pandas==1.5.0
Pillow==9.2.0
protobuf==4.21.6
psutil==5.9.2
pyasn1==0.4.8
pyasn1-modules==0.2.8
pycparser==2.21
pygtail==0.12.0
pyhocon==0.3.59
pyOpenSSL==22.0.0
pyparsing==2.4.7
python-dateutil==2.8.2
python-json-logger==2.0.4
pytz==2022.2.1
pytz-deprecation-shim==0.1.0.post0
PyYAML==5.4.1
regex==2022.3.2
requests==2.28.1
requests-oauthlib==1.3.1
rsa==4.7.2
s3transfer==0.6.0
scikit-learn==1.1.2
scipy==1.9.1
six==1.16.0
threadpoolctl==3.1.0
tqdm==4.64.1
tzdata==2022.2
tzlocal==4.2
urllib3==1.26.12
websocket-client==1.4.1
xdg==5.1.1
yarl==1.8.1
zipp==3.8.1
111 changes: 79 additions & 32 deletions src/align_stats.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,18 @@
# usage: bash align_stats.sh [NUM_CORES] [indir] [bamdir]
# make sure the vm has bc and samtools istalled
# sudo apt install bc
set -e
set -e

cores=$1
indir=$2
bamdir=$3

if [ -z "$4" ]; then
type="glob"
else
type=$4
fi

#indir=/projects/motrpac/PASS1A/ATAC/NOVASEQ_BATCH2/outputs
#indir=~/test_mnt/PASS/atac-seq/stanford/batch4_20200928/Output/final
# path to ENCODE outputs, anywhere upstream of the bam files
Expand All @@ -20,45 +26,86 @@ bamdir=$3

#module load samtools

cd ${indir}
cwd=$(pwd)
cd "$indir"
mkdir -p idxstats # make outdir

align_stats () {
local bam=$1 # 90141015805_R1.trim.bam - UNFILTERED BAM
local viallabel=$(basename $bam | sed "s/_R1.*//")
local primary=idxstats/${viallabel}_primary.bam
# already sorted
# keep only primary alignments
samtools view -b -F 0x900 ${bam} -o ${primary}
# index
samtools index ${primary}
samtools idxstats ${primary} > idxstats/${viallabel}_chrinfo.txt
rm ${primary} ${primary}.bai
align_stats() {
local bam=$1 # 90141015805_R1.trim.bam - UNFILTERED BAM
local viallabel
local primary

viallabel=$(basename "$bam" | sed "s/_R1.*//")
echo "$viallabel"

if [ -f idxstats/"${viallabel}"_chrinfo.txt ] && samtools quickcheck "$bam" 2&> /dev/null ; then
echo "Skipping $viallabel"
return
fi

# get counts
local total=$(awk '{sum+=$3;}END{print sum;}' idxstats/${viallabel}_chrinfo.txt)
local y=$(grep -E "^chrY" idxstats/${viallabel}_chrinfo.txt | cut -f 3)
local x=$(grep -E "^chrX" idxstats/${viallabel}_chrinfo.txt | cut -f 3)
local mt=$(grep -E "^chrM" idxstats/${viallabel}_chrinfo.txt | cut -f 3)
local auto=$(grep -E "^chr[0-9]" idxstats/${viallabel}_chrinfo.txt | cut -f 3 | awk '{sum+=$1;}END{print sum;}')
local contig=$(grep -E -v "^chr" idxstats/${viallabel}_chrinfo.txt | cut -f 3 | awk '{sum+=$1;}END{print sum;}')
primary=idxstats/${viallabel}_primary.bam
# already sorted
# keep only primary alignments
samtools view -b -F 0x900 "$bam" -o "$primary"
# index
samtools index "$primary"
samtools idxstats "$primary" >"idxstats/${viallabel}_chrinfo.txt"
rm "$primary" "${primary}.bai"

# get fractions
local pct_y=$(echo "scale=5; ${y}/${total}*100" | bc -l | sed 's/^\./0./')
local pct_x=$(echo "scale=5; ${x}/${total}*100" | bc -l | sed 's/^\./0./')
local pct_mt=$(echo "scale=5; ${mt}/${total}*100" | bc -l | sed 's/^\./0./')
local pct_auto=$(echo "scale=5; ${auto}/${total}*100" | bc -l | sed 's/^\./0./')
local pct_contig=$(echo "scale=5; ${contig}/${total}*100" | bc -l | sed 's/^\./0./')
# get counts
local total
local y
local x
local mt
local auto
local contig
total=$(awk '{sum+=$3;}END{print sum;}' "idxstats/${viallabel}_chrinfo.txt")
y=$(grep -E "^chrY" "idxstats/${viallabel}_chrinfo.txt" | head -1 | cut -f 3)
x=$(grep -E "^chrX" "idxstats/${viallabel}_chrinfo.txt" | head -1 | cut -f 3)
mt=$(grep -E "^chrM" "idxstats/${viallabel}_chrinfo.txt" | head -1 | cut -f 3)
auto=$(grep -E "^chr[0-9]" "idxstats/${viallabel}_chrinfo.txt" | cut -f 3 | awk '{sum+=$1;}END{print sum;}')
contig=$(grep -E -v "^chr" "idxstats/${viallabel}_chrinfo.txt" | cut -f 3 | awk '{sum+=$1;}END{print sum;}')

# output to file
echo 'viallabel,total_primary_alignments,pct_chrX,pct_chrY,pct_chrM,pct_auto,pct_contig' > idxstats/${viallabel}_chrinfo.csv
echo ${viallabel},${total},${pct_x},${pct_y},${pct_mt},${pct_auto},${pct_contig} >> idxstats/${viallabel}_chrinfo.csv
# get fractions
local pct_y
local pct_x
local pct_mt
local pct_auto
local pct_contig
pct_y=$(echo "scale=5; ${y}/${total}*100" | bc -l | sed 's/^\./0./')
pct_x=$(echo "scale=5; ${x}/${total}*100" | bc -l | sed 's/^\./0./')
pct_mt=$(echo "scale=5; ${mt}/${total}*100" | bc -l | sed 's/^\./0./')
pct_auto=$(echo "scale=5; ${auto}/${total}*100" | bc -l | sed 's/^\./0./')
pct_contig=$(echo "scale=5; ${contig}/${total}*100" | bc -l | sed 's/^\./0./')

echo $viallabel
echo $total
echo $pct_x
echo $pct_y
echo $pct_mt
echo $pct_auto
echo $pct_contig
# output to file
echo 'viallabel,total_primary_alignments,pct_chrX,pct_chrY,pct_chrM,pct_auto,pct_contig' >"idxstats/${viallabel}_chrinfo.csv"
echo "${viallabel},${total},${pct_x},${pct_y},${pct_mt},${pct_auto},${pct_contig}" >>"idxstats/${viallabel}_chrinfo.csv"
}
export -f align_stats
#parallel --verbose --jobs ${cores} align_stats ::: $(find -name "*_R1.trim.bam")
parallel --verbose --jobs ${cores} align_stats ::: $(ls ${bamdir}/*/*/align/rep*/*.trim.bam)

if [ "$type" == "glob" ]; then
parallel --joblog ~/mnt/tmp/"$(basename bamdir)"_"$(date "+%b%d%Y_%H%M%S")"_align_stats_joblog.log --progress --bar --verbose --jobs "$cores" align_stats ::: "$(ls "${bamdir%/}"/*/*/align/rep*/*_R1.trim.bam)"
elif [ "$type" == "file" ]; then
readarray -t raw_bam_list <<<"$bamdir"
parallel --joblog ~/mnt/tmp/"$(basename bamdir)"_"$(date "+%b%d%Y_%H%M%S")"_align_stats_joblog.log --progress --bar --verbose --jobs "$cores" align_stats ::: "${raw_bam_list[@]}"
elif [ "$type" == "find" ]; then
parallel --joblog ~/mnt/tmp/"$(basename bamdir)"_"$(date "+%b%d%Y_%H%M%S")"_align_stats_joblog.log --progress --bar --verbose --jobs "$cores" align_stats ::: "$(find -name "*_R1.trim.bam" "$bamdir")"
else
echo "type must be glob, file, or find"
exit 1
fi

# collapse
#head -1 $(find -name "*_chrinfo.csv" | head -1) > merged_chr_info.csv
#for file in $(find -name "*_chrinfo.csv"); do sed -e '1d' $file >> merged_chr_info.csv; done
cat idxstats/*_chrinfo.csv|grep -v "^viallabel"|sed '1iviallabel,total_primary_alignments,pct_chrX,pct_chrY,pct_chrM,pct_auto,pct_contig' >merged_chr_info.csv
cat idxstats/*_chrinfo.csv | grep -v "^viallabel" | sed '1iviallabel,total_primary_alignments,pct_chrX,pct_chrY,pct_chrM,pct_auto,pct_contig' >merged_chr_info.csv
cd "$cwd" || exit 1

86 changes: 86 additions & 0 deletions src/align_stats_concat_only.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/bin/bash
# Nicole Gay
# 13 May 2020
# ATAC-seq alignment stats
# usage: bash align_stats.sh [NUM_CORES] [indir] [bamdir]
# make sure the vm has bc and samtools istalled
# sudo apt install bc
set -e

cores=$1
indir=$2
bamdir=$3

if [ -z "$4" ]; then
type="glob"
else
type=$4
fi

#indir=/projects/motrpac/PASS1A/ATAC/NOVASEQ_BATCH2/outputs
#indir=~/test_mnt/PASS/atac-seq/stanford/batch4_20200928/Output/final
# path to ENCODE outputs, anywhere upstream of the bam files
# also where output folder will be made
#bamdir=~/test_mnt/PASS/atac-seq/stanford/batch4_20200928/Output
#path to the location of unfiltered bam files (*_R1.trim.bam)

#module load samtools

cd "$indir"
mkdir -p idxstats # make outdir

align_stats() {
local bam=$1 # 90141015805_R1.trim.bam - UNFILTERED BAM
local viallabel

viallabel=$(basename "$bam" | sed "s/_R1.*//")
echo "Processing $viallabel"

# get counts
local total
local y
local x
local mt
local auto
local contig
total=$(awk '{sum+=$3;}END{print sum;}' "idxstats/${viallabel}_chrinfo.txt")
y=$(grep -E "^chrY" "idxstats/${viallabel}_chrinfo.txt" | head -1 | cut -f 3)
x=$(grep -E "^chrX" "idxstats/${viallabel}_chrinfo.txt" | head -1 | cut -f 3)
mt=$(grep -E "^chrM" "idxstats/${viallabel}_chrinfo.txt" | head -1 | cut -f 3)
auto=$(grep -E "^chr[0-9]" "idxstats/${viallabel}_chrinfo.txt" | cut -f 3 | awk '{sum+=$1;}END{print sum;}')
contig=$(grep -E -v "^chr" "idxstats/${viallabel}_chrinfo.txt" | cut -f 3 | awk '{sum+=$1;}END{print sum;}')

# get fractions
local pct_y
local pct_x
local pct_mt
local pct_auto
local pct_contig
pct_y=$(echo "scale=5; ${y}/${total}*100" | bc -l | sed 's/^\./0./')
pct_x=$(echo "scale=5; ${x}/${total}*100" | bc -l | sed 's/^\./0./')
pct_mt=$(echo "scale=5; ${mt}/${total}*100" | bc -l | sed 's/^\./0./')
pct_auto=$(echo "scale=5; ${auto}/${total}*100" | bc -l | sed 's/^\./0./')
pct_contig=$(echo "scale=5; ${contig}/${total}*100" | bc -l | sed 's/^\./0./')

# output to file
echo 'viallabel,total_primary_alignments,pct_chrX,pct_chrY,pct_chrM,pct_auto,pct_contig' >"idxstats/${viallabel}_chrinfo.csv"
echo "${viallabel},${total},${pct_x},${pct_y},${pct_mt},${pct_auto},${pct_contig}" >>"idxstats/${viallabel}_chrinfo.csv"
}
export -f align_stats

if [ "$type" == "glob" ]; then
parallel --joblog ~/mnt/tmp/"$(basename bamdir)"_align_stats_concat_joblog.log --progress --bar --verbose --jobs "$cores" align_stats ::: "$(ls "${bamdir%/}"/*/*/align/rep*/*_R1.trim.bam)"
elif [ "$type" == "file" ]; then
readarray -t raw_bam_list <<<"$bamdir"
parallel --joblog ~/mnt/tmp/"$(basename bamdir)"_align_stats_concat_joblog.log --progress --bar --verbose --jobs "$cores" align_stats ::: "${raw_bam_list[@]}"
elif [ "$type" == "find" ]; then
parallel --joblog ~/mnt/tmp/"$(basename bamdir)"_align_stats_concat_joblog.log --progress --bar --verbose --jobs "$cores" align_stats ::: "$(find -name "*_R1.trim.bam" "$bamdir")"
else
echo "type must be glob, file, or find"
exit 1
fi

# collapse
#head -1 $(find -name "*_chrinfo.csv" | head -1) > merged_chr_info.csv
#for file in $(find -name "*_chrinfo.csv"); do sed -e '1d' $file >> merged_chr_info.csv; done
cat idxstats/*_chrinfo.csv | grep -v "^viallabel" | sed '1iviallabel,total_primary_alignments,pct_chrX,pct_chrY,pct_chrM,pct_auto,pct_contig' >merged_chr_info.csv
54 changes: 54 additions & 0 deletions src/atac-post-process-human-wrapper.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/usr/bin/env bash
# Wrapper script for running the ENCODE ATAC-seq pipeline

###
# working directory needs to be base directory (not src/)
# USAGE: src/atac-seq-human-wrapper.sh
###

set -eu

### VARIABLES TO SET ###
NUM_CORES=12
BATCH_PREFIX=batch1_20220710
GCS_BUCKET=my_bucket
ATAC_OUTPUT_DIR="gs://my_bucket/atac-seq/batch1_20220710/processed"
SAMPLE_METADATA_FILENAME="batch1_20220710_sample_metadata.csv"
# set this to the region you want to run in (e.g. us-central1), the region should be compatible with life sciences API
CROMWELL_OUT_DIR="gs://my_bucket/pipelines/out"
###

# constants
OUT_DIR=outputs/
MOUNT_DIR=~/mnt/
LOCAL_ATAC_OUT_DIR=${ATAC_OUTPUT_DIR#"gs://"}

echo "Downloading outputs..."
bash src/croo.sh $OUT_DIR/"$BATCH_PREFIX"_submissions.json $CROMWELL_OUT_DIR/atac/ $ATAC_OUTPUT_DIR/croo true

echo "Mounting GCS bucket..."
mkdir -p "$MOUNT_DIR"/"$GCS_BUCKET"
mkdir -p "$MOUNT_DIR"/tmp
gcsfuse --implicit-dirs "$GCS_BUCKET" "$MOUNT_DIR"/"$GCS_BUCKET"

#get qc tsv report
echo "Creating qc2tsv report..."
mkdir -p "$MOUNT_DIR"/"$GCS_BUCKET"/qc/
bash src/qc2tsv.sh $ATAC_OUTPUT_DIR/croo "$MOUNT_DIR"/"$GCS_BUCKET"/qc/"$BATCH_PREFIX"_qc.tsv

# reorganize croo outputs for quantification
echo "Copying files for quantification..."
bash src/extract_atac_from_gcp_human.sh $NUM_CORES $ATAC_OUTPUT_DIR/ $ATAC_OUTPUT_DIR/croo

# run samtools to generate genome alignment stats
echo "Generating alignment stats..."
bash src/align_stats.sh $NUM_CORES "$MOUNT_DIR"/$LOCAL_ATAC_OUT_DIR "$MOUNT_DIR"/$LOCAL_ATAC_OUT_DIR/croo

# Create final qc report merging the metadata and workflow qc scores
echo "Generating merged qc reports..."
Rscript src/merge_atac_qc_human.R -w "$MOUNT_DIR/$LOCAL_ATAC_OUT_DIR/$SAMPLE_METADATA_FILENAME" -q "$MOUNT_DIR"/$LOCAL_ATAC_OUT_DIR/croo/final/qc/"$BATCH_PREFIX"_qc.tsv -a "$MOUNT_DIR"/$LOCAL_ATAC_OUT_DIR/merged_chr_info.csv -o "$MOUNT_DIR"/${ATAC_OUTPUT_DIR#"gs://"}/

echo "Generating sample counts matrix..."
bash src/encode_to_count_matrix_human.sh "$MOUNT_DIR/$LOCAL_ATAC_OUT_DIR" "$(pwd)"/src/ $NUM_CORES

echo "Done!"
Loading

0 comments on commit 7eb6b28

Please sign in to comment.