-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add in processing scripts for human data (#40)
* refactor: make json script work with gsutil * feat: add submit.sh * feat: add helper delete_json script * refactor: run ShellCheck, shellfmt, and pylint on some existing scripts * feat: add new helper scripts * feat: refactor status script * feat: continue working on various helper scripts * fix: mo' problems, mo' fixes * feat: add in merge_atac_qc_human.R script, fix some parallel processing, add progress * fix: a couple minor script tweaks * feat: add in complete wrapper and server setup scripts * feat: fix wrapper script with better outputs * fix: add in fixes for the setup and wrapper scripts * feat: update batch scripts * feat: add R, fish, neovim, and qc2tsv install to server setup * feat: clean up scripts * feat: clean up scripts * fix: add samtools quickcheck * fix: use date for job log * fix: update echo command make_json_singleton.sh * feat: clean up src directory, get ready for submission * chore: add latest updates to README.md * feat: add --bar to all GNU parallel jobs * Added scripts used to process pass1ac data. Made minor changes to script to match consistent naming convention for pass data * fix: fix shebang in server setup script * chore: add merge_jsons.py utility script * fix: remove trailing slash from input dir * fix: point to correct script names in wrappers * fix: remove home dir path addition from wrapper script * fix: fix paths in scripts * Fix : remove printing out encode workflow level qc , added vial_label column * fix: fix paths in scripts --------- Co-authored-by: archanaraja <araja7@stanford.edu>
- Loading branch information
1 parent
21cf116
commit 7eb6b28
Showing
30 changed files
with
2,382 additions
and
653 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,3 +2,6 @@ config/ | |
metadata/ | ||
.DS_Store | ||
test_data/ | ||
.idea/ | ||
.fleet/ | ||
stash/ |
18 changes: 18 additions & 0 deletions
18
patches/fix__add_missing_runtime_attributes_to_atac-seq_v1_7_0.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
Index: atac-seq-pipeline/atac.wdl | ||
IDEA additional info: | ||
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP | ||
<+>UTF-8 | ||
=================================================================== | ||
diff --git a/atac-seq-pipeline/atac.wdl b/atac-seq-pipeline/atac.wdl | ||
--- a/atac-seq-pipeline/atac.wdl (revision 46f2a928db20ca4889160b6e2a2e980fac34d672) | ||
+++ b/atac-seq-pipeline/atac.wdl (date 1663706161074) | ||
@@ -1990,5 +1990,9 @@ | ||
} | ||
runtime { | ||
maxRetries : 0 | ||
+ cpu : 1 | ||
+ memory : '2 GB' | ||
+ time : 1 | ||
+ disks : 'local-disk 10 SSD' | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
aiohttp==3.8.1 | ||
aiosignal==1.2.0 | ||
argcomplete==1.12.3 | ||
async-timeout==4.0.2 | ||
attrs==22.1.0 | ||
autouri==0.3.0 | ||
awscli==1.25.78 | ||
boto3==1.24.77 | ||
botocore==1.27.77 | ||
bullet==2.2.0 | ||
cachetools==5.2.0 | ||
caper==2.2.2 | ||
certifi==2022.9.14 | ||
cffi==1.15.1 | ||
charset-normalizer==2.1.1 | ||
colorama==0.4.4 | ||
coloredlogs==15.0.1 | ||
contourpy==1.0.5 | ||
croo==0.6.0 | ||
cryptography==38.0.1 | ||
cycler==0.11.0 | ||
dateparser==1.1.1 | ||
decorator==5.1.1 | ||
docker==6.0.0 | ||
docutils==0.16 | ||
filelock==3.8.0 | ||
fonttools==4.37.3 | ||
frozenlist==1.3.1 | ||
fsspec==2022.8.2 | ||
gcsfs==2022.8.2 | ||
google-api-core==2.10.1 | ||
google-auth==2.11.1 | ||
google-auth-oauthlib==0.5.3 | ||
google-cloud-core==2.3.2 | ||
google-cloud-storage==2.5.0 | ||
google-crc32c==1.5.0 | ||
google-resumable-media==2.3.3 | ||
googleapis-common-protos==1.56.4 | ||
graphviz==0.20.1 | ||
humanfriendly==10.0 | ||
idna==3.4 | ||
importlib-metadata==4.12.0 | ||
jmespath==1.0.1 | ||
joblib==1.2.0 | ||
kiwisolver==1.4.4 | ||
lark-parser==0.12.0 | ||
matplotlib==3.6.0 | ||
miniwdl==1.3.3 | ||
multidict==6.0.2 | ||
ntplib==0.4.0 | ||
numpy==1.23.3 | ||
oauthlib==3.2.1 | ||
packaging==21.3 | ||
pandas==1.5.0 | ||
Pillow==9.2.0 | ||
protobuf==4.21.6 | ||
psutil==5.9.2 | ||
pyasn1==0.4.8 | ||
pyasn1-modules==0.2.8 | ||
pycparser==2.21 | ||
pygtail==0.12.0 | ||
pyhocon==0.3.59 | ||
pyOpenSSL==22.0.0 | ||
pyparsing==2.4.7 | ||
python-dateutil==2.8.2 | ||
python-json-logger==2.0.4 | ||
pytz==2022.2.1 | ||
pytz-deprecation-shim==0.1.0.post0 | ||
PyYAML==5.4.1 | ||
regex==2022.3.2 | ||
requests==2.28.1 | ||
requests-oauthlib==1.3.1 | ||
rsa==4.7.2 | ||
s3transfer==0.6.0 | ||
scikit-learn==1.1.2 | ||
scipy==1.9.1 | ||
six==1.16.0 | ||
threadpoolctl==3.1.0 | ||
tqdm==4.64.1 | ||
tzdata==2022.2 | ||
tzlocal==4.2 | ||
urllib3==1.26.12 | ||
websocket-client==1.4.1 | ||
xdg==5.1.1 | ||
yarl==1.8.1 | ||
zipp==3.8.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
#!/bin/bash | ||
# Nicole Gay | ||
# 13 May 2020 | ||
# ATAC-seq alignment stats | ||
# usage: bash align_stats.sh [NUM_CORES] [indir] [bamdir] | ||
# make sure the vm has bc and samtools istalled | ||
# sudo apt install bc | ||
set -e | ||
|
||
cores=$1 | ||
indir=$2 | ||
bamdir=$3 | ||
|
||
if [ -z "$4" ]; then | ||
type="glob" | ||
else | ||
type=$4 | ||
fi | ||
|
||
#indir=/projects/motrpac/PASS1A/ATAC/NOVASEQ_BATCH2/outputs | ||
#indir=~/test_mnt/PASS/atac-seq/stanford/batch4_20200928/Output/final | ||
# path to ENCODE outputs, anywhere upstream of the bam files | ||
# also where output folder will be made | ||
#bamdir=~/test_mnt/PASS/atac-seq/stanford/batch4_20200928/Output | ||
#path to the location of unfiltered bam files (*_R1.trim.bam) | ||
|
||
#module load samtools | ||
|
||
cd "$indir" | ||
mkdir -p idxstats # make outdir | ||
|
||
align_stats() { | ||
local bam=$1 # 90141015805_R1.trim.bam - UNFILTERED BAM | ||
local viallabel | ||
|
||
viallabel=$(basename "$bam" | sed "s/_R1.*//") | ||
echo "Processing $viallabel" | ||
|
||
# get counts | ||
local total | ||
local y | ||
local x | ||
local mt | ||
local auto | ||
local contig | ||
total=$(awk '{sum+=$3;}END{print sum;}' "idxstats/${viallabel}_chrinfo.txt") | ||
y=$(grep -E "^chrY" "idxstats/${viallabel}_chrinfo.txt" | head -1 | cut -f 3) | ||
x=$(grep -E "^chrX" "idxstats/${viallabel}_chrinfo.txt" | head -1 | cut -f 3) | ||
mt=$(grep -E "^chrM" "idxstats/${viallabel}_chrinfo.txt" | head -1 | cut -f 3) | ||
auto=$(grep -E "^chr[0-9]" "idxstats/${viallabel}_chrinfo.txt" | cut -f 3 | awk '{sum+=$1;}END{print sum;}') | ||
contig=$(grep -E -v "^chr" "idxstats/${viallabel}_chrinfo.txt" | cut -f 3 | awk '{sum+=$1;}END{print sum;}') | ||
|
||
# get fractions | ||
local pct_y | ||
local pct_x | ||
local pct_mt | ||
local pct_auto | ||
local pct_contig | ||
pct_y=$(echo "scale=5; ${y}/${total}*100" | bc -l | sed 's/^\./0./') | ||
pct_x=$(echo "scale=5; ${x}/${total}*100" | bc -l | sed 's/^\./0./') | ||
pct_mt=$(echo "scale=5; ${mt}/${total}*100" | bc -l | sed 's/^\./0./') | ||
pct_auto=$(echo "scale=5; ${auto}/${total}*100" | bc -l | sed 's/^\./0./') | ||
pct_contig=$(echo "scale=5; ${contig}/${total}*100" | bc -l | sed 's/^\./0./') | ||
|
||
# output to file | ||
echo 'viallabel,total_primary_alignments,pct_chrX,pct_chrY,pct_chrM,pct_auto,pct_contig' >"idxstats/${viallabel}_chrinfo.csv" | ||
echo "${viallabel},${total},${pct_x},${pct_y},${pct_mt},${pct_auto},${pct_contig}" >>"idxstats/${viallabel}_chrinfo.csv" | ||
} | ||
export -f align_stats | ||
|
||
if [ "$type" == "glob" ]; then | ||
parallel --joblog ~/mnt/tmp/"$(basename bamdir)"_align_stats_concat_joblog.log --progress --bar --verbose --jobs "$cores" align_stats ::: "$(ls "${bamdir%/}"/*/*/align/rep*/*_R1.trim.bam)" | ||
elif [ "$type" == "file" ]; then | ||
readarray -t raw_bam_list <<<"$bamdir" | ||
parallel --joblog ~/mnt/tmp/"$(basename bamdir)"_align_stats_concat_joblog.log --progress --bar --verbose --jobs "$cores" align_stats ::: "${raw_bam_list[@]}" | ||
elif [ "$type" == "find" ]; then | ||
parallel --joblog ~/mnt/tmp/"$(basename bamdir)"_align_stats_concat_joblog.log --progress --bar --verbose --jobs "$cores" align_stats ::: "$(find -name "*_R1.trim.bam" "$bamdir")" | ||
else | ||
echo "type must be glob, file, or find" | ||
exit 1 | ||
fi | ||
|
||
# collapse | ||
#head -1 $(find -name "*_chrinfo.csv" | head -1) > merged_chr_info.csv | ||
#for file in $(find -name "*_chrinfo.csv"); do sed -e '1d' $file >> merged_chr_info.csv; done | ||
cat idxstats/*_chrinfo.csv | grep -v "^viallabel" | sed '1iviallabel,total_primary_alignments,pct_chrX,pct_chrY,pct_chrM,pct_auto,pct_contig' >merged_chr_info.csv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
#!/usr/bin/env bash | ||
# Wrapper script for running the ENCODE ATAC-seq pipeline | ||
|
||
### | ||
# working directory needs to be base directory (not src/) | ||
# USAGE: src/atac-seq-human-wrapper.sh | ||
### | ||
|
||
set -eu | ||
|
||
### VARIABLES TO SET ### | ||
NUM_CORES=12 | ||
BATCH_PREFIX=batch1_20220710 | ||
GCS_BUCKET=my_bucket | ||
ATAC_OUTPUT_DIR="gs://my_bucket/atac-seq/batch1_20220710/processed" | ||
SAMPLE_METADATA_FILENAME="batch1_20220710_sample_metadata.csv" | ||
# set this to the region you want to run in (e.g. us-central1), the region should be compatible with life sciences API | ||
CROMWELL_OUT_DIR="gs://my_bucket/pipelines/out" | ||
### | ||
|
||
# constants | ||
OUT_DIR=outputs/ | ||
MOUNT_DIR=~/mnt/ | ||
LOCAL_ATAC_OUT_DIR=${ATAC_OUTPUT_DIR#"gs://"} | ||
|
||
echo "Downloading outputs..." | ||
bash src/croo.sh $OUT_DIR/"$BATCH_PREFIX"_submissions.json $CROMWELL_OUT_DIR/atac/ $ATAC_OUTPUT_DIR/croo true | ||
|
||
echo "Mounting GCS bucket..." | ||
mkdir -p "$MOUNT_DIR"/"$GCS_BUCKET" | ||
mkdir -p "$MOUNT_DIR"/tmp | ||
gcsfuse --implicit-dirs "$GCS_BUCKET" "$MOUNT_DIR"/"$GCS_BUCKET" | ||
|
||
#get qc tsv report | ||
echo "Creating qc2tsv report..." | ||
mkdir -p "$MOUNT_DIR"/"$GCS_BUCKET"/qc/ | ||
bash src/qc2tsv.sh $ATAC_OUTPUT_DIR/croo "$MOUNT_DIR"/"$GCS_BUCKET"/qc/"$BATCH_PREFIX"_qc.tsv | ||
|
||
# reorganize croo outputs for quantification | ||
echo "Copying files for quantification..." | ||
bash src/extract_atac_from_gcp_human.sh $NUM_CORES $ATAC_OUTPUT_DIR/ $ATAC_OUTPUT_DIR/croo | ||
|
||
# run samtools to generate genome alignment stats | ||
echo "Generating alignment stats..." | ||
bash src/align_stats.sh $NUM_CORES "$MOUNT_DIR"/$LOCAL_ATAC_OUT_DIR "$MOUNT_DIR"/$LOCAL_ATAC_OUT_DIR/croo | ||
|
||
# Create final qc report merging the metadata and workflow qc scores | ||
echo "Generating merged qc reports..." | ||
Rscript src/merge_atac_qc_human.R -w "$MOUNT_DIR/$LOCAL_ATAC_OUT_DIR/$SAMPLE_METADATA_FILENAME" -q "$MOUNT_DIR"/$LOCAL_ATAC_OUT_DIR/croo/final/qc/"$BATCH_PREFIX"_qc.tsv -a "$MOUNT_DIR"/$LOCAL_ATAC_OUT_DIR/merged_chr_info.csv -o "$MOUNT_DIR"/${ATAC_OUTPUT_DIR#"gs://"}/ | ||
|
||
echo "Generating sample counts matrix..." | ||
bash src/encode_to_count_matrix_human.sh "$MOUNT_DIR/$LOCAL_ATAC_OUT_DIR" "$(pwd)"/src/ $NUM_CORES | ||
|
||
echo "Done!" |
Oops, something went wrong.