From 7eb6b2872bf82fb180adee3fcc4e002a00eb102e Mon Sep 17 00:00:00 2001 From: Mihir Samdarshi Date: Wed, 3 Jul 2024 23:35:14 -0700 Subject: [PATCH] add in processing scripts for human data (#40) * refactor: make json script work with gsutil * feat: add submit.sh * feat: add helper delete_json script * refactor: run ShellCheck, shellfmt, and pylint on some existing scripts * feat: add new helper scripts * feat: refactor status script * feat: continue working on various helper scripts * fix: mo' problems, mo' fixes * feat: add in merge_atac_qc_human.R script, fix some parallel processing, add progress * fix: a couple minor script tweaks * feat: add in complete wrapper and server setup scripts * feat: fix wrapper script with better outputs * fix: add in fixes for the setup and wrapper scripts * feat: update batch scripts * feat: add R, fish, neovim, and qc2tsv install to server setup * feat: clean up scripts * feat: clean up scripts * fix: add samtools quickcheck * fix: use date for job log * fix: update echo command make_json_singleton.sh * feat: clean up src directory, get ready for submission * chore: add latest updates to README.md * feat: add --bar to all GNU parallel jobs * Added scripts used to process pass1ac data. Made minor changes to script to match consistent naming convention for pass data * fix: fix shebang in server setup script * chore: add merge_jsons.py utility script * fix: remove trailing slash from input dir * fix: point to correct script names in wrappers * fix: remove home dir path addition from wrapper script * fix: fix paths in scripts * Fix : remove printing out encode workflow level qc , added vial_label column * fix: fix paths in scripts --------- Co-authored-by: archanaraja --- .gitignore | 3 + README.md | 677 ++++++++++++------ ...untime_attributes_to_atac-seq_v1_7_0.patch | 18 + requirements.txt | 86 +++ src/align_stats.sh | 111 ++- src/align_stats_concat_only.sh | 86 +++ src/atac-post-process-human-wrapper.sh | 54 ++ ...r.sh => atac-post-process-pass-wrapper.sh} | 39 +- src/croo.sh | 77 +- src/croo_wrapper.sh | 10 - src/delete_json.sh | 23 + src/encode_rep_names_from_croo.py | 123 +++- src/encode_to_count_matrix.sh | 12 +- ...encode_to_count_matrix_all_tissues_pass.sh | 79 ++ src/encode_to_count_matrix_human.sh | 89 +++ src/extract_atac_from_gcp_human.sh | 122 ++++ src/extract_atac_from_gcp_pass.sh | 121 ++++ src/extract_rep_names_from_encode.sh | 26 +- src/get_execution_status.py | 181 +++++ src/human_encode_rep_names_from_croo.py | 95 +++ src/make_json_singleton.sh | 127 ++-- src/merge_atac_qc.R | 135 ++-- src/merge_atac_qc_human.R | 111 +++ src/merge_jsons.py | 66 ++ src/meta_pass_data_dict.txt | 68 -- src/pass_extract_atac_from_gcp.sh | 106 --- src/qc2tsv.sh | 18 +- src/run_pipeline.sh | 75 ++ src/server_setup.sh | 239 +++++++ src/submit.sh | 58 ++ 30 files changed, 2382 insertions(+), 653 deletions(-) create mode 100644 patches/fix__add_missing_runtime_attributes_to_atac-seq_v1_7_0.patch create mode 100644 requirements.txt create mode 100644 src/align_stats_concat_only.sh create mode 100644 src/atac-post-process-human-wrapper.sh rename src/{atac-post-process_wrapper.sh => atac-post-process-pass-wrapper.sh} (54%) mode change 100755 => 100644 src/croo.sh delete mode 100755 src/croo_wrapper.sh create mode 100644 src/delete_json.sh create mode 100644 src/encode_to_count_matrix_all_tissues_pass.sh create mode 100644 src/encode_to_count_matrix_human.sh create mode 100644 src/extract_atac_from_gcp_human.sh create mode 100644 src/extract_atac_from_gcp_pass.sh create mode 100644 src/get_execution_status.py create mode 100644 src/human_encode_rep_names_from_croo.py create mode 100644 src/merge_atac_qc_human.R create mode 100644 src/merge_jsons.py delete mode 100644 src/meta_pass_data_dict.txt delete mode 100644 src/pass_extract_atac_from_gcp.sh create mode 100644 src/run_pipeline.sh create mode 100644 src/server_setup.sh create mode 100644 src/submit.sh diff --git a/.gitignore b/.gitignore index fe57464..624a80b 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,6 @@ config/ metadata/ .DS_Store test_data/ +.idea/ +.fleet/ +stash/ diff --git a/README.md b/README.md index b9b3b7a..95c826a 100644 --- a/README.md +++ b/README.md @@ -1,70 +1,85 @@ -# MoTrPAC ATAC-Seq QC and Analysis Pipeline +# MoTrPAC ATAC-Seq QC and Analysis Pipeline [![DOI](https://zenodo.org/badge/221818534.svg)](https://zenodo.org/badge/latestdoi/221818534) -This repository provides MoTrPAC-specific supplements to the [ENCODE ATAC-seq pipeline](https://github.com/ENCODE-DCC/atac-seq-pipeline). For additional details not directly related to running the ENCODE ATAC-seq pipeline or processing the results, see the most recent version of the MoTrPAC ATAC-seq QC and Analysis Pipeline MOP, available [here](https://docs.google.com/document/d/1vnB7ITAKnaZYc3v_FCdaDu3z-JXeDncRk5GnqzQVwRw/edit#heading=h.tjbixx8yyd33). +This repository provides MoTrPAC-specific supplements to the [ENCODE ATAC-seq pipeline](https://github.com/ENCODE-DCC/atac-seq-pipeline). -This documentation is intended to help individuals who are preparing ATAC-seq data for submission to the BIC or processing pilot samples with the full pipeline. For simplicity, this documentation explains how to run the full pipeline on a computer compatible with `Conda` environments. Users working on the cloud or in other environments can follow ENCODE's documentation as necessary. Post-processing scripts are intended to be useful to all users, regardless of environment. +For additional details not directly related to running the ENCODE ATAC-seq pipeline or processing the results, see the most +recent version of the MoTrPAC ATAC-seq QC and Analysis Pipeline MOP, available +[here](https://docs.google.com/document/d/1vnB7ITAKnaZYc3v_FCdaDu3z-JXeDncRk5GnqzQVwRw/edit#heading=h.tjbixx8yyd33). ->**NOTE:** MoTrPAC uses ENCODE ATAC-seq pipeline **version 1.7.0** for consistency within the consortium and reproducibilty outside of the consortium. +This documentation is intended to help individuals who are preparing ATAC-seq data for submission to the BIC or +processing pilot samples with the full pipeline. For simplicity, this documentation explains how to run the full +pipeline on a computer compatible with `Conda` environments. Users working on the cloud or in other environments can +follow ENCODE's documentation as necessary. Post-processing scripts are intended to be useful to all users, regardless +of environment. -### Important references: -- GitHub repository for the ENCODE ATAC-seq pipeline: https://github.com/ENCODE-DCC/atac-seq-pipeline -- ENCODE ATAC-seq pipeline documentation: https://www.encodeproject.org/atac-seq/ -- ENCODE data quality standards: https://www.encodeproject.org/atac-seq/#standards -- ENCODE terms and definitions: https://www.encodeproject.org/data-standards/terms/ +> **NOTE:** MoTrPAC uses ENCODE ATAC-seq pipeline **version 1.7.0** for consistency within the consortium and +> reproducibilty outside of the consortium. -### Table of Contents: +* Important references: \* -1. [Prepare ATAC-seq data for submission to the BIC](#1-prepare-atac-seq-data-for-submission-to-the-bic) +* [GitHub repository for the ENCODE ATAC-seq pipeline](https://github.com/ENCODE-DCC/atac-seq-pipeline) - 1.1 Clone this repository - - 1.2 Generate and format FASTQs - - 1.3 Collect additional documents - - 1.4 Submit data - - 1.5 [**For GET: Download pipeline outputs FROM BIC**](#15-for-get-download-pipeline-outputs-from-bic) - -2. [Install and test ENCODE ATAC-seq pipeline and dependencies](#2-install-and-test-encode-atac-seq-pipeline-and-dependencies) - - 2.1 Clone the ENCODE repository +* [ENCODE ATAC-seq pipeline documentation](https://www.encodeproject.org/atac-seq/) - 2.2 Install the `Conda` environment with all software dependencies +* [ENCODE data quality standards](https://www.encodeproject.org/atac-seq/#standards) - 2.3 Initialize `Caper` +* [ENCODE terms and definitions](https://www.encodeproject.org/data-standards/terms/) - 2.4 Run a test sample - - 2.5 [Install genome databases](#25-install-genome-databases) - - 2.5.1 Install the hg38 genome database - - 2.5.2 Install the custom rn6 genome database +## Table of Contents + +1. [Prepare ATAC-seq data for submission to the BIC](#1-prepare-atac-seq-data-for-submission-to-the-bic) + + 1.1 Clone this repository + + 1.2 Generate and format FASTQs + + 1.3 Collect additional documents + + 1.4 Submit data + + 1.5 [**For GET: Download pipeline outputs FROM BIC**](#15--get-sites-only--download-pipeline-outputs-from-bic) + +2. [Install and test ENCODE ATAC-seq pipeline and dependencies](#2-install-and-test-encode-atac-seq-pipeline-and-dependencies) + + 2.1 Clone the ENCODE repository + + 2.2 Install the `Conda` environment with all software dependencies + + 2.3 Initialize `Caper` + + 2.4 Run a test sample + + 2.5 [Install genome databases](#25-install-genome-databases) + + 2.5.1 Install the hg38 genome database + + 2.5.2 Install the custom rn6 genome database 3. [Run the ENCODE ATAC-seq pipeline](#3-run-the-encode-atac-seq-pipeline) - - 3.1 Generate configuration files - - 3.2 Run the pipeline - + + 3.1 Generate configuration files + + 3.2 Run the pipeline + 4. [Organize outputs](#4-organize-outputs) - 4.1 Collect important outputs with `Croo` - - 4.2 Generate a spreadsheet of QC metrics for all samples with `qc2tsv` + 4.1 Collect important outputs with `Croo` -5. [Flag problematic samples](#5-flag-problematic-samples) + 4.2 Generate a spreadsheet of QC metrics for all samples with `qc2tsv` + +5. [Flag problematic samples](#5-flag-problematic-samples) 6. [Post-processing scripts](#6-post-processing-scripts) +## 1. Prepare ATAC-seq data for submission to the BIC + +### 1.1 Clone this repository -## 1. Prepare ATAC-seq data for submission to the BIC +This documentation will assume you clone it in a folder called `ATAC_PIPELINE` in your home directory. `~/ATAC_PIPELINE` +is also the recommended destination folder for when you clone ENCODE's repository later. -### 1.1 Clone this repository -This documentation will assume you clone it in a folder called `ATAC_PIPELINE` in your home directory. `~/ATAC_PIPELINE` is also the recommended destination folder for when you clone ENCODE's repository later. ```bash cd ~ mkdir ATAC_PIPELINE @@ -72,96 +87,163 @@ cd ATAC_PIPELINE git clone https://github.com/MoTrPAC/motrpac-atac-seq-pipeline.git ``` -### 1.2 Generate and format FASTQs -Each GET site (Stanford and MSSM) is responsible for sequencing the library and obtaining the demultiplexed FASTQ files for each sample. If sequencing is performed with NovaSeq, raw data is output as BCL files, which must be demultiplexed and converted to FASTQ files with `bcl2fastq` (version 2.20.0). `bcl2fastq v2.20.0` can be downloaded directly from Illumina [here](https://support.illumina.com/downloads/bcl2fastq-conversion-software-v2-20.html). +### 1.2 Generate and format FASTQs + +Each GET site (Stanford and MSSM) is responsible for sequencing the library and obtaining the demultiplexed FASTQ files +for each sample. If sequencing is performed with NovaSeq, raw data is output as BCL files, which must be demultiplexed +and converted to FASTQ files with `bcl2fastq` (version 2.20.0). `bcl2fastq v2.20.0` can be downloaded directly from +Illumina [here](https://support.illumina.com/downloads/bcl2fastq-conversion-software-v2-20.html). + +Prepare a sample sheet for demultiplexing. Find an example [here](examples/SampleSheet.csv). -Prepare a sample sheet for demultiplexing. Find an example [here](examples/SampleSheet.csv). -- The sample sheet must not include the `Adapter` or `AdapterRead2` settings. This will prevent `bcl2fastq` from automatically performing adapter trimming, which provides us with FASTQ files that include the fullest form of the raw data. Adapter trimming is performed downstream -- `Sample_Name` and `Sample_ID` should correspond to vial labels; FASTQ files must follow the naming convention `${viallabel}_R?.fastq.gz` before submission to the BIC +* The sample sheet must not include the `Adapter` or `AdapterRead2` settings. This will prevent `bcl2fastq` from + automatically performing adapter trimming, which provides us with FASTQ files that include the fullest form of the raw + data. Adapter trimming is performed downstream +* `Sample_Name` and `Sample_ID` should correspond to vial labels; FASTQ files must follow the naming + convention `${viallabel}_R?.fastq.gz` before submission to the BIC + +[src/bcl2fastq.sh](src/bcl2fastq.sh) provides code both to run `bcl2fastq` and rename files. It can be run as follows: -[src/bcl2fastq.sh](src/bcl2fastq.sh) provides code both to run `bcl2fastq` and rename files. It can be run as follows: 1. Define the following paths: - - `bclfolder`: Path to sequencing output directory, e.g `/lab/data/NOVASEQ_BATCH1/181205_NB551514_0071_AHFHLGAFXY` - - `samplesheet`: Path to the sample sheet, e.g. `${bclfolder}/SampleSheet.csv` - - `outdir`: Path to root output folder, e.g. `/lab/data/NOVASEQ_BATCH1` -2. If applicable, load the correct version of `bcl2fastq`. For example, on Stanford SCG, run `module load bcl2fastq2/2.20.0.422`. + +* `bclfolder`: Path to sequencing output directory, e.g `/lab/data/NOVASEQ_BATCH1/181205_NB551514_0071_AHFHLGAFXY` +* `samplesheet`: Path to the sample sheet, e.g. `${bclfolder}/SampleSheet.csv` +* `outdir`: Path to root output folder, e.g. `/lab/data/NOVASEQ_BATCH1` + +2. If applicable, load the correct version of `bcl2fastq`. For example, on Stanford SCG, + run `module load bcl2fastq2/2.20.0.422`. 3. Run [src/bcl2fastq.sh](src/bcl2fastq.sh): -``` + +```bash bash ~/ATAC_PIPELINE/motrpac-atac-seq-pipeline/src/bcl2fastq.sh ${bclfolder} ${samplesheet} ${outdir} ``` -This makes two new directories: -1. `${outdir}/bcl2fastq`: Outputs of `bcl2fastq` -2. `${outdir}/fastq_raw`: Merged and re-named FASTQ files, ready for submission to the BIC -Alternatively, run the `bcl2fastq` command independently, and use your own scripts to merge and rename FASTQ files before submission to the BIC: +This makes two new directories: + +1. `${outdir}/bcl2fastq`: Outputs of `bcl2fastq` +2. `${outdir}/fastq_raw`: Merged and re-named FASTQ files, ready for submission to the BIC + +Alternatively, run the `bcl2fastq` command independently, and use your own scripts to merge and rename FASTQ files +before submission to the BIC: + ```bash bcl2fastq \ --sample-sheet /path/to/SampleSheet.csv --runfolder-dir $seqDir \ --output-dir $outDir ``` -This command will generate two FASTQ files (one for each read in the pair) per sample per lane, e.g. `${viallabel}_L${lane}_R{1,2}_001.fastq.gz`. - -### 1.3 Collect additional documents -- Collect the laneBarcode HTML report in `${outdir}/bcl2fastq/Reports/html/*/all/all/all/laneBarcode.html`. This report must be included in the BIC data submission, -- Generate `sample_metadata_YYYYMMDD.csv`. See [this table](https://docs.google.com/document/d/1vnB7ITAKnaZYc3v_FCdaDu3z-JXeDncRk5GnqzQVwRw/edit#heading=h.sqhy9p63uf9b) for a list of metrics that must be included in this file. -- Generate `file_manifest_YYYYMMDD.csv`. See the [GET CAS-to-BIC Data Transfer Guidelines](https://docs.google.com/document/d/1W1b5PVp2yjam4FU2IidGagqdA7lYpkTaD_LMeaN_n_k) for details about the format of this document. - -### 1.4 Submit data -Refer to the [GET CAS-to-BIC Data Transfer Guidelines](https://docs.google.com/document/d/1W1b5PVp2yjam4FU2IidGagqdA7lYpkTaD_LMeaN_n_k) for details about the directory structure for ATAC-seq data submissions. The following files are required: -- `file_manifest_YYYYMMDD.csv` -- `sample_metadata_YYYYMMDD.csv` -- `readme_YYYYMMDD.txt` -- `laneBarcode.html` -- `fastq_raw/*.fastq.gz` - -### 1.5 For GET: Download pipeline outputs FROM BIC -After the BIC has finished running the ENCODE ATAC-seq pipeline on a batch of submitted data, use [`pass_extract_atac_from_gcp.sh`](src/pass_extract_atac_from_gcp.sh) to download the important subset of outputs from GCP. Inside the script, change the `download_dir` and `gsurl` paths to point to the gsutil source and the local destination, respectively. Then run the script with the number of cores available for parallelization as an argument, e.g.: + +This command will generate two FASTQ files (one for each read in the pair) per sample per lane, +e.g. `${viallabel}_L${lane}_R{1,2}_001.fastq.gz`. + +### 1.3 Collect additional documents + +* Collect the laneBarcode HTML report in `${outdir}/bcl2fastq/Reports/html/*/all/all/all/laneBarcode.html`. This report + must be included in the BIC data submission, +* Generate `sample_metadata_YYYYMMDD.csv`. + See [this table](https://docs.google.com/document/d/1vnB7ITAKnaZYc3v_FCdaDu3z-JXeDncRk5GnqzQVwRw/edit#heading=h.sqhy9p63uf9b) + for a list of metrics that must be included in this file. +* Generate `file_manifest_YYYYMMDD.csv`. See + the [GET CAS-to-BIC Data Transfer Guidelines](https://docs.google.com/document/d/1W1b5PVp2yjam4FU2IidGagqdA7lYpkTaD_LMeaN_n_k) + for details about the format of this document. + +### 1.4 Submit data + +Refer to +the [GET CAS-to-BIC Data Transfer Guidelines](https://docs.google.com/document/d/1W1b5PVp2yjam4FU2IidGagqdA7lYpkTaD_LMeaN_n_k) +for details about the directory structure for ATAC-seq data submissions. The following files are required: + +* `file_manifest_YYYYMMDD.csv` +* `sample_metadata_YYYYMMDD.csv` +* `readme_YYYYMMDD.txt` +* `laneBarcode.html` +* `fastq_raw/*.fastq.gz` + +### 1.5 (GET Sites only) Download pipeline outputs FROM BIC + +After the BIC has finished running the ENCODE ATAC-seq pipeline on a batch of submitted data, +use [`pass_extract_atac_from_gcp.sh`](src/pass_extract_atac_from_gcp.sh) to download the important subset of outputs +from GCP. Inside the script, change the `download_dir` and `gsurl` paths to point to the gsutil source and the local +destination, respectively. Then run the script with the number of cores available for parallelization as an argument, +e.g.: + ```bash bash pass_extract_atac_from_gcp.sh 10 ``` ## 2. Install and test ENCODE ATAC-seq pipeline and dependencies -All steps in this section must only be performed once. After dependencies are installed and genome databases are built, skip to [here](#3-run-the-encode-atac-seq-pipeline). -The ENCODE pipeline supports many cloud platforms and cluster engines. It also supports `docker`, `singularity`, and `Conda` to resolve complicated software dependencies for the pipeline. There are special instructions for two major Stanford HPC servers (SCG4 and Sherlock). +All steps in this section must only be performed once. After dependencies are installed and genome databases are built, +skip to [here](#3-run-the-encode-atac-seq-pipeline). + +The ENCODE pipeline supports many cloud platforms and cluster engines. It also supports `docker`, `singularity`, +and `Conda` to resolve complicated software dependencies for the pipeline. There are special instructions for two major +Stanford HPC servers (SCG4 and Sherlock). + +While the BIC runs this pipeline on Google Cloud Platform, this documentation is tailored for consortium users who use +non-cloud computing environments, including clusters and personal computers. Therefore, this documentation describes +the `Conda` implementation. Refer to ENCODE's documentation for alternatives. + +### Quick start -While the BIC runs this pipeline on Google Cloud Platform, this documentation is tailored for consortium users who use non-cloud computing environments, including clusters and personal computers. Therefore, this documentation describes the `Conda` implementation. Refer to ENCODE's documentation for alternatives. +If you have a fresh VM in Google Cloud Platform and a service account key with Google Cloud Life Sciences API +permissions, the easiest way to set up a clean server is by using[the server setup script](src/server_setup.sh) in +the `src` directory. + +This script will install all dependencies, run a database in a Docker container, and perform the rest of the steps +described in this section *except [step 2.5](#25-install-genome-databases)*. + +First, create a JSON file with your service account key somewhere on the VM. Then, use the following commands to clone +this repository and run the server setup script. + +```bash +git clone https://github.com/MoTrPAC/motrpac-atac-seq-pipeline.git +cd motrpac-atac-seq-pipeline +bash src/server_setup.sh us-central1 my-gcp-project gs://my_bucket/outputs gs://my_bucket/loc /path/to/key.json +``` ### 2.1 Clone the ENCODE repository + Clone the v1.7.0 ENCODE repository and this repository in a folder in your home directory: + ```bash cd ~/ATAC_PIPELINE git clone --single-branch --branch v1.7.0 https://github.com/ENCODE-DCC/atac-seq-pipeline.git ``` -**IMPORTANT: The following change needs to be made to ~/ATAC_PIPELINE/atac-seq-pipeline/atac.wdl.** + +**IMPORTANT: The following change needs to be made to ~/ATAC\_PIPELINE/atac-seq-pipeline/atac.wdl.**\ At the end of `~/ATAC_PIPELINE/atac-seq-pipeline/atac.wdl`, find this block of code: -``` + +```wdl task raise_exception { - String msg - command { - echo -e "\n* Error: ${msg}\n" >&2 - exit 2 - } - output { - String error_msg = '${msg}' - } - runtime { - maxRetries : 0 - } + String msg + command { + echo -e "\n* Error: ${msg}\n" >&2 + exit 2 + } + output { + String error_msg = '${msg}' + } + runtime { + maxRetries: 0 + } } ``` + Replace the `runtime` parameters in the `raise_exception` task with these: -``` + +```wdl runtime { - maxRetries : 0 - cpu : 1 - memory : '2 GB' - time : 1 - disks : 'local-disk 10 SSD' - } + maxRetries: 0 + cpu: 1 + memory: '2 GB' + time: 1 + disks: 'local-disk 10 SSD' + } ``` + **If you do not make this change, you will get the following error when you try to run the pipeline:** + ```bash Task raise_exception has an invalid runtime attribute memory = !! NOT FOUND !! @@ -183,87 +265,125 @@ Task raise_exception has an invalid runtime attribute memory = !! NOT FOUND !! ] ``` -### 2.2 Install the `Conda` environment with all software dependencies -Install `conda` by following [these instructions](https://github.com/ENCODE-DCC/atac-seq-pipeline/blob/master/docs/install_conda.md). Perform Step 5 in a `screen` or `tmux` session, as it can take some time. +### 2.2 (Optional) Install the `Conda` environment with all software dependencies + +If you did not use the server setup script, you will need to install the `Conda` environment with all software. + +Install`conda` by following [these instructions](https://github.com/ENCODE-DCC/atac-seq-pipeline/blob/master/docs/install_conda.md). + +Perform Step 5 of the ENCODE instructions in a `screen` or `tmux` session, as it can take some time. ### 2.3 Initialize `Caper` + Installing the `Conda` environment also installs `Caper`. Make sure it works: + ```bash conda activate encode-atac-seq-pipeline caper ``` -If you see an error like `caper: command not found`, then add the following line to the bottom of ~/.bashrc and re-login. -``` + +If you see an error like `caper: command not found`, then add the following +line to the bottom of ~/.bashrc and re-login. + +```bash export PATH=$PATH:~/.local/bin ``` -Choose a platform from the following table and initialize `Caper`. This will create a default `Caper` configuration file `~/.caper/default.conf`, which have only required parameters for each platform. There are special platforms for Stanford Sherlock/SCG users. +Choose a platform from the following table and initialize `Caper`. This will +create a default `Caper` configuration file `~/.caper/default.conf`, which +have only required parameters for each platform. There are special platforms +for Stanford Sherlock/SCG users. + ```bash -$ caper init [PLATFORM] +caper init [PLATFORM] ``` -**Platform**|**Description** -:--------|:----- -sherlock | Stanford Sherlock cluster (SLURM) -scg | Stanford SCG cluster (SLURM) -gcp | Google Cloud Platform -aws | Amazon Web Service -local | General local computer -sge | HPC with Sun GridEngine cluster engine -pbs | HPC with PBS cluster engine -slurm | HPC with SLURM cluster engine - -Edit `~/.caper/default.conf` according to your chosen platform. Find instruction for each item in the following table. -> **IMPORTANT**: ONCE YOU HAVE INITIALIZED THE CONFIGURATION FILE `~/.caper/default.conf` WITH YOUR CHOSEN PLATFORM, THEN IT WILL HAVE ONLY REQUIRED PARAMETERS FOR THE CHOSEN PLATFORM. DO NOT LEAVE ANY PARAMETERS UNDEFINED OR CAPER WILL NOT WORK CORRECTLY. - -**Parameter**|**Description** -:--------|:----- -tmp-dir | **IMPORTANT**: A directory to store all cached files for inter-storage file transfer. DO NOT USE `/tmp`. -slurm-partition | SLURM partition. Define only if required by a cluster. You must define it for Stanford Sherlock. -slurm-account | SLURM partition. Define only if required by a cluster. You must define it for Stanford SCG. -sge-pe | Parallel environment of SGE. Find one with `$ qconf -spl` or ask you admin to add one if not exists. -aws-batch-arn | ARN for AWS Batch. -aws-region | AWS region (e.g. us-west-1) -out-s3-bucket | Output bucket path for AWS. This should start with `s3://`. -gcp-prj | Google Cloud Platform Project -out-gcs-bucket | Output bucket path for Google Cloud Platform. This should start with `gs://`. - -An important optional parameter is `db`. If you would like to enable call-catching (i.e. re-use ouputs from previous workflows, which is particularly useful if a workflow fails halfway through a pipeline), add the following lines to `~/.caper/default.conf`: -``` -db=file -java-heap-run=4G +| **Platform** | **Description** | +|:-------------|:---------------------------------------| +| sherlock | Stanford Sherlock cluster (SLURM) | +| scg | Stanford SCG cluster (SLURM) | +| gcp | Google Cloud Platform | +| aws | Amazon Web Service | +| local | General local computer | +| sge | HPC with Sun GridEngine cluster engine | +| pbs | HPC with PBS cluster engine | +| slurm | HPC with SLURM cluster engine | + +Edit `~/.caper/default.conf` according to your chosen platform. Find +instruction for each item in the following table. + +> **IMPORTANT**: ONCE YOU HAVE INITIALIZED THE CONFIGURATION FILE +`~/.caper/default.conf` WITH YOUR CHOSEN PLATFORM, THEN IT WILL HAVE ONLY +> REQUIRED PARAMETERS FOR THE CHOSEN PLATFORM. DO NOT LEAVE ANY PARAMETERS +> UNDEFINED OR CAPER WILL NOT WORK CORRECTLY. + +| **Parameter** | **Description** | +|:----------------|:---------------------------------------------------------------------------------------------------------| +| tmp-dir | **IMPORTANT**: A directory to store all cached files for inter-storage file transfer. DO NOT USE `/tmp`. | +| slurm-partition | SLURM partition. Define only if required by a cluster. You must define it for Stanford Sherlock. | +| slurm-account | SLURM partition. Define only if required by a cluster. You must define it for Stanford SCG. | +| sge-pe | Parallel environment of SGE. Find one with `$ qconf -spl` or ask you admin to add one if not exists. | +| aws-batch-arn | ARN for AWS Batch. | +| aws-region | AWS region (e.g. us-west-1) | +| out-s3-bucket | Output bucket path for AWS. This should start with `s3://`. | +| gcp-prj | Google Cloud Platform Project | +| out-gcs-bucket | Output bucket path for Google Cloud Platform. This should start with `gs://`. | + +An important optional parameter is `db`. If you would like to enable call-catching (i.e. re-use ouputs from previous +workflows, which is particularly useful if a workflow fails halfway through a pipeline), add the following lines +to `~/.caper/default.conf`: + +```conf + db=file + java-heap-run=4G ``` -### 2.4 Run a test sample -Follow [these platform-specific instructions](https://github.com/ENCODE-DCC/caper/blob/master/README.md#activating-conda-environment) to run a test sample. Use the following variable assignments: +### 2.4 Run a test sample + +Follow [these platform-specific instructions](https://github.com/ENCODE-DCC/caper/blob/master/README.md#activating-conda-environment) +to run a test sample. Use the following variable assignments: + ```bash +# the next variable is only needed if using conda to manage Python packages PIPELINE_CONDA_ENV=encode-atac-seq-pipeline WDL=~/ATAC_PIPELINE/atac-seq-pipeline/atac.wdl INPUT_JSON=https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ_subsampled_caper.json -``` -Note that `Caper` writes all outputs to the current working directory, so first `cd` to the desired output directory before using `caper run` or `caper server`. - -Here is an example of how the test workflow is run on Stanford SCG (SLURM): ``` + +Note that `Caper` writes all outputs to the current working directory, so first `cd` to the desired output directory +before using `caper run` or `caper server`. + +### On Stanford SCG/SLURM + +Here is an example of how the test workflow is run on Stanford SCG (SLURM): + +```bash +# only needed if using conda to manage Python packages conda activate ${PIPELINE_CONDA_ENV} JOB_NAME=encode_test sbatch -A ${ACCOUNT} -J ${JOB_NAME} --export=ALL --mem 2G -t 4-0 --wrap "caper run ${WDL} -i ${INPUT_JSON}" ``` ### 2.5 Install genome databases - + #### 2.5.1 Install the hg38 genome database -Specify a destination directory and install the ENCODE hg38 reference with the following command. We recommend not to run this installer on a login node of your cluster. It will take >8GB memory and >2h time. -```bash + +Specify a destination directory and install the ENCODE hg38 reference with the following command. We recommend not to +run this installer on a login node of your cluster. It will take >8GB memory and >2h time. + +```bash +# only needed if using conda to manage Python packages conda activate encode-atac-seq-pipeline + outdir=/path/to/reference/genome/hg38 bash ~/ATAC_PIPELINE/atac-seq-pipeline/scripts/download_genome_data.sh hg38 ${outdir} ``` - -#### 2.5.2 Install the custom rn6 genome database + +#### 2.5.2 Install the custom rn6 genome database Find this section in `~/ATAC_PIPELINE/atac-seq-pipeline/scripts/build_genome_data.sh`: -``` + +```bash ... elif [[ "${GENOME}" == "YOUR_OWN_GENOME" ]]; then @@ -288,44 +408,85 @@ elif [[ "${GENOME}" == "YOUR_OWN_GENOME" ]]; then fi ... ``` + Above it, add this block: -``` + +```bash +... elif [[ "${GENOME}" == "motrpac_rn6" ]]; then REGEX_BFILT_PEAK_CHR_NAME=".*" MITO_CHR_NAME="chrM" REF_FA="http://mitra.stanford.edu/montgomery/projects/motrpac/atac/SCG/motrpac_references/rn6_release96/Rattus_norvegicus.Rnor_6.0.dna.toplevel.standardized.fa.gz" TSS="http://mitra.stanford.edu/montgomery/projects/motrpac/atac/SCG/motrpac_references/rn6_release96/Rattus_norvegicus.Rnor_6.0.96_protein_coding.tss.bed.gz" BLACKLIST= +... ``` **NOTE:** The TSS reference file was generated from the Ensembl GTF using [this script](src/tss_from_gtf.py). -Now run the script to build the custom genome database. Specify a destination directory and install the MoTrPAC rn6 reference with the following command. We recommend not to run this installer on a login node of your cluster. It will take >8GB memory and >2h time. +Now run the script to build the custom genome database. Specify a destination directory and install the MoTrPAC rn6 +reference with the following command. We recommend not to run this installer on a login node of your cluster. It will +take >8GB memory and >2h time. + ```bash +# only needed if using conda to manage Python packages conda activate encode-atac-seq-pipeline + outdir=/path/to/reference/genome/motrpac_rn6 bash ~/ATAC_PIPELINE/atac-seq-pipeline/scripts/build_genome_data.sh motrpac_rn6 ${outdir} ``` - -## 3. Run the ENCODE ATAC-seq pipeline -MoTrPAC will run the ENCODE pipeline both with singletons for human samples and replicates for rat samples. In both cases, many iterations of the pipeline will need to be run for each batch of sequencing data. This repository provides scripts to automate this process, for both rat and human samples. -Running the pipeline with replicates outputs all of the same per-sample information generated by running the pipeline with a single sample but improves power for peak calling and outputs a higher-confidence peak set called using all replicates. This generates a single peak set for every exercise protocol/timepoint/tissue/sex combination in the PASS study, which will be useful for downstream analyses. +## 3. Run the ENCODE ATAC-seq pipeline + +MoTrPAC will run the ENCODE pipeline both with singletons for human samples and replicates for rat samples. In both +cases, many iterations of the pipeline will need to be run for each batch of sequencing data. This repository provides +scripts to automate this process, for both rat and human samples. + +Running the pipeline with replicates outputs all of the same per-sample information generated by running the pipeline +with a single sample but improves power for peak calling and outputs a higher-confidence peak set called using all +replicates. This generates a single peak set for every exercise protocol/timepoint/tissue/sex combination in the PASS +study, which will be useful for downstream analyses. ### 3.1 Generate configuration files -A configuration (config) file in JSON format that specifies input parameters is required to run the pipeline. Find comprehensive documentation of definable parameters [here](https://github.com/ENCODE-DCC/atac-seq-pipeline/blob/master/docs/input.md). -Please click the appropriate link below for detailed instructions on how to automate the generation of config files for pipelines with singletons or replicates. This is particularly important for PASS data, as this repository provides a script to automatically group replicates in the same condition (protocol/timepoint/tissue/sex). +A configuration (config) file in JSON format that specifies input parameters is required to run the pipeline. Find +comprehensive documentation of definable +parameters [here](https://github.com/ENCODE-DCC/atac-seq-pipeline/blob/master/docs/input.md). + +Please click the appropriate link below for detailed instructions on how to automate the generation of config files for +pipelines with singletons or replicates. This is particularly important for PASS data, as this repository provides a +script to automatically group replicates in the same condition (protocol/timepoint/tissue/sex). * [Prepare config files for replicates (PASS/rat)](docs/replicate_config.md) -* [Prepare config files for singletons (CASS/human)](docs/single_config.md) +* [Prepare config files for singletons (CASS/human)](docs/single_config.md) + +### 3.2 Run the pipeline + +Actually running the pipeline is straightforward. However, the command is different depending on the environment in +which you set up the pipeline. Refer back to environment-specific +instructions [here](https://github.com/ENCODE-DCC/caper/blob/master/README.md#activating-conda-environment). + +An `atac` directory containing all of the pipeline outputs is created in the output directory (note the default output +directory is the current working directory). One arbitrarily-named subdirectory for each config file (assuming the +command is run in a loop for several samples) is written in `atac`. + +#### For GCP -### 3.2 Run the pipeline -Actually running the pipeline is straightforward. However, the command is different depending on the environment in which you set up the pipeline. Refer back to environment-specific instructions [here](https://github.com/ENCODE-DCC/caper/blob/master/README.md#activating-conda-environment). +You can use the submit.sh script to submit a batch of pipelines to the GCP job queue. `${JSON_DIR}` is the path to all +of the config files, generated in [Step 3.1](#31-generate-configuration-files): -An `atac` directory containing all of the pipeline outputs is created in the output directory (note the default output directory is the current working directory). One arbitrarily-named subdirectory for each config file (assuming the command is run in a loop for several samples) is written in `atac`. +```bash +JSON_DIR=/path/to/genereated/json/files +JSON_OUT=/path/to/output/file.json +bash src/submit.sh atac-seq-pipeline/atac.wdl ${JSON_DIR} ${OUTDIR} +``` + +#### For Stanford SCG/SLURM + +Here is an example of bash scrupt that submits a batch of pipelines to the Stanford SCG job queue. `${JSON_DIR}` is +the path to all of the config files, generated in [Step 3.1](#31-generate-configuration-files): -Here is an example of code that submits a batch of pipelines to the Stanford SCG job queue. `${JSON_DIR}` is the path to all of the config files, generated in [Step 3.1](#31-generate-configuration-files): ```bash +# if using conda to manage Python packages conda activate encode-atac-seq-pipeline ATACSRC=~/ATAC_PIPELINE/atac-seq-pipeline @@ -342,11 +503,34 @@ for json in $(ls ${JSON_DIR}); do done ``` +### 3.3 (Optional) Monitoring execution status + +If you ran the pipeline on GCP, you can monitor the status of your jobs by using the [`get_execution_status.py`](src/get_execution_status.py) +script. This script will print the status of all jobs in the output directory. The argument to the script is the path +of the file that you generated in [Step 3.2](#32-run-the-pipeline). For example: + +```bash +python src/get_execution_status.py /path/to/output/file.json +``` + ## 4. Organize outputs -### 4.1 Collect important outputs with `Croo` -`Croo` is a tool ENCODE developed to simplify the pipeline outputs. It was installed along with the `Conda` environment. Run it on each sample in the batch. See **Table 4.1** for a description of outputs generated by this process. +### 4.1 Mount the output directory + +If running on GCP, mount the output directory to your local machine. This will allow you to access the pipeline outputs +from your local machine. If running on the SCG, you can skip this step. + +```bash +gcsfuse --implicit-dirs --dir-mode 777 --file-mode 777 gs://my_bucket/outputs /mnt/${OUTDIR} ``` + +### 4.2 Collect important outputs with `croo` + +`croo` is a tool ENCODE developed to simplify the pipeline outputs. It was installed along with the `Conda` environment. +Run it on each sample in the batch. See **Table 4.1** for a description of outputs generated by this process. + +```bash +# only needed if using conda to manage Python packages conda activate encode-atac-seq-pipeline cd ${OUTDIR}/atac @@ -356,81 +540,112 @@ for dir in $(ls */metadata.json | sed "s:/metadata\.json::"); do cd .. done ``` - -**Table 4.1.** Important files in `Croo`-organized ENCODE ATAC-seq pipeline output. - -| Subdirectory or file | Description | -|-------------------------------------------|-----------------------------------------| -| `qc/*` | Components of the merged QC spreadhseet (see Step 4.2) | -| `signal/*/*fc.signal.bigwig` | MACS2 peak-calling signal (fold-change), useful for visualizing "read pileups" in a genome browser | -| `signal/*/*pval.signal.bigwig` | MACS2 peak-calling signal (P-value), useful for visualizing "read pileups" in a genome browser. P-value track is more dramatic than the fold-change track | -| `align/*/*.trim.merged.bam` | Unfiltered BAM files | -| `align/*/*.trim.merged.nodup.no_chrM_MT.bam` | Filtered BAM files, used as input for peak calling | -| `align/*/*.tagAlign.gz` | [tagAlign](https://genome.ucsc.edu/FAQ/FAQformat.html#format15) files from filtered BAMs | -| `peak/overlap_reproducibility/ overlap.optimal_peak.narrowPeak.hammock.gz` | Hammock file of `overlap` peaks, optimized for viewing peaks in a genome browser | -| `peak/overlap_reproducibility/ overlap.optimal_peak.narrowPeak.gz` | BED file of `overlap` peaks. **Generally, use this as your final peak set** | -| `peak/overlap_reproducibility/ overlap.optimal_peak.narrowPeak.bb` | [bigBed](https://genome.ucsc.edu/goldenPath/help/bigBed.html) file of `overlap` peaks useful for visualizing peaks in a genome browser | -| `peak/idr_reproducibility/ idr.optimal_peak.narrowPeak.gz` | `IDR` peaks. More conservative than `overlap` peaks | - -[ENCODE recommends](https://www.encodeproject.org/atac-seq/) using the `overlap` peak sets when one prefers a low false negative rate but potentially higher false positives; they recommend using the `IDR` peaks when one prefers low false positive rates. - -### 4.2 Generate a spreadsheet of QC metrics for all samples with `qc2tsv` -This is most useful if you ran the pipeline for multiple samples. **Step 4.1** generates a `qc/qc.json` file for each pipeline run. After installing `qc2tsv` within the `encode-atac-seq-pipeline` `Conda` environment (`pip install qc2tsv`), run the following command to compile a spreadsheet with QC from all samples: -``` + +**Table 4.1.** Important files in `Croo`-organized ENCODE ATAC-seq pipeline output. + +| Subdirectory or file | Description | +|----------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------| +| `qc/*` | Components of the merged QC spreadhseet (see Step 4.2) | +| `signal/*/*fc.signal.bigwig` | MACS2 peak-calling signal (fold-change), useful for visualizing "read pileups" in a genome browser | +| `signal/*/*pval.signal.bigwig` | MACS2 peak-calling signal (P-value), useful for visualizing "read pileups" in a genome browser. P-value track is more dramatic than the fold-change track | +| `align/*/*.trim.merged.bam` | Unfiltered BAM files | +| `align/*/*.trim.merged.nodup.no_chrM_MT.bam` | Filtered BAM files, used as input for peak calling | +| `align/*/*.tagAlign.gz` | [tagAlign](https://genome.ucsc.edu/FAQ/FAQformat.html#format15) files from filtered BAMs | +| `peak/overlap_reproducibility/ overlap.optimal_peak.narrowPeak.hammock.gz` | Hammock file of `overlap` peaks, optimized for viewing peaks in a genome browser | +| `peak/overlap_reproducibility/ overlap.optimal_peak.narrowPeak.gz` | BED file of `overlap` peaks. **Generally, use this as your final peak set** | +| `peak/overlap_reproducibility/ overlap.optimal_peak.narrowPeak.bb` | [bigBed](https://genome.ucsc.edu/goldenPath/help/bigBed.html) file of `overlap` peaks useful for visualizing peaks in a genome browser | +| `peak/idr_reproducibility/ idr.optimal_peak.narrowPeak.gz` | `IDR` peaks. More conservative than `overlap` peaks | + +[ENCODE recommends](https://www.encodeproject.org/atac-seq/) using the `overlap` peak sets when one prefers a low false +negative rate but potentially higher false positives; they recommend using the `IDR` peaks when one prefers low false +positive rates. + +### 4.3 Generate a spreadsheet of QC metrics for all samples with `qc2tsv` + +This is most useful if you ran the pipeline for multiple samples. **Step 4.1** generates a `qc/qc.json` file for each +pipeline run. After installing `qc2tsv` within the `encode-atac-seq-pipeline` `Conda` +environment (`pip install qc2tsv`), run the following command to compile a spreadsheet with QC from all samples: + +```bash cd ${outdir}/atac qc2tsv $(find -path "*/call-qc_report/execution/glob-*/qc.json") --collapse-header > spreadsheet.tsv ``` -**Table 4.2** provides definitions for a limited number of metrics included in the JSON QC reports. The full JSON report includes >100 metrics per sample; some lines are duplicates, and many metrics are irrelevant for running the pipeline with a single biological replicate. +**Table 4.2** provides definitions for a limited number of metrics included in the JSON QC reports. The full JSON report +includes >100 metrics per sample; some lines are duplicates, and many metrics are irrelevant for running the pipeline +with a single biological replicate. **Table 4.2. Description of relevant QC metrics.** -| Metric | Definition/Notes | -|--------|------------------| -| replication.reproducibility.overlap.N_opt | Number of optimal overlap_reproducibility peaks | -| replication.reproducibility.overlap.opt_set | Peak set corresponding to optimal overlap_reproducibility peaks | -| replication.reproducibility.idr.N_opt | Number of optimal idr_reproducibility peaks | -| replication.reproducibility.idr.opt_set | Peak set corresponding to optimal idr_reproducibility peaks | -| replication.num_peaks.num_peaks | Number of peaks called in each replicate | -| peak_enrich.frac_reads_in_peaks.macs2.frip | Replicate-level FRiP in raw MACS2 peaks | -| peak_enrich.frac_reads_in_peaks.overlap.{opt_set}.frip | Many FRiP values are reported. In order to get the FRiP corresponding to the overlap_reproducibility peak set, you need to cross-reference the `replication.reproducibility.overlap.opt_set` metric with these column names to extract the appropriate FRiP. For example, if `replication.reproducibility.overlap.opt_set` is `pooled-pr1_vs_pooled-pr2`, then you need to extract the FRiP value from the `peak_enrich.frac_reads_in_peaks.overlap.pooled-pr1_vs_pooled-pr2.frip` column. See **insert script name** to see how to do this in an automated way | -| peak_enrich.frac_reads_in_peaks.idr.{opt_set}.frip | Cross-reference with `replication.reproducibility.idr.opt_set`. See `peak_enrich.frac_reads_in_peaks.overlap.{opt_set}.frip` | -| align.samstat.total_reads | Total number of alignments* (including multimappers)| -| align.samstat.pct_mapped_reads | Percent of reads that mapped| -| align.samstat.pct_properly_paired_reads |Percent of reads that are properly paired| -| align.dup.pct_duplicate_reads |Fraction (not percent) of read pairs that are duplicates **after** filtering alignments for quality| -| align.frac_mito.frac_mito_reads | Fraction of reads that align to chrM **after** filtering alignments for quality and removing duplicates | -| align.nodup_samstat.total_reads | Number of alignments* after applying all filters | -| align.frag_len_stat.frac_reads_in_nfr | Fraction of reads in nucleosome-free-region. Should be a value greater than 0.4 | -| align.frag_len_stat.nfr_over_mono_nuc_reads | Reads in nucleosome-free-region versus reads in mononucleosomal peak. Should be a value greater than 2.5 | -| align.frag_len_stat.nfr_peak_exists | Does a nucleosome-free-peak exist? Should be `true` | -| align.frag_len_stat.mono_nuc_peak_exists | Does a mononucleosomal-peak exist? Should be `true` | -| align.frag_len_stat.di_nuc_peak_exists | Does a dinucleosomal-peak exist? Ideally `true`, but not condemnable if `false` | -| lib_complexity.lib_complexity.NRF | Non-reduandant fraction. Measure of library complexity, i.e. degree of duplicates. Ideally >0.9 | -| lib_complexity.lib_complexity.PBC1 | PCR bottlenecking coefficient 1. Measure of library complexity. Ideally >0.9 | -| lib_complexity.lib_complexity.PBC2 | PCR bottlenecking coefficient 2. Measure of library complexity. Ideally >3 | -| align_enrich.tss_enrich.tss_enrich | TSS enrichment | - -*Note: Alignments are per read, so for PE reads, there are two alignments per fragment if each PE read aligns once. - -## 5. Flag problematic samples -The following metrics are not strictly exclusion criteria for MoTrPAC samples, but samples should be flagged if any of these conditions are met. Some of these metrics reflect the [ENCODE ATAC-seq data standards](https://www.encodeproject.org/atac-seq/#standards). +| Metric | Definition/Notes | +|-------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| replication.reproducibility.overlap.N\_opt | Number of optimal overlap\_reproducibility peaks | +| replication.reproducibility.overlap.opt\_set | Peak set corresponding to optimal overlap\_reproducibility peaks | +| replication.reproducibility.idr.N\_opt | Number of optimal idr\_reproducibility peaks | +| replication.reproducibility.idr.opt\_set | Peak set corresponding to optimal idr\_reproducibility peaks | +| replication.num\_peaks.num\_peaks | Number of peaks called in each replicate | +| peak\_enrich.frac\_reads\_in\_peaks.macs2.frip | Replicate-level FRiP in raw MACS2 peaks | +| peak\_enrich.frac\_reads\_in\_peaks.overlap.{opt\_set}.frip | Many FRiP values are reported. In order to get the FRiP corresponding to the overlap\_reproducibility peak set, you need to cross-reference the `replication.reproducibility.overlap.opt_set` metric with these column names to extract the appropriate FRiP. For example, if `replication.reproducibility.overlap.opt_set` is `pooled-pr1_vs_pooled-pr2`, then you need to extract the FRiP value from the `peak_enrich.frac_reads_in_peaks.overlap.pooled-pr1_vs_pooled-pr2.frip` column. See **insert script name** to see how to do this in an automated way | +| peak\_enrich.frac\_reads\_in\_peaks.idr.{opt\_set}.frip | Cross-reference with `replication.reproducibility.idr.opt_set`. See `peak_enrich.frac_reads_in_peaks.overlap.{opt_set}.frip` | +| align.samstat.total\_reads | Total number of alignments\* (including multimappers) | +| align.samstat.pct\_mapped\_reads | Percent of reads that mapped | +| align.samstat.pct\_properly\_paired\_reads | Percent of reads that are properly paired | +| align.dup.pct\_duplicate\_reads | Fraction (not percent) of read pairs that are duplicates **after** filtering alignments for quality | +| align.frac\_mito.frac\_mito\_reads | Fraction of reads that align to chrM **after** filtering alignments for quality and removing duplicates | +| align.nodup\_samstat.total\_reads | Number of alignments\* after applying all filters | +| align.frag\_len\_stat.frac\_reads\_in\_nfr | Fraction of reads in nucleosome-free-region. Should be a value greater than 0.4 | +| align.frag\_len\_stat.nfr\_over\_mono\_nuc\_reads | Reads in nucleosome-free-region versus reads in mononucleosomal peak. Should be a value greater than 2.5 | +| align.frag\_len\_stat.nfr\_peak\_exists | Does a nucleosome-free-peak exist? Should be `true` | +| align.frag\_len\_stat.mono\_nuc\_peak\_exists | Does a mononucleosomal-peak exist? Should be `true` | +| align.frag\_len\_stat.di\_nuc\_peak\_exists | Does a dinucleosomal-peak exist? Ideally `true`, but not condemnable if `false` | +| lib\_complexity.lib\_complexity.NRF | Non-reduandant fraction. Measure of library complexity, i.e. degree of duplicates. Ideally >0.9 | +| lib\_complexity.lib\_complexity.PBC1 | PCR bottlenecking coefficient 1. Measure of library complexity. Ideally >0.9 | +| lib\_complexity.lib\_complexity.PBC2 | PCR bottlenecking coefficient 2. Measure of library complexity. Ideally >3 | +| align\_enrich.tss\_enrich.tss\_enrich | TSS enrichment | + +*Note: Alignments are per read, so for PE reads, there are two alignments per fragment if each PE read aligns once. + +## 5. Flag problematic samples + +The following metrics are not strictly exclusion criteria for MoTrPAC samples, but samples should be flagged if any of +these conditions are met. Some of these metrics reflect +the [ENCODE ATAC-seq data standards](https://www.encodeproject.org/atac-seq/#standards). **Table 5.1 Criteria to flag problematic samples.** -| Description | In terms of Table 2 metrics | Comments | -|-------------|-------------------------------|----------| -|< 50 million filtered, non-duplicated, non-mitochondrial paired-end reads in the filtered BAM file (i.e. 25M pairs)| align.nodup_samstat.total_reads < 50M | This is the most stringent criterion and may be relaxed | -|Alignment rate < 80%| align.samstat.pct_mapped_reads < 80%|| -|Fraction of reads in `overlap` peaks < 0.1| peak_enrich.frac_reads_in_peaks.overlap.{opt_set}.frip < 0.1|This is more relaxed than the ENCODE recommendation. Note that replicate-level FRiP in raw peaks can be assessed with peak_enrich.frac_reads_in_peaks.macs2.frip | -|Number of peaks in `overlap` peak set < 80,000|replication.reproducibility.overlap.N_opt < 80000|This is more relaxed than the ENCODE recommendation| -|A nucleosome-free region is not present| align.frag_len_stat.nfr_peak_exists == false|This should be enforced more strictly| -|A mononucleosome peak is not present|align.frag_len_stat.mono_nuc_peak_exists == false|This should be enforced more strictly| -|TSS enrichment < ?|align_enrich.tss_enrich.tss_enrich|This cutoff needs to be evaluated retrospectively. We will probably have tissue-specific recommendations | - -## 6. Post-processing scripts -- [extract_rep_names_from_encode.sh](src/extract_rep_names_from_encode.sh): generate rep-to-viallabel map to interpret QC report -- [pass_extract_atac_from_gcp.sh](src/pass_extract_atac_from_gcp.sh): download relevant files from ENCODE output -- [encode_to_count_matrix.sh](src/encode_to_count_matrix.sh): use `narrowkpeak.gz` and `tagAlign` files to generate a peak x sample raw counts matrix -- [align_stats.sh](src/align_stats.sh): calculate % of primary alignments aligning to chrX, chrY, chrM, autosomes, and contigs -- [merge_atac_qc.R](src/merge_atac_qc.R): merge wet lab QC, curated pipeline QC, and alignment stats +| Description | In terms of Table 2 metrics | Comments | +|---------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| < 50 million filtered, non-duplicated, non-mitochondrial paired-end reads in the filtered BAM file (i.e. 25M pairs) | align.nodup\_samstat.total\_reads < 50M | This is the most stringent criterion and may be relaxed | +| Alignment rate < 80% | align.samstat.pct\_mapped\_reads < 80% || +| Fraction of reads in `overlap` peaks < 0.1 | peak\_enrich.frac\_reads\_in\_peaks.overlap.{opt\_set}.frip < 0.1 | This is more relaxed than the ENCODE recommendation. Note that replicate-level FRiP in raw peaks can be assessed with peak\_enrich.frac\_reads\_in\_peaks.macs2.frip | +| Number of peaks in `overlap` peak set < 80,000 | replication.reproducibility.overlap.N\_opt < 80000 | This is more relaxed than the ENCODE recommendation | +| A nucleosome-free region is not present | align.frag\_len\_stat.nfr\_peak\_exists == false | This should be enforced more strictly | +| A mononucleosome peak is not present | align.frag\_len\_stat.mono\_nuc\_peak\_exists == false | This should be enforced more strictly | +| TSS enrichment < ? | align\_enrich.tss\_enrich.tss\_enrich | This cutoff needs to be evaluated retrospectively. We will probably have tissue-specific recommendations | + +## 6. Post-processing scripts + +Use the post-processing wrapper scripts to generate the QC report and to generate the final count matrix. + +* [atac-post-process-wrapper.sh](src/atac-post-process-pass-wrapper.sh): wrapper script to generate QC report and final + count matrix for PASS samples +* [atac-seq-human-wrapper.sh](src/atac-post-porcess-human-wrapper.sh): wrapper script to generate QC report and final + count matrix for human samples + +For each of these wrappers, make sure you fill in the appropriate variables at the top of the script before running. + +### 6.1. Individual scripts + +* [extract\_rep\_names\_from\_encode.sh](src/extract_rep_names_from_encode.sh): generate rep-to-viallabel map to + interpret QC report +* [pass\_extract\_atac\_from\_gcp.sh](src/pass_extract_atac_from_gcp.sh): download relevant files for PASS samples from + ENCODE output +* [human\_extract\_atac\_from\_gcp.sh](src/extract_atac_from_gcp_human.sh): download relevant files for human samples + from ENCODE output +* [encode\_to\_count\_matrix.sh](src/encode_to_count_matrix.sh): use `narrowkpeak.gz` and `tagAlign` files to generate a + peak x sample raw counts matrix for PASS samples +* [encode\_to\_count\_matrix_human.sh](src/encode_to_count_matrix_human.sh): use `narrowkpeak.gz` and `tagAlign` files + to generate a peak x sample raw counts matrix for human samples +* [align\_stats.sh](src/align_stats.sh): calculate % of primary alignments aligning to chrX, chrY, chrM, autosomes, and + contigs +* [merge\_atac\_qc.R](src/merge_atac_qc.R): merge wet lab QC, curated pipeline QC, and alignment stats diff --git a/patches/fix__add_missing_runtime_attributes_to_atac-seq_v1_7_0.patch b/patches/fix__add_missing_runtime_attributes_to_atac-seq_v1_7_0.patch new file mode 100644 index 0000000..ad40977 --- /dev/null +++ b/patches/fix__add_missing_runtime_attributes_to_atac-seq_v1_7_0.patch @@ -0,0 +1,18 @@ +Index: atac-seq-pipeline/atac.wdl +IDEA additional info: +Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP +<+>UTF-8 +=================================================================== +diff --git a/atac-seq-pipeline/atac.wdl b/atac-seq-pipeline/atac.wdl +--- a/atac-seq-pipeline/atac.wdl (revision 46f2a928db20ca4889160b6e2a2e980fac34d672) ++++ b/atac-seq-pipeline/atac.wdl (date 1663706161074) +@@ -1990,5 +1990,9 @@ + } + runtime { + maxRetries : 0 ++ cpu : 1 ++ memory : '2 GB' ++ time : 1 ++ disks : 'local-disk 10 SSD' + } + } diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..144b236 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,86 @@ +aiohttp==3.8.1 +aiosignal==1.2.0 +argcomplete==1.12.3 +async-timeout==4.0.2 +attrs==22.1.0 +autouri==0.3.0 +awscli==1.25.78 +boto3==1.24.77 +botocore==1.27.77 +bullet==2.2.0 +cachetools==5.2.0 +caper==2.2.2 +certifi==2022.9.14 +cffi==1.15.1 +charset-normalizer==2.1.1 +colorama==0.4.4 +coloredlogs==15.0.1 +contourpy==1.0.5 +croo==0.6.0 +cryptography==38.0.1 +cycler==0.11.0 +dateparser==1.1.1 +decorator==5.1.1 +docker==6.0.0 +docutils==0.16 +filelock==3.8.0 +fonttools==4.37.3 +frozenlist==1.3.1 +fsspec==2022.8.2 +gcsfs==2022.8.2 +google-api-core==2.10.1 +google-auth==2.11.1 +google-auth-oauthlib==0.5.3 +google-cloud-core==2.3.2 +google-cloud-storage==2.5.0 +google-crc32c==1.5.0 +google-resumable-media==2.3.3 +googleapis-common-protos==1.56.4 +graphviz==0.20.1 +humanfriendly==10.0 +idna==3.4 +importlib-metadata==4.12.0 +jmespath==1.0.1 +joblib==1.2.0 +kiwisolver==1.4.4 +lark-parser==0.12.0 +matplotlib==3.6.0 +miniwdl==1.3.3 +multidict==6.0.2 +ntplib==0.4.0 +numpy==1.23.3 +oauthlib==3.2.1 +packaging==21.3 +pandas==1.5.0 +Pillow==9.2.0 +protobuf==4.21.6 +psutil==5.9.2 +pyasn1==0.4.8 +pyasn1-modules==0.2.8 +pycparser==2.21 +pygtail==0.12.0 +pyhocon==0.3.59 +pyOpenSSL==22.0.0 +pyparsing==2.4.7 +python-dateutil==2.8.2 +python-json-logger==2.0.4 +pytz==2022.2.1 +pytz-deprecation-shim==0.1.0.post0 +PyYAML==5.4.1 +regex==2022.3.2 +requests==2.28.1 +requests-oauthlib==1.3.1 +rsa==4.7.2 +s3transfer==0.6.0 +scikit-learn==1.1.2 +scipy==1.9.1 +six==1.16.0 +threadpoolctl==3.1.0 +tqdm==4.64.1 +tzdata==2022.2 +tzlocal==4.2 +urllib3==1.26.12 +websocket-client==1.4.1 +xdg==5.1.1 +yarl==1.8.1 +zipp==3.8.1 diff --git a/src/align_stats.sh b/src/align_stats.sh index 2aadafd..db3089a 100644 --- a/src/align_stats.sh +++ b/src/align_stats.sh @@ -5,12 +5,18 @@ # usage: bash align_stats.sh [NUM_CORES] [indir] [bamdir] # make sure the vm has bc and samtools istalled # sudo apt install bc -set -e +set -e cores=$1 indir=$2 bamdir=$3 +if [ -z "$4" ]; then + type="glob" +else + type=$4 +fi + #indir=/projects/motrpac/PASS1A/ATAC/NOVASEQ_BATCH2/outputs #indir=~/test_mnt/PASS/atac-seq/stanford/batch4_20200928/Output/final # path to ENCODE outputs, anywhere upstream of the bam files @@ -20,45 +26,86 @@ bamdir=$3 #module load samtools -cd ${indir} +cwd=$(pwd) +cd "$indir" mkdir -p idxstats # make outdir -align_stats () { - local bam=$1 # 90141015805_R1.trim.bam - UNFILTERED BAM - local viallabel=$(basename $bam | sed "s/_R1.*//") - local primary=idxstats/${viallabel}_primary.bam - # already sorted - # keep only primary alignments - samtools view -b -F 0x900 ${bam} -o ${primary} - # index - samtools index ${primary} - samtools idxstats ${primary} > idxstats/${viallabel}_chrinfo.txt - rm ${primary} ${primary}.bai +align_stats() { + local bam=$1 # 90141015805_R1.trim.bam - UNFILTERED BAM + local viallabel + local primary + + viallabel=$(basename "$bam" | sed "s/_R1.*//") + echo "$viallabel" + + if [ -f idxstats/"${viallabel}"_chrinfo.txt ] && samtools quickcheck "$bam" 2&> /dev/null ; then + echo "Skipping $viallabel" + return + fi - # get counts - local total=$(awk '{sum+=$3;}END{print sum;}' idxstats/${viallabel}_chrinfo.txt) - local y=$(grep -E "^chrY" idxstats/${viallabel}_chrinfo.txt | cut -f 3) - local x=$(grep -E "^chrX" idxstats/${viallabel}_chrinfo.txt | cut -f 3) - local mt=$(grep -E "^chrM" idxstats/${viallabel}_chrinfo.txt | cut -f 3) - local auto=$(grep -E "^chr[0-9]" idxstats/${viallabel}_chrinfo.txt | cut -f 3 | awk '{sum+=$1;}END{print sum;}') - local contig=$(grep -E -v "^chr" idxstats/${viallabel}_chrinfo.txt | cut -f 3 | awk '{sum+=$1;}END{print sum;}') + primary=idxstats/${viallabel}_primary.bam + # already sorted + # keep only primary alignments + samtools view -b -F 0x900 "$bam" -o "$primary" + # index + samtools index "$primary" + samtools idxstats "$primary" >"idxstats/${viallabel}_chrinfo.txt" + rm "$primary" "${primary}.bai" - # get fractions - local pct_y=$(echo "scale=5; ${y}/${total}*100" | bc -l | sed 's/^\./0./') - local pct_x=$(echo "scale=5; ${x}/${total}*100" | bc -l | sed 's/^\./0./') - local pct_mt=$(echo "scale=5; ${mt}/${total}*100" | bc -l | sed 's/^\./0./') - local pct_auto=$(echo "scale=5; ${auto}/${total}*100" | bc -l | sed 's/^\./0./') - local pct_contig=$(echo "scale=5; ${contig}/${total}*100" | bc -l | sed 's/^\./0./') + # get counts + local total + local y + local x + local mt + local auto + local contig + total=$(awk '{sum+=$3;}END{print sum;}' "idxstats/${viallabel}_chrinfo.txt") + y=$(grep -E "^chrY" "idxstats/${viallabel}_chrinfo.txt" | head -1 | cut -f 3) + x=$(grep -E "^chrX" "idxstats/${viallabel}_chrinfo.txt" | head -1 | cut -f 3) + mt=$(grep -E "^chrM" "idxstats/${viallabel}_chrinfo.txt" | head -1 | cut -f 3) + auto=$(grep -E "^chr[0-9]" "idxstats/${viallabel}_chrinfo.txt" | cut -f 3 | awk '{sum+=$1;}END{print sum;}') + contig=$(grep -E -v "^chr" "idxstats/${viallabel}_chrinfo.txt" | cut -f 3 | awk '{sum+=$1;}END{print sum;}') - # output to file - echo 'viallabel,total_primary_alignments,pct_chrX,pct_chrY,pct_chrM,pct_auto,pct_contig' > idxstats/${viallabel}_chrinfo.csv - echo ${viallabel},${total},${pct_x},${pct_y},${pct_mt},${pct_auto},${pct_contig} >> idxstats/${viallabel}_chrinfo.csv + # get fractions + local pct_y + local pct_x + local pct_mt + local pct_auto + local pct_contig + pct_y=$(echo "scale=5; ${y}/${total}*100" | bc -l | sed 's/^\./0./') + pct_x=$(echo "scale=5; ${x}/${total}*100" | bc -l | sed 's/^\./0./') + pct_mt=$(echo "scale=5; ${mt}/${total}*100" | bc -l | sed 's/^\./0./') + pct_auto=$(echo "scale=5; ${auto}/${total}*100" | bc -l | sed 's/^\./0./') + pct_contig=$(echo "scale=5; ${contig}/${total}*100" | bc -l | sed 's/^\./0./') + + echo $viallabel + echo $total + echo $pct_x + echo $pct_y + echo $pct_mt + echo $pct_auto + echo $pct_contig + # output to file + echo 'viallabel,total_primary_alignments,pct_chrX,pct_chrY,pct_chrM,pct_auto,pct_contig' >"idxstats/${viallabel}_chrinfo.csv" + echo "${viallabel},${total},${pct_x},${pct_y},${pct_mt},${pct_auto},${pct_contig}" >>"idxstats/${viallabel}_chrinfo.csv" } export -f align_stats -#parallel --verbose --jobs ${cores} align_stats ::: $(find -name "*_R1.trim.bam") -parallel --verbose --jobs ${cores} align_stats ::: $(ls ${bamdir}/*/*/align/rep*/*.trim.bam) + +if [ "$type" == "glob" ]; then + parallel --joblog ~/mnt/tmp/"$(basename bamdir)"_"$(date "+%b%d%Y_%H%M%S")"_align_stats_joblog.log --progress --bar --verbose --jobs "$cores" align_stats ::: "$(ls "${bamdir%/}"/*/*/align/rep*/*_R1.trim.bam)" +elif [ "$type" == "file" ]; then + readarray -t raw_bam_list <<<"$bamdir" + parallel --joblog ~/mnt/tmp/"$(basename bamdir)"_"$(date "+%b%d%Y_%H%M%S")"_align_stats_joblog.log --progress --bar --verbose --jobs "$cores" align_stats ::: "${raw_bam_list[@]}" +elif [ "$type" == "find" ]; then + parallel --joblog ~/mnt/tmp/"$(basename bamdir)"_"$(date "+%b%d%Y_%H%M%S")"_align_stats_joblog.log --progress --bar --verbose --jobs "$cores" align_stats ::: "$(find -name "*_R1.trim.bam" "$bamdir")" +else + echo "type must be glob, file, or find" + exit 1 +fi # collapse #head -1 $(find -name "*_chrinfo.csv" | head -1) > merged_chr_info.csv #for file in $(find -name "*_chrinfo.csv"); do sed -e '1d' $file >> merged_chr_info.csv; done -cat idxstats/*_chrinfo.csv|grep -v "^viallabel"|sed '1iviallabel,total_primary_alignments,pct_chrX,pct_chrY,pct_chrM,pct_auto,pct_contig' >merged_chr_info.csv +cat idxstats/*_chrinfo.csv | grep -v "^viallabel" | sed '1iviallabel,total_primary_alignments,pct_chrX,pct_chrY,pct_chrM,pct_auto,pct_contig' >merged_chr_info.csv +cd "$cwd" || exit 1 + diff --git a/src/align_stats_concat_only.sh b/src/align_stats_concat_only.sh new file mode 100644 index 0000000..16045b8 --- /dev/null +++ b/src/align_stats_concat_only.sh @@ -0,0 +1,86 @@ +#!/bin/bash +# Nicole Gay +# 13 May 2020 +# ATAC-seq alignment stats +# usage: bash align_stats.sh [NUM_CORES] [indir] [bamdir] +# make sure the vm has bc and samtools istalled +# sudo apt install bc +set -e + +cores=$1 +indir=$2 +bamdir=$3 + +if [ -z "$4" ]; then + type="glob" +else + type=$4 +fi + +#indir=/projects/motrpac/PASS1A/ATAC/NOVASEQ_BATCH2/outputs +#indir=~/test_mnt/PASS/atac-seq/stanford/batch4_20200928/Output/final +# path to ENCODE outputs, anywhere upstream of the bam files +# also where output folder will be made +#bamdir=~/test_mnt/PASS/atac-seq/stanford/batch4_20200928/Output +#path to the location of unfiltered bam files (*_R1.trim.bam) + +#module load samtools + +cd "$indir" +mkdir -p idxstats # make outdir + +align_stats() { + local bam=$1 # 90141015805_R1.trim.bam - UNFILTERED BAM + local viallabel + + viallabel=$(basename "$bam" | sed "s/_R1.*//") + echo "Processing $viallabel" + + # get counts + local total + local y + local x + local mt + local auto + local contig + total=$(awk '{sum+=$3;}END{print sum;}' "idxstats/${viallabel}_chrinfo.txt") + y=$(grep -E "^chrY" "idxstats/${viallabel}_chrinfo.txt" | head -1 | cut -f 3) + x=$(grep -E "^chrX" "idxstats/${viallabel}_chrinfo.txt" | head -1 | cut -f 3) + mt=$(grep -E "^chrM" "idxstats/${viallabel}_chrinfo.txt" | head -1 | cut -f 3) + auto=$(grep -E "^chr[0-9]" "idxstats/${viallabel}_chrinfo.txt" | cut -f 3 | awk '{sum+=$1;}END{print sum;}') + contig=$(grep -E -v "^chr" "idxstats/${viallabel}_chrinfo.txt" | cut -f 3 | awk '{sum+=$1;}END{print sum;}') + + # get fractions + local pct_y + local pct_x + local pct_mt + local pct_auto + local pct_contig + pct_y=$(echo "scale=5; ${y}/${total}*100" | bc -l | sed 's/^\./0./') + pct_x=$(echo "scale=5; ${x}/${total}*100" | bc -l | sed 's/^\./0./') + pct_mt=$(echo "scale=5; ${mt}/${total}*100" | bc -l | sed 's/^\./0./') + pct_auto=$(echo "scale=5; ${auto}/${total}*100" | bc -l | sed 's/^\./0./') + pct_contig=$(echo "scale=5; ${contig}/${total}*100" | bc -l | sed 's/^\./0./') + + # output to file + echo 'viallabel,total_primary_alignments,pct_chrX,pct_chrY,pct_chrM,pct_auto,pct_contig' >"idxstats/${viallabel}_chrinfo.csv" + echo "${viallabel},${total},${pct_x},${pct_y},${pct_mt},${pct_auto},${pct_contig}" >>"idxstats/${viallabel}_chrinfo.csv" +} +export -f align_stats + +if [ "$type" == "glob" ]; then + parallel --joblog ~/mnt/tmp/"$(basename bamdir)"_align_stats_concat_joblog.log --progress --bar --verbose --jobs "$cores" align_stats ::: "$(ls "${bamdir%/}"/*/*/align/rep*/*_R1.trim.bam)" +elif [ "$type" == "file" ]; then + readarray -t raw_bam_list <<<"$bamdir" + parallel --joblog ~/mnt/tmp/"$(basename bamdir)"_align_stats_concat_joblog.log --progress --bar --verbose --jobs "$cores" align_stats ::: "${raw_bam_list[@]}" +elif [ "$type" == "find" ]; then + parallel --joblog ~/mnt/tmp/"$(basename bamdir)"_align_stats_concat_joblog.log --progress --bar --verbose --jobs "$cores" align_stats ::: "$(find -name "*_R1.trim.bam" "$bamdir")" +else + echo "type must be glob, file, or find" + exit 1 +fi + +# collapse +#head -1 $(find -name "*_chrinfo.csv" | head -1) > merged_chr_info.csv +#for file in $(find -name "*_chrinfo.csv"); do sed -e '1d' $file >> merged_chr_info.csv; done +cat idxstats/*_chrinfo.csv | grep -v "^viallabel" | sed '1iviallabel,total_primary_alignments,pct_chrX,pct_chrY,pct_chrM,pct_auto,pct_contig' >merged_chr_info.csv diff --git a/src/atac-post-process-human-wrapper.sh b/src/atac-post-process-human-wrapper.sh new file mode 100644 index 0000000..e4212e3 --- /dev/null +++ b/src/atac-post-process-human-wrapper.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +# Wrapper script for running the ENCODE ATAC-seq pipeline + +### +# working directory needs to be base directory (not src/) +# USAGE: src/atac-seq-human-wrapper.sh +### + +set -eu + +### VARIABLES TO SET ### +NUM_CORES=12 +BATCH_PREFIX=batch1_20220710 +GCS_BUCKET=my_bucket +ATAC_OUTPUT_DIR="gs://my_bucket/atac-seq/batch1_20220710/processed" +SAMPLE_METADATA_FILENAME="batch1_20220710_sample_metadata.csv" +# set this to the region you want to run in (e.g. us-central1), the region should be compatible with life sciences API +CROMWELL_OUT_DIR="gs://my_bucket/pipelines/out" +### + +# constants +OUT_DIR=outputs/ +MOUNT_DIR=~/mnt/ +LOCAL_ATAC_OUT_DIR=${ATAC_OUTPUT_DIR#"gs://"} + +echo "Downloading outputs..." +bash src/croo.sh $OUT_DIR/"$BATCH_PREFIX"_submissions.json $CROMWELL_OUT_DIR/atac/ $ATAC_OUTPUT_DIR/croo true + +echo "Mounting GCS bucket..." +mkdir -p "$MOUNT_DIR"/"$GCS_BUCKET" +mkdir -p "$MOUNT_DIR"/tmp +gcsfuse --implicit-dirs "$GCS_BUCKET" "$MOUNT_DIR"/"$GCS_BUCKET" + +#get qc tsv report +echo "Creating qc2tsv report..." +mkdir -p "$MOUNT_DIR"/"$GCS_BUCKET"/qc/ +bash src/qc2tsv.sh $ATAC_OUTPUT_DIR/croo "$MOUNT_DIR"/"$GCS_BUCKET"/qc/"$BATCH_PREFIX"_qc.tsv + +# reorganize croo outputs for quantification +echo "Copying files for quantification..." +bash src/extract_atac_from_gcp_human.sh $NUM_CORES $ATAC_OUTPUT_DIR/ $ATAC_OUTPUT_DIR/croo + +# run samtools to generate genome alignment stats +echo "Generating alignment stats..." +bash src/align_stats.sh $NUM_CORES "$MOUNT_DIR"/$LOCAL_ATAC_OUT_DIR "$MOUNT_DIR"/$LOCAL_ATAC_OUT_DIR/croo + +# Create final qc report merging the metadata and workflow qc scores +echo "Generating merged qc reports..." +Rscript src/merge_atac_qc_human.R -w "$MOUNT_DIR/$LOCAL_ATAC_OUT_DIR/$SAMPLE_METADATA_FILENAME" -q "$MOUNT_DIR"/$LOCAL_ATAC_OUT_DIR/croo/final/qc/"$BATCH_PREFIX"_qc.tsv -a "$MOUNT_DIR"/$LOCAL_ATAC_OUT_DIR/merged_chr_info.csv -o "$MOUNT_DIR"/${ATAC_OUTPUT_DIR#"gs://"}/ + +echo "Generating sample counts matrix..." +bash src/encode_to_count_matrix_human.sh "$MOUNT_DIR/$LOCAL_ATAC_OUT_DIR" "$(pwd)"/src/ $NUM_CORES + +echo "Done!" diff --git a/src/atac-post-process_wrapper.sh b/src/atac-post-process-pass-wrapper.sh similarity index 54% rename from src/atac-post-process_wrapper.sh rename to src/atac-post-process-pass-wrapper.sh index 24d3568..4a8b002 100644 --- a/src/atac-post-process_wrapper.sh +++ b/src/atac-post-process-pass-wrapper.sh @@ -1,65 +1,56 @@ -#!/bin/sh +#!/bin/bash set -Eeuxo pipefail #Usage: bash atac-post-process_wrapper.sh # On gcp , make sure that the bucket containing the inputs is mounted before running the script ############################################## #Change the paths to USER DEFINED VARIABLES IN THIS SECTION ONLY BEFORE RUNNING THE SCRIPT -pipeline_output_path=gs://rna-seq_araja/PASS/atac-seq/stanford/PASS1B/atac download_dir=gs://rna-seq_araja/PASS/atac-seq/stanford/batch8_20210113/Output/final croo_output_path=gs://rna-seq_araja/PASS/atac-seq/stanford/batch8_20210113/Output -qc2tsv_report_name="atac_qc.tsv" -final_qc_report_name="merged_atac_qc.tsv" -copy_cores=12 -copy_dest="gcp" -align_cores=12 +qc2tsv_report_name=atac_qc.tsv indir=~/test_mnt/PASS/atac-seq/stanford in_mnt_dir=~/test_mnt/PASS/atac-seq/stanford/batch8_20210113/Output/final in_bam_dir=~/test_mnt/PASS/atac-seq/stanford/batch8_20210113/Output #path to raw unfiltered bam files -script_dir=/home/araja7/atac-pp/motrpac-atac-seq-pipeline/src -batch_wfids="/home/araja7/atac-pp/stanford/batch8/batch8_wfids.txt" -gcp_project="motrpac-portal" -sample_metadata="sample_metadata_20200928.csv" -merge_counts_cores=3 -batch_filelist="/home/araja7/atac-pp/stanford/batch_list.txt" -batch_count=1 -final_results_dir="batch8_20210113/Output/final" +gcp_project=motrpac-portal +sample_metadata=sample_metadata_20200928.csv +batch_wfids=/home/araja7/atac-pp/stanford/batch8/ +batch_filelist=/home/araja7/atac-pp/stanford/batch_list.txt +final_results_dir=batch8_20210113/Output/final mode=0 -#final_results_dir="test_merge" +batch_count=1 +num_cores=12 ############################################## - #get replicates to sample mapping file -python3 ${script_dir}/encode_rep_names_from_croo.py ${croo_output_path} ${download_dir}/ ${batch_wfids} ${gcp_project} +python3 src/encode_rep_names_from_croo.py ${croo_output_path} ${download_dir}/ ${batch_wfids} ~/mnt/rna-seq_araja/atac-seq/wfids/${batch_wfids} ${gcp_project} #bash src/extract_rep_names_from_encode.sh ${pipeline_output_path}/ ${download_dir}/ echo "Success! Done creation of sample mapping file" #get qc tsv report -bash ${script_dir}/qc2tsv.sh ${croo_output_path} ${qc2tsv_report_name} +bash src/qc2tsv.sh ${croo_output_path} ${qc2tsv_report_name} echo "Success! Done creation of qc2tsv report" # reorganize croo outputs for quantification -bash ${script_dir}/pass_extract_atac_from_gcp.sh ${copy_cores} ${download_dir} ${croo_output_path} ${copy_dest} +bash src/extract_atac_from_gcp_pass.sh ${num_cores} ${croo_output_path} ${download_dir} echo "Success! Done copying files for quantification" # run samtools to generate genome alignment stats -bash ${script_dir}/align_stats.sh ${align_cores} ${in_mnt_dir} ${in_bam_dir} +bash src/align_stats.sh ${num_cores} ${in_mnt_dir} ${in_bam_dir} echo "Success! Done generating alignment stats" # Create final qc report merging the metadata and workflow qc scores - -Rscript ${script_dir}/merge_atac_qc.R -w ~/${in_mnt_dir}/${sample_metadata} -q ~/${in_mnt_dir}/qc/${qc2tsv_report_name} -m ${in_mnt_dir}/rep_to_sample_map.csv -a ${in_mnt_dir}/merged_chr_info.csv -o ${in_mnt_dir}/ +Rscript src/merge_atac_qc.R -w ~/${in_mnt_dir}/${sample_metadata} -q ~/${in_mnt_dir}/qc/${qc2tsv_report_name} -m ${in_mnt_dir}/rep_to_sample_map.csv -a ${in_mnt_dir}/merged_chr_info.csv -o ${in_mnt_dir}/ echo "Success! Done generating merged qc reports" #generate counts matrix per batch if [[ "$batch_count" == "1" ]]; then - bash ${script_dir}/encode_to_count_matrix.sh ${indir} ${script_dir} ${batch_filelist} ${merge_counts_cores} ${final_results_dir} ${mode} + bash src/encode_to_count_matrix.sh ${indir} src ${batch_filelist} ${num_cores} ${final_results_dir} ${mode} else echo "Skipping this step as the batch count is greater than 1, this step has to be run outside of wrapper" exit diff --git a/src/croo.sh b/src/croo.sh old mode 100755 new mode 100644 index 9722867..ad6023c --- a/src/croo.sh +++ b/src/croo.sh @@ -1,24 +1,65 @@ -#!/bin/sh -#Usage : src/croo.sh >>croo_copy_jobs.txt +#!/bin/bash + +set -eux +#Usage : src/croo.sh >>croo_copy_jobs.txt #Contributors : Archana Raja, Nicole Gay -workflow_id_list=$1 -gcp_path=$2 -outpath=$3 -for i in `cat ${workflow_id_list}`; do - sample_dir=${gcp_path}/$i - #out_dir=gs://motrpac-portal-transfer-stanford/Output/atac-seq/batch_20200318/$i - out_dir=${outpath}/$i - #as long as the descrption is hyphenated and don't contain any spaces or special characters below would work - descrip=$(gsutil cat ${sample_dir}/call-qc_report/glob-*/qc.json | grep "description" | sed -e 's/.*": "//' -e 's/".*//') - empty="No description" - if [ "${descrip}" = "No description" ] - then - descrip=$(gsutil cat ${sample_dir}/call-qc_report/glob-*/qc.json | grep "title" | sed -e 's/.*": "//' -e 's/".*//') +if [ $# -lt 3 ]; then + echo "Usage: ./croo.sh [WORKFLOW_SUBMISSION_MAP] [GCP_PATH] [OUT_PATH]" + echd + echo "Example: croo.sh out.json gs://my-bucket/my_workflow/outputs/croo gs://my-bucket/my_workflow/processed" + echo + echo "[WORKFLOW_SUBMISSION_MAP]: A JSON array of workflow ids to process" + echo "[GCP_PATH]: This directory with the outputs of the pipeline" + echo "[OUT_PATH]: The location to output the croo files to" + echo "[PARSE_FROM_ID_LIST] (Optional): Whether to use the workflow id list to parse the files to copy. If false/not set will use QC json to create a folder name" + echo + exit 1 +fi + +WORKFLOW_SUBMISSION_MAP=$1 +GCP_PATH=${2%/} +OUT_PATH=${3%/} +PARSE_FROM_ID_LIST=$4 + +function run_croo() { + local line=$1 + local sample_dir + local out_dir + local descrip + + sample_dir=$GCP_PATH/${line%/} + out_dir=$OUT_PATH/${sample_dir#gs://} + + if [[ "$PARSE_FROM_ID_LIST" ]]; then + out_dir="$out_dir"/$(jq -r '.[] | select(.workflow_id == "'"$line"'") | .label' "$WORKFLOW_SUBMISSION_MAP") + echo "out_dir: $out_dir" + else + # as long as the description is hyphenated and don't contain any spaces or special characters below would work + descrip=$(gsutil cat "$sample_dir"/call-qc_report/glob-*/qc.json | grep "description" | sed -e 's/.*": "//' -e 's/".*//') + + if [ "$descrip" = "No description" ]; then + descrip=$(gstil cat "$sample_dir"/call-qc_report/glob-*/qc.json | grep "title" | sed -e 's/.*": "//' -e 's/".*//') else - descrip=${descrip} + descrip=$descrip fi - echo croo --method copy ${sample_dir}/metadata.json --out-dir ${out_dir}/${descrip}/ -done + out_dir="$out_dir"/"${descrip/gs:\/\///}"/ + fi + + croo --method copy "$sample_dir"/metadata.json --out-dir "$out_dir" +} + +export GCP_PATH +export OUT_PATH +export WORKFLOW_SUBMISSION_MAP +export PARSE_FROM_ID_LIST +export -f run_croo +cores=10 + +# shellcheck disable=SC2046 +parallel --joblog ~/mnt/tmp/"${WORKFLOW_SUBMISSION_MAP%%.*}"_croo.log --progress --bar --verbose --jobs "$cores" run_croo ::: $(jq -r '.[].workflow_id' "$WORKFLOW_SUBMISSION_MAP") +#for line in $(jq -r '.[].workflow_id' "$WORKFLOW_SUBMISSION_MAP"); do +# run_croo "$line" +#done diff --git a/src/croo_wrapper.sh b/src/croo_wrapper.sh deleted file mode 100755 index 1f00f66..0000000 --- a/src/croo_wrapper.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/sh - -indir="gs://rna-seq_araja/PASS/atac-seq/stanford/PASS1B/atac" -outdir="gs://rna-seq_araja/PASS/atac-seq/stanford/batch10_20210113/Output" -wfids="hipp_wfids.txt" -croo_job_name="croo_copy_jobs_hippo.sh" - -bash motrpac-atac-seq-pipeline-test/src/croo.sh ${wfids} ${indir} ${outdir} >>${croo_job_name} - -echo "Success! Done creating croo copy jobs" diff --git a/src/delete_json.sh b/src/delete_json.sh new file mode 100644 index 0000000..4ff67ac --- /dev/null +++ b/src/delete_json.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# Gets a list of all the workflows that have been run, and deletes the corresponding input json files + +if [ $# -lt 2 ]; then + echo "Usage: ./delete_json.sh [SUBMITTED_LABELS] [JSON_DIR]" + echd + echo "Example: delete_json.sh labels.txt jsons" + echo + echo "[SUBMITTED_LABELS]: a file containing a list of labels of the submitted jobs" + echo "[JSON_DIR]: The directory where the JSON files are stored" + echo + exit 1 +fi + +SUBMITTED_LABELS=$1 +JSON_DIR=$2 + +readarray -t lines <<<"$SUBMITTED_LABELS" + +for line in "${lines[@]}"; do + echo "Deleting $line.json" + rm "${JSON_DIR%/}"/"$line".json || true +done diff --git a/src/encode_rep_names_from_croo.py b/src/encode_rep_names_from_croo.py index 05cfd22..94dd2b8 100644 --- a/src/encode_rep_names_from_croo.py +++ b/src/encode_rep_names_from_croo.py @@ -1,46 +1,95 @@ -import pandas as pd -import os,sys,argparse +import argparse + import gcsfs +import pandas as pd +from tqdm import tqdm + + +# This script generates replicate to sample mapping file to interpret the qc report. It +# was developed as an alternative to extract_rep_names_from_encode.sh which might not +# work if the pipeline run_dir had outputs for a specific condition in multiple +# workflow ids, this happens when a workflow is rerun on gcp, where the metadata.json +# file generated by caper points to the completed paths from failed workflow ids. This +# script uses the c roo.filetable.*.tsv to generate the mapping file + +# Usage: python3 encode_rep_names_from_croo.py +# +# -#This script generates replicate to sample mapping file to interpret the qc report. It was developed as an alternative to extract_rep_names_from_encode.sh -#which might not work if the pipeline run_dir had outputs for a specific condition in multiple workflow ids, this happens when a workflow is rerun on gcp, -#where the metadata.json file generated by caper points to the completed paths from failed wfids. -#This script uses the croo.filetable.*.tsv to generate the mapping file -#Usage: python3 encode_rep_names_from_croo.py -# - -def main(gcp_path,output_path,wfids,gcp_project): +def main(gcp_path: str, output_path: str, workflow_id_fp: str, gcp_project: str): fs = gcsfs.GCSFileSystem(project=gcp_project) - out_df = pd.DataFrame() - wfl = [line.strip() for line in open(wfids, 'r')] - cond_l=[] - rep_l=[] - sample_list=[] - for i in wfl: - in_path=gcp_path+"/"+i+"/*/croo.filetable*.tsv" - f="gs://"+fs.glob(in_path)[0] - df_in=pd.read_csv(f,sep="\t",header=None) - filtered=df_in[df_in[0].str.contains("Raw BAM from aligner")] - cond=f.split("/")[-2] + wfl = [line.strip() for line in open(workflow_id_fp, "r")] + + cond_l = [] + rep_l = [] + + sample_list = [] + + for i in (pbar := tqdm(wfl)): + workflow_path = f"{gcp_path.rstrip('/')}/{i}" + pbar.set_description(f"Processing {i} at {workflow_path}") + + in_path = fs.glob(f"{workflow_path}/**/croo.filetable*.tsv") + pbar.write(f"Found {len(in_path)} filetable files: {in_path}") + + if len(in_path) == 0: + print(f"Skip {i} since no filetable found or exit script?") + print(f"Enter y/n") + user_input = input() + user_input = user_input.lower() + if user_input == "y" or user_input == "yes": + continue + else: + exit() + + filetable_filename = f"gs://{in_path[0]}" + + df_in = pd.read_csv(filetable_filename, sep="\t", header=None) + filtered = df_in[df_in[0].str.contains("Raw BAM from aligner")] + + cond = filetable_filename.split("/")[-2] + for vals in filtered[1]: + split_vals = vals.split("/") cond_l.append(cond) - rep=vals.split("/")[-2] + rep = split_vals[-2] rep_l.append(rep) - sample=(vals.split("/")[-1]).split("_R1.trim.bam")[0] - sample_list.append(sample) - out_df[0] = cond_l - out_df[1] = rep_l - out_df[2] = sample_list - out_file=output_path+"rep_to_sample_map.csv" - out_df.to_csv(out_file,index=False,header=False) + sample = (split_vals[-1]).split("_R1.trim.bam")[0] + sample_list.append(sample) + + pbar.write(f"Found {len(filtered[1])} values, finished {i}\n") + + out_df = pd.DataFrame() + out_df["cond_l"] = cond_l + out_df["rep_l"] = rep_l + out_df["sample_list"] = sample_list + + print(out_df.head()) + out_file = f"{output_path.rstrip('/')}/rep_to_sample_map.csv" + out_df.to_csv(out_file, index=False, header=False) + print("Success! Finished writing the mapping file") -if __name__ == '__main__': - parser = argparse.ArgumentParser(description = 'This script generates the replicate to sample mapping file to interpret the qc report') - parser.add_argument('gcp_path',help='location of atac-seq croo outputs',type=str) - parser.add_argument('output_path', help='output path, where you want the outputs to be written', type=str) - parser.add_argument('wfids', help='file containing list of workflow ids,usually all workflows for a specific tissue') - parser.add_argument('gcp_project', help='file containing list of workflow ids') - args = parser.parse_args() - main(args.gcp_path,args.output_path,args.wfids,args.gcp_project) + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="This script generates the replicate to sample mapping file to " + "interpret the qc report" + ) + parser.add_argument("gcp_path", help="location of atac-seq croo outputs", type=str) + parser.add_argument( + "output_path", + help="output path, where you want the outputs to be written", + type=str, + ) + parser.add_argument( + "wfids", + help="file containing list of workflow ids,usually all workflows for a specific " + "tissue", + ) + parser.add_argument("gcp_project", help="file containing list of workflow ids") + + args = parser.parse_args() + + main(args.gcp_path, args.output_path, args.wfids, args.gcp_project) diff --git a/src/encode_to_count_matrix.sh b/src/encode_to_count_matrix.sh index f58658f..8117e82 100644 --- a/src/encode_to_count_matrix.sh +++ b/src/encode_to_count_matrix.sh @@ -66,7 +66,7 @@ echo "Success! done concatenating peak files from all tissues" #cat $(find -path "./peak/*narrowPeak.gz") > ${outdir}/overlap.optimal_peak.narrowPeak.bed.gz #truncate peaks to 200 bp around summit -python ${srcdir}/truncate_narrowpeak_200bp_summit.py --infile ${outdir}/overlap.optimal_peak.narrowPeak.bed.gz --outfile ${outdir}/overlap.optimal_peak.narrowPeak.200.bed.gz +python3 ${srcdir}/truncate_narrowpeak_200bp_summit.py --infile ${outdir}/overlap.optimal_peak.narrowPeak.bed.gz --outfile ${outdir}/overlap.optimal_peak.narrowPeak.200.bed.gz echo "Success! finished truncating peaks" # sort and merge peaks --> master peak file @@ -78,6 +78,8 @@ mkdir -p ${indir}/${final_results_dir}/counts_matrix intersect_tag () { local tag=$1 local results_dir=$(ls|grep "final") #assumes the results folder has the word final + #below loction was used for pass1ac + #local results_dir=$(ls|grep "pass1c_merged_counts_v2") #echo "results dir is" ${results_dir} #echo "tag is" ${tag} local viallabel=$(basename $tag | sed "s/_.*//") @@ -90,11 +92,9 @@ for i in `cat ${batch_file}`;do tag_align=$(ls ${indir}/$i/Output/final/tagalign/*tagAlign.gz) echo ${tag_align} echo ${final_results_dir} - parallel --verbose --jobs ${cores} intersect_tag ::: $(echo ${tag_align}) + parallel --verbose --progress --bar --jobs ${cores} intersect_tag ::: $(echo ${tag_align}) done -#tag_align=$(find -path "./tagalign/*tagAlign.gz") -#parallel --verbose --jobs ${cores} intersect_tag ::: $(echo ${tag_align}) echo -e $'chrom\tstart\tend' > ${indir}/${final_results_dir}/index cat ${outdir}/overlap.optimal_peak.narrowPeak.200.sorted.merged.bed >> ${indir}/${final_results_dir}/index @@ -105,8 +105,8 @@ cat ${outdir}/overlap.optimal_peak.narrowPeak.200.sorted.merged.bed >> ${indir}/ cd ${indir}/${final_results_dir}/counts_matrix ls *|awk -F "." '{print $2}'|awk '{print substr($1,8,2)}'|cut -f1|sort|uniq >>${indir}/${final_results_dir}/tmp_tids.txt for i in `cat ${indir}/${final_results_dir}/tmp_tids.txt`;do - paste ${indir}/${final_results_dir}/index counts.*$i??.txt >${indir}/${final_results_dir}/T$i.atac.counts.txt - gzip ${indir}/${final_results_dir}/T$i.atac.counts.txt + paste ${indir}/${final_results_dir}/index counts.*$i??.txt >${indir}/${final_results_dir}/t$i.atac.counts.txt + gzip ${indir}/${final_results_dir}/t$i.atac.counts.txt done rm ${indir}/${final_results_dir}/tmp_tids.txt rm ${indir}/${final_results_dir}/index diff --git a/src/encode_to_count_matrix_all_tissues_pass.sh b/src/encode_to_count_matrix_all_tissues_pass.sh new file mode 100644 index 0000000..a4974de --- /dev/null +++ b/src/encode_to_count_matrix_all_tissues_pass.sh @@ -0,0 +1,79 @@ +#!/bin/bash +set -Eeuxo pipefail +trap "echo ERR trap fired!" ERR +# Author: Nicole Gay, Anna Scherbina +# Updated: 7 May 2020 +# Script : encode_to_count_matrix.sh +# Purpose: generate peak x sample read counts matrix from tagAlign and hammock files +# Run pass_extract_from_gcp.sh before this + +#get a merged peak file + +#module load miniconda/3 # for python3 +#module load bedtools + +############################################################ +## USER-DEFINED VARIABLES +#indir=~/test_mnt/PASS/atac-seq/stanford/batch5_20200929/Output/final # same as ${base} from running pass_extract_from_gcp.sh +#indir=~/test_mnt/PASS/atac-seq/stanford +indir=~/test_mnt/PASS/atac-seq +srcdir=~/motrpac-atac-seq-pipeline/src # directory with truncate_narrowpeak_200bp_summit.py +#batch_file=/home/araja7/motrpac-atac-seq-pipeline_code_dev/src/test_batch.txt +batch_file=/home/araja7/pp/test_batch.txt +final_results_dir=test_merge_pass1ac +#3 worked on gcp , 8 crashes the system current gcp vm with 60gb ram +cores=3 # number of cores allocated for parallelization +# need ~25G per core. 10G was too low +############################################################ + +outdir=${indir}/${final_results_dir}/merged_peaks +mkdir -p ${outdir} +cd ${indir} +#concatenate peaks (narrowpeak.gz) +for i in `cat ${batch_file}`;do + cat ${indir}/$i/Output/final/peak/*narrowPeak.gz >>${outdir}/overlap.optimal_peak.narrowPeak.bed.gz +done +echo "Success! done concatenating peak files from all tissues" + +#truncate peaks to 200 bp around summit +python ${srcdir}/truncate_narrowpeak_200bp_summit.py --infile ${outdir}/overlap.optimal_peak.narrowPeak.bed.gz --outfile ${outdir}/overlap.optimal_peak.narrowPeak.200.bed.gz +echo "Success! finished truncating peaks" + +# sort and merge peaks --> master peak file +zcat ${outdir}/overlap.optimal_peak.narrowPeak.200.bed.gz | bedtools sort | bedtools merge > ${outdir}/overlap.optimal_peak.narrowPeak.200.sorted.merged.bed +echo "Success! Finished sorting and merging" + +# intersect with tagalign files +mkdir -p ${indir}/${final_results_dir}/counts_matrix +intersect_tag () { + local tag=$1 + local results_dir=$(ls|grep "final") + #echo "results dir is" ${results_dir} + #echo "tag is" ${tag} + local viallabel=$(basename $tag | sed "s/_.*//") + echo ${viallabel} > ${results_dir}/counts_matrix/counts.${viallabel}.txt + bedtools coverage -nonamecheck -counts -a ${results_dir}/merged_peaks/overlap.optimal_peak.narrowPeak.200.sorted.merged.bed -b ${tag} | cut -f4 >> ${results_dir}/counts_matrix/counts.${viallabel}.txt +} +export -f intersect_tag + +for i in `cat ${batch_file}`;do + tag_align=$(ls $i/Output/final/tagalign/*tagAlign.gz) + echo ${tag_align} + echo ${final_results_dir} + parallel --verbose --jobs ${cores} intersect_tag ::: $(echo ${tag_align}) +done + +echo -e $'chrom\tstart\tend' > ${indir}/${final_results_dir}/index +cat ${outdir}/overlap.optimal_peak.narrowPeak.200.sorted.merged.bed >> ${indir}/${final_results_dir}/index + +#split the results counts matrix by tissue +#to do : reimplement in python +cd ${indir}/${final_results_dir}/counts_matrix +ls *|awk -F "." '{print $2}'|awk '{print substr($1,8,2)}'|cut -f1|sort|uniq >>${indir}/${final_results_dir}/tmp_tids.txt +for i in `cat ${indir}/${final_results_dir}/tmp_tids.txt`;do + paste ${indir}/${final_results_dir}/index counts.*$i??.txt >${indir}/${final_results_dir}/t$i.atac.counts.txt + gzip ${indir}/${final_results_dir}/t$i.atac.counts.txt +done +rm ${indir}/${final_results_dir}/tmp_tids.txt +rm ${indir}/${final_results_dir}/index +echo "Success generating counts matrix" diff --git a/src/encode_to_count_matrix_human.sh b/src/encode_to_count_matrix_human.sh new file mode 100644 index 0000000..fa30c45 --- /dev/null +++ b/src/encode_to_count_matrix_human.sh @@ -0,0 +1,89 @@ +#!/bin/bash +set -Eeuxo pipefail +# Author: Nicole Gay, Anna Scherbina +# Updated: 7 May 2020 +# Script : encode_to_count_matrix.sh ${in_dir} ${src_dir} ${cores} ${final_results_dir} +# Purpose: generate peak x sample read counts matrix from tagAlign and hammock files +# Run pass_extract_from_gcp.sh before this + +#get a merged peak file + +############################################################ +## USER-DEFINED VARIABLES +#Examples for user defined input arguments +#in_dir=~/test_mnt/PASS/atac-seq/stanford #base path to the location of outputs from all batches +#src_dir=~/motrpac-atac-seq-pipeline/src # directory with truncate_narrowpeak_200bp_summit.py +#batch_file=/home/araja7/motrpac-atac-seq-pipeline_code_dev/src/test_batch.txt #file containing the list of batches to merge the peak files +#example contents of batch file +#batch5_2020092 +#final_results_dir=pass1b_atac_final +#3 worked on gcp , 8 crashes the system current gcp vm with 60gb ram +#cores=3 # number of cores allocated for parallelization +# need ~25G per core. 10G was too low +in_dir=$1 +src_dir=$2 +cores=$3 +############################################################ + +#make the same code usable for generating counts from single or multiple batches + +in_dir=${in_dir%/} +src_dir=${src_dir%/} + +OUT_DIR=${in_dir}/merged_peaks +echo "$OUT_DIR" +mkdir -p "$OUT_DIR" + +cd "$in_dir" + +#concatenate peaks (narrowpeak.gz) +cat "${in_dir}"/croo/final/peak/*narrowPeak.gz >>"${OUT_DIR}/overlap.optimal_peak.narrowPeak.bed.gz" +echo "Success! done concatenating peak files from all tissues" + +# concatenate peaks (narrowpeak.gz) +#cat $(find -path "./peak/*narrowPeak.gz") > ${OUT_DIR}/overlap.optimal_peak.narrowPeak.bed.gz + +#truncate peaks to 200 bp around summit +python "${src_dir}/truncate_narrowpeak_200bp_summit.py" --infile "${OUT_DIR}/overlap.optimal_peak.narrowPeak.bed.gz" --outfile "${OUT_DIR}/overlap.optimal_peak.narrowPeak.200.bed.gz" +echo "Success! finished truncating peaks" + +# sort and merge peaks --> master peak file +zcat "${OUT_DIR}/overlap.optimal_peak.narrowPeak.200.bed.gz" | bedtools sort | bedtools merge >"${OUT_DIR}/overlap.optimal_peak.narrowPeak.200.sorted.merged.bed" +echo "Success! Finished sorting and merging" + +# intersect with tagalign files +mkdir -p "${in_dir}/counts_matrix" + +intersect_tag() { + local TAG=$1 + local VIAL_LABEL + + VIAL_LABEL=$(basename "$TAG" | sed "s/_.*//") + echo "$VIAL_LABEL" >"${in_dir}/counts_matrix/counts.${VIAL_LABEL}.txt" + bedtools coverage -nonamecheck -counts -a "${in_dir}/merged_peaks/overlap.optimal_peak.narrowPeak.200.sorted.merged.bed" -b "$TAG" | cut -f4 >>"${in_dir}/counts_matrix/counts.${VIAL_LABEL}.txt" +} + +export in_dir +export -f intersect_tag + +# shellcheck disable=SC2046 +parallel --verbose --progress --bar --jobs "$cores" intersect_tag ::: $(ls "${in_dir}"/tagalign/*tagAlign.gz) + +echo -e $'chrom\tstart\tend' >"${in_dir}/index" +cat "${OUT_DIR}/overlap.optimal_peak.narrowPeak.200.sorted.merged.bed" >>"${in_dir}/index" + +#split the results counts matrix by tissue +#to do : reimplement in python + +cd "${in_dir}/counts_matrix" +ls ./* | awk -F "." '{print $2}' | awk '{print substr($1,8,2)}' | cut -f1 | sort | uniq >>"${in_dir}/tmp_tids.txt" + +while IFS= read -r line; do + paste "${in_dir}"/index counts.*"${line}"??.txt >"${in_dir}/T${line}.atac.counts.txt" + gzip "${in_dir}/T${line}.atac.counts.txt" +done <"${in_dir}/tmp_tids.txt" + +rm "${in_dir}/tmp_tids.txt" +rm "${in_dir}/index" + +echo "Success generating counts matrix" diff --git a/src/extract_atac_from_gcp_human.sh b/src/extract_atac_from_gcp_human.sh new file mode 100644 index 0000000..abc058e --- /dev/null +++ b/src/extract_atac_from_gcp_human.sh @@ -0,0 +1,122 @@ +#!/bin/bash + +# Nicole Gay, modified by Archana to work on copying outputs to GCP and locally +# 6 May 2020 +# +# description: copy necessary outputs from GCP to final destination on GCP or local cluster +# +# Usage: bash pass_extract_atac_from_gcp.sh ${NUM_CORES} ${DOWNLOAD_DIR_without_trailing_slash} ${gcp_path_to_atac_croo_outputs_without_trailing_slash} ${copy_dest} +# +### Expected numbers of files: +## +## PASS1A: +# signal : 98 * {N_tissues} +# peak : 40 * {N_tissues} +# qc : 1 +# tagAlign : 80 * {N_tissues} +## +## PASS1B: +# signal : 62 * {N_tissues} +# peak : 24 * {N_tissues} +# qc : 12 +# tagAlign : 52 * {N_tissues} + +if [ $# -lt 3 ]; then + echo "Usage: ./pass_extract_atac_from_gcp.sh [NUM_CORES] [CROO_OUTPUT_PATH] [DOWNLOAD_DIR]" + echd + echo "Example: pass_extract_atac_from_gcp.sh 4 gs://my-bucket/my_workflow/outputs/croo gs://my-bucket/my_workflow/processed" + echo + echo "[NUM_CORES]: The number of cores to use" + echo "[DOWNLOAD_DIR]: The directory to output the files to" + echo "[CROO_OUTPUT_PATH]: The location of the croo outputs" + echo + exit 1 +fi + +NUM_CORES=$1 # number of processes to run in parallel +DOWNLOAD_DIR=$2 # gcp or local output directory path without trailing slash +CROO_OUTPUT_PATH=$3 # gcp path to croo outputs without trailing slash + +if [[ $CROO_OUTPUT_PATH == gs://* ]]; then + copy_dest="gcp" +else + copy_dest="local" +fi + +CROO_OUTPUT_PATH=${CROO_OUTPUT_PATH%/} +DOWNLOAD_DIR=${DOWNLOAD_DIR%/} + +if [[ "$copy_dest" == "gcp" ]]; then + + # individual tagAlign files + gsutil -m cp -n "${CROO_OUTPUT_PATH}/*/*/align/rep?/*tagAlign.gz" "${DOWNLOAD_DIR}/tagalign/" + + # individual signal track (p-value) + gsutil -m cp -n "${CROO_OUTPUT_PATH}/*/*/signal/rep?/*pval.signal.bigwig" "${DOWNLOAD_DIR}/signal/" +else + cd "$DOWNLOAD_DIR" || (echo "ERROR: could not cd to $DOWNLOAD_DIR" && exit 1) + mkdir -p qc peak signal tagalign + + # rep-to-sample map + gsutil cp -n "${CROO_OUTPUT_PATH}/rep_to_sample_map.csv" . + # merged QC + gsutil cp -n "${CROO_OUTPUT_PATH}/*qc*" qc + + # individual tagAlign files + gsutil -m cp -n "${CROO_OUTPUT_PATH}/*/*/align/rep?/*tagAlign.gz" tagalign + + # individual signal track (p-value) + gsutil -m cp -n "${CROO_OUTPUT_PATH}/*/*/signal/rep?/*pval.signal.bigwig" signal +fi + +gs_copy() { + local dir=$1 + local subdir + local condition + + subdir=$(gsutil ls "$dir") + condition=$(basename "${subdir%/}") + + # merged peak file + gsutil cp -n "${subdir}peak/overlap_reproducibility/overlap.optimal_peak.narrowPeak.gz peak/${condition}.overlap.optimal_peak.narrowPeak.gz" + gsutil cp -n "${subdir}peak/overlap_reproducibility/overlap.optimal_peak.narrowPeak.hammock.gz peak/${condition}.overlap.optimal_peak.narrowPeak.hammock.gz" + + # pooled signal track + if [[ $condition != *"GET-STDRef-Set"* ]]; then + gsutil cp -n "${subdir}signal/pooled-rep/basename_prefix.pooled.pval.signal.bigwig signal/${condition}.pooled.pval.signal.bigwig" + fi + + # qc.html + gsutil cp -n "${subdir}qc/qc.html" "qc/${condition}.qc.html" +} + +gs_copy_gcp() { + local dir=$1 + local subdir + local condition + local out_dir + + subdir=$(gsutil ls "$dir" | grep -v 'rep_to_sample_map.csv\|tagalign') + condition=$(basename "${subdir%/}") + out_dir="$(dirname "$dir")/final" + + # merged peak file + gsutil -m cp -n "${subdir}peak/overlap_reproducibility/overlap.optimal_peak.narrowPeak.gz" "${out_dir}/peak/${condition}.overlap.optimal_peak.narrowPeak.gz" + gsutil -m cp -n "${subdir}peak/overlap_reproducibility/overlap.optimal_peak.narrowPeak.hammock.gz" "${out_dir}/peak/${condition}.overlap.optimal_peak.narrowPeak.hammock.gz" + + # pooled signal track + if [[ $condition != *"GET-STDRef-Set"* ]]; then + gsutil -m cp -n "${subdir}signal/pooled-rep/basename_prefix.pooled.pval.signal.bigwig" "${out_dir}/signal/${condition}.pooled.pval.signal.bigwig" + fi + + # qc.html + gsutil cp -n "${subdir}qc/qc.html" "${out_dir}/qc/${condition}.qc.html" +} + +if [[ "$copy_dest" == "gcp" ]]; then + export -f gs_copy_gcp + parallel --progress --bar --verbose --jobs "$NUM_CORES" gs_copy_gcp ::: "$(gsutil ls "$CROO_OUTPUT_PATH" | grep -E "/$" | grep -v "final")" +else + export -f gs_copy + parallel --progress --bar --verbose --jobs "$NUM_CORES" gs_copy ::: "$(gsutil ls "$CROO_OUTPUT_PATH" | grep -E "/$" | grep -v "final")" +fi diff --git a/src/extract_atac_from_gcp_pass.sh b/src/extract_atac_from_gcp_pass.sh new file mode 100644 index 0000000..e0d69ac --- /dev/null +++ b/src/extract_atac_from_gcp_pass.sh @@ -0,0 +1,121 @@ +#!/bin/bash +# Nicole Gay, modified by Archana to work on copying outputs to GCP and locally +# 6 May 2020 +# +# description: copy necessary outputs from GCP to final destination on GCP or local cluster +# +# Usage: bash pass_extract_atac_from_gcp.sh ${NUM_CORES} ${CROO_OUTPUT_PATH} ${DOWNLOAD_DIR} +# +### Expected numbers of files: +## +## PASS1A: +# signal : 98 * {N_tissues} +# peak : 40 * {N_tissues} +# qc : 1 +# tagAlign : 80 * {N_tissues} +## +## PASS1B: +# signal : 62 * {N_tissues} +# peak : 24 * {N_tissues} +# qc : 12 +# tagAlign : 52 * {N_tissues} + +if [ $# -lt 3 ]; then + echo "Usage: ./pass_extract_atac_from_gcp.sh [NUM_CORES] [CROO_OUTPUT_PATH] [DOWNLOAD_DIR]" + echd + echo "Example: pass_extract_atac_from_gcp.sh 4 gs://my-bucket/my_workflow/outputs/croo gs://my-bucket/my_workflow/processed" + echo + echo "[NUM_CORES]: The number of cores to use" + echo "[DOWNLOAD_DIR]: The directory to output the files to" + echo "[CROO_OUTPUT_PATH]: The location of the croo outputs" + echo + exit 1 +fi + +NUM_CORES=$1 # number of processes to run in parallel +DOWNLOAD_DIR=$2 # gcp or local output directory path without trailing slash +CROO_OUTPUT_PATH=$3 # gcp path to croo outputs without trailing slash + +if [[ $CROO_OUTPUT_PATH == gs://* ]]; then + copy_dest="gcp" +else + copy_dest="local" +fi + +CROO_OUTPUT_PATH=${CROO_OUTPUT_PATH%/} +DOWNLOAD_DIR=${DOWNLOAD_DIR%/} + +if [[ "$copy_dest" == "gcp" ]]; then + + # individual tagAlign files + gsutil -m cp -n "${CROO_OUTPUT_PATH}/*/*/align/rep?/*tagAlign.gz" "${DOWNLOAD_DIR}/tagalign/" + + # individual signal track (p-value) + gsutil -m cp -n "${CROO_OUTPUT_PATH}/*/*/signal/rep?/*pval.signal.bigwig" "${DOWNLOAD_DIR}/signal/" +else + cd "$DOWNLOAD_DIR" || (echo "ERROR: could not cd to $DOWNLOAD_DIR" && exit 1) + mkdir -p qc peak signal tagalign + + # rep-to-sample map + gsutil cp -n "${CROO_OUTPUT_PATH}/rep_to_sample_map.csv" . + # merged QC + gsutil cp -n "${CROO_OUTPUT_PATH}/*qc*" qc + + # individual tagAlign files + gsutil -m cp -n "${CROO_OUTPUT_PATH}/*/*/align/rep?/*tagAlign.gz" tagalign + + # individual signal track (p-value) + gsutil -m cp -n "${CROO_OUTPUT_PATH}/*/*/signal/rep?/*pval.signal.bigwig" signal +fi + +gs_copy() { + local dir=$1 + local subdir + local condition + + subdir=$(gsutil ls "$dir") + condition=$(basename "${subdir%/}") + + # merged peak file + gsutil cp -n "${subdir}peak/overlap_reproducibility/overlap.optimal_peak.narrowPeak.gz" "peak/${condition}.overlap.optimal_peak.narrowPeak.gz" + gsutil cp -n "${subdir}peak/overlap_reproducibility/overlap.optimal_peak.narrowPeak.hammock.gz" "peak/${condition}.overlap.optimal_peak.narrowPeak.hammock.gz" + + # pooled signal track + if [[ $condition != *"GET-STDRef-Set"* ]]; then + gsutil cp -n "${subdir}signal/pooled-rep/basename_prefix.pooled.pval.signal.bigwig" "signal/${condition}.pooled.pval.signal.bigwig" + fi + + # qc.html + gsutil cp -n "${subdir}qc/qc.html" "qc/${condition}.qc.html" +} + +gs_copy_gcp() { + local dir=$1 + local subdir + local condition + local out_dir + + subdir=$(gsutil ls "$dir" | grep -v 'rep_to_sample_map.csv\|tagalign') + condition=$(basename "${subdir%/}") + out_dir="$(dirname "$dir")/final" + + # merged peak file + gsutil -m cp -n "${subdir}peak/overlap_reproducibility/overlap.optimal_peak.narrowPeak.gz" "${out_dir}/peak/${condition}.overlap.optimal_peak.narrowPeak.gz" + gsutil -m cp -n "${subdir}peak/overlap_reproducibility/overlap.optimal_peak.narrowPeak.hammock.gz" "${out_dir}/peak/${condition}.overlap.optimal_peak.narrowPeak.hammock.gz" + + # pooled signal track + if [[ $condition != *"GET-STDRef-Set"* ]]; then + gsutil -m cp -n "${subdir}signal/pooled-rep/basename_prefix.pooled.pval.signal.bigwig" "${out_dir}/signal/${condition}.pooled.pval.signal.bigwig" + fi + + # qc.html + gsutil cp -n "${subdir}qc/qc.html" "${out_dir}/qc/${condition}.qc.html" +} + +if [[ "$copy_dest" == "gcp" ]]; then + export -f gs_copy_gcp + parallel --progress --bar --verbose --jobs "$NUM_CORES" gs_copy_gcp ::: "$(gsutil ls "$CROO_OUTPUT_PATH" | grep -E "/$" | grep -v "final")" +else + export -f gs_copy + parallel --progress --bar --verbose --jobs "$NUM_CORES" gs_copy ::: "$(gsutil ls "$CROO_OUTPUT_PATH" | grep -E "/$" | grep -v "final")" +fi diff --git a/src/extract_rep_names_from_encode.sh b/src/extract_rep_names_from_encode.sh index 466a341..1ab0da4 100644 --- a/src/extract_rep_names_from_encode.sh +++ b/src/extract_rep_names_from_encode.sh @@ -4,11 +4,16 @@ # pull replicate names from ATAC pipeline stucture #Usage : bash src/extract_rep_names_from_encode.sh -run_dir=$1 -out_gcp_path=$2 +RUN_DIR=$1 +OUT_GCP_PATH=$2 -for dir in $(gsutil ls ${run_dir}); do - # get condition name +random_string=$(openssl rand -hex 8) +mkdir "$random_string" +cd "$random_string" || exit + +function get_rep_names() { + local dir=$1 + # get condition name gsutil cp $(gsutil ls ${dir}call-qc_report/glob*/qc.json) . condition=$(grep "description" qc.json | sed 's/.*: "//' | sed 's/".*//') if [[ $condition == *" "* ]]; then @@ -29,9 +34,14 @@ for dir in $(gsutil ls ${run_dir}); do echo "${condition},rep${rep},${viallabel}" >> rep_to_sample_map.csv rep=$((rep + 1)) done -done +} + +export -f get_rep_names + +parallel --progress --bar --verbose --jobs 12 get_rep_names ::: $(gsutil ls ${RUN_DIR}) #copy outputs to gcp and delete the local copies -gsutil -m cp -r rep_to_sample_map.csv ${out_gcp_path}/ -rm -rf rep_to_sample_map.csv -rm -rf qc.json +gsutil -m cp -r rep_to_sample_map.csv ${OUT_GCP_PATH}/ + +cd .. +rm -rf "$random_string" diff --git a/src/get_execution_status.py b/src/get_execution_status.py new file mode 100644 index 0000000..10f024c --- /dev/null +++ b/src/get_execution_status.py @@ -0,0 +1,181 @@ +import asyncio +import json +import argparse +from datetime import datetime +from pathlib import Path +from typing import Dict + +from aiohttp import ClientSession +from tqdm import tqdm + + +async def get_status( + workflow_id: str, session: ClientSession +) -> Dict[str, Dict[str, str]]: + """ + This function gets the status of a workflow. + :param workflow_id: The workflow ID + :param session: The aiohttp session + :return: + """ + res = await session.get( + f"http://localhost:8000/api/workflows/v1/query", + params={"id": workflow_id, "additionalQueryResultFields": "labels"}, + ) + res_json = await res.json() + # get the status of the workflow + wf_metadata = res_json["results"][0] + status = wf_metadata["status"] + # and the label of the workflow (the submitted JSON filename or other custom label) + labels = wf_metadata.get("labels", {}) + label = labels.get("caper-str-label", None) + return {workflow_id: {"status": status, "label": label}} + + +async def get_metadata(workflow_id, session: ClientSession): + res = await session.get( + f"http://localhost:8000/api/workflows/v1/{workflow_id}/metadata" + ) + res_json = await res.json() + calls = res_json["calls"] + calls_dict = {} + # iterate through the various call attempts + for call_name, call_attempts in calls.items(): + attempts = (0, 1) + # find the latest attempt and its index (sometimes the latest attempt comes out + # of order, although this is rare, we need to account for it) + for i, attempt in enumerate(call_attempts): + # compare the latest attempt to the current attempt + is_latest = max(attempts[1], attempt["attempt"]) + if is_latest: + attempts = (i, attempt["attempt"]) + + # set the latest attempt + latest_task_attempt = call_attempts[attempts[0]] + + # get the status of the latest attempt + task_status = latest_task_attempt["executionStatus"] + # we only want to fetch the rest of the metadata if the task is running + if task_status != "Done": + task_dict = { + "attempt": latest_task_attempt["attempt"], + "cromwell_status": task_status, + "backend_status": latest_task_attempt.get("backendStatus"), + "logs": latest_task_attempt.get("backendLogs", {}).get("log", None), + } + task_execution_events = latest_task_attempt.get("executionEvents", None) + if task_execution_events: + task_dict["execution_events"] = task_execution_events + else: + task_dict = task_status + + calls_dict[call_name] = task_dict + + return calls_dict + + +async def loop_execution_status(submission_map_fp: str): + """ + This function loops over the workflow IDs in the submission map and prints the status + continually until all workflows are done. + + :param submission_map_fp: The path to the submission map file (created by submit.sh) + """ + submit_fp = Path(submission_map_fp) + # load all the workflow IDs that we want to monitor + with open(submit_fp, "r") as f: + submission_map = json.load(f) + + num_submissions = len(submission_map) + + # we want to keep track of whether we are on the first loop to overwrite stdout + first_loop = True + # keep track of the number of previous lines, again for overwriting stdout + prev_line_count = 0 + prev_submissions = set() + pbar = tqdm(total=num_submissions, desc="Workflow status", unit=" workflows") + + while True: + # get the status of all the workflows + async with ClientSession() as session: + futures = [] + for pair in submission_map: + t = asyncio.create_task(get_status(pair["workflow_id"], session)) + futures.append(t) + # gather all the futures + finished = await asyncio.gather(*futures) + + # create a single dictionary of all the workflow IDs and their statuses and + # filter workflows that are not running + status_dict = { + k: v + for d in finished + for k, v in d.items() + if v.get("status") != "Succeeded" and v.get("status") != "Failed" + } + + iteration_submissions = set(status_dict.keys()) + pbar.update(num_submissions - len(status_dict)) + finished_submissions = list(prev_submissions.difference(iteration_submissions)) + if len(finished_submissions) > 0: + pbar.write(f"Finished workflows: {finished_submissions} at {datetime.now()}") + + # break if there are no more workflows that have not "succeeded" or "failed" + if len(status_dict) == 0: + print("\x1b[1A\x1b[2K" * prev_line_count) + print("All workflows are done!") + break + + # get metadata of non-succeeded workflows only + async with ClientSession() as session: + futures = [] + # submit our futures + for k in status_dict.keys(): + t = asyncio.create_task(get_metadata(k, session)) + futures.append(t) + # gather all the futures + finished = await asyncio.gather(*futures) + + # add a calls key for metadata about the latest calls + for k, v in zip(status_dict.keys(), finished): + status_dict[k]["calls"] = v + + # sort the workflows by their status + sorted_status_dict = { + k: v + for k, v in sorted(status_dict.items(), key=lambda item: item[1]["status"]) + } + + fmt_status_dict = json.dumps(sorted_status_dict, indent=4) + + # we want to erase what we wrote previously by sending the escape sequence + # the number of times we wrote to stdout previously + if not first_loop: + print("\x1b[1A\x1b[2K" * prev_line_count) + + # print the status of the workflows + print(fmt_status_dict) + # update the number of lines we wrote to stdout + first_loop = False + prev_line_count = len(fmt_status_dict.splitlines()) + 1 + # pause for a bit + await asyncio.sleep(15) + + +def main(): + parser = argparse.ArgumentParser( + description="This script renders the execution " "status of the pipeline" + ) + parser.add_argument( + "workflow_id_file", help="Workflow ID map, generated by submit.sh", type=str + ) + args = parser.parse_args() + try: + asyncio.run(loop_execution_status(args.workflow_id_file)) + except KeyboardInterrupt: + print("KeyboardInterrupt received, exiting...") + exit() + + +if __name__ == "__main__": + main() diff --git a/src/human_encode_rep_names_from_croo.py b/src/human_encode_rep_names_from_croo.py new file mode 100644 index 0000000..94dd2b8 --- /dev/null +++ b/src/human_encode_rep_names_from_croo.py @@ -0,0 +1,95 @@ +import argparse + +import gcsfs +import pandas as pd +from tqdm import tqdm + + +# This script generates replicate to sample mapping file to interpret the qc report. It +# was developed as an alternative to extract_rep_names_from_encode.sh which might not +# work if the pipeline run_dir had outputs for a specific condition in multiple +# workflow ids, this happens when a workflow is rerun on gcp, where the metadata.json +# file generated by caper points to the completed paths from failed workflow ids. This +# script uses the c roo.filetable.*.tsv to generate the mapping file + +# Usage: python3 encode_rep_names_from_croo.py +# +# + + +def main(gcp_path: str, output_path: str, workflow_id_fp: str, gcp_project: str): + fs = gcsfs.GCSFileSystem(project=gcp_project) + wfl = [line.strip() for line in open(workflow_id_fp, "r")] + + cond_l = [] + rep_l = [] + + sample_list = [] + + for i in (pbar := tqdm(wfl)): + workflow_path = f"{gcp_path.rstrip('/')}/{i}" + pbar.set_description(f"Processing {i} at {workflow_path}") + + in_path = fs.glob(f"{workflow_path}/**/croo.filetable*.tsv") + pbar.write(f"Found {len(in_path)} filetable files: {in_path}") + + if len(in_path) == 0: + print(f"Skip {i} since no filetable found or exit script?") + print(f"Enter y/n") + user_input = input() + user_input = user_input.lower() + if user_input == "y" or user_input == "yes": + continue + else: + exit() + + filetable_filename = f"gs://{in_path[0]}" + + df_in = pd.read_csv(filetable_filename, sep="\t", header=None) + filtered = df_in[df_in[0].str.contains("Raw BAM from aligner")] + + cond = filetable_filename.split("/")[-2] + + for vals in filtered[1]: + split_vals = vals.split("/") + cond_l.append(cond) + rep = split_vals[-2] + rep_l.append(rep) + sample = (split_vals[-1]).split("_R1.trim.bam")[0] + sample_list.append(sample) + + pbar.write(f"Found {len(filtered[1])} values, finished {i}\n") + + out_df = pd.DataFrame() + out_df["cond_l"] = cond_l + out_df["rep_l"] = rep_l + out_df["sample_list"] = sample_list + + print(out_df.head()) + out_file = f"{output_path.rstrip('/')}/rep_to_sample_map.csv" + out_df.to_csv(out_file, index=False, header=False) + + print("Success! Finished writing the mapping file") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="This script generates the replicate to sample mapping file to " + "interpret the qc report" + ) + parser.add_argument("gcp_path", help="location of atac-seq croo outputs", type=str) + parser.add_argument( + "output_path", + help="output path, where you want the outputs to be written", + type=str, + ) + parser.add_argument( + "wfids", + help="file containing list of workflow ids,usually all workflows for a specific " + "tissue", + ) + parser.add_argument("gcp_project", help="file containing list of workflow ids") + + args = parser.parse_args() + + main(args.gcp_path, args.output_path, args.wfids, args.gcp_project) diff --git a/src/make_json_singleton.sh b/src/make_json_singleton.sh index f7c1790..6f0b49a 100644 --- a/src/make_json_singleton.sh +++ b/src/make_json_singleton.sh @@ -3,59 +3,92 @@ # Write JSON files for ENCODE ATAC pipeline input # Assumes that reads are paired-end -set -e +set -eu -json_base=$1 -fastq_dir=$2 -json_dir=$3 +if [ $# -lt 3 ]; then + echo "Usage: ./make_json.sh [JSON_BASE] [FASTQ_DIR] [JSON_DIR]" + echo + echo "Example: make_json.sh base.json /home/user/fastq jsons" + echo + echo "[JSON_BASE]: The base JSON file to use" + echo "[FASTQ_DIR]: This directory with raw FASTQ files" + echo "[JSON_DIR]: The directory to output the JSON files to" + echo + exit 1 +fi -mkdir -p ${json_dir} +JSON_BASE=$1 +FASTQ_DIR=$2 +JSON_DIR=$3 -samples=$(ls $fastq_dir | grep "fastq.gz" | grep -v "Undetermined" | grep -E "R1|R2" | sed "s/_R.*//" | sort | uniq) -echo $samples +mkdir -p "$JSON_DIR" + +# assign the ls command based on the FastQ directory's prefix +if [[ $FASTQ_DIR == gs://* ]]; then + ls_command="gsutil ls" +else + ls_command="ls" +fi + +samples=$($ls_command "$FASTQ_DIR" | grep "fastq.gz" | grep -v "Undetermined" | grep -E "R1|R2" | sed "s/_R.*//" | sort | uniq) +echo "$samples" + +create_sample_json() { + sample=$1 + + # name JSON file from FASTQ sample name + json_file=$JSON_DIR/$(basename "$sample").json + + printf "{\n \"atac.description\": \"%s\",\n" "$(basename "$sample")" >"$json_file" + + # standard parameters for this project + cat "$JSON_BASE" >>"$json_file" + + # add paths to FASTQ files + echo " \"atac.fastqs_rep1_R1\": [" >>"$json_file" + + lanes=$($ls_command "$FASTQ_DIR" | grep "$sample" | grep -c "R1") + counter=1 + for r1 in $($ls_command "$FASTQ_DIR" | grep "$sample" | grep "R1"); do + if [ "$counter" = "$lanes" ]; then + echo " \"$r1\"" >>"$json_file" + else + echo " \"$r1\"," >>"$json_file" + fi + counter=$((counter + 1)) + done + printf " ],\n" >>"$json_file" + + echo " \"atac.fastqs_rep1_R2\": [" >>"$json_file" + + counter=1 + for r2 in $($ls_command "$FASTQ_DIR" | grep "$sample" | grep "R2"); do + if [ "$counter" = "$lanes" ]; then + echo " \"$r2\"" >>"$json_file" + else + echo " \"$r2\"," >>"$json_file" + fi + counter=$((counter + 1)) + done + printf " ]\n}" >>"$json_file" +} + +N_JOBS=6 for i in $samples; do + create_sample_json "$i" & - # name JSON file from FASTQ sample name - json_file=${json_dir}/${i}.json - - echo "{" > ${json_file} - echo " \"atac.description\" : \"${i}\"," >> ${json_file} - - # standard parameters for this project - cat ${json_base} >> ${json_file} - - # add paths to FASTQ files - echo " \"atac.fastqs_rep1_R1\" : [" >> ${json_file} - - lanes=$(ls ${fastq_dir} | grep "$i" | grep "R1" | wc -l) - counter=1 - for r1 in $(ls ${fastq_dir} | grep "$i" | grep "R1"); do - if [ "$counter" = "$lanes" ]; then - echo " \"${fastq_dir}/${r1}\"" >> ${json_file} - else - echo " \"${fastq_dir}/${r1}\"," >> ${json_file} - fi - counter=$((counter +1)) - done - echo " ]," >> ${json_file} - echo >> ${json_file} - - echo " \"atac.fastqs_rep1_R2\" : [" >> ${json_file} - - counter=1 - for r2 in $(ls ${fastq_dir} | grep "$i" | grep "R2"); do - if [ "$counter" = "$lanes" ]; then - echo " \"${fastq_dir}/${r2}\"" >> ${json_file} - else - echo " \"${fastq_dir}/${r2}\"," >> ${json_file} - fi - counter=$((counter +1)) - done - echo " ]" >> ${json_file} - echo >> ${json_file} - - echo "}" >> ${json_file} + # allow to execute up to $N jobs in parallel + if [[ $(jobs -r -p | wc -l) -ge $N_JOBS ]]; then + # now there are $N jobs already running, so wait here for any job + # to be finished so there is a place to start next one. + wait -n + fi done +# no more jobs to be started but wait for pending jobs +# (all need to be finished) +wait + +echo "all done" diff --git a/src/merge_atac_qc.R b/src/merge_atac_qc.R index 35860f1..0f75006 100644 --- a/src/merge_atac_qc.R +++ b/src/merge_atac_qc.R @@ -9,105 +9,120 @@ library(data.table) library(optparse) option_list <- list( - make_option(c("-w", "--sample_meta"), help="Absolute path to wetlab sample metadata file, e.g. sample_metadata_YYYYMMDD.csv"), - make_option(c("-q", "--atac_qc"), help="Absolute path to pipeline qc metrics file output of qc2tsv tool, e.g. atac_qc.tsv"), - make_option(c("-m", "--sample_mapping_file"), help="Absolute path to replicate to sample mapping file, e.g. rep_to_sample_map.csv"), - make_option(c("-a", "--align_stats"), help="Absolute path to genome alignment stats file, e.g. merged_chr_info.csv"), - make_option(c("-o", "--outdir", help="Absolute path to output directory for the merged qc reports")) + make_option(c("-w", "--sample_meta"), help = "Absolute path to wetlab sample metadata file, e.g. sample_metadata_YYYYMMDD.csv"), + make_option(c("-q", "--atac_qc"), help = "Absolute path to pipeline qc metrics file output of qc2tsv tool, e.g. atac_qc.tsv"), + make_option(c("-m", "--sample_mapping_file"), help = "Absolute path to replicate to sample mapping file, e.g. rep_to_sample_map.csv"), + make_option(c("-a", "--align_stats"), help = "Absolute path to genome alignment stats file, e.g. merged_chr_info.csv"), + make_option(c("-o", "--outdir", help = "Absolute path to output directory for the merged qc reports")) ) -opt <- parse_args(OptionParser(option_list=option_list)) +opt_parse_inst <- OptionParser(option_list = option_list) -wet = fread(opt$sample_meta, sep=',', header=TRUE) -wet = unique(wet) # remove duplicate rows -encode = fread(opt$atac_qc, sep='\t', header=T) -rep_to_sample_map = fread(opt$sample_mapping_file, sep=',', header=F) -align_stat = fread(opt$align_stats,sep=',',header=T) +opt <- parse_args(opt_parse_inst) + +if (is.null(opt$sample_meta) | + is.null(opt$atac_qc) | + is.null(opt$sample_mapping_file) | + is.null(opt$align_stats) | + is.null(opt$outdir)) { + message("\033[31mERROR! Please provide all required arguments") + message("\033[34mExample: Rscript src/merge_atac_qc.R -w -q -m -a -o ") + print_help(opt_parse_inst) + quit("no") +} + +wet <- fread(opt$sample_meta, sep = ',', header = TRUE) +wet <- unique(wet) # remove duplicate rows +encode <- fread(opt$atac_qc, sep = '\t', header = T) +rep_to_sample_map <- fread(opt$sample_mapping_file, sep = ',', header = F) +align_stat <- fread(opt$align_stats, sep = ',', header = T) ################################################################################### ## fix ENCODE QC ################################################################################### # refstd are singletons; treat them differently -refstd = encode[grepl('STDRef', general.title)] -refstd[,general.description := general.title] -encode = encode[!grepl('STDRef', general.title)] +refstd <- encode[grepl('STDRef', general.title)] +refstd[, general.description := general.title] +encode <- encode[!grepl('STDRef', general.title)] -# format "general" to fix auto-format from qc2tsv +# format "general" to fix auto-format from qc2tsv # fix description -t1 = encode[1,general.description] -if(grepl(' ',t1)){ - t1 = encode[1,general.title] - encode[1,general.description := t1] +t1 <- encode[1, general.description] +if (grepl(' ', t1)) { + t1 <- encode[1, general.title] + encode[1, general.description := t1] } -for (i in 1:nrow(encode)){ - if(encode[i,general.description] == ''){ - encode[i,general.description := t1] - }else{ - t1 = encode[i,general.description] - if(grepl(' ',t1)){ - t1 = encode[i,general.title] - encode[i,general.description := t1] +for (i in seq_len(nrow(encode))) { + if (encode[i, general.description] == '') { + encode[i, general.description := t1] + }else { + t1 <- encode[i, general.description] + if (grepl(' ', t1)) { + t1 <- encode[i, general.title] + encode[i, general.description := t1] } } } # fix other "general" cols -cols = colnames(encode)[grepl('general',colnames(encode))] -cols = cols[cols != 'general.description'] -for (col in cols){ +cols <- colnames(encode)[grepl('general', colnames(encode))] +cols <- cols[cols != 'general.description'] +for (col in cols) { print(col) - t1 = encode[1,get(col)] - for (i in 1:nrow(encode)){ - if(encode[i,get(col)] == ''){ - encode[i,(col) := t1] - }else{ - t1 = encode[i,get(col)] + t1 <- encode[1, get(col)] + for (i in seq_len(nrow(encode))) { + if (is.na(encode[i, get(col)]) | as.character(encode[i, get(col)]) == '') { + encode[i, (col) := t1] + } else { + t1 <- encode[i, get(col)] } } } -# separate workflow-level and sample-level QC -workflow_level = colnames(encode)[unlist(encode[, lapply(.SD, function(x) any(is.na(x) | x == ''))])] -# add refstd back in -encode = rbindlist(list(encode, refstd)) -workflow_qc = encode[replicate == 'rep1',c('general.description',workflow_level),with=F] -viallabel_qc = encode[,colnames(encode)[!colnames(encode)%in%workflow_level],with=F] - +# separate workflow-level and sample-level QC +workflow_level <- colnames(encode)[unlist(encode[, lapply(.SD, function(x) any(is.na(x) | as.character(x) == ''))])] +# add refstd back in +encode <- rbindlist(list(encode, refstd)) +workflow_qc <- encode[replicate == 'rep1', c('general.description', workflow_level), with = F] +viallabel_qc <- encode[, colnames(encode)[!colnames(encode) %in% workflow_level], with = F] # match rep to viallabel -colnames(rep_to_sample_map) = c('general.description','replicate','viallabel') -dt = merge(viallabel_qc, rep_to_sample_map, by=c('general.description','replicate')) -stopifnot(nrow(dt)==nrow(viallabel_qc)) +colnames(rep_to_sample_map) <- c('general.description', 'replicate', 'viallabel') +dt <- merge(viallabel_qc, rep_to_sample_map, by = c('general.description', 'replicate')) +stopifnot(nrow(dt) == nrow(viallabel_qc)) ################################################################################### ## merge all sample-level QC ################################################################################### -# merge with wet lab QC -m1 = merge(dt, wet, by.x='viallabel', by.y='vial_label') +# merge with wet lab QC +print(dt$viallabel) +print(wet$vial_label) +m1 <- merge(dt, wet, by.x = 'viallabel', by.y = 'vial_label') stopifnot(nrow(m1) == nrow(dt)) -# merge with align stats -m2 = merge(m1, align_stat, by='viallabel') +# merge with align stats +m2 <- merge(m1, align_stat, by = 'viallabel') stopifnot(nrow(m2) == nrow(dt)) -# remove columns of all 0 or all 100 -check_col = function(x){ - if(is.numeric(x)){ - if(sum(as.numeric(x)) == 0 | all(x == 100)){ +# remove columns of all 0 or all 100 +check_col <- function(x) { + if (is.numeric(x)) { + if (sum(as.numeric(x)) == 0 | all(x == 100)) { return(x) } } return(NA) } -res = lapply(m2, check_col) -res = res[!is.na(res)] -m2[,names(res):=NULL] + +res <- lapply(m2, check_col) +res <- res[!is.na(res)] +m2[, names(res) := NULL] head(m2) -# write out merged QC +# write out merged QC outfile <- paste0(opt$outdir, '/', 'merged_atac_qc.tsv') -write.table(m2, file=outfile, sep='\t', col.names=T, row.names=F, quote=F) +write.table(m2, file = outfile, sep = '\t', col.names = T, row.names = F, quote = F) # write out workflow-level QC outfile_workflow <- paste0(opt$outdir, '/', 'encode_workflow_level_atac_qc.tsv') -write.table(workflow_qc, file=outfile_workflow, sep='\t', col.names=T, row.names=F, quote=F) +write.table(workflow_qc, file = outfile_workflow, sep = '\t', col.names = T, row.names = F, quote = F) diff --git a/src/merge_atac_qc_human.R b/src/merge_atac_qc_human.R new file mode 100644 index 0000000..03b330b --- /dev/null +++ b/src/merge_atac_qc_human.R @@ -0,0 +1,111 @@ +#!/bin/R +# Nicole Gay +# 15 May 2020 +# Fix and merge ATAC-seq QC + +#Usage : Rscript src/merge_atac_qc.R -w ~/test_mnt/PASS/atac-seq/stanford/batch4_20200928/Output/final/sample_metadata_20200928.csv -q ~/test_mnt/PASS/atac-seq/stanford/batch4_20200928/Output/final/qc/atac_qc.tsv -m ~/test_mnt/PASS/atac-seq/stanford/batch4_20200928/Output/final/rep_to_sample_map.csv -a ~/test_mnt/PASS/atac-seq/stanford/batch4_20200928/Output/final/merged_chr_info.csv -o ~/test_mnt/PASS/atac-seq/stanford/batch4_20200928/Output/final/ + +library(data.table) +library(optparse) +library(dplyr) +option_list <- list( + make_option(c("-w", "--sample_meta"), help = "Absolute path to wetlab sample metadata file, e.g. sample_metadata_YYYYMMDD.csv"), + make_option(c("-q", "--atac_qc"), help = "Absolute path to pipeline qc metrics file output of qc2tsv tool, e.g. atac_qc.tsv"), + make_option(c("-m", "--sample_mapping_file"), help="Absolute path to replicate to sample mapping file, e.g. rep_to_sample_map.csv"), + make_option(c("-a", "--align_stats"), help = "Absolute path to genome alignment stats file, e.g. merged_chr_info.csv"), + make_option(c("-o", "--outdir", help = "Absolute path to output directory for the merged qc reports")) + +) + +opt_parse_inst <- OptionParser(option_list = option_list) + +opt <- parse_args(opt_parse_inst) + +if (is.null(opt$sample_meta) | + is.null(opt$atac_qc) | + is.null(opt$sample_mapping_file) | + is.null(opt$align_stats) | + is.null(opt$outdir)) { + message("\033[31mERROR! Please provide all required arguments") + message("\033[34mExample: Rscript src/merge_atac_qc.R -w -q -m -a -o ") + print_help(opt_parse_inst) + quit("no") +} + +wet <- fread(opt$sample_meta, sep = ',', header = TRUE) +wet <- unique(wet) # remove duplicate rows +encode <- fread(opt$atac_qc, sep = '\t', header = T) +rep_to_sample_map = fread(opt$sample_mapping_file, sep=',', header=F) +align_stat <- fread(opt$align_stats, sep = ',', header = T) + +################################################################################### +## fix ENCODE QC +################################################################################### + +# fix other "general" cols +cols <- colnames(encode)[grepl('general', colnames(encode))] +cols <- cols[cols != 'general.description'] +for (col in cols) { + #print(col) + t1 <- encode[1, get(col)] + for (i in seq_len(nrow(encode))) { + if (as.character(encode[i, get(col)]) == '') { + encode[i, (col) := t1] + } else { + t1 <- encode[i, get(col)] + } + } +} +# separate workflow-level and sample-level QC +workflow_level <- colnames(encode)[unlist(encode[, lapply(.SD, function(x) any(is.na(x) | as.character(x) == ''))])] + +workflow_qc <- encode[replicate == 'rep1', c('general.title', workflow_level), with = F] +viallabel_qc <- encode[, colnames(encode)[!colnames(encode) %in% workflow_level], with = F] + +# match rep to viallabel +colnames(rep_to_sample_map) = c('general.description','replicate','viallabel') +dt = merge(viallabel_qc, rep_to_sample_map, by.x=c('general.title','replicate','general.description'),by.y=c('viallabel','replicate','general.description')) +stopifnot(nrow(dt)==nrow(viallabel_qc)) +print(head(dt)) +print(names(dt)) +print(names(wet)) +################################################################################### +## merge all sample-level QC +################################################################################### + +# merge with wet lab QC +m1 <- merge(dt, wet, by.x = 'general.title', by.y = 'vial_label') +stopifnot(nrow(m1) == nrow(dt)) +# merge with align stats +print(colnames(m1)) +print(colnames(align_stat)) +m2 <- merge(m1, align_stat, by.x = 'general.title', by.y = 'viallabel') +stopifnot(nrow(m2) == nrow(dt)) +# remove columns of all 0 or all 100 +check_col <- function(x) { + if (is.numeric(x)) { + if (sum(as.numeric(x)) == 0 | all(x == 100)) { + return(x) + } + } + return(NA) +} + +res <- lapply(m2, check_col) +res <- res[!is.na(res)] +m2[, names(res) := NULL] + +#head(m2) +#Adding in the viallabel column to make it more explicit +#m2 <- m2 %>% mutate(viallabel = general.title, .before=1) +m2$vial_label=m2$general.title +#m2 <- m2[, c("vial_label", names(m2)[!names(m2) %in% "vial_label"])] +new_m2 <- m2 %>% select(vial_label, everything()) +head(new_m2) +# write out merged QC +outfile <- paste0(trimws(opt$outdir, which = 'right', whitespace = '/'), '/', 'merged_atac_qc.tsv') +write.table(new_m2, file = outfile, sep = '\t', col.names = T, row.names = F, quote = F) + +# write out workflow-level QC +#outfile_workflow <- paste0(opt$outdir, '/', 'encode_workflow_level_atac_qc.tsv') +#write.table(workflow_qc, file = outfile_workflow, sep = '\t', col.names = T, row.names = F, quote = F) diff --git a/src/merge_jsons.py b/src/merge_jsons.py new file mode 100644 index 0000000..bd62832 --- /dev/null +++ b/src/merge_jsons.py @@ -0,0 +1,66 @@ +import argparse +import json +from pathlib import Path + + +def open_json(fp): + """ + Open a JSON file and return the object + :param fp: The path to the object + :return: The dictionary object + """ + with open(fp, mode="r", encoding="utf-8") as open_file: + obj = json.load(open_file) + + return obj + + +def main( + source_json_fp: str, target_json_fp: str, key: str = "label", output_fp: str = None +): + """ + Merge two JSON objects together. Used when you have failed tasks that you re-ran and + want to merge the failed tasks' output JSON with the old one. + + :param source_json_fp: The source JSON file + :param target_json_fp: The target JSON file (objects from this JSON array will overwrite source) + :param key: The key to merge on, defaults to "label" + :param output_fp: The output file path + :return: + """ + json_a = open_json(source_json_fp) + json_b = open_json(target_json_fp) + + merged_json_obj = [] + for orig_obj in json_a: + new_obj = ( + # find the object in json_b that has the same label as the original object + next((item for item in json_b if item[key] == orig_obj[key]), None) + # if it exists, use that object, otherwise use the original object + or orig_obj + ) + merged_json_obj.append(new_obj) + + # print the object to console (for jq) + print(json.dumps(merged_json_obj, indent=2)) + + + final_fp = output_fp or f"merged_{Path(source_json_fp).name}" + # write the object to a file + with open(final_fp, "w") as f: + f.write(json.dumps(merged_json_obj, indent=4)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + "Merge two JSON objects together. Used when you have failed tasks that you " + "re-ran and want to merge the failed tasks' output JSON with the old one." + ) + parser.add_argument("source", help="The source JSON file") + parser.add_argument("target", help="The target JSON file (will be overwrite source)") + parser.add_argument("--key", help="The key to use for merging", default="label") + parser.add_argument("--output", help="The output file path", default=None) + + args = parser.parse_args() + + main(args.source, args.target, args.key) diff --git a/src/meta_pass_data_dict.txt b/src/meta_pass_data_dict.txt deleted file mode 100644 index 7c168d1..0000000 --- a/src/meta_pass_data_dict.txt +++ /dev/null @@ -1,68 +0,0 @@ -val key column -0h 1 sacrificeTime -0.5h 2 sacrificeTime -1h 3 sacrificeTime -4h 4 sacrificeTime -7h 5 sacrificeTime -24h 6 sacrificeTime -48h 7 sacrificeTime -1w 8 sacrificeTime -2w 9 sacrificeTime -4w 10 sacrificeTime -8w 11 sacrificeTime -female 1 sex -male 2 sex -Rat PaxGene RNA 30 sampleTypeCode -Rat Plasma 31 sampleTypeCode -Rat Packed Cells 32 sampleTypeCode -Rat Hippocampus 33 sampleTypeCode -Rat Cortex 34 sampleTypeCode -Rat Hypothalamus 35 sampleTypeCode -Rat Gastrocnemius 36 sampleTypeCode -Rat Vastus Lateralis 37 sampleTypeCode -Rat Tibia 38 sampleTypeCode -Rat Heart 39 sampleTypeCode -Rat Kidney 40 sampleTypeCode -Rat Adrenals 41 sampleTypeCode -Rat Colon 42 sampleTypeCode -Rat Spleen 43 sampleTypeCode -Rat Testes 44 sampleTypeCode -Rat Ovaries 45 sampleTypeCode -Rat Brown Adipose 46 sampleTypeCode -Rat White Adipose 47 sampleTypeCode -Rat Aorta 48 sampleTypeCode -Rat Lung 49 sampleTypeCode -Rat Small Intestine 50 sampleTypeCode -Rat Liver 51 sampleTypeCode -Rat Hippocampus Powder 52 sampleTypeCode -Rat Cortex Powder 53 sampleTypeCode -Rat Hypothalamus Powder 54 sampleTypeCode -Rat Gastrocnemius Powder 55 sampleTypeCode -Rat Vastus Lateralis Powder 56 sampleTypeCode -Rat Tibia Powder 57 sampleTypeCode -Rat Heart Powder 58 sampleTypeCode -Rat Kidney Powder 59 sampleTypeCode -Rat Adrenal Powder 60 sampleTypeCode -Rat Colon Powder 61 sampleTypeCode -Rat Spleen Powder 62 sampleTypeCode -Rat Testes Powder 63 sampleTypeCode -Rat Ovaries Powder 64 sampleTypeCode -Rat Aorta Powder 65 sampleTypeCode -Rat Lung Powder 66 sampleTypeCode -Rat Small Intestine Powder 67 sampleTypeCode -Rat Liver Powder 68 sampleTypeCode -Rat Brown Adipose Powder 69 sampleTypeCode -Rat White Adipose Powder 70 sampleTypeCode -phase1a 1 Protocol -phase1b 2 Protocol -pilot 3 Protocol -Joslin 910 siteName -Florida 930 siteName -Iowa 940 siteName -acute 1 intervention -training 2 intervention -control 3 intervention -Fischer 344 1 species -Fischer 344 X Brown Norway 2 species -6 months 1 ageGroup -18 months 2 ageGroup \ No newline at end of file diff --git a/src/pass_extract_atac_from_gcp.sh b/src/pass_extract_atac_from_gcp.sh deleted file mode 100644 index a62b9a4..0000000 --- a/src/pass_extract_atac_from_gcp.sh +++ /dev/null @@ -1,106 +0,0 @@ -#!/bin/bash -# Nicole Gay, modified by Archana to work on copying outputs to GCP and locally -# 6 May 2020 -# -# desription: copy necessary outputs from GCP to final destination on GCP or local cluster -# -# Usage: bash pass_extract_atac_from_gcp.sh ${NUM_CORES} ${download_dir_without_trailing_slash} ${gcp_path_to_atac_croo_outputs_without_trailing_slash} ${copy_dest} -# -### Expected numbers of files: -## -## PASS1A: -# signal : 98 * {N_tissues} -# peak : 40 * {N_tissues} -# qc : 1 -# tagalign : 80 * {N_tissues} -## -## PASS1B: -# signal : 62 * {N_tissues} -# peak : 24 * {N_tissues} -# qc : 12 -# tagalign : 52 * {N_tissues} - -set -e -cores=$1 # number of processes to run in parallel -download_dir=$2 # gcp or local output directory path without trailing slash -gsurl=$3 # gcp path to croo outputs without trailing slash -copy_dest=$4 # mode of copy use "gcp" if the copy destination is on gcp or "local" - -if [[ "$copy_dest" == "gcp" ]]; then - - # individiual tagalign files - gsutil -m cp -n ${gsurl}/*/*/align/rep?/*tagAlign.gz ${download_dir}/tagalign/ - - # individual signal track (p-value) - gsutil -m cp -n ${gsurl}/*/*/signal/rep?/*pval.signal.bigwig ${download_dir}/signal/ - -elif [[ $copy_dest == "local" ]]; then - cd ${download_dir} - mkdir -p qc peak signal tagalign - - # rep-to-sample map - gsutil cp -n ${gsurl}/rep_to_sample_map.csv . - # merged QC - gsutil cp -n ${gsurl}/*qc* qc - - # individiual tagalign files - gsutil -m cp -n ${gsurl}/*/*/align/rep?/*tagAlign.gz tagalign - - # individual signal track (p-value) - gsutil -m cp -n ${gsurl}/*/*/signal/rep?/*pval.signal.bigwig signal - -#fail if the copy destination doesn't match "gcp" or "local" - -else - echo "Invalid value for \"copy_dest\", must be \"gcp\" or \"local\" only" - exit 1 -fi - -gscopy () { - local dir=$1 - local subdir=$(gsutil ls ${dir}) - local condition=$(basename $(echo $subdir | sed "s|/$||")) - - # merged peak file - gsutil cp -n ${subdir}peak/overlap_reproducibility/overlap.optimal_peak.narrowPeak.gz peak/${condition}.overlap.optimal_peak.narrowPeak.gz - gsutil cp -n ${subdir}peak/overlap_reproducibility/overlap.optimal_peak.narrowPeak.hammock.gz peak/${condition}.overlap.optimal_peak.narrowPeak.hammock.gz - - # pooled signal track - if [[ $condition != *"GET-STDRef-Set"* ]]; then - gsutil cp -n ${subdir}signal/pooled-rep/basename_prefix.pooled.pval.signal.bigwig signal/${condition}.pooled.pval.signal.bigwig - fi - - # qc.html - gsutil cp -n ${subdir}qc/qc.html qc/${condition}.qc.html -} - -gscopygcp () { - local dir=$1 - local subdir=$(gsutil ls ${dir}|grep -v 'rep_to_sample_map.csv\|tagalign') - local condition=$(basename $(echo $subdir | sed "s|/$||")) - local outdir=`dirname ${dir}`"/final" - - - # merged peak file - gsutil -m cp -n ${subdir}peak/overlap_reproducibility/overlap.optimal_peak.narrowPeak.gz ${outdir}/peak/${condition}.overlap.optimal_peak.narrowPeak.gz - gsutil -m cp -n ${subdir}peak/overlap_reproducibility/overlap.optimal_peak.narrowPeak.hammock.gz ${outdir}/peak/${condition}.overlap.optimal_peak.narrowPeak.hammock.gz - - # pooled signal track - if [[ $condition != *"GET-STDRef-Set"* ]]; then - gsutil -m cp -n ${subdir}signal/pooled-rep/basename_prefix.pooled.pval.signal.bigwig ${outdir}/signal/${condition}.pooled.pval.signal.bigwig - fi - - # qc.html - gsutil cp -n ${subdir}qc/qc.html ${outdir}/qc/${condition}.qc.html -} - -if [[ "$copy_dest" == "gcp" ]]; then - export -f gscopygcp - parallel --verbose --jobs ${cores} gscopygcp ::: $(gsutil ls ${gsurl} | grep -E "/$"|grep -v "final") - -elif [[ "$copy_dest" == "local" ]]; then - export -f gscopy - parallel --verbose --jobs ${cores} gscopy ::: $(gsutil ls ${gsurl} | grep -E "/$"|grep -v "final") - -fi - diff --git a/src/qc2tsv.sh b/src/qc2tsv.sh index d9e7db9..d09cbdd 100755 --- a/src/qc2tsv.sh +++ b/src/qc2tsv.sh @@ -1,15 +1,21 @@ -#!/bin/sh +#!/bin/bash set -Eeuxo pipefail -trap "echo ERR trap fired!" ERR +trap "echo ERR trap fired!" ERR #This script generates atac qc tsv report and write the output to a gcp bucket. #Usage : bash src/qc2tsv.sh gcp_path=$1 outfile_name=$2 -gsutil ls ${gcp_path}/*/*/qc/qc.json >file_list.txt +random_string=$(openssl rand -hex 8) +mkdir "$random_string" +cd "$random_string" || exit + +gsutil ls "$gcp_path"/*/*/qc/qc.json >file_list.txt echo "Done creating file list" -/home/araja7/miniconda3/bin/qc2tsv --file file_list.txt --collapse-header >${outfile_name} -gsutil mv ${outfile_name} ${gcp_path}/final/ -rm -rf file_list.txt +qc2tsv --file file_list.txt --collapse-header >"$outfile_name" +gsutil mv "$outfile_name" "$gcp_path" echo "Done creating atac-seq qc report" + +cd .. +rm -rf "$random_string" diff --git a/src/run_pipeline.sh b/src/run_pipeline.sh new file mode 100644 index 0000000..3641395 --- /dev/null +++ b/src/run_pipeline.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash + +CONTAINER_NAME=mysql_cromwell +FASTQ_RAW_DIR="gs://my_bucket/atac-seq/batch1_20220710/fastq_raw" +BASE_JSON="batch1_20220710_base.json" +# you need to have a service account with access to the life sciences API on the VM +REMOTE_KEY_FILE=~/.keys/service_account_key.json + +# constants +JSON_DIR=json/ +OUT_DIR=outputs/ + +#### Script #### +if [ ! "$(docker ps -q -f name="${CONTAINER_NAME}")" ]; then + echo "MySQL container does not exist, creating Docker container" + + echo "Creating MySQL database..." + DB_DIR=~/.caper/db + INIT_DB_DIR=~/.caper/init_db + sudo mkdir -p $DB_DIR $INIT_DB_DIR + + INIT_SQL=""" +CREATE USER 'cromwell'@'%' IDENTIFIED BY 'cromwell'; +GRANT ALL PRIVILEGES ON cromwell.* TO 'cromwell'@'%' WITH GRANT OPTION; +""" + echo "${INIT_SQL}" >$INIT_DB_DIR/init_cromwell_user.sql + + docker run -d --rm \ + --name mysql \ + -v "$DB_DIR":/var/lib/mysql \ + -v "$INIT_DB_DIR":/docker-entrypoint-initdb.d \ + -e MYSQL_ROOT_PASSWORD=cromwell \ + -e MYSQL_DATABASE=cromwell \ + -p 3306:3306 mysql:5.7 + + CONTAINER_DB_HOST='127.0.0.1' + CONTAINER_DB_PORT=3306 + is_mysql_alive() { + docker exec -it mysql \ + mysqladmin ping \ + --user=cromwell \ + --password=cromwell \ + --host=$CONTAINER_DB_HOST \ + --port=$CONTAINER_DB_PORT \ + >/dev/null + returned_value=$? + echo ${returned_value} + } + + until [ "$(is_mysql_alive)" -eq 0 ]; do + sleep 5 + echo "Waiting for MySQL container to be ready..." + done +fi + +gcloud auth activate-service-account --key-file="$REMOTE_KEY_FILE" +export GOOGLE_APPLICATION_CREDENTIALS="$REMOTE_KEY_FILE" + +echo "Running caper in a tmux session..." +cd /opt/caper/ || exit 1 +tmux new-session -d -s caper-server 'caper server > caper_server.log 2>&1' +cd - || exit 1 + +# create JSON files for each sample +echo "Creating JSON files..." +mkdir -p "$JSON_DIR" +bash src/make_json_singleton.sh "$BASE_JSON" "$FASTQ_RAW_DIR" "$JSON_DIR" + +echo "Submitting workflows to ENCODE ATAC-seq pipeline..." +mkdir -p "$OUT_DIR" +bash src/submit.sh atac-seq-pipeline/atac.wdl "$JSON_DIR" $OUT_DIR/"$BATCH_PREFIX"_submissions.json +echo "Done!" + +echo "Monitoring workflows..." +python3 src/get_execution_status.py $OUT_DIR/"$BATCH_PREFIX"_submissions.json diff --git a/src/server_setup.sh b/src/server_setup.sh new file mode 100644 index 0000000..011a6ab --- /dev/null +++ b/src/server_setup.sh @@ -0,0 +1,239 @@ +#!/bin/bash +# Wrapper script for running the ENCODE ATAC-seq pipeline +### +# working directory needs to be base directory (not src/) +# USAGE: src/atac-seq-human-wrapper.sh +### +set -eu + +if [ $# -lt 2 ]; then + echo "Usage: ./server_setup.sh [GCP_REGION] [GCP_PROJECT] [CROMWELL_OUT_DIR] [CROMWELL_LOC_DIR] [REMOTE_KEY_FILE]" + echd + echo "Example: bash server_setup.sh us-central1 my-project gs://my-bucket/out gs://my-bucket/loc ~/.keys/service_account_key.json" + echo + echo "[GCP_REGION]: The GCP region to run the pipeline in, must be compatible with the Google Life Sciences API" + echo "[GCP_PROJECT]: The project you are running the pipeline in" + echo "[CROMWELL_OUT_DIR]: The GCS bucket directory to store the cromwell output files in (with a prefix - gs://)" + echo "[CROMWELL_LOC_DIR]: The GCS bucket directory to store the cromwell localization files in (with a prefix - gs://)" + echo "[REMOTE_KEY_FILE]: The path to the service account key file on the remote machine" + echo + exit 1 +fi + +### VARIABLES TO SET ### +# set this to the region you want to run in (e.g. us-central1), the region should be compatible with life sciences API +GCP_REGION=$1 +GCP_PROJECT=$2 +CROMWELL_OUT_DIR=$3 +CROMWELL_LOC_DIR=$4 +# you need to have a service account with access to the life sciences API on the VM +REMOTE_KEY_FILE=$5 +### + +#### Script #### +echo "Installing dependencies..." +sudo apt-get update +sudo apt-get install -y curl wget jq parallel git acl tmux make build-essential \ + libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev llvm libncursesw5-dev \ + xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev \ + autoconf automake gcc perl libcurl4-gnutls-dev libncurses5-dev software-properties-common dirmngr + +sudo apt-add-repository -y ppa:fish-shell/release-3 +sudo add-apt-repository -y ppa:neovim-ppa/stable +sudo apt update +sudo apt install -y fish neovim + +if ! command -v pyenv &>/dev/null; then + curl -L https://github.com/pyenv/pyenv-installer/raw/master/bin/pyenv-installer | bash + + echo """ +export PYENV_ROOT=\"\$HOME/.pyenv\" +command -v pyenv >/dev/null || export PATH=\"\$PYENV_ROOT/bin:\$PATH\" +eval \"\$(pyenv init --path)\" +eval \"\$(pyenv init -)\" +eval \"\$(pyenv virtualenv-init -)\" +""" >>~/.bashrc + + echo """ +export PYENV_ROOT=\"\$HOME/.pyenv\" +command -v pyenv >/dev/null || export PATH=\"\$PYENV_ROOT/bin:\$PATH\" +eval \"\$(pyenv init --path)\" +eval \"\$(pyenv init -)\" +eval \"\$(pyenv virtualenv-init -)\" +""" >>~/.profile + + export PYENV_ROOT="$HOME/.pyenv" + command -v pyenv >/dev/null || export PATH="$PYENV_ROOT/bin:$PATH" + eval "$(pyenv init --path)" + eval "$(pyenv init -)" + eval "$(pyenv virtualenv-init -)" + + fish -c "set -Ux PYENV_ROOT \$HOME/.pyenv" + fish -c "fish_add_path \$PYENV_ROOT/bin" + echo "pyenv init - | source" >> ~/.config/fish/config.fish + + pyenv update +fi + +ver=$(python3 -c "import sys;t='{v[0]}{v[1]}'.format(v=list(sys.version_info[:2]));sys.stdout.write(t)") + +if [[ "$ver" != "311" ]]; then + pyenv install 3.11.2 +fi + +pyenv global 3.11.2 + +if ! command -v java &>/dev/null; then + sudo apt install gnupg ca-certificates curl + curl -s https://repos.azul.com/azul-repo.key | sudo gpg --dearmor -o /usr/share/keyrings/azul.gpg + echo "deb [signed-by=/usr/share/keyrings/azul.gpg] https://repos.azul.com/zulu/deb stable main" | sudo tee /etc/apt/sources.list.d/zulu.list + sudo apt update + sudo apt install zulu17-jdk +fi + +if ! command -v Rscript &>/dev/null; then + # update indices + sudo apt update -qq + # install two helper packages we need + # add the signing key (by Michael Rutter) for these repos + # To verify key, run gpg --show-keys /etc/apt/trusted.gpg.d/cran_ubuntu_key.asc + # Fingerprint: E298A3A825C0D65DFD57CBB651716619E084DAB9 + wget -qO- https://cloud.r-project.org/bin/linux/ubuntu/marutter_pubkey.asc | sudo tee -a /etc/apt/trusted.gpg.d/cran_ubuntu_key.asc + # add the R 4.0 repo from CRAN -- adjust 'focal' to 'groovy' or 'bionic' as needed + sudo add-apt-repository "deb https://cloud.r-project.org/bin/linux/ubuntu $(lsb_release -cs)-cran40/" + sudo apt install -y r-base + sudo chmod 777 /usr/local/lib/R/site-library +fi + +echo "Installing Python requirements..." +python3 -m pip install --upgrade pip setuptools wheel +python3 -m pip install -r requirements.txt +python3 -m pip install qc2tsv croo + +echo "Installing R requirements..." +Rscript -e "install.packages(c('data.table', 'optparse'), repos = 'http://cran.us.r-project.org')" + +if ! command -v docker &>/dev/null; then + echo "Installing Docker" + curl -fsSL https://get.docker.com -o get-docker.sh + sudo sh get-docker.sh + sudo getent group docker || sudo groupadd docker + sudo usermod -aG docker "$USER" + newgrp docker +fi + +if [ ! -d "atac-seq-pipeline" ]; then + echo "Cloning ENCODE ATAC-seq pipeline..." + git clone --single-branch --branch v1.7.0 https://github.com/ENCODE-DCC/atac-seq-pipeline.git + git apply patches/fix__add_missing_runtime_attributes_to_atac-seq_v1_7_0.patch +fi + +echo "Configuring caper..." +CAPER_CONF_DIR=/opt/caper +sudo mkdir -p $CAPER_CONF_DIR $CAPER_CONF_DIR/local_out_dir $CAPER_CONF_DIR/local_loc_dir + +# set the permissions on the folder +sudo chown -R "$USER" $CAPER_CONF_DIR +sudo chmod 777 -R $CAPER_CONF_DIR +sudo setfacl -R -d -m u::rwX $CAPER_CONF_DIR +sudo setfacl -R -d -m g::rwX $CAPER_CONF_DIR +sudo setfacl -R -d -m o::rwX $CAPER_CONF_DIR + +# create the config file +cat <"/opt/caper/default.conf" +# caper +backend=gcp +no-server-heartbeat=True +# cromwell +max-concurrent-workflows=300 +max-concurrent-tasks=1000 +# local backend +local-out-dir=$CAPER_CONF_DIR/local_out_dir +local-loc-dir=$CAPER_CONF_DIR/local_loc_dir +# gcp backend +gcp-prj=$GCP_PROJECT +gcp-region=$GCP_REGION +gcp-out-dir=$CROMWELL_OUT_DIR +gcp-loc-dir=$CROMWELL_LOC_DIR +gcp-service-account-key-json=$REMOTE_KEY_FILE +use-google-cloud-life-sciences=True +# metadata DB +db=mysql +mysql-db-ip=localhost +mysql-db-port=3306 +mysql-db-user=cromwell +mysql-db-password=cromwell +mysql-db-name=cromwell +EOF +sudo chmod +r "/opt/caper/default.conf" + +echo "Creating MySQL database..." +DB_DIR=$HOME/.caper/db +INIT_DB_DIR=$HOME/.caper/init_db +mkdir -p "$DB_DIR" "$INIT_DB_DIR" +sudo chown -R "$USER" "$DB_DIR" "$INIT_DB_DIR" +sudo chmod 777 -R "$DB_DIR" "$INIT_DB_DIR" + +INIT_SQL=""" +CREATE USER 'cromwell'@'%' IDENTIFIED BY 'cromwell'; +GRANT ALL PRIVILEGES ON cromwell.* TO 'cromwell'@'%' WITH GRANT OPTION; +""" +echo "${INIT_SQL}" >"$INIT_DB_DIR"/init_cromwell_user.sql + +docker run -d --rm \ + --name mysql \ + -v "$DB_DIR":/var/lib/mysql \ + -v "$INIT_DB_DIR":/docker-entrypoint-initdb.d \ + -e MYSQL_ROOT_PASSWORD=cromwell \ + -e MYSQL_DATABASE=cromwell \ + -p 3306:3306 mysql:5.7 + +CONTAINER_DB_HOST='127.0.0.1' +CONTAINER_DB_PORT=3306 +is_mysql_alive() { + docker exec -it mysql \ + mysqladmin ping \ + --user=cromwell \ + --password=cromwell \ + --host=$CONTAINER_DB_HOST \ + --port=$CONTAINER_DB_PORT \ + >/dev/null + returned_value=$? + echo ${returned_value} +} + +until [ "$(is_mysql_alive)" -eq 0 ]; do + sleep 5 + echo "Waiting for MySQL container to be ready..." +done + +gcloud auth activate-service-account --key-file="$REMOTE_KEY_FILE" +export GOOGLE_APPLICATION_CREDENTIALS="$REMOTE_KEY_FILE" + +echo "Downloading and setting up gcsfuse..." +wget -q https://github.com/GoogleCloudPlatform/gcsfuse/releases/download/v0.41.7/gcsfuse_0.41.7_amd64.deb +sudo dpkg -i gcsfuse_0.41.6_amd64.deb + +echo "Downloading and setting up bedtools and samtools..." +wget -q https://github.com/arq5x/bedtools2/releases/download/v2.30.0/bedtools.static.binary -O bedtools +sudo mv bedtools /usr/local/bin +sudo chmod a+x /usr/local/bin/bedtools + +wget -q https://github.com/samtools/samtools/releases/download/1.16.1/samtools-1.16.1.tar.bz2 -O samtools.tar.bz2 + +tar -xjf samtools.tar.bz2 +cd samtools-1.16.1 +./configure +make +sudo make install +cd .. +rm -rf samtools-1.16.1 samtools.tar.bz2 + +echo "Downloading and setting up bigWigToBedGraph..." +wget -q https://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/bigWigToBedGraph -O bigWigToBedGraph +sudo mv bigWigToBedGraph /usr/local/bin +sudo chmod a+x /usr/local/bin/bigWigToBedGraph + +echo "Finished" + +curl https://raw.githubusercontent.com/oh-my-fish/oh-my-fish/master/bin/install | fish diff --git a/src/submit.sh b/src/submit.sh new file mode 100644 index 0000000..0080187 --- /dev/null +++ b/src/submit.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +if [ $# -lt 4 ]; then + echo + echo "Usage: ./submit.sh [WDL_FILE] [JSON_DIR] [WF_ID_MAP] [NUM_CORES]" + echo + echo "Example: ./submit.sh atac.wdl json/ wfids.json" + echo "[WDL_FILE]: the WDL file to use as the workflow" + echo "[JSON_DIR]: the directory containing JSON files to use as the WDL inputs" + echo "[WF_ID_MAP]: a JSON file to write an array of a map of label and workflow ID of the submitted jobs to" + echo "[NUM_CORES]: The number of parallel submits to do at a time" + echo + exit 1 +fi + +WDL_FILE=$1 +JSON_DIR=${2/%\//} +WF_ID_MAP=$3 +NUM_CORES=$4 + +if ! [ -f "$WDL_FILE" ]; then + echo "$WDL_FILE does not exist" +fi + +if ! python -c 'import caper' &>/dev/null; then + echo "caper not installed" >&2 + exit 1 +fi + +echo "[" >"$WF_ID_MAP" + +function submit_file() { + local input_json_file=$1 + echo + echo "Submitting $input_json_file" + local submission_output + local workflow_id + local json_str + + submission_output=$(python3 -m caper submit -i "$input_json_file" "$WDL_FILE" 2>&1) + parsed_output=$(echo "$submission_output" | tail -n1 | sed -E 's/(.*)(\{[^}]*\})/\2/g' | sed -E 's/'\''/\"/g') + echo "$parsed_output" + workflow_id=$(echo "$parsed_output" | jq -r '.id') + json_str="{\"label\": \"$(basename "$input_json_file" .json)\", \"workflow_id\": \"$workflow_id\"}," + echo "$json_str" >>"$WF_ID_MAP" + + echo "Submitted $input_json_file" +} + +export -f submit_file +export WDL_FILE +export WF_ID_MAP +echo "Submitting files in $JSON_DIR" +parallel --bar --progress --verbose --jobs "$NUM_CORES" submit_file ::: "$JSON_DIR"/*.json + +echo "]" >>"$WF_ID_MAP" + +echo "Submitted all jobs"