ENCODE-DCC
diff --git a/‎.circleci/config.yml
Lines changed: 80 additions & 6 deletions b/‎.circleci/config.yml
Lines changed: 80 additions & 6 deletions
diff --git a/‎.gitignore
Lines changed: 5 additions & 0 deletions b/‎.gitignore
Lines changed: 5 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 32 additions & 58 deletions b/‎README.md
Lines changed: 32 additions & 58 deletions
@@ -51,11 +51,11 @@ jobs:
           name: build image
           command: |
             source ${BASH_ENV}
-            export DOCKER_CACHE_TAG=test-v1.2.1
+            export DOCKER_CACHE_TAG=new-test-v1.3.0
             echo "pulling ${DOCKER_CACHE_TAG}!"
             docker pull quay.io/encode-dcc/chip-seq-pipeline:${DOCKER_CACHE_TAG}
             docker login -u=${QUAY_ROBOT_USER} -p=${QUAY_ROBOT_USER_TOKEN} quay.io
-            docker build --cache-from quay.io/encode-dcc/chip-seq-pipeline:${DOCKER_CACHE_TAG} --build-arg GIT_COMMIT_HASH=${CIRCLE_SHA1} --build-arg BRANCH=${CIRCLE_BRANCH} --build-arg BUILD_TAG=${TAG} -t $TAG -f docker_image/Dockerfile .
+            docker build --cache-from quay.io/encode-dcc/chip-seq-pipeline:${DOCKER_CACHE_TAG} --build-arg GIT_COMMIT_HASH=${CIRCLE_SHA1} --build-arg BRANCH=${CIRCLE_BRANCH} --build-arg BUILD_TAG=${TAG} -t $TAG -f dev/docker_image/Dockerfile .
             docker push ${TAG}
             # docker push quay.io/encode-dcc/chip-seq-pipeline:template
             docker logout
@@ -68,9 +68,10 @@ jobs:
           no_output_timeout: 300m
           command: |
             source ${BASH_ENV}
-            cd test/test_task/
+            cd dev/test/test_task/
             rm -rf chip-seq-pipeline-test-data
-            git clone https://github.com/ENCODE-DCC/chip-seq-pipeline-test-data
+            export BOTO_CONFIG=/dev/null
+            gsutil -m cp -r gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/chip-seq-pipeline-test-data .
             for wdl in test_*.wdl
             do
               json=${wdl%.*}.json
@@ -90,11 +91,26 @@ jobs:
           command: |
             source ${BASH_ENV}
             gcloud --quiet config set project ${GOOGLE_PROJECT_ID}
-            cd test/test_workflow/
+            cd dev/test/test_workflow/
             echo ${GCLOUD_SERVICE_ACCOUNT_SECRET_JSON} > tmp_key.json
             ./test_chip.sh ENCSR000DYI_subsampled_chr19_only.json tmp_key.json ${TAG}
             python -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data[u'outputs'][u'chip.qc_json_ref_match']))" < ENCSR000DYI_subsampled_chr19_only.metadata.json
 
+  test_workflow_unrep_se:
+    <<: *machine_defaults
+    steps:
+      - checkout
+      - run: *make_tag
+      - run:
+          no_output_timeout: 300m
+          command: |
+            source ${BASH_ENV}
+            gcloud --quiet config set project ${GOOGLE_PROJECT_ID}
+            cd dev/test/test_workflow/
+            echo ${GCLOUD_SERVICE_ACCOUNT_SECRET_JSON} > tmp_key.json
+            ./test_chip.sh ENCSR000DYI_subsampled_chr19_only_unrep.json tmp_key.json ${TAG}
+            python -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data[u'outputs'][u'chip.qc_json_ref_match']))" < ENCSR000DYI_subsampled_chr19_only_unrep.metadata.json
+
   test_workflow_pe:
     <<: *machine_defaults
     steps:
@@ -105,11 +121,57 @@ jobs:
           command: |
             source ${BASH_ENV}
             gcloud --quiet config set project ${GOOGLE_PROJECT_ID}
-            cd test/test_workflow/
+            cd dev/test/test_workflow/
             echo ${GCLOUD_SERVICE_ACCOUNT_SECRET_JSON} > tmp_key.json
             ./test_chip.sh ENCSR936XTK_subsampled_chr19_only.json tmp_key.json ${TAG}
             python -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data[u'outputs'][u'chip.qc_json_ref_match']))" < ENCSR936XTK_subsampled_chr19_only.metadata.json
 
+  test_workflow_hist_se:
+    <<: *machine_defaults
+    steps:
+      - checkout
+      - run: *make_tag
+      - run:
+          no_output_timeout: 300m
+          command: |
+            source ${BASH_ENV}
+            gcloud --quiet config set project ${GOOGLE_PROJECT_ID}
+            cd dev/test/test_workflow/
+            echo ${GCLOUD_SERVICE_ACCOUNT_SECRET_JSON} > tmp_key.json
+            ./test_chip.sh ENCSR000DYI_subsampled_chr19_only_hist.json tmp_key.json ${TAG}
+            python -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data[u'outputs'][u'chip.qc_json_ref_match']))" < ENCSR000DYI_subsampled_chr19_only_hist.metadata.json
+
+  test_workflow_hist_unrep_se:
+    <<: *machine_defaults
+    steps:
+      - checkout
+      - run: *make_tag
+      - run:
+          no_output_timeout: 300m
+          command: |
+            source ${BASH_ENV}
+            gcloud --quiet config set project ${GOOGLE_PROJECT_ID}
+            cd dev/test/test_workflow/
+            echo ${GCLOUD_SERVICE_ACCOUNT_SECRET_JSON} > tmp_key.json
+            ./test_chip.sh ENCSR000DYI_subsampled_chr19_only_hist_unrep.json tmp_key.json ${TAG}
+            python -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data[u'outputs'][u'chip.qc_json_ref_match']))" < ENCSR000DYI_subsampled_chr19_only_hist_unrep.metadata.json
+
+  test_workflow_hist_pe:
+    <<: *machine_defaults
+    steps:
+      - checkout
+      - run: *make_tag
+      - run:
+          no_output_timeout: 300m
+          command: |
+            source ${BASH_ENV}
+            gcloud --quiet config set project ${GOOGLE_PROJECT_ID}
+            cd dev/test/test_workflow/
+            echo ${GCLOUD_SERVICE_ACCOUNT_SECRET_JSON} > tmp_key.json
+            ./test_chip.sh ENCSR936XTK_subsampled_chr19_only_hist.json tmp_key.json ${TAG}
+            python -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data[u'outputs'][u'chip.qc_json_ref_match']))" < ENCSR936XTK_subsampled_chr19_only_hist.metadata.json
+
+
 # Define workflow here
 workflows:
   version: 2
@@ -122,6 +184,18 @@ workflows:
       - test_workflow_se:
           requires:
             - build
+      - test_workflow_unrep_se:
+          requires:
+            - build
       - test_workflow_pe:
           requires:
             - build
+      - test_workflow_hist_se:
+          requires:
+            - build
+      - test_workflow_hist_unrep_se:
+          requires:
+            - build
+      - test_workflow_hist_pe:
+          requires:
+            - build
@@ -119,3 +119,8 @@ hg38
 
 metadata.json
 resume.*.json
+
+tmp_db*
+*.local.json
+temp_db*
+
@@ -7,90 +7,64 @@ This ChIP-Seq pipeline is based off the ENCODE (phase-3) transcription factor an
 
 ### Features
 
-* **Portability**: Support for many cloud platforms (Google/DNAnexus) and cluster engines (SLURM/SGE/PBS).
-* **User-friendly HTML report**: tabulated quality metrics including alignment/peak statistics and FRiP along with many useful plots (IDR/cross-correlation measures).
-  - Examples: [HTML](https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/example_output/qc.html), [JSON](docs/example_output/v1.1.5/qc.json)
-* **Genomes**: Pre-built database for GRCh38, hg19, mm10, mm9 and additional support for custom genomes.
+* **Portability**: The pipeline run can be performed across different cloud platforms such as Google, AWS and DNAnexus, as well as on cluster engines such as SLURM, SGE and PBS.
+* **User-friendly HTML report**: In addition to the standard outputs, the pipeline generates an HTML report that consists of a tabular representation of quality metrics including alignment/peak statistics and FRiP along with many useful plots (IDR/cross-correlation measures). An example of the [HTML report](https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/example_output/qc.html). The [json file](https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/example_output/qc.json) used in generating this report.
+* **Supported genomes**: Pipeline needs genome specific data such as aligner indices, chromosome sizes file and blacklist. We provide a genome database downloader/builder for hg38, hg19, mm10, mm9. You can also use this [builder](docs/build_genome_database.md) to build genome database from FASTA for your custom genome.
 
 ## Installation
 
-1) Install [Caper](https://github.com/ENCODE-DCC/caper#installation). Caper is a python wrapper for [Cromwell](https://github.com/broadinstitute/cromwell). Make sure that you have python3(> 3.4.1) installed on your system.
+1) [Install Caper](https://github.com/ENCODE-DCC/caper#installation). Caper is a python wrapper for [Cromwell](https://github.com/broadinstitute/cromwell).
 
-  ```bash
-  $ pip install caper
-  ```
+	> **IMPORTANT**: Make sure that you have python3(> 3.4.1) installed on your system.
 
-2) Read through [Caper's README](https://github.com/ENCODE-DCC/caper) carefully.
+	```bash
+	$ pip install caper  # use pip3 if it doesn't work
+	```
 
-3) Run a pipeline with Caper.
+2) Follow [Caper's README](https://github.com/ENCODE-DCC/caper) carefully. Find an instruction for your platform.
+	> **IMPORTANT**: Configure your Caper configuration file `~/.caper/default.conf` correctly for your platform.
 
-## Running pipelines without Caper
+3) Git clone this pipeline.
+	> **IMPORTANT*: use `~/chip-seq-pipeline2/chip.wdl` as `[WDL]` in Caper's documentation.
 
-Caper uses the cromwell workflow execution engine to run the workflow on the platform you specify.  While we recommend you use caper, if you want to run cromwell directly without caper you can learn about that [here](docs/deprecated/OLD_METHOD.md).
+	```bash
+	$ cd
+	$ git clone https://github.com/ENCODE-DCC/chip-seq-pipeline2
+	```
 
-## DNAnexus
+4) Install pipeline's [Conda environment](docs/install_conda.md) if you want to use Conda instead of Docker/Singularity. Conda is recommneded on local computer and HPCs (e.g. Stanford Sherlock/SCG). Use 
+	> **IMPORTANT*: use `encode-chip-seq-pipeline` as `[PIPELINE_CONDA_ENV]` in Caper's documentation.
 
-You can also run our pipeline on DNAnexus without using Caper or Cromwell. There are two ways to build a workflow on DNAnexus based on our WDL.
+## Test input JSON file
 
-1) [dxWDL CLI](docs/tutorial_dx_cli.md)
-2) [DNAnexus Web UI](docs/tutorial_dx_web.md)
-
-## Conda
-
-We no longer recommend Conda for resolving dependencies and plan to phase out Conda support. Instead we recommend using Docker or Singularity. You can install Singularity and use it for our pipeline with Caper (by adding `--singularity` to command line arguments). Please see [this instruction](docs/install_conda.md).
+Use `https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI_subsampled_chr19_only_caper.json` as `[INPUT_JSON]` in Caper's documentation.
 
-## Tutorial
+## Input JSON file
 
-Make sure that you have configured Caper correctly.
-> **WARNING**: Do not run Caper on HPC login nodes. Your jobs can be killed.
+An input JSON file specifies all the input parameters and files that are necessary for successfully running this pipeline. This includes a specification of the path to the genome reference files and the raw data fastq file. Please make sure to specify absolute paths rather than relative paths in your input JSON files.
 
-All files (HTTP URLs) in `examples/caper/ENCSR936XTK_subsampled_chr19_only.json` will be recursively copied into Caper's temporary folder (`--tmp-dir`).
-```bash
-$ caper run chip.wdl -i examples/caper/ENCSR936XTK_subsampled_chr19_only.json --singularity
-```
+[Input JSON file specification](docs/input.md)
 
-If you use Docker then replace `--singularity` with `--docker`.
-```bash
-$ caper run chip.wdl -i examples/caper/ENCSR936XTK_subsampled_chr19_only.json  --docker
-```
+## Running a pipeline without Caper
 
-If you use Conda then remove `--singularity` from the command line and activate pipeline's Conda env before running a pipeline.
-```bash
-$ # source activate encode-chip-seq-pipeline  # for Conda < 4.6
-$ conda activate encode-chip-seq-pipeline  # for Conda >= 4.6
-$ caper run chip.wdl -i examples/caper/ENCSR936XTK_subsampled_chr19_only.json 
-```
+> **WARNING**: This method has been deprecated. There are many unfixed known bugs. We no longer support it.
 
-To run it on an HPC (e.g. Stanford Sherlock and SCG). See details at [Caper's README](https://github.com/ENCODE-DCC/caper/blob/master/README.md#how-to-run-it-on-slurm-cluster).
+Caper uses the cromwell workflow execution engine to run the workflow on the platform you specify. While we recommend you use caper, if you want to run cromwell directly without caper you can learn about that [here](docs/deprecated/OLD_METHOD.md).
 
-## Input JSON file
+## Running a pipeline on DNAnexus
 
-An input JSON file includes all genomic data files, input parameters and metadata for running pipelines. Always use absolute paths in an input JSON.
+You can also run this pipeline on DNAnexus without using Caper or Cromwell. There are two ways to build a workflow on DNAnexus based on our WDL.
 
-[Input JSON file specification](docs/input.md)
+1) [dxWDL CLI](docs/tutorial_dx_cli.md)
+2) [DNAnexus Web UI](docs/tutorial_dx_web.md)
 
 ## How to organize outputs
 
-Install [Croo](https://github.com/ENCODE-DCC/croo#installation). Make sure that you have python3(> 3.4.1) installed on your system.
+Install [Croo](https://github.com/ENCODE-DCC/croo#installation). Make sure that you have python3(> 3.4.1) installed on your system. Find a `metadata.json` on Caper's output directory.
 
 ```bash
 $ pip install croo
-```
-
-Find a `metadata.json` on Caper's output directory.
-
-```bash
 $ croo [METADATA_JSON_FILE]
 ```
 
-## Useful tools
-
-There are some useful tools to post-process outputs of the pipeline.
-
-### qc_jsons_to_tsv
-
-[This tool](utils/qc_jsons_to_tsv/README.md) recursively finds and parses all `qc.json` (pipeline's [final output](docs/example_output/v1.1.5/qc.json)) found from a specified root directory. It generates a TSV file that has all quality metrics tabulated in rows for each experiment and replicate. This tool also estimates overall quality of a sample by [a criteria definition JSON file](utils/qc_jsons_to_tsv/criteria.default.json) which can be a good guideline for QC'ing experiments.
-
-### ENCODE downloader
-
-[This tool](https://github.com/kundajelab/ENCODE_downloader) downloads any type (FASTQ, BAM, PEAK, ...) of data from the ENCODE portal. It also generates a metadata JSON file per experiment which will be very useful to make an input JSON file for the pipeline.
+There is another [useful tool](utils/qc_jsons_to_tsv/README.md) to make a spreadsheet of QC metrics from multiple workflows. This tool recursively finds and parses all `qc.json` (pipeline's [final output](docs/example_output/v1.1.5/qc.json)) found from a specified root directory. It generates a TSV file that has all quality metrics tabulated in rows for each experiment and replicate. This tool also estimates overall quality of a sample by [a criteria definition JSON file](utils/qc_jsons_to_tsv/criteria.default.json) which can be a good guideline for QC'ing experiments.