Skip to content

Commit c09a1a7

Browse files
authored
Merge pull request #90 from ENCODE-DCC/dev
dev-v1.3.0 to master (v1.3.0)
2 parents 1f54467 + d6edc87 commit c09a1a7

File tree

330 files changed

+11070
-6352
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

330 files changed

+11070
-6352
lines changed

.circleci/config.yml

Lines changed: 80 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,11 @@ jobs:
5151
name: build image
5252
command: |
5353
source ${BASH_ENV}
54-
export DOCKER_CACHE_TAG=test-v1.2.1
54+
export DOCKER_CACHE_TAG=new-test-v1.3.0
5555
echo "pulling ${DOCKER_CACHE_TAG}!"
5656
docker pull quay.io/encode-dcc/chip-seq-pipeline:${DOCKER_CACHE_TAG}
5757
docker login -u=${QUAY_ROBOT_USER} -p=${QUAY_ROBOT_USER_TOKEN} quay.io
58-
docker build --cache-from quay.io/encode-dcc/chip-seq-pipeline:${DOCKER_CACHE_TAG} --build-arg GIT_COMMIT_HASH=${CIRCLE_SHA1} --build-arg BRANCH=${CIRCLE_BRANCH} --build-arg BUILD_TAG=${TAG} -t $TAG -f docker_image/Dockerfile .
58+
docker build --cache-from quay.io/encode-dcc/chip-seq-pipeline:${DOCKER_CACHE_TAG} --build-arg GIT_COMMIT_HASH=${CIRCLE_SHA1} --build-arg BRANCH=${CIRCLE_BRANCH} --build-arg BUILD_TAG=${TAG} -t $TAG -f dev/docker_image/Dockerfile .
5959
docker push ${TAG}
6060
# docker push quay.io/encode-dcc/chip-seq-pipeline:template
6161
docker logout
@@ -68,9 +68,10 @@ jobs:
6868
no_output_timeout: 300m
6969
command: |
7070
source ${BASH_ENV}
71-
cd test/test_task/
71+
cd dev/test/test_task/
7272
rm -rf chip-seq-pipeline-test-data
73-
git clone https://github.com/ENCODE-DCC/chip-seq-pipeline-test-data
73+
export BOTO_CONFIG=/dev/null
74+
gsutil -m cp -r gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/chip-seq-pipeline-test-data .
7475
for wdl in test_*.wdl
7576
do
7677
json=${wdl%.*}.json
@@ -90,11 +91,26 @@ jobs:
9091
command: |
9192
source ${BASH_ENV}
9293
gcloud --quiet config set project ${GOOGLE_PROJECT_ID}
93-
cd test/test_workflow/
94+
cd dev/test/test_workflow/
9495
echo ${GCLOUD_SERVICE_ACCOUNT_SECRET_JSON} > tmp_key.json
9596
./test_chip.sh ENCSR000DYI_subsampled_chr19_only.json tmp_key.json ${TAG}
9697
python -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data[u'outputs'][u'chip.qc_json_ref_match']))" < ENCSR000DYI_subsampled_chr19_only.metadata.json
9798
99+
test_workflow_unrep_se:
100+
<<: *machine_defaults
101+
steps:
102+
- checkout
103+
- run: *make_tag
104+
- run:
105+
no_output_timeout: 300m
106+
command: |
107+
source ${BASH_ENV}
108+
gcloud --quiet config set project ${GOOGLE_PROJECT_ID}
109+
cd dev/test/test_workflow/
110+
echo ${GCLOUD_SERVICE_ACCOUNT_SECRET_JSON} > tmp_key.json
111+
./test_chip.sh ENCSR000DYI_subsampled_chr19_only_unrep.json tmp_key.json ${TAG}
112+
python -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data[u'outputs'][u'chip.qc_json_ref_match']))" < ENCSR000DYI_subsampled_chr19_only_unrep.metadata.json
113+
98114
test_workflow_pe:
99115
<<: *machine_defaults
100116
steps:
@@ -105,11 +121,57 @@ jobs:
105121
command: |
106122
source ${BASH_ENV}
107123
gcloud --quiet config set project ${GOOGLE_PROJECT_ID}
108-
cd test/test_workflow/
124+
cd dev/test/test_workflow/
109125
echo ${GCLOUD_SERVICE_ACCOUNT_SECRET_JSON} > tmp_key.json
110126
./test_chip.sh ENCSR936XTK_subsampled_chr19_only.json tmp_key.json ${TAG}
111127
python -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data[u'outputs'][u'chip.qc_json_ref_match']))" < ENCSR936XTK_subsampled_chr19_only.metadata.json
112128
129+
test_workflow_hist_se:
130+
<<: *machine_defaults
131+
steps:
132+
- checkout
133+
- run: *make_tag
134+
- run:
135+
no_output_timeout: 300m
136+
command: |
137+
source ${BASH_ENV}
138+
gcloud --quiet config set project ${GOOGLE_PROJECT_ID}
139+
cd dev/test/test_workflow/
140+
echo ${GCLOUD_SERVICE_ACCOUNT_SECRET_JSON} > tmp_key.json
141+
./test_chip.sh ENCSR000DYI_subsampled_chr19_only_hist.json tmp_key.json ${TAG}
142+
python -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data[u'outputs'][u'chip.qc_json_ref_match']))" < ENCSR000DYI_subsampled_chr19_only_hist.metadata.json
143+
144+
test_workflow_hist_unrep_se:
145+
<<: *machine_defaults
146+
steps:
147+
- checkout
148+
- run: *make_tag
149+
- run:
150+
no_output_timeout: 300m
151+
command: |
152+
source ${BASH_ENV}
153+
gcloud --quiet config set project ${GOOGLE_PROJECT_ID}
154+
cd dev/test/test_workflow/
155+
echo ${GCLOUD_SERVICE_ACCOUNT_SECRET_JSON} > tmp_key.json
156+
./test_chip.sh ENCSR000DYI_subsampled_chr19_only_hist_unrep.json tmp_key.json ${TAG}
157+
python -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data[u'outputs'][u'chip.qc_json_ref_match']))" < ENCSR000DYI_subsampled_chr19_only_hist_unrep.metadata.json
158+
159+
test_workflow_hist_pe:
160+
<<: *machine_defaults
161+
steps:
162+
- checkout
163+
- run: *make_tag
164+
- run:
165+
no_output_timeout: 300m
166+
command: |
167+
source ${BASH_ENV}
168+
gcloud --quiet config set project ${GOOGLE_PROJECT_ID}
169+
cd dev/test/test_workflow/
170+
echo ${GCLOUD_SERVICE_ACCOUNT_SECRET_JSON} > tmp_key.json
171+
./test_chip.sh ENCSR936XTK_subsampled_chr19_only_hist.json tmp_key.json ${TAG}
172+
python -c "import sys; import json; data=json.loads(sys.stdin.read()); sys.exit(int(not data[u'outputs'][u'chip.qc_json_ref_match']))" < ENCSR936XTK_subsampled_chr19_only_hist.metadata.json
173+
174+
113175
# Define workflow here
114176
workflows:
115177
version: 2
@@ -122,6 +184,18 @@ workflows:
122184
- test_workflow_se:
123185
requires:
124186
- build
187+
- test_workflow_unrep_se:
188+
requires:
189+
- build
125190
- test_workflow_pe:
126191
requires:
127192
- build
193+
- test_workflow_hist_se:
194+
requires:
195+
- build
196+
- test_workflow_hist_unrep_se:
197+
requires:
198+
- build
199+
- test_workflow_hist_pe:
200+
requires:
201+
- build

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,3 +119,8 @@ hg38
119119

120120
metadata.json
121121
resume.*.json
122+
123+
tmp_db*
124+
*.local.json
125+
temp_db*
126+

README.md

Lines changed: 32 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -7,90 +7,64 @@ This ChIP-Seq pipeline is based off the ENCODE (phase-3) transcription factor an
77

88
### Features
99

10-
* **Portability**: Support for many cloud platforms (Google/DNAnexus) and cluster engines (SLURM/SGE/PBS).
11-
* **User-friendly HTML report**: tabulated quality metrics including alignment/peak statistics and FRiP along with many useful plots (IDR/cross-correlation measures).
12-
- Examples: [HTML](https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/example_output/qc.html), [JSON](docs/example_output/v1.1.5/qc.json)
13-
* **Genomes**: Pre-built database for GRCh38, hg19, mm10, mm9 and additional support for custom genomes.
10+
* **Portability**: The pipeline run can be performed across different cloud platforms such as Google, AWS and DNAnexus, as well as on cluster engines such as SLURM, SGE and PBS.
11+
* **User-friendly HTML report**: In addition to the standard outputs, the pipeline generates an HTML report that consists of a tabular representation of quality metrics including alignment/peak statistics and FRiP along with many useful plots (IDR/cross-correlation measures). An example of the [HTML report](https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/example_output/qc.html). The [json file](https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/example_output/qc.json) used in generating this report.
12+
* **Supported genomes**: Pipeline needs genome specific data such as aligner indices, chromosome sizes file and blacklist. We provide a genome database downloader/builder for hg38, hg19, mm10, mm9. You can also use this [builder](docs/build_genome_database.md) to build genome database from FASTA for your custom genome.
1413

1514
## Installation
1615

17-
1) Install [Caper](https://github.com/ENCODE-DCC/caper#installation). Caper is a python wrapper for [Cromwell](https://github.com/broadinstitute/cromwell). Make sure that you have python3(> 3.4.1) installed on your system.
16+
1) [Install Caper](https://github.com/ENCODE-DCC/caper#installation). Caper is a python wrapper for [Cromwell](https://github.com/broadinstitute/cromwell).
1817

19-
```bash
20-
$ pip install caper
21-
```
18+
> **IMPORTANT**: Make sure that you have python3(> 3.4.1) installed on your system.
2219

23-
2) Read through [Caper's README](https://github.com/ENCODE-DCC/caper) carefully.
20+
```bash
21+
$ pip install caper # use pip3 if it doesn't work
22+
```
2423

25-
3) Run a pipeline with Caper.
24+
2) Follow [Caper's README](https://github.com/ENCODE-DCC/caper) carefully. Find an instruction for your platform.
25+
> **IMPORTANT**: Configure your Caper configuration file `~/.caper/default.conf` correctly for your platform.
2626

27-
## Running pipelines without Caper
27+
3) Git clone this pipeline.
28+
> **IMPORTANT*: use `~/chip-seq-pipeline2/chip.wdl` as `[WDL]` in Caper's documentation.
2829

29-
Caper uses the cromwell workflow execution engine to run the workflow on the platform you specify. While we recommend you use caper, if you want to run cromwell directly without caper you can learn about that [here](docs/deprecated/OLD_METHOD.md).
30+
```bash
31+
$ cd
32+
$ git clone https://github.com/ENCODE-DCC/chip-seq-pipeline2
33+
```
3034

31-
## DNAnexus
35+
4) Install pipeline's [Conda environment](docs/install_conda.md) if you want to use Conda instead of Docker/Singularity. Conda is recommneded on local computer and HPCs (e.g. Stanford Sherlock/SCG). Use
36+
> **IMPORTANT*: use `encode-chip-seq-pipeline` as `[PIPELINE_CONDA_ENV]` in Caper's documentation.
3237

33-
You can also run our pipeline on DNAnexus without using Caper or Cromwell. There are two ways to build a workflow on DNAnexus based on our WDL.
38+
## Test input JSON file
3439

35-
1) [dxWDL CLI](docs/tutorial_dx_cli.md)
36-
2) [DNAnexus Web UI](docs/tutorial_dx_web.md)
37-
38-
## Conda
39-
40-
We no longer recommend Conda for resolving dependencies and plan to phase out Conda support. Instead we recommend using Docker or Singularity. You can install Singularity and use it for our pipeline with Caper (by adding `--singularity` to command line arguments). Please see [this instruction](docs/install_conda.md).
40+
Use `https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI_subsampled_chr19_only_caper.json` as `[INPUT_JSON]` in Caper's documentation.
4141

42-
## Tutorial
42+
## Input JSON file
4343

44-
Make sure that you have configured Caper correctly.
45-
> **WARNING**: Do not run Caper on HPC login nodes. Your jobs can be killed.
44+
An input JSON file specifies all the input parameters and files that are necessary for successfully running this pipeline. This includes a specification of the path to the genome reference files and the raw data fastq file. Please make sure to specify absolute paths rather than relative paths in your input JSON files.
4645

47-
All files (HTTP URLs) in `examples/caper/ENCSR936XTK_subsampled_chr19_only.json` will be recursively copied into Caper's temporary folder (`--tmp-dir`).
48-
```bash
49-
$ caper run chip.wdl -i examples/caper/ENCSR936XTK_subsampled_chr19_only.json --singularity
50-
```
46+
[Input JSON file specification](docs/input.md)
5147

52-
If you use Docker then replace `--singularity` with `--docker`.
53-
```bash
54-
$ caper run chip.wdl -i examples/caper/ENCSR936XTK_subsampled_chr19_only.json --docker
55-
```
48+
## Running a pipeline without Caper
5649

57-
If you use Conda then remove `--singularity` from the command line and activate pipeline's Conda env before running a pipeline.
58-
```bash
59-
$ # source activate encode-chip-seq-pipeline # for Conda < 4.6
60-
$ conda activate encode-chip-seq-pipeline # for Conda >= 4.6
61-
$ caper run chip.wdl -i examples/caper/ENCSR936XTK_subsampled_chr19_only.json
62-
```
50+
> **WARNING**: This method has been deprecated. There are many unfixed known bugs. We no longer support it.
6351
64-
To run it on an HPC (e.g. Stanford Sherlock and SCG). See details at [Caper's README](https://github.com/ENCODE-DCC/caper/blob/master/README.md#how-to-run-it-on-slurm-cluster).
52+
Caper uses the cromwell workflow execution engine to run the workflow on the platform you specify. While we recommend you use caper, if you want to run cromwell directly without caper you can learn about that [here](docs/deprecated/OLD_METHOD.md).
6553

66-
## Input JSON file
54+
## Running a pipeline on DNAnexus
6755

68-
An input JSON file includes all genomic data files, input parameters and metadata for running pipelines. Always use absolute paths in an input JSON.
56+
You can also run this pipeline on DNAnexus without using Caper or Cromwell. There are two ways to build a workflow on DNAnexus based on our WDL.
6957

70-
[Input JSON file specification](docs/input.md)
58+
1) [dxWDL CLI](docs/tutorial_dx_cli.md)
59+
2) [DNAnexus Web UI](docs/tutorial_dx_web.md)
7160

7261
## How to organize outputs
7362

74-
Install [Croo](https://github.com/ENCODE-DCC/croo#installation). Make sure that you have python3(> 3.4.1) installed on your system.
63+
Install [Croo](https://github.com/ENCODE-DCC/croo#installation). Make sure that you have python3(> 3.4.1) installed on your system. Find a `metadata.json` on Caper's output directory.
7564

7665
```bash
7766
$ pip install croo
78-
```
79-
80-
Find a `metadata.json` on Caper's output directory.
81-
82-
```bash
8367
$ croo [METADATA_JSON_FILE]
8468
```
8569

86-
## Useful tools
87-
88-
There are some useful tools to post-process outputs of the pipeline.
89-
90-
### qc_jsons_to_tsv
91-
92-
[This tool](utils/qc_jsons_to_tsv/README.md) recursively finds and parses all `qc.json` (pipeline's [final output](docs/example_output/v1.1.5/qc.json)) found from a specified root directory. It generates a TSV file that has all quality metrics tabulated in rows for each experiment and replicate. This tool also estimates overall quality of a sample by [a criteria definition JSON file](utils/qc_jsons_to_tsv/criteria.default.json) which can be a good guideline for QC'ing experiments.
93-
94-
### ENCODE downloader
95-
96-
[This tool](https://github.com/kundajelab/ENCODE_downloader) downloads any type (FASTQ, BAM, PEAK, ...) of data from the ENCODE portal. It also generates a metadata JSON file per experiment which will be very useful to make an input JSON file for the pipeline.
70+
There is another [useful tool](utils/qc_jsons_to_tsv/README.md) to make a spreadsheet of QC metrics from multiple workflows. This tool recursively finds and parses all `qc.json` (pipeline's [final output](docs/example_output/v1.1.5/qc.json)) found from a specified root directory. It generates a TSV file that has all quality metrics tabulated in rows for each experiment and replicate. This tool also estimates overall quality of a sample by [a criteria definition JSON file](utils/qc_jsons_to_tsv/criteria.default.json) which can be a good guideline for QC'ing experiments.

0 commit comments

Comments
 (0)