Merge pull request #108 from ENCODE-DCC/dev

leepc12 · web-flow · commit 61f77dd94b3a · 2019-11-15T18:04:30.000-08:00
v1.3.4
diff --git a/README.md b/README.md
@@ -35,7 +35,6 @@ This ChIP-Seq pipeline is based off the ENCODE (phase-3) transcription factor an
 4) Follow [Caper's README](https://github.com/ENCODE-DCC/caper) carefully. Find an instruction for your platform.
 	> **IMPORTANT**: Configure your Caper configuration file `~/.caper/default.conf` correctly for your platform.
 
-
 ## Test input JSON file
 
 Use `https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI_subsampled_chr19_only_caper.json` as `[INPUT_JSON]` in Caper's documentation.
@@ -64,3 +63,16 @@ Install [Croo](https://github.com/ENCODE-DCC/croo#installation). **You can skip
 $ pip install croo
 $ croo [METADATA_JSON_FILE]
 ```
+
+## How to make a spreadsheet of QC metrics
+
+Install [qc2tsv](https://github.com/ENCODE-DCC/qc2tsv#installation). Make sure that you have python3(> 3.4.1) installed on your system. 
+
+Once you have [organized output with Croo](#how-to-organize-outputs), you will be able to find pipeline's final output file `qc/qc.json` which has all QC metrics in it. Simply feed `qc2tsv` with multiple `qc.json` files. It can take various URIs like local path, `gs://` and `s3://`.
+
+```bash
+$ pip install qc2tsv
+$ qc2tsv /sample1/qc.json gs://sample2/qc.json s3://sample3/qc.json ... > spreadsheet.tsv
+```
+
+QC metrics for each experiment (`qc.json`) will be split into multiple rows (1 for overall experiment + 1 for each bio replicate) in a spreadsheet.
diff --git a/chip.croo.json b/chip.croo.json
diff --git a/chip.wdl b/chip.wdl
@@ -1,12 +1,12 @@
 # ENCODE TF/Histone ChIP-Seq pipeline
 # Author: Jin Lee (leepc12@gmail.com)
 
-#CAPER docker quay.io/encode-dcc/chip-seq-pipeline:v1.3.3
-#CAPER singularity docker://quay.io/encode-dcc/chip-seq-pipeline:v1.3.3
+#CAPER docker quay.io/encode-dcc/chip-seq-pipeline:v1.3.4
+#CAPER singularity docker://quay.io/encode-dcc/chip-seq-pipeline:v1.3.4
 #CROO out_def https://storage.googleapis.com/encode-pipeline-output-definition/chip.croo.json
 
 workflow chip {
-	String pipeline_ver = 'v1.3.3'
+	String pipeline_ver = 'v1.3.4'
 	### sample name, description
 	String title = 'Untitled'
 	String description = 'No description'
@@ -120,7 +120,7 @@ workflow chip {
 
 	Int macs2_signal_track_mem_mb = 16000
 	Int macs2_signal_track_time_hr = 24
-	String macs2_signal_track_disks = 'local-disk 200 HDD'
+	String macs2_signal_track_disks = 'local-disk 400 HDD'
 
 	Int call_peak_cpu = 2
 	Int call_peak_mem_mb = 16000
@@ -1184,8 +1184,6 @@ task align {
 	Int? multimapping
 	File? custom_align_py	
 	File? idx_tar			# reference index tar
-	File? fastq_R1 			# [read_end_id]
-	File? fastq_R2
 	Boolean paired_end
 	Boolean use_bwa_mem_for_pe
 
@@ -1597,6 +1595,7 @@ task call_peak {
 		memory : '${mem_mb} MB'
 		time : time_hr
 		disks : disks
+		preemptible: 0		
 	}
 }
 
@@ -1629,6 +1628,7 @@ task macs2_signal_track {
 		memory : '${mem_mb} MB'
 		time : time_hr
 		disks : disks
+		preemptible: 0
 	}
 }
 
diff --git a/docs/input.md b/docs/input.md
@@ -4,6 +4,8 @@ An input JSON file includes all genomic data files, parameters and metadata for
 
 Please read through the following step-by-step instruction to compose a input JSON file.
 
+>**IMPORTANT**: ALWAYS USE ABSOLUTE PATHS.
+
 ## Pipeline metadata
 
 Parameter|Description
@@ -252,7 +254,7 @@ Parameter|Default
 ---------|-------
 `chip.macs2_signal_track_mem_mb` | 16000
 `chip.macs2_signal_track_time_hr` | 24
-`chip.macs2_signal_track_disks` | `local-disk 200 HDD`
+`chip.macs2_signal_track_disks` | `local-disk 400 HDD`
 
 > **IMPORTANT**: If you see Java memory errors, check the following resource parameters.
 
diff --git a/docs/input_short.md b/docs/input_short.md
@@ -1,6 +1,8 @@
 # Input JSON
 
-An input JSON file includes all genomic data files, parameters and metadata for running pipelines. Our pipeline will use default values if they are not defined in an input JSON file. We provide a set of template JSON files: [minimum](../example_input_json/template.json) and [full](../example_input_json/template.full.json). We recommend to use a minimum template instead of full one. A full template includes all parameters of the pipeline with default values defined.
+An input JSON file is a file which must include all the information needed to run this pipeline. Hence, it must include the absolute paths to all the control and experimental fastq files; paths to all the genomic data files needed for this pipeline, and it must also specify the parameters and the metadata needed for running this pipeline. If the parameters are not specified in an input JSON file, default values will be used. We provide a set of template JSON files: [minimum](../example_input_json/template.json) and [full](../example_input_json/template.full.json). We recommend to use a minimum template instead of full one. A full template includes all parameters of the pipeline with default values defined.
+
+>**IMPORTANT**: ALWAYS USE ABSOLUTE PATHS.
 
 # Checklist
 
@@ -79,7 +81,7 @@ Pipeline can start from any of the following data types (FASTQ, BAM, NODUP_BAM a
     * Define a BAM for each replicate. Our pipeline does not determine read endedness from a BAM file. You need to explicitly define read endedness.
     * Example of 3 singled-ended replicates.
         ```javascript
-        {
+        {       
             "chip.paired_end" : false,
             "chip.bams" : ["rep1.bam", "rep2.bam", "rep3.bam"]
         }
@@ -230,7 +232,7 @@ Parameter|Default
 ---------|-------
 `chip.macs2_signal_track_mem_mb` | 16000
 `chip.macs2_signal_track_time_hr` | 24
-`chip.macs2_signal_track_disks` | `local-disk 200 HDD`
+`chip.macs2_signal_track_disks` | `local-disk 400 HDD`
 
 > **IMPORTANT**: If you see Java memory errors, check the following resource parameters.
 
diff --git a/docs/install_conda.md b/docs/install_conda.md
@@ -4,6 +4,11 @@
 
 > **WARNING**: DO NOT SKIP ANY OF THE FOLLOWING STEPS OR PIPELINE'S ENVIRONMENT WILL BE MESSED UP WITH YOUR LOCAL PYTHON/GLOBAL CONDA.
 
+0) MacOS users: **MAKE SURE THAT YOU HAVE GNU `grep` INSTALLED ON YOUR SYSTEM**. Check if your `grep` has a `-P` parameter.
+  ```bash
+  $ grep --help  # check if a parameter "-P" exists
+  ```
+
 1) Download [Miniconda installer](https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh). Use default answers to all questions except for the first and last.
   ```bash
   $ wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
diff --git a/example_input_json/template.full.json b/example_input_json/template.full.json
@@ -10,10 +10,7 @@
     "chip.genome_tsv" : "/path_to_genome_data/hg38/hg38.tsv",
 
     "chip.paired_end" : true,
-    "chip.ctl_paired_end" : [true, true],
-
-    "chip.paired_ends" : true,
-    "chip.ctl_paired_ends" : [true, true],
+    "chip.ctl_paired_end" : true,
 
     "chip.fastqs_rep1_R1" : [ "rep1_R1_L1.fastq.gz", "rep1_R1_L2.fastq.gz", "rep1_R1_L3.fastq.gz" ],
     "chip.fastqs_rep1_R2" : [ "rep1_R2_L1.fastq.gz", "rep1_R2_L2.fastq.gz", "rep1_R2_L3.fastq.gz" ],
@@ -40,15 +37,14 @@
     "chip.ctl_depth_ratio" : 1.2,
 
     "chip.peak_caller" : null,
-    "chip.macs2_cap_num_peak" : 500000,
+    "chip.cap_num_peak_macs2" : 500000,
     "chip.pval_thresh" : 0.01,
     "chip.idr_thresh" : 0.05,
-    "chip.spp_cap_num_peak" : 300000,
+    "chip.cap_num_peak_spp" : 300000,
 
     "chip.enable_jsd" : true,
     "chip.enable_gc_bias" : true,
     "chip.enable_count_signal_track" : false,
-    "chip.keep_irregular_chr_in_bfilt_peak" : false,
 
     "chip.filter_chrs" : [],
 
@@ -86,7 +82,7 @@
 
     "chip.macs2_signal_track_mem_mb" : 16000,
     "chip.macs2_signal_track_time_hr" : 24,
-    "chip.macs2_signal_track_disks" : "local-disk 200 HDD",
+    "chip.macs2_signal_track_disks" : "local-disk 400 HDD",
 
     "chip.filter_picard_java_heap" : "4G",
     "chip.gc_bias_picard_java_heap" : "6G"
diff --git a/src/encode_task_qc_report.py b/src/encode_task_qc_report.py
@@ -191,21 +191,11 @@ def parse_arguments():
         if isinstance(value, list):
             setattr(args, a, split_entries_and_extend(value))
 
-    if args.paired_ends is None:
-        if args.paired_end:
-            args.paired_ends = [True]*20
-        else:
-            args.paired_ends = [False]*20
-    else:
+    if args.paired_ends is not None:
         for i, _ in enumerate(args.paired_ends):
             args.paired_ends[i] = str2bool(args.paired_ends[i])
 
-    if args.ctl_paired_ends is None:
-        if args.ctl_paired_end:
-            args.ctl_paired_ends = [True]*20
-        else:
-            args.ctl_paired_ends = args.paired_ends
-    else:
+    if args.ctl_paired_ends is not None:
         for i, _ in enumerate(args.ctl_paired_ends):
             args.ctl_paired_ends[i] = str2bool(args.ctl_paired_ends[i])
 
@@ -250,8 +240,7 @@ def str_ctl(i):
     'aligner': 'Aligner',
     'peak_caller': 'Peak caller',
     'genome': 'Genome',
-    'paired_end': 'Paired-end per replicate',
-    'ctl_paired_end': 'Control paired-end per replicate',
+    'seq_endedness': 'Sequencing endedness'
 }
 
 
@@ -270,13 +259,16 @@ def make_cat_root(args):
         ('pipeline_ver', args.pipeline_ver),
         ('pipeline_type', args.pipeline_type),
         ('genome', args.genome),
-        ('paired_end', args.paired_ends),
         ('aligner', args.aligner),
+        ('seq_endedness', OrderedDict()),
         ('peak_caller', args.peak_caller),
     ])
-    if args.ctl_paired_ends \
-            and args.pipeline_type not in ('atac', 'dnase'):
-        d_general['ctl_paired_end'] = args.ctl_paired_ends
+    if args.paired_ends is not None:
+        for i, paired_end in enumerate(args.paired_ends):
+            d_general['seq_endedness']['rep{}'.format(i + 1)] = {'paired_end': paired_end}
+    if args.ctl_paired_ends is not None:
+        for i, paired_end in enumerate(args.ctl_paired_ends):
+            d_general['seq_endedness']['ctl{}'.format(i + 1)] = {'paired_end': paired_end}
     cat_root.add_log(d_general, key='general')
 
     return cat_root