From 2ec4e5743eeb72287144f76de8affc2a310d9d09 Mon Sep 17 00:00:00 2001 From: Jin Lee Date: Wed, 5 Jun 2019 08:21:00 -0700 Subject: [PATCH 01/14] increase disk for filter task from 200 to 400 HDD --- chip.wdl | 2 +- examples/template_pe.full.json | 4 ++-- examples/template_se.full.json | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/chip.wdl b/chip.wdl index 5ebc007c..1e792ba4 100644 --- a/chip.wdl +++ b/chip.wdl @@ -79,7 +79,7 @@ workflow chip { Int filter_cpu = 2 Int filter_mem_mb = 20000 Int filter_time_hr = 24 - String filter_disks = "local-disk 200 HDD" + String filter_disks = "local-disk 400 HDD" Int bam2ta_cpu = 2 Int bam2ta_mem_mb = 10000 diff --git a/examples/template_pe.full.json b/examples/template_pe.full.json index 7ada2c63..1a4c42a8 100644 --- a/examples/template_pe.full.json +++ b/examples/template_pe.full.json @@ -84,7 +84,7 @@ "chip.filter_cpu" : 2, "chip.filter_mem_mb" : 20000, "chip.filter_time_hr" : 24, - "chip.filter_disks" : "local-disk 200 HDD", + "chip.filter_disks" : "local-disk 400 HDD", "chip.bam2ta_cpu" : 2, "chip.bam2ta_mem_mb" : 10000, @@ -111,4 +111,4 @@ "chip.spp_mem_mb" : 16000, "chip.spp_time_hr" : 72, "chip.spp_disks" : "local-disk 200 HDD" -} \ No newline at end of file +} diff --git a/examples/template_se.full.json b/examples/template_se.full.json index a791600a..cfd2950d 100644 --- a/examples/template_se.full.json +++ b/examples/template_se.full.json @@ -79,7 +79,7 @@ "chip.filter_cpu" : 2, "chip.filter_mem_mb" : 20000, "chip.filter_time_hr" : 24, - "chip.filter_disks" : "local-disk 200 HDD", + "chip.filter_disks" : "local-disk 400 HDD", "chip.bam2ta_cpu" : 2, "chip.bam2ta_mem_mb" : 10000, @@ -106,4 +106,4 @@ "chip.spp_mem_mb" : 16000, "chip.spp_time_hr" : 72, "chip.spp_disks" : "local-disk 200 HDD" -} \ No newline at end of file +} From bf1d3c289f58a378c9ec2a8846b37639b40a806f Mon Sep 17 00:00:00 2001 From: Jin Lee Date: Fri, 7 Jun 2019 09:15:24 -0700 Subject: [PATCH 02/14] fix typo in doc --- docs/input.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/input.md b/docs/input.md index 44a1276d..d6123328 100644 --- a/docs/input.md +++ b/docs/input.md @@ -80,7 +80,7 @@ Let us take a close look at a full template JSON. Comments are not allowed in a // If you start from FASTQs then define these, otherwise remove from this file. // You can define up to 6 replicates. - // FASTQs in an array will be merged after trimming adapters. + // FASTQs in an array will be merged. // For example, // "rep1_R1_L1.fastq.gz", "rep1_R1_L2.fastq.gz" and "rep1_R1_L3.fastq.gz" will be merged together. "chip.fastqs_rep1_R1" : [ "rep1_R1_L1.fastq.gz", "rep1_R1_L2.fastq.gz", "rep1_R1_L3.fastq.gz" ], From b328949452271c9fd6d0e22e4765354011acc34d Mon Sep 17 00:00:00 2001 From: Jin Lee Date: Sat, 8 Jun 2019 04:05:26 -0700 Subject: [PATCH 03/14] don't install caper, croo in conda env --- conda/config_conda_env_py3.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/conda/config_conda_env_py3.sh b/conda/config_conda_env_py3.sh index 32c8279d..f9213af0 100755 --- a/conda/config_conda_env_py3.sh +++ b/conda/config_conda_env_py3.sh @@ -18,8 +18,8 @@ else exit 1 fi -echo "=== Installing additional packages for python3 env..." -source activate ${CONDA_ENV_PY3} - pip install caper croo +#echo "=== Installing additional packages for python3 env..." +#source activate ${CONDA_ENV_PY3} +# pip install caper croo echo "=== All done." From b82bdaefc6425ccbae5e1988358b7646b7e94647 Mon Sep 17 00:00:00 2001 From: Jin Lee Date: Sat, 8 Jun 2019 04:08:11 -0700 Subject: [PATCH 04/14] complete refactoring of chip.wdl for 1) mixed endedness, 2) sync with atac pipeline (more robust code structure), 3) remove resumer support, 4) max num_rep 6 -> 10 --- chip.wdl | 1565 +++++++++++++++++--------------- examples/template_pe.full.json | 2 +- examples/template_se.full.json | 2 +- src/encode_merge_fastq.py | 11 +- 4 files changed, 858 insertions(+), 722 deletions(-) diff --git a/chip.wdl b/chip.wdl index 1e792ba4..7d847d43 100644 --- a/chip.wdl +++ b/chip.wdl @@ -11,16 +11,27 @@ workflow chip { String title = 'Untitled' String description = 'No description' + # endedness for input data + Boolean? paired_end # to define endedness for all replciates + # if defined, this will override individual endedness below + Array[Boolean] paired_ends = [] # to define endedness for individual replicate + Boolean? ctl_paired_end + Array[Boolean] ctl_paired_ends = [] + + ### mandatory genome param + File? genome_tsv # reference genome data TSV file including + # all genome-specific file paths and parameters + # individual genome parameters + File? ref_fa # reference fasta (*.fa.gz) + File? bwa_idx_tar # bwa index tar (uncompressed) + File? chrsz # 2-col chromosome sizes file + File? blacklist # blacklist BED (peaks overlapping will be filtered out) + String? gensz # genome sizes (hs for human, mm for mouse or sum of 2nd col in chrsz) + ### pipeline type String pipeline_type # tf or histone chip-eq String? peak_caller # default: (spp for tf) and (macs2 for histone) - ### mandatory genome param - File genome_tsv # reference genome data TSV file including - # all important genome specific data file paths and parameters - Boolean paired_end - Boolean? ctl_paired_end - ### optional but important Boolean align_only = false # disable all post-align analysis (peak-calling, overlap, idr, ...) Boolean true_rep_only = false # disable all analyses for pseudo replicates @@ -114,94 +125,92 @@ workflow chip { # [rep_id] is for each replicate ### fastqs - # define fastqs either with DNANexus style (1-dim array) or with default one (3-dim array) - # [merge_id] is for pooing fastqs - ## DNANexus UI style fastq definition - Array[File] fastqs_rep1_R1 = [] # [merge_id] - Array[File] fastqs_rep1_R2 = [] # do not define _R2 array if your sample is not paired end - Array[File] fastqs_rep2_R1 = [] # do not define if you have a single replicate - Array[File] fastqs_rep2_R2 = [] # do not define _R2 array if your sample is not paired end - Array[File] fastqs_rep3_R1 = [] # do not define if you have <=2 replicates - Array[File] fastqs_rep3_R2 = [] # do not define _R2 array if your sample is not paired end - Array[File] fastqs_rep4_R1 = [] # do not define if you have <=3 replicates - Array[File] fastqs_rep4_R2 = [] # do not define _R2 array if your sample is not paired end - Array[File] fastqs_rep5_R1 = [] # do not define if you have <=4 replicates - Array[File] fastqs_rep5_R2 = [] # do not define _R2 array if your sample is not paired end - Array[File] fastqs_rep6_R1 = [] # do not define if you have <=5 replicates - Array[File] fastqs_rep6_R2 = [] # do not define _R2 array if your sample is not paired end - Array[File] ctl_fastqs_rep1_R1 = [] # [merge_id] - Array[File] ctl_fastqs_rep1_R2 = [] # do not define _R2 array if your sample is not paired end - Array[File] ctl_fastqs_rep2_R1 = [] # do not define if you have a single control - Array[File] ctl_fastqs_rep2_R2 = [] # do not define _R2 array if your sample is not paired end - Array[File] ctl_fastqs_rep3_R1 = [] # do not define if you have <=2 controls - Array[File] ctl_fastqs_rep3_R2 = [] # do not define _R2 array if your sample is not paired end - Array[File] ctl_fastqs_rep4_R1 = [] # do not define if you have <=3 controls - Array[File] ctl_fastqs_rep4_R2 = [] # do not define _R2 array if your sample is not paired end - Array[File] ctl_fastqs_rep5_R1 = [] # do not define if you have <=4 controls - Array[File] ctl_fastqs_rep5_R2 = [] # do not define _R2 array if your sample is not paired end - Array[File] ctl_fastqs_rep6_R1 = [] # do not define if you have <=5 controls - Array[File] ctl_fastqs_rep6_R2 = [] # do not define _R2 array if your sample is not paired end - ## default style fastq definition - # [read_end_id] is for fastq R1 or fastq R2 - Array[Array[Array[File]]] fastqs = [] # [rep_id][merge_id][read_end_id] - Array[Array[Array[File]]] ctl_fastqs = [] # [rep_id][merge_id][read_end_id] + Array[File] fastqs_rep1_R1 = [] # FASTQs to be merged for rep1 R1 + Array[File] fastqs_rep1_R2 = [] # do not define if single-ended + Array[File] fastqs_rep2_R1 = [] # do not define if unreplicated + Array[File] fastqs_rep2_R2 = [] # ... + Array[File] fastqs_rep3_R1 = [] + Array[File] fastqs_rep3_R2 = [] + Array[File] fastqs_rep4_R1 = [] + Array[File] fastqs_rep4_R2 = [] + Array[File] fastqs_rep5_R1 = [] + Array[File] fastqs_rep5_R2 = [] + Array[File] fastqs_rep6_R1 = [] + Array[File] fastqs_rep6_R2 = [] + Array[File] fastqs_rep7_R1 = [] + Array[File] fastqs_rep7_R2 = [] + Array[File] fastqs_rep8_R1 = [] + Array[File] fastqs_rep8_R2 = [] + Array[File] fastqs_rep9_R1 = [] + Array[File] fastqs_rep9_R2 = [] + Array[File] fastqs_rep10_R1 = [] + Array[File] fastqs_rep10_R2 = [] + + Array[File] ctl_fastqs_rep1_R1 = [] # Control FASTQs to be merged for rep1 R1 + Array[File] ctl_fastqs_rep1_R2 = [] # do not define if single-ended + Array[File] ctl_fastqs_rep2_R1 = [] # do not define if unreplicated + Array[File] ctl_fastqs_rep2_R2 = [] # ... + Array[File] ctl_fastqs_rep3_R1 = [] + Array[File] ctl_fastqs_rep3_R2 = [] + Array[File] ctl_fastqs_rep4_R1 = [] + Array[File] ctl_fastqs_rep4_R2 = [] + Array[File] ctl_fastqs_rep5_R1 = [] + Array[File] ctl_fastqs_rep5_R2 = [] + Array[File] ctl_fastqs_rep6_R1 = [] + Array[File] ctl_fastqs_rep6_R2 = [] + Array[File] ctl_fastqs_rep7_R1 = [] + Array[File] ctl_fastqs_rep7_R2 = [] + Array[File] ctl_fastqs_rep8_R1 = [] + Array[File] ctl_fastqs_rep8_R2 = [] + Array[File] ctl_fastqs_rep9_R1 = [] + Array[File] ctl_fastqs_rep9_R2 = [] + Array[File] ctl_fastqs_rep10_R1 = [] + Array[File] ctl_fastqs_rep10_R2 = [] ### other input types (bam, nodup_bam, ta) - Array[File] bams = [] # [rep_id] - Array[File] ctl_bams = [] # [rep_id] - Array[File] nodup_bams = [] # [rep_id] - Array[File] ctl_nodup_bams = [] # [rep_id] - Array[File] tas = [] # [rep_id] - Array[File] ctl_tas = [] # [rep_id] + Array[File?] merged_fastqs_R1 = [] + Array[File?] merged_fastqs_R2 = [] + Array[File?] ctl_merged_fastqs_R1 = [] + Array[File?] ctl_merged_fastqs_R2 = [] + Array[File?] bams = [] # [rep_id] + Array[File?] ctl_bams = [] # [rep_id] + Array[File?] nodup_bams = [] # [rep_id] + Array[File?] ctl_nodup_bams = [] # [rep_id] + Array[File?] tas = [] # [rep_id] + Array[File?] ctl_tas = [] # [rep_id] ### other input types (peak) - Array[Int] fraglen = [] # [rep_id]. fragment length if inputs are peaks - Array[File] peaks = [] # [PAIR(rep_id1,rep_id2)]. example for 3 reps: [rep1_rep2, rep1_rep3, rep2_rep3] - Array[File] peaks_pr1 = [] # [rep_id]. do not define if true_rep=true - Array[File] peaks_pr2 = [] # [rep_id]. do not define if true_rep=true + Array[Int?] fraglen = [] # [rep_id]. fragment length if inputs are peaks + Array[File?] peaks = [] # [PAIR(rep_id1,rep_id2)]. example for 3 reps: [rep1_rep2, rep1_rep3, rep2_rep3] + Array[File?] peaks_pr1 = [] # [rep_id]. do not define if true_rep=true + Array[File?] peaks_pr2 = [] # [rep_id]. do not define if true_rep=true File? peak_ppr1 # do not define if you have a single replicate or true_rep=true File? peak_ppr2 # do not define if you have a single replicate or true_rep=true File? peak_pooled # do not define if you have a single replicate or true_rep=true - ### other inputs used for resuming pipelines (QC/txt/log/png files, ...) - File? ta_pooled - File? ctl_ta_pooled - Array[File] flagstat_qcs = [] - Array[File] pbc_qcs = [] - Array[File] dup_qcs = [] - Array[File] nodup_flagstat_qcs = [] - Array[File] ctl_flagstat_qcs = [] - Array[File] ctl_pbc_qcs = [] - Array[File] ctl_dup_qcs = [] - Array[File] ctl_nodup_flagstat_qcs = [] - Array[File] pval_bws = [] - Array[File] xcor_plots = [] - Array[File] xcor_scores = [] - - Array[File] macs2_frip_qcs = [] - Array[File] macs2_pr1_frip_qcs = [] - Array[File] macs2_pr2_frip_qcs = [] - File? macs2_pooled_frip_qc_ - File? macs2_ppr1_frip_qc_ - File? macs2_ppr2_frip_qc_ - Array[File] spp_frip_qcs = [] - Array[File] spp_pr1_frip_qcs = [] - Array[File] spp_pr2_frip_qcs = [] - File? spp_pooled_frip_qc_ - File? spp_ppr1_frip_qc_ - File? spp_ppr2_frip_qc_ - - Array[File] jsd_qcs = [] - File? jsd_plot - - Array[File] count_signal_track_pos_bws = [] - Array[File] count_signal_track_neg_bws = [] - File? count_signal_track_pooled_pos_bw_ - File? count_signal_track_pooled_neg_bw_ + ####################### pipeline starts here ####################### + # DO NOT DEFINE ANY VARIABLES DECLARED BELOW IN AN INPUT JSON FILE # + # THEY ARE TEMPORARY/INTERMEDIATE SYSTEM VARIABLES # + ####################### pipeline starts here ####################### + + # read genome data and paths + if ( defined(genome_tsv) ) { + call read_genome_tsv { input: genome_tsv = genome_tsv } + } + File? ref_fa_ = if defined(ref_fa) then ref_fa + else read_genome_tsv.ref_fa + File? bwa_idx_tar_ = if defined(bwa_idx_tar) then bwa_idx_tar + else read_genome_tsv.bwa_idx_tar + File? chrsz_ = if defined(chrsz) then chrsz + else read_genome_tsv.chrsz + String? gensz_ = if defined(gensz) then gensz + else read_genome_tsv.gensz + File? blacklist_ = if defined(blacklist) then blacklist + else read_genome_tsv.blacklist ### temp vars (do not define these) - String peak_caller_ = if pipeline_type=='tf' then select_first([peak_caller,'spp']) - else select_first([peak_caller,'macs2']) + String peak_caller_ = if pipeline_type=='tf' then select_first([peak_caller, 'spp']) + else select_first([peak_caller, 'macs2']) String peak_type = if peak_caller_=='spp' then 'regionPeak' else if peak_caller_=='macs2' then 'narrowPeak' else 'narrowPeak' @@ -210,312 +219,411 @@ workflow chip { else if peak_caller_=='macs2' then 'p.value' else 'p.value' - ### read genome data and paths - call read_genome_tsv { input:genome_tsv = genome_tsv } - File bwa_idx_tar = read_genome_tsv.genome['bwa_idx_tar'] - File blacklist = read_genome_tsv.genome['blacklist'] - File chrsz = read_genome_tsv.genome['chrsz'] - String gensz = read_genome_tsv.genome['gensz'] - - ### pipeline starts here - # temporary 2-dim arrays for DNANexus style fastqs - Array[Array[File]] fastqs_rep1 = if length(fastqs_rep1_R2)>0 then transpose([fastqs_rep1_R1,fastqs_rep1_R2]) - else transpose([fastqs_rep1_R1]) - Array[Array[File]] fastqs_rep2 = if length(fastqs_rep2_R2)>0 then transpose([fastqs_rep2_R1,fastqs_rep2_R2]) - else transpose([fastqs_rep2_R1]) - Array[Array[File]] fastqs_rep3 = if length(fastqs_rep3_R2)>0 then transpose([fastqs_rep3_R1,fastqs_rep3_R2]) - else transpose([fastqs_rep3_R1]) - Array[Array[File]] fastqs_rep4 = if length(fastqs_rep4_R2)>0 then transpose([fastqs_rep4_R1,fastqs_rep4_R2]) - else transpose([fastqs_rep4_R1]) - Array[Array[File]] fastqs_rep5 = if length(fastqs_rep5_R2)>0 then transpose([fastqs_rep5_R1,fastqs_rep5_R2]) - else transpose([fastqs_rep5_R1]) - Array[Array[File]] fastqs_rep6 = if length(fastqs_rep6_R2)>0 then transpose([fastqs_rep6_R1,fastqs_rep6_R2]) - else transpose([fastqs_rep6_R1]) - Array[Array[Array[File]]] fastqs_ = if length(fastqs_rep1)<1 then fastqs - else if length(fastqs_rep2)<1 then [fastqs_rep1] - else if length(fastqs_rep3)<1 then [fastqs_rep1,fastqs_rep2] - else if length(fastqs_rep4)<1 then [fastqs_rep1,fastqs_rep2,fastqs_rep3] - else if length(fastqs_rep5)<1 then [fastqs_rep1,fastqs_rep2,fastqs_rep3,fastqs_rep4] - else if length(fastqs_rep6)<1 then [fastqs_rep1,fastqs_rep2,fastqs_rep3,fastqs_rep4,fastqs_rep5] - else [fastqs_rep1,fastqs_rep2,fastqs_rep3,fastqs_rep4,fastqs_rep5,fastqs_rep6] - - ## temp vars for resuming pipelines - Boolean need_to_process_ta = length(peaks_pr1)==0 && length(peaks)==0 - Boolean need_to_process_nodup_bam = need_to_process_ta && length(tas)==0 - Boolean need_to_process_bam = need_to_process_nodup_bam && length(nodup_bams)==0 - Boolean need_to_process_fastq = need_to_process_bam && length(bams)==0 - - scatter(fastq_set in if need_to_process_fastq then fastqs_ else []) { - # merge fastqs - call merge_fastq { input : - fastqs = fastq_set, - paired_end = paired_end, - } - # align merged fastqs with bwa - call bwa { input : - idx_tar = bwa_idx_tar, - fastqs = merge_fastq.merged_fastqs, #[R1,R2] - paired_end = paired_end, - use_bwa_mem_for_pe = use_bwa_mem_for_pe, - cpu = bwa_cpu, - mem_mb = bwa_mem_mb, - time_hr = bwa_time_hr, - disks = bwa_disks, - } - } - - # special treatment for xcor for paired end samples only - Array[Array[File]] fastqs_xcor = if !paired_end then [] else merge_fastq.merged_fastqs - scatter(fastq_set in fastqs_xcor) { - # for paired end dataset, map R1 only as SE for xcor analysis - call trim_fastq { input : - fastq = fastq_set[0], - trim_bp = xcor_pe_trim_bp, - } - } - Array[Array[File]] trimmed_fastqs_R1 = if length(trim_fastq.trimmed_fastq)<1 then [] - else transpose([trim_fastq.trimmed_fastq]) - scatter(fastq_set in trimmed_fastqs_R1) { - call bwa as bwa_R1 { input : - idx_tar = bwa_idx_tar, - fastqs = fastq_set, - paired_end = false, - use_bwa_mem_for_pe = use_bwa_mem_for_pe, - cpu = bwa_cpu, - mem_mb = bwa_mem_mb, - time_hr = bwa_time_hr, - disks = bwa_disks, - } - # no bam filtering for xcor - call bam2ta as bam2ta_no_filt_R1 { input : - bam = bwa_R1.bam, - paired_end = false, - subsample = 0, - regex_grep_v_ta = regex_filter_reads, - mito_chr_name = mito_chr_name, - - cpu = bam2ta_cpu, - mem_mb = bam2ta_mem_mb, - time_hr = bam2ta_time_hr, - disks = bam2ta_disks, + # temporary 2-dim fastqs array [rep_id][merge_id] + Array[Array[File]] fastqs_R1 = + if length(fastqs_rep10_R1)>0 then + [fastqs_rep1_R1, fastqs_rep2_R1, fastqs_rep3_R1, fastqs_rep4_R1, fastqs_rep5_R1, + fastqs_rep6_R1, fastqs_rep7_R1, fastqs_rep8_R1, fastqs_rep9_R1, fastqs_rep10_R1] + else if length(fastqs_rep9_R1)>0 then + [fastqs_rep1_R1, fastqs_rep2_R1, fastqs_rep3_R1, fastqs_rep4_R1, fastqs_rep5_R1, + fastqs_rep6_R1, fastqs_rep7_R1, fastqs_rep8_R1, fastqs_rep9_R1] + else if length(fastqs_rep8_R1)>0 then + [fastqs_rep1_R1, fastqs_rep2_R1, fastqs_rep3_R1, fastqs_rep4_R1, fastqs_rep5_R1, + fastqs_rep6_R1, fastqs_rep7_R1, fastqs_rep8_R1] + else if length(fastqs_rep7_R1)>0 then + [fastqs_rep1_R1, fastqs_rep2_R1, fastqs_rep3_R1, fastqs_rep4_R1, fastqs_rep5_R1, + fastqs_rep6_R1, fastqs_rep7_R1] + else if length(fastqs_rep6_R1)>0 then + [fastqs_rep1_R1, fastqs_rep2_R1, fastqs_rep3_R1, fastqs_rep4_R1, fastqs_rep5_R1, + fastqs_rep6_R1] + else if length(fastqs_rep5_R1)>0 then + [fastqs_rep1_R1, fastqs_rep2_R1, fastqs_rep3_R1, fastqs_rep4_R1, fastqs_rep5_R1] + else if length(fastqs_rep4_R1)>0 then + [fastqs_rep1_R1, fastqs_rep2_R1, fastqs_rep3_R1, fastqs_rep4_R1] + else if length(fastqs_rep3_R1)>0 then + [fastqs_rep1_R1, fastqs_rep2_R1, fastqs_rep3_R1] + else if length(fastqs_rep2_R1)>0 then + [fastqs_rep1_R1, fastqs_rep2_R1] + else if length(fastqs_rep1_R1)>0 then + [fastqs_rep1_R1] + else [] + # no need to do that for R2 (R1 array will be used to determine presense of fastq for each rep) + Array[Array[File]] fastqs_R2 = + [fastqs_rep1_R2, fastqs_rep2_R2, fastqs_rep3_R2, fastqs_rep4_R2, fastqs_rep5_R2, + fastqs_rep6_R2, fastqs_rep7_R2, fastqs_rep8_R2, fastqs_rep9_R2, fastqs_rep10_R2] + + # temporary 2-dim ctl fastqs array [rep_id][merge_id] + Array[Array[File]] ctl_fastqs_R1 = + if length(ctl_fastqs_rep10_R1)>0 then + [ctl_fastqs_rep1_R1, ctl_fastqs_rep2_R1, ctl_fastqs_rep3_R1, ctl_fastqs_rep4_R1, ctl_fastqs_rep5_R1, + ctl_fastqs_rep6_R1, ctl_fastqs_rep7_R1, ctl_fastqs_rep8_R1, ctl_fastqs_rep9_R1, ctl_fastqs_rep10_R1] + else if length(ctl_fastqs_rep9_R1)>0 then + [ctl_fastqs_rep1_R1, ctl_fastqs_rep2_R1, ctl_fastqs_rep3_R1, ctl_fastqs_rep4_R1, ctl_fastqs_rep5_R1, + ctl_fastqs_rep6_R1, ctl_fastqs_rep7_R1, ctl_fastqs_rep8_R1, ctl_fastqs_rep9_R1] + else if length(ctl_fastqs_rep8_R1)>0 then + [ctl_fastqs_rep1_R1, ctl_fastqs_rep2_R1, ctl_fastqs_rep3_R1, ctl_fastqs_rep4_R1, ctl_fastqs_rep5_R1, + ctl_fastqs_rep6_R1, ctl_fastqs_rep7_R1, ctl_fastqs_rep8_R1] + else if length(ctl_fastqs_rep7_R1)>0 then + [ctl_fastqs_rep1_R1, ctl_fastqs_rep2_R1, ctl_fastqs_rep3_R1, ctl_fastqs_rep4_R1, ctl_fastqs_rep5_R1, + ctl_fastqs_rep6_R1, ctl_fastqs_rep7_R1] + else if length(ctl_fastqs_rep6_R1)>0 then + [ctl_fastqs_rep1_R1, ctl_fastqs_rep2_R1, ctl_fastqs_rep3_R1, ctl_fastqs_rep4_R1, ctl_fastqs_rep5_R1, + ctl_fastqs_rep6_R1] + else if length(ctl_fastqs_rep5_R1)>0 then + [ctl_fastqs_rep1_R1, ctl_fastqs_rep2_R1, ctl_fastqs_rep3_R1, ctl_fastqs_rep4_R1, ctl_fastqs_rep5_R1] + else if length(ctl_fastqs_rep4_R1)>0 then + [ctl_fastqs_rep1_R1, ctl_fastqs_rep2_R1, ctl_fastqs_rep3_R1, ctl_fastqs_rep4_R1] + else if length(ctl_fastqs_rep3_R1)>0 then + [ctl_fastqs_rep1_R1, ctl_fastqs_rep2_R1, ctl_fastqs_rep3_R1] + else if length(ctl_fastqs_rep2_R1)>0 then + [ctl_fastqs_rep1_R1, ctl_fastqs_rep2_R1] + else if length(ctl_fastqs_rep1_R1)>0 then + [ctl_fastqs_rep1_R1] + else [] + # no need to do that for R2 (R1 array will be used to determine presense of fastq for each rep) + Array[Array[File]] ctl_fastqs_R2 = + [ctl_fastqs_rep1_R2, ctl_fastqs_rep2_R2, ctl_fastqs_rep3_R2, ctl_fastqs_rep4_R2, ctl_fastqs_rep5_R2, + ctl_fastqs_rep6_R2, ctl_fastqs_rep7_R2, ctl_fastqs_rep8_R2, ctl_fastqs_rep9_R2, ctl_fastqs_rep10_R2] + + # temporary variables to get number of replicates + # WDLic implementation of max(A,B,C,...) + Int num_rep_fastq = length(fastqs_R1) + Int num_rep_merged_fastq = if length(merged_fastqs_R1)0 + Boolean has_output_of_merge_fastq = i1 ) { - # pool tagaligns from true replicates - call pool_ta { input : - tas = tas__, + File? ta_ = if has_output_of_bam2ta then tas[i] else bam2ta.ta + + # convert unfiltered BAM to a special TAG-ALIGN for xcor + Boolean has_input_of_bam2ta_no_filt = has_output_of_bwa || defined(bwa.bam) + if ( has_input_of_bam2ta_no_filt ) { + call bam2ta as bam2ta_no_filt { input : + bam = bam_, + paired_end = paired_end_, + subsample = 0, + regex_grep_v_ta = regex_filter_reads, + mito_chr_name = mito_chr_name, + + cpu = bam2ta_cpu, + mem_mb = bam2ta_mem_mb, + time_hr = bam2ta_time_hr, + disks = bam2ta_disks, + } } - } - if ( !true_rep_only ) { - scatter( ta in tas__ ) { - # make two self pseudo replicates per true replicate + Boolean has_input_of_spr = has_output_of_bam2ta || defined(bam2ta.ta) + if ( has_input_of_spr && !align_only && !true_rep_only ) { call spr { input : - ta = ta, - paired_end = paired_end, + ta = ta_, + paired_end = paired_end_, mem_mb = spr_mem_mb, } } - } - if ( !true_rep_only && length(tas__)>1 ) { - # pool tagaligns from pseudo replicates - call pool_ta as pool_ta_pr1 { input : - tas = spr.ta_pr1, + + Boolean has_input_of_count_signal_track = has_output_of_bam2ta || defined(bam2ta.ta) + if ( has_input_of_count_signal_track && enable_count_signal_track ) { + # generate count signal track + call count_signal_track { input : + ta = ta_, + chrsz = chrsz_, + } } - call pool_ta as pool_ta_pr2 { input : - tas = spr.ta_pr2, + + Boolean has_input_of_trim_fastq = has_output_of_merge_fastq || defined(merge_fastq.merged_fastq_R1) + if ( has_input_of_trim_fastq && paired_end_ ) { + # special trimming for paired end samples (for cross-corr analysis) + call trim_fastq { input : + fastq = merged_fastq_R1_, + trim_bp = xcor_pe_trim_bp, + } + call bwa as bwa_R1 { input : + bwa_idx_tar = bwa_idx_tar_, + fastq_R1 = trim_fastq.trimmed_fastq, + paired_end = false, + use_bwa_mem_for_pe = use_bwa_mem_for_pe, + cpu = bwa_cpu, + mem_mb = bwa_mem_mb, + time_hr = bwa_time_hr, + disks = bwa_disks, + } + # no bam filtering for xcor + call bam2ta as bam2ta_no_filt_R1 { input : + bam = bwa_R1.bam, + paired_end = false, + subsample = 0, + regex_grep_v_ta = regex_filter_reads, + mito_chr_name = mito_chr_name, + + cpu = bam2ta_cpu, + mem_mb = bam2ta_mem_mb, + time_hr = bam2ta_time_hr, + disks = bam2ta_disks, + } } - } - Array[File] tas_xcor = if length(fraglen)>0 then [] - else if length(xcor_scores)>0 then [] - else if length(bam2ta_no_filt_R1.ta)>0 then bam2ta_no_filt_R1.ta - else if length(bam2ta_no_filt.ta)>0 then bam2ta_no_filt.ta - else tas__ - Boolean paired_end_xcor = paired_end && length(bam2ta_no_filt_R1.ta)<1 - scatter(ta in tas_xcor) { # use trimmed/unfilitered R1 tagAlign for paired end dataset # if not starting from fastqs, keep using old method # (mapping with both ends for tag-aligns to be used for xcor) # subsample tagalign (non-mito) and cross-correlation analysis - call xcor { input : - ta = ta, - paired_end = paired_end_xcor, - subsample = xcor_subsample_reads, - mito_chr_name = mito_chr_name, - chip_seq_type = pipeline_type, - exclusion_range_min = xcor_exclusion_range_min, - exclusion_range_max = xcor_exclusion_range_max, - cpu = xcor_cpu, - mem_mb = xcor_mem_mb, - time_hr = xcor_time_hr, - disks = xcor_disks, + File? ta_xcor = if defined(bam2ta_no_filt_R1.ta) then bam2ta_no_filt_R1.ta + else if defined(bam2ta_no_filt.ta) then bam2ta_no_filt.ta + else ta_ + Boolean? paired_end_xcor = if defined(bam2ta_no_filt_R1.ta) then false + else paired_end_ + + Boolean has_input_of_xcor = defined(ta_xcor) + if ( has_input_of_xcor ) { + call xcor { input : + ta = ta_xcor, + paired_end = paired_end_xcor, + subsample = xcor_subsample_reads, + mito_chr_name = mito_chr_name, + chip_seq_type = pipeline_type, + exclusion_range_min = xcor_exclusion_range_min, + exclusion_range_max = xcor_exclusion_range_max, + cpu = xcor_cpu, + mem_mb = xcor_mem_mb, + time_hr = xcor_time_hr, + disks = xcor_disks, + } } - } - # generate count signal track - Array[File] tas_count_signal_track = if length(count_signal_track_pos_bws)>0 then [] - else if enable_count_signal_track then tas_ - else [] - scatter(i in range(length(tas_count_signal_track))) { - call count_signal_track { input : - ta = tas_count_signal_track[i], - chrsz = chrsz, + # before peak calling, get fragment length from xcor analysis or given input + # if fraglen [] is defined in the input JSON, fraglen from xcor will be ignored + Int? fraglen_ = if length(fraglen)>0 then fraglen[i] + else xcor.fraglen + } + + # align each control + scatter(i in range(num_ctl)) { + # to override endedness definition for individual control + # ctl_paired_end will override ctl_paired_ends[i] + Boolean? ctl_paired_end_ = if !defined(ctl_paired_end) && i0 + Boolean has_output_of_merge_fastq_ctl = i0 ) { - call count_signal_track as count_signal_track_pooled { input : - ta = select_first([pool_ta.ta_pooled, ta_pooled]), - chrsz = chrsz, + File? ctl_merged_fastq_R1_ = if has_output_of_merge_fastq_ctl then ctl_merged_fastqs_R1[i] + else merge_fastq_ctl.merged_fastq_R1 + File? ctl_merged_fastq_R2_ = if i0 then transpose([ctl_fastqs_rep1_R1,ctl_fastqs_rep1_R2]) - else transpose([ctl_fastqs_rep1_R1]) - Array[Array[File]] ctl_fastqs_rep2 = if length(ctl_fastqs_rep2_R2)>0 then transpose([ctl_fastqs_rep2_R1,ctl_fastqs_rep2_R2]) - else transpose([ctl_fastqs_rep2_R1]) - Array[Array[File]] ctl_fastqs_rep3 = if length(ctl_fastqs_rep3_R2)>0 then transpose([ctl_fastqs_rep3_R1,ctl_fastqs_rep3_R2]) - else transpose([ctl_fastqs_rep3_R1]) - Array[Array[File]] ctl_fastqs_rep4 = if length(ctl_fastqs_rep4_R2)>0 then transpose([ctl_fastqs_rep4_R1,ctl_fastqs_rep4_R2]) - else transpose([ctl_fastqs_rep4_R1]) - Array[Array[File]] ctl_fastqs_rep5 = if length(ctl_fastqs_rep5_R2)>0 then transpose([ctl_fastqs_rep5_R1,ctl_fastqs_rep5_R2]) - else transpose([ctl_fastqs_rep5_R1]) - Array[Array[File]] ctl_fastqs_rep6 = if length(ctl_fastqs_rep6_R2)>0 then transpose([ctl_fastqs_rep6_R1,ctl_fastqs_rep6_R2]) - else transpose([ctl_fastqs_rep6_R1]) - Array[Array[Array[File]]] ctl_fastqs_ = if length(ctl_fastqs_rep1)<1 then ctl_fastqs - else if length(ctl_fastqs_rep2)<1 then [ctl_fastqs_rep1] - else if length(ctl_fastqs_rep3)<1 then [ctl_fastqs_rep1,ctl_fastqs_rep2] - else if length(ctl_fastqs_rep4)<1 then [ctl_fastqs_rep1,ctl_fastqs_rep2,ctl_fastqs_rep3] - else if length(ctl_fastqs_rep5)<1 then [ctl_fastqs_rep1,ctl_fastqs_rep2,ctl_fastqs_rep3,ctl_fastqs_rep4] - else if length(ctl_fastqs_rep6)<1 then [ctl_fastqs_rep1,ctl_fastqs_rep2,ctl_fastqs_rep3,ctl_fastqs_rep4,ctl_fastqs_rep5] - else [ctl_fastqs_rep1,ctl_fastqs_rep2,ctl_fastqs_rep3,ctl_fastqs_rep4,ctl_fastqs_rep5,ctl_fastqs_rep6] - - ## temp vars for resuming pipelines - Boolean need_to_process_ctl_nodup_bam = length(ctl_tas)==0 - Boolean need_to_process_ctl_bam = need_to_process_ctl_nodup_bam && length(ctl_nodup_bams)==0 - Boolean need_to_process_ctl_fastq = need_to_process_ctl_bam && length(ctl_bams)==0 - - scatter(fastq_set in if need_to_process_ctl_fastq then ctl_fastqs_ else []) { - # merge fastqs - call merge_fastq as merge_fastq_ctl { input : - fastqs = fastq_set, - paired_end = if defined(ctl_paired_end) then ctl_paired_end else paired_end, - } - # align merged fastqs with bwa - call bwa as bwa_ctl { input : - idx_tar = bwa_idx_tar, - fastqs = merge_fastq_ctl.merged_fastqs, #[R1,R2] - paired_end = paired_end, - use_bwa_mem_for_pe = use_bwa_mem_for_pe, - cpu = bwa_cpu, - mem_mb = bwa_mem_mb, - time_hr = bwa_time_hr, - disks = bwa_disks, + # if there are TAs for ALL replicates then pool them + Boolean has_all_inputs_of_pool_ta = length(select_all(ta_))==num_rep + if ( has_all_inputs_of_pool_ta && num_rep>1 ) { + # pool tagaligns from true replicates + call pool_ta { input : + tas = ta_, } } - Array[File] ctl_bams_ = flatten([bwa_ctl.bam, ctl_bams]) - scatter(bam in if need_to_process_ctl_bam then ctl_bams_ else []) { - # filter/dedup bam - call filter as filter_ctl { input : - bam = bam, - paired_end = if defined(ctl_paired_end) then ctl_paired_end else paired_end, - dup_marker = dup_marker, - mapq_thresh = mapq_thresh, - no_dup_removal = no_dup_removal, - mito_chr_name = mito_chr_name, - - cpu = filter_cpu, - mem_mb = filter_mem_mb, - time_hr = filter_time_hr, - disks = filter_disks, + # if there are pr1 TAs for ALL replicates then pool them + Boolean has_all_inputs_of_pool_ta_pr1 = length(select_all(spr.ta_pr1))==num_rep + if ( has_all_inputs_of_pool_ta_pr1 && num_rep>1 && !align_only && !true_rep_only ) { + # pool tagaligns from pseudo replicate 1 + call pool_ta as pool_ta_pr1 { input : + tas = spr.ta_pr1, } } - Array[File] ctl_nodup_bams_ = flatten([filter_ctl.nodup_bam, ctl_nodup_bams]) - scatter(bam in if need_to_process_ctl_nodup_bam then ctl_nodup_bams_ else []) { - # convert bam to tagalign and subsample it if necessary - call bam2ta as bam2ta_ctl { input : - bam = bam, - paired_end = if defined(ctl_paired_end) then ctl_paired_end else paired_end, - subsample = ctl_subsample_reads, - regex_grep_v_ta = regex_filter_reads, - mito_chr_name = mito_chr_name, - - cpu = bam2ta_cpu, - mem_mb = bam2ta_mem_mb, - time_hr = bam2ta_time_hr, - disks = bam2ta_disks, + # if there are pr2 TAs for ALL replicates then pool them + Boolean has_all_inputs_of_pool_ta_pr2 = length(select_all(spr.ta_pr2))==num_rep + if ( has_all_inputs_of_pool_ta_pr1 && num_rep>1 && !align_only && !true_rep_only ) { + # pool tagaligns from pseudo replicate 2 + call pool_ta as pool_ta_pr2 { input : + tas = spr.ta_pr2, } } - Array[String] ctl_tas_ = if align_only then [] else flatten([bam2ta_ctl.ta, ctl_tas]) - if ( length(ctl_tas_)>0 && !defined(ctl_ta_pooled) ) { + # if there are CTL TAs for ALL replicates then pool them + Boolean has_all_inputs_of_pool_ta_ctl = length(select_all(ctl_ta_))==num_ctl + if ( has_all_inputs_of_pool_ta_ctl && num_ctl>1 ) { # pool tagaligns from true replicates call pool_ta as pool_ta_ctl { input : - tas = ctl_tas_, + tas = ctl_ta_, + } + } + + Boolean has_input_of_count_signal_track_pooled = defined(pool_ta.ta_pooled) + if ( has_input_of_count_signal_track_pooled && enable_count_signal_track && num_rep>1 ) { + call count_signal_track as count_signal_track_pooled { input : + ta = pool_ta.ta_pooled, + chrsz = chrsz_, } } - if ( !disable_fingerprint && length(nodup_bams_)>0 && length(ctl_nodup_bams_)>0 && basename(blacklist)!='null' && length(jsd_qcs)<1 ) { + Boolean has_input_of_fingerprint = defined(blacklist_) && #basename(blacklist_) != 'null' && + length(select_all(nodup_bam_))==num_rep && + num_ctl>0 && defined(ctl_nodup_bam_[0]) + if ( has_input_of_fingerprint && !disable_fingerprint ) { # fingerprint and JS-distance plot call fingerprint { input : - nodup_bams = nodup_bams_, - ctl_bam = ctl_nodup_bams_[0], # use first control only - blacklist = blacklist, + nodup_bams = nodup_bam_, + ctl_bam = ctl_nodup_bam_[0], # use first control only + blacklist = blacklist_, cpu = fingerprint_cpu, mem_mb = fingerprint_mem_mb, @@ -524,71 +632,41 @@ workflow chip { } } - if ( length(tas__)>0 && length(ctl_tas_)>0 ) { + Boolean has_all_input_of_choose_ctl = length(select_all(ta_))==num_rep + && length(select_all(ctl_ta_))==num_ctl && num_ctl > 0 + if ( has_all_input_of_choose_ctl ) { # choose appropriate control for each exp IP replicate # outputs: # choose_ctl.idx : control replicate index for each exp replicate # -1 means pooled ctl replicate call choose_ctl { input: - tas = tas__, - ctl_tas = ctl_tas_, + tas = ta_, + ctl_tas = ctl_ta_, ta_pooled = pool_ta.ta_pooled, - ctl_ta_pooled = if !defined(ctl_ta_pooled) then pool_ta_ctl.ta_pooled else ctl_ta_pooled, + ctl_ta_pooled = pool_ta_ctl.ta_pooled, always_use_pooled_ctl = always_use_pooled_ctl, ctl_depth_ratio = ctl_depth_ratio, } } - # before peak calling, get fragment length from xcor analysis or given input - # if fraglen [] is defined in the input JSON, fraglen from xcor will be ignored - Array[Int] fraglen_ = if align_only then [] - else if length(fraglen)>0 then fraglen - else xcor.fraglen # make control ta array [[1,2,3,4]] -> [[1],[2],[3],[4]], will be zipped with exp ta array latter - Array[Array[File]] chosen_ctl_tas = if length(tas__)<1 || length(ctl_tas_)<1 then [[],[],[],[],[],[]] - else transpose(select_all([choose_ctl.chosen_ctl_tas])) + Array[Array[File]] chosen_ctl_tas = + if has_all_input_of_choose_ctl then transpose(select_all([choose_ctl.chosen_ctl_tas])) + else [[],[],[],[],[],[],[],[],[],[]] # we have all tas and ctl_tas (optional for histone chipseq) ready, let's call peaks - scatter(i in range(length(tas__))) { - # always call MACS2 peaks for true replicates to get signal tracks - # call peaks on tagalign - call macs2 { input : - tas = flatten([[tas__[i]], chosen_ctl_tas[i]]), - gensz = gensz, - chrsz = chrsz, - cap_num_peak = macs2_cap_num_peak, - pval_thresh = pval_thresh, - fraglen = fraglen_[i], - blacklist = blacklist, - keep_irregular_chr_in_bfilt_peak = keep_irregular_chr_in_bfilt_peak, - - mem_mb = macs2_mem_mb, - disks = macs2_disks, - time_hr = macs2_time_hr, - } - call macs2_signal_track { input : - tas = flatten([[tas__[i]], chosen_ctl_tas[i]]), - gensz = gensz, - chrsz = chrsz, - pval_thresh = pval_thresh, - fraglen = fraglen_[i], - - mem_mb = macs2_mem_mb, - disks = macs2_disks, - time_hr = macs2_time_hr, - } - } - - # SPP cannot call peaks without controls - if ( peak_caller_=='spp' ) { - scatter(i in range(length(tas__))) { - # call peaks on tagalign + #scatter(i in range(length(tas__))) { + scatter(i in range(num_rep)) { + Boolean has_input_of_peak_call = defined(ta_[i]) + Boolean has_output_of_peak_call = i 1 ) { + # rounded mean of fragment length, which will be used for + # 1) calling peaks for pooled true/pseudo replicates + # 2) calculating FRiP + call rounded_mean as fraglen_mean { input : + ints = fraglen_, + } } # actually not an array - Array[File] chosen_ctl_ta_pooled = if length(tas__)<2 || length(ctl_tas_)<1 then [] - else if length(ctl_tas_)<2 then [ctl_tas_[0]] # choose first (only) control - else if defined(ctl_ta_pooled) then select_all([ctl_ta_pooled]) # choose pooled control + Array[File?] chosen_ctl_ta_pooled = if !has_all_input_of_choose_ctl then [] + else if num_ctl < 2 then [ctl_ta_[0]] # choose first (only) control else select_all([pool_ta_ctl.ta_pooled]) # choose pooled control - if ( length(tas__)>1 ) { + Boolean has_input_of_peak_caller_pooled = defined(pool_ta.ta_pooled) + Boolean has_output_of_peak_caller_pooled = defined(peak_pooled) + if ( has_input_of_peak_caller_pooled && !has_output_of_peak_caller_pooled && + peak_caller_=='macs2' && !align_only && num_rep>1 ) { # call peaks on pooled replicate # always call MACS2 peaks for pooled replicate to get signal tracks call macs2 as macs2_pooled { input : tas = flatten([select_all([pool_ta.ta_pooled]), chosen_ctl_ta_pooled]), - gensz = gensz, - chrsz = chrsz, + gensz = gensz_, + chrsz = chrsz_, cap_num_peak = macs2_cap_num_peak, pval_thresh = pval_thresh, fraglen = fraglen_mean.rounded_mean, - blacklist = blacklist, + blacklist = blacklist_, keep_irregular_chr_in_bfilt_peak = keep_irregular_chr_in_bfilt_peak, - mem_mb = macs2_mem_mb, - disks = macs2_disks, - time_hr = macs2_time_hr, - } - call macs2_signal_track as macs2_signal_track_pooled { input : - tas = flatten([select_all([pool_ta.ta_pooled]), chosen_ctl_ta_pooled]), - gensz = gensz, - chrsz = chrsz, - pval_thresh = pval_thresh, - fraglen = fraglen_mean.rounded_mean, - mem_mb = macs2_mem_mb, disks = macs2_disks, time_hr = macs2_time_hr, } } - if ( length(tas__)>1 && peak_caller_=='spp' ) { + if ( has_input_of_peak_caller_pooled && !has_output_of_peak_caller_pooled && + peak_caller_=='spp' && !align_only && num_rep>1 ) { # call peaks on pooled replicate call spp as spp_pooled { input : tas = flatten([select_all([pool_ta.ta_pooled]), chosen_ctl_ta_pooled]), - chrsz = chrsz, + chrsz = chrsz_, cap_num_peak = spp_cap_num_peak, fraglen = fraglen_mean.rounded_mean, - blacklist = blacklist, + blacklist = blacklist_, keep_irregular_chr_in_bfilt_peak = keep_irregular_chr_in_bfilt_peak, cpu = spp_cpu, @@ -724,34 +846,38 @@ workflow chip { time_hr = spp_time_hr, } } + File? peak_pooled_ = if has_output_of_peak_caller_pooled then peak_pooled + else if peak_caller_=='spp' then spp_pooled.rpeak + else macs2_pooled.npeak - if ( !true_rep_only && length(tas__)>1 && peak_caller_=='macs2' ) { - # call peaks on 1st pooled pseudo replicates - call macs2 as macs2_ppr1 { input : - tas = flatten([select_all([pool_ta_pr1.ta_pooled]), chosen_ctl_ta_pooled]), - gensz = gensz, - chrsz = chrsz, - cap_num_peak = macs2_cap_num_peak, + # macs2 signal track for pooled rep + if ( has_input_of_peak_caller_pooled && !align_only && num_rep>1 ) { + call macs2_signal_track as macs2_signal_track_pooled { input : + tas = flatten([select_all([pool_ta.ta_pooled]), chosen_ctl_ta_pooled]), + gensz = gensz_, + chrsz = chrsz_, pval_thresh = pval_thresh, fraglen = fraglen_mean.rounded_mean, - blacklist = blacklist, - keep_irregular_chr_in_bfilt_peak = keep_irregular_chr_in_bfilt_peak, mem_mb = macs2_mem_mb, disks = macs2_disks, time_hr = macs2_time_hr, } } - if ( !true_rep_only && length(tas__)>1 && peak_caller_=='macs2' ) { - # call peaks on 2nd pooled pseudo replicates - call macs2 as macs2_ppr2 { input : - tas = flatten([select_all([pool_ta_pr2.ta_pooled]), chosen_ctl_ta_pooled]), - gensz = gensz, - chrsz = chrsz, + + Boolean has_input_of_peak_caller_ppr1 = defined(pool_ta_pr1.ta_pooled) + Boolean has_output_of_peak_caller_ppr1 = defined(peak_ppr1) + if ( has_input_of_peak_caller_ppr1 && !has_output_of_peak_caller_ppr1 && + peak_caller_=='macs2' && !align_only && !true_rep_only && num_rep>1 ) { + # call peaks on 1st pooled pseudo replicates + call macs2 as macs2_ppr1 { input : + tas = flatten([select_all([pool_ta_pr1.ta_pooled]), chosen_ctl_ta_pooled]), + gensz = gensz_, + chrsz = chrsz_, cap_num_peak = macs2_cap_num_peak, pval_thresh = pval_thresh, fraglen = fraglen_mean.rounded_mean, - blacklist = blacklist, + blacklist = blacklist_, keep_irregular_chr_in_bfilt_peak = keep_irregular_chr_in_bfilt_peak, mem_mb = macs2_mem_mb, @@ -759,14 +885,15 @@ workflow chip { time_hr = macs2_time_hr, } } - if ( !true_rep_only && length(tas__)>1 && peak_caller_=='spp' ) { + if ( has_input_of_peak_caller_ppr1 && !has_output_of_peak_caller_ppr1 && + peak_caller_=='spp' && !align_only && !true_rep_only && num_rep>1 ) { # call peaks on 1st pooled pseudo replicates call spp as spp_ppr1 { input : tas = flatten([select_all([pool_ta_pr1.ta_pooled]), chosen_ctl_ta_pooled]), - chrsz = chrsz, + chrsz = chrsz_, cap_num_peak = spp_cap_num_peak, fraglen = fraglen_mean.rounded_mean, - blacklist = blacklist, + blacklist = blacklist_, keep_irregular_chr_in_bfilt_peak = keep_irregular_chr_in_bfilt_peak, cpu = spp_cpu, @@ -775,14 +902,39 @@ workflow chip { time_hr = spp_time_hr, } } - if ( !true_rep_only && length(tas__)>1 && peak_caller_=='spp' ) { + File? peak_ppr1_ = if has_output_of_peak_caller_ppr1 then peak_ppr1 + else if peak_caller_=='spp' then spp_ppr1.rpeak + else macs2_ppr1.npeak + + Boolean has_input_of_peak_caller_ppr2 = defined(pool_ta_pr2.ta_pooled) + Boolean has_output_of_peak_caller_ppr2 = defined(peak_ppr2) + if ( has_input_of_peak_caller_ppr2 && !has_output_of_peak_caller_ppr2 && + peak_caller_=='macs2' && !align_only && !true_rep_only && num_rep>1 ) { + # call peaks on 2nd pooled pseudo replicates + call macs2 as macs2_ppr2 { input : + tas = flatten([select_all([pool_ta_pr2.ta_pooled]), chosen_ctl_ta_pooled]), + gensz = gensz_, + chrsz = chrsz_, + cap_num_peak = macs2_cap_num_peak, + pval_thresh = pval_thresh, + fraglen = fraglen_mean.rounded_mean, + blacklist = blacklist_, + keep_irregular_chr_in_bfilt_peak = keep_irregular_chr_in_bfilt_peak, + + mem_mb = macs2_mem_mb, + disks = macs2_disks, + time_hr = macs2_time_hr, + } + } + if ( has_input_of_peak_caller_ppr2 && !has_output_of_peak_caller_ppr2 && + peak_caller_=='spp' && !align_only && !true_rep_only && num_rep>1 ) { # call peaks on 2nd pooled pseudo replicates call spp as spp_ppr2 { input : tas = flatten([select_all([pool_ta_pr2.ta_pooled]), chosen_ctl_ta_pooled]), - chrsz = chrsz, + chrsz = chrsz_, cap_num_peak = spp_cap_num_peak, fraglen = fraglen_mean.rounded_mean, - blacklist = blacklist, + blacklist = blacklist_, keep_irregular_chr_in_bfilt_peak = keep_irregular_chr_in_bfilt_peak, cpu = spp_cpu, @@ -791,152 +943,134 @@ workflow chip { time_hr = spp_time_hr, } } + File? peak_ppr2_ = if has_output_of_peak_caller_ppr2 then peak_ppr2 + else if peak_caller_=='spp' then spp_ppr2.rpeak + else macs2_ppr2.npeak - # make peak arrays - Array[File] peaks_ = if align_only then [] - else if peak_caller_=='spp' then flatten(select_all([spp.rpeak, peaks])) - else if peak_caller_=='macs2' then flatten([macs2.npeak, peaks]) - else [] + # do IDR/overlap on all pairs of two replicates (i,j) + # where i and j are zero-based indices and 0 <= i < j < num_rep + Array[Pair[Int, Int]] pairs_ = cross(range(num_rep),range(num_rep)) + scatter( pair in pairs_ ) { + Pair[Int, Int]? null_pair + Pair[Int, Int]? pairs__ = if pair.left0 ) { - scatter( pair in peak_pairs ) { + if ( !align_only ) { + scatter( pair in pairs ) { + # pair.left = 0-based index of 1st replicate + # pair.right = 0-based index of 2nd replicate # Naive overlap on every pair of true replicates call overlap { input : - prefix = pair.left, - peak1 = pair.right[0], - peak2 = pair.right[1], - peak_pooled = select_first([spp_pooled.rpeak, macs2_pooled.npeak, peak_pooled]), + prefix = 'rep'+(pair.left+1)+"_rep"+(pair.right+1), + peak1 = peak_[pair.left], + peak2 = peak_[pair.right], + peak_pooled = peak_pooled_, fraglen = fraglen_mean.rounded_mean, peak_type = peak_type, - blacklist = blacklist, - chrsz = chrsz, + blacklist = blacklist_, + chrsz = chrsz_, keep_irregular_chr_in_bfilt_peak = keep_irregular_chr_in_bfilt_peak, - ta = if defined(ta_pooled) then ta_pooled else pool_ta.ta_pooled, + ta = pool_ta.ta_pooled, } } } - if ( length(peaks_)>0 && enable_idr ) { - scatter( pair in peak_pairs ) { + + if ( enable_idr && !align_only ) { + scatter( pair in pairs ) { + # pair.left = 0-based index of 1st replicate + # pair.right = 0-based index of 2nd replicate # IDR on every pair of true replicates - call idr { input : - prefix = pair.left, - peak1 = pair.right[0], - peak2 = pair.right[1], - peak_pooled = select_first([spp_pooled.rpeak, macs2_pooled.npeak, peak_pooled]), + call idr { input : + prefix = 'rep'+(pair.left+1)+"_rep"+(pair.right+1), + peak1 = peak_[pair.left], + peak2 = peak_[pair.right], + peak_pooled = peak_pooled_, + fraglen = fraglen_mean.rounded_mean, idr_thresh = idr_thresh, peak_type = peak_type, - fraglen = fraglen_mean.rounded_mean, rank = idr_rank, - blacklist = blacklist, - chrsz = chrsz, + blacklist = blacklist_, + chrsz = chrsz_, keep_irregular_chr_in_bfilt_peak = keep_irregular_chr_in_bfilt_peak, - ta = if defined(ta_pooled) then ta_pooled else pool_ta.ta_pooled, + ta = pool_ta.ta_pooled, } } } - Array[File] peaks_pr1_ = flatten(select_all([spp_pr1.rpeak, macs2_pr1.npeak, peaks_pr1])) - Array[File] peaks_pr2_ = flatten(select_all([spp_pr2.rpeak, macs2_pr2.npeak, peaks_pr2])) - - #Array[File] peaks_pr1_ = if align_only then [] - # else if peak_caller=='spp' then flatten(select_all([spp_pr1.rpeak, peaks_pr1])) - # else if peak_caller=='macs2' then flatten(select_all([macs2_pr1.npeak, peaks_pr1])) - # else [] - #Array[File] peaks_pr2_ = if align_only then [] - # else if peak_caller=='spp' then flatten(select_all([spp_pr2.rpeak, peaks_pr2])) - # else if peak_caller=='macs2' then flatten(select_all([macs2_pr2.npeak, peaks_pr2])) - # else [] - - scatter( i in range(length(peaks_pr1_)) ) { - # Naive overlap on pseduo replicates - call overlap as overlap_pr { input : - prefix = "rep"+(i+1)+"-pr", - peak1 = peaks_pr1_[i], - peak2 = peaks_pr2_[i], - peak_pooled = peaks_[i], - fraglen = fraglen_[i], - peak_type = peak_type, - blacklist = blacklist, - chrsz = chrsz, - keep_irregular_chr_in_bfilt_peak = keep_irregular_chr_in_bfilt_peak, - ta = if length(tas_)>0 then tas_[i] else if defined(ta_pooled) then ta_pooled else pool_ta.ta_pooled, + # overlap on pseudo-replicates (pr1, pr2) for each true replicate + scatter( i in range(num_rep) ) { + if ( !align_only && !true_rep_only ) { + call overlap as overlap_pr { input : + prefix = "rep"+(i+1)+"-pr", + peak1 = peak_pr1_[i], + peak2 = peak_pr2_[i], + peak_pooled = peak_[i], + fraglen = fraglen_[i], + peak_type = peak_type, + blacklist = blacklist_, + chrsz = chrsz_, + keep_irregular_chr_in_bfilt_peak = keep_irregular_chr_in_bfilt_peak, + ta = ta_[i], + } } } - if ( enable_idr ) { - scatter( i in range(length(peaks_pr1_)) ) { + + scatter( i in range(num_rep) ) { + if ( !align_only && !true_rep_only && enable_idr ) { # IDR on pseduo replicates - call idr as idr_pr { input : + call idr as idr_pr { input : prefix = "rep"+(i+1)+"-pr", - peak1 = peaks_pr1_[i], - peak2 = peaks_pr2_[i], - peak_pooled = peaks_[i], + peak1 = peak_pr1_[i], + peak2 = peak_pr2_[i], + peak_pooled = peak_[i], fraglen = fraglen_[i], idr_thresh = idr_thresh, peak_type = peak_type, rank = idr_rank, - blacklist = blacklist, - chrsz = chrsz, + blacklist = blacklist_, + chrsz = chrsz_, keep_irregular_chr_in_bfilt_peak = keep_irregular_chr_in_bfilt_peak, - ta = if length(tas_)>0 then tas_[i] else if defined(ta_pooled) then ta_pooled else pool_ta.ta_pooled, + ta = ta_[i], } } } - if ( length(peaks_pr1_)>1 ) { + + if ( !align_only && !true_rep_only && num_rep>1 ) { # Naive overlap on pooled pseudo replicates - call overlap as overlap_ppr { input : + call overlap as overlap_ppr { input : prefix = "ppr", - peak1 = select_first([spp_ppr1.rpeak, macs2_ppr1.npeak, peak_ppr1]), #peak_ppr1_[0], - peak2 = select_first([spp_ppr2.rpeak, macs2_ppr2.npeak, peak_ppr2]), #peak_ppr2_[0], - peak_pooled = select_first([spp_pooled.rpeak, macs2_pooled.npeak, peak_pooled]), + peak1 = peak_ppr1_, + peak2 = peak_ppr2_, + peak_pooled = peak_pooled_, peak_type = peak_type, fraglen = fraglen_mean.rounded_mean, - blacklist = blacklist, - chrsz = chrsz, + blacklist = blacklist_, + chrsz = chrsz_, keep_irregular_chr_in_bfilt_peak = keep_irregular_chr_in_bfilt_peak, - ta = if defined(ta_pooled) then ta_pooled else pool_ta.ta_pooled, + ta = pool_ta.ta_pooled, } } - if ( enable_idr && length(peaks_pr1_)>1 ) { + + if ( !align_only && !true_rep_only && num_rep>1 ) { # IDR on pooled pseduo replicates - call idr as idr_ppr { input : + call idr as idr_ppr { input : prefix = "ppr", - peak1 = select_first([spp_ppr1.rpeak, macs2_ppr1.npeak, peak_ppr1]), #peak_ppr1_[0], - peak2 = select_first([spp_ppr2.rpeak, macs2_ppr2.npeak, peak_ppr2]), #peak_ppr2_[0], - peak_pooled = select_first([spp_pooled.rpeak, macs2_pooled.npeak, peak_pooled]), + peak1 = peak_ppr1_, + peak2 = peak_ppr2_, + peak_pooled = peak_pooled_, idr_thresh = idr_thresh, peak_type = peak_type, - rank = idr_rank, fraglen = fraglen_mean.rounded_mean, - blacklist = blacklist, - chrsz = chrsz, + rank = idr_rank, + blacklist = blacklist_, + chrsz = chrsz_, keep_irregular_chr_in_bfilt_peak = keep_irregular_chr_in_bfilt_peak, - ta = if defined(ta_pooled) then ta_pooled else pool_ta.ta_pooled, + ta = pool_ta.ta_pooled, } } + # reproducibility QC for overlap/IDR peaks if ( !align_only && !true_rep_only ) { # reproducibility QC for overlapping peaks call reproducibility as reproducibility_overlap { input : @@ -945,10 +1079,11 @@ workflow chip { peaks_pr = overlap_pr.bfilt_overlap_peak, peak_ppr = overlap_ppr.bfilt_overlap_peak, peak_type = peak_type, - chrsz = chrsz, + chrsz = chrsz_, keep_irregular_chr_in_bfilt_peak = keep_irregular_chr_in_bfilt_peak, } } + if ( !align_only && !true_rep_only && enable_idr ) { # reproducibility QC for IDR peaks call reproducibility as reproducibility_idr { input : @@ -957,72 +1092,53 @@ workflow chip { peaks_pr = idr_pr.bfilt_idr_peak, peak_ppr = idr_ppr.bfilt_idr_peak, peak_type = peak_type, - chrsz = chrsz, + chrsz = chrsz_, keep_irregular_chr_in_bfilt_peak = keep_irregular_chr_in_bfilt_peak, } } - Array[File] flagstat_qcs_ = flatten([flagstat_qcs, bwa.flagstat_qc]) - Array[File] pbc_qcs_ = flatten([pbc_qcs, filter.pbc_qc]) - Array[File] dup_qcs_ = flatten([dup_qcs, filter.dup_qc]) - Array[File] nodup_flagstat_qcs_ = flatten([nodup_flagstat_qcs, filter.flagstat_qc]) - - Array[File] ctl_flagstat_qcs_ = flatten([ctl_flagstat_qcs, bwa_ctl.flagstat_qc]) - Array[File] ctl_pbc_qcs_ = flatten([ctl_pbc_qcs, filter_ctl.pbc_qc]) - Array[File] ctl_dup_qcs_ = flatten([ctl_dup_qcs, filter_ctl.dup_qc]) - Array[File] ctl_nodup_flagstat_qcs_ = flatten([ctl_nodup_flagstat_qcs, filter_ctl.flagstat_qc]) - - Array[File] xcor_plots_ = flatten([xcor_plots, xcor.plot_png]) - Array[File] xcor_scores_ = flatten([xcor_scores, xcor.score]) - - Array[File] macs2_frip_qcs_ = flatten([macs2_frip_qcs, macs2.frip_qc]) - Array[File] macs2_pr1_frip_qcs_ = flatten(select_all([macs2_pr1_frip_qcs, macs2_pr1.frip_qc])) - Array[File] macs2_pr2_frip_qcs_ = flatten(select_all([macs2_pr2_frip_qcs, macs2_pr2.frip_qc])) - Array[File] spp_frip_qcs_ = flatten(select_all([spp_frip_qcs, spp.frip_qc])) - Array[File] spp_pr1_frip_qcs_ = flatten(select_all([spp_pr1_frip_qcs, spp_pr1.frip_qc])) - Array[File] spp_pr2_frip_qcs_ = flatten(select_all([spp_pr2_frip_qcs, spp_pr2.frip_qc])) - # Generate final QC report and JSON call qc_report { input : pipeline_ver = pipeline_ver, title = title, description = description, - genome = basename(genome_tsv), - paired_end = paired_end, - ctl_paired_end = if defined(ctl_paired_end) then ctl_paired_end else paired_end, + genome = basename(select_first([genome_tsv, ref_fa_, chrsz_, 'None'])), + paired_ends = paired_end_, + ctl_paired_ends = ctl_paired_end_, pipeline_type = pipeline_type, peak_caller = peak_caller_, macs2_cap_num_peak = macs2_cap_num_peak, spp_cap_num_peak = spp_cap_num_peak, idr_thresh = idr_thresh, - flagstat_qcs = flagstat_qcs_, - nodup_flagstat_qcs = nodup_flagstat_qcs_, - dup_qcs = dup_qcs_, - pbc_qcs = pbc_qcs_, - ctl_flagstat_qcs = ctl_flagstat_qcs_, - ctl_nodup_flagstat_qcs = ctl_nodup_flagstat_qcs_, - ctl_dup_qcs = ctl_dup_qcs_, - ctl_pbc_qcs = ctl_pbc_qcs_, - xcor_plots = xcor_plots_, - xcor_scores = xcor_scores_, - - jsd_plot = if length(jsd_qcs)>0 then jsd_plot else fingerprint.plot, - jsd_qcs = if length(jsd_qcs)>0 then jsd_qcs else select_first([fingerprint.jsd_qcs,[]]), - - frip_macs2_qcs = macs2_frip_qcs_, - frip_macs2_qcs_pr1 = macs2_pr1_frip_qcs_, - frip_macs2_qcs_pr2 = macs2_pr2_frip_qcs_, - frip_macs2_qc_pooled = if defined(macs2_pooled_frip_qc_) then macs2_pooled_frip_qc_ else macs2_pooled.frip_qc, - frip_macs2_qc_ppr1 = if defined(macs2_ppr1_frip_qc_) then macs2_ppr1_frip_qc_ else macs2_ppr1.frip_qc, - frip_macs2_qc_ppr2 = if defined(macs2_ppr2_frip_qc_) then macs2_ppr2_frip_qc_ else macs2_ppr2.frip_qc, - - frip_spp_qcs = spp_frip_qcs_, - frip_spp_qcs_pr1 = spp_pr1_frip_qcs_, - frip_spp_qcs_pr2 = spp_pr2_frip_qcs_, - frip_spp_qc_pooled = if defined(spp_pooled_frip_qc_) then spp_pooled_frip_qc_ else spp_pooled.frip_qc, - frip_spp_qc_ppr1 = if defined(spp_ppr1_frip_qc_) then spp_ppr1_frip_qc_ else spp_ppr1.frip_qc, - frip_spp_qc_ppr2 = if defined(spp_ppr2_frip_qc_) then spp_ppr2_frip_qc_ else spp_ppr2.frip_qc, + flagstat_qcs = bwa.flagstat_qc, + nodup_flagstat_qcs = filter.flagstat_qc, + dup_qcs = filter.dup_qc, + pbc_qcs = filter.pbc_qc, + xcor_plots = xcor.plot_png, + xcor_scores = xcor.score, + + ctl_flagstat_qcs = bwa_ctl.flagstat_qc, + ctl_nodup_flagstat_qcs = filter_ctl.flagstat_qc, + ctl_dup_qcs = filter_ctl.dup_qc, + ctl_pbc_qcs = filter_ctl.pbc_qc, + + jsd_plot = fingerprint.plot, + jsd_qcs = fingerprint.jsd_qcs, + + frip_macs2_qcs = macs2.frip_qc, + frip_macs2_qcs_pr1 = macs2_pr1.frip_qc, + frip_macs2_qcs_pr2 = macs2_pr2.frip_qc, + frip_macs2_qc_pooled = macs2_pooled.frip_qc, + frip_macs2_qc_ppr1 = macs2_ppr1.frip_qc, + frip_macs2_qc_ppr2 = macs2_ppr2.frip_qc, + + frip_spp_qcs = spp.frip_qc, + frip_spp_qcs_pr1 = spp_pr1.frip_qc, + frip_spp_qcs_pr2 = spp_pr2.frip_qc, + frip_spp_qc_pooled = spp_pooled.frip_qc, + frip_spp_qc_ppr1 = spp_ppr1.frip_qc, + frip_spp_qc_ppr2 = spp_ppr2.frip_qc, idr_plots = idr.idr_plot, idr_plots_pr = idr_pr.idr_plot, @@ -1041,27 +1157,26 @@ workflow chip { File report = qc_report.report File qc_json = qc_report.qc_json Boolean qc_json_ref_match = qc_report.qc_json_ref_match - } + } } task merge_fastq { # merge trimmed fastqs - Array[Array[File]] fastqs # [merge_id][read_end_id] + Array[File] fastqs_R1 # [merge_id] + Array[File] fastqs_R2 Boolean paired_end + File? null_f + Array[Array[File]] tmp_fastqs = if paired_end then transpose([fastqs_R1, fastqs_R2]) + else transpose([fastqs_R1]) command { python $(which encode_merge_fastq.py) \ - ${write_tsv(fastqs)} \ + ${write_tsv(tmp_fastqs)} \ ${if paired_end then "--paired-end" else ""} \ ${"--nth " + 1} } output { - # WDL glob() globs in an alphabetical order - # so R1 and R2 can be switched, which results in an - # unexpected behavior of a workflow - # so we prepend merge_fastqs_'end'_ (R1 or R2) - # to the basename of original filename - # this prefix will be later stripped in bwa task - Array[File] merged_fastqs = glob("merge_fastqs_R?_*.fastq.gz") + File merged_fastq_R1 = glob("R1/*.fastq.gz")[0] + File? merged_fastq_R2 = if paired_end then glob("R2/*.fastq.gz")[0] else null_f } runtime { cpu : 1 @@ -1092,8 +1207,9 @@ task trim_fastq { # trim fastq (for PE R1 only) } task bwa { - File idx_tar # reference bwa index tar - Array[File] fastqs # [read_end_id] + File bwa_idx_tar # reference bwa index tar + File? fastq_R1 # [read_end_id] + File? fastq_R2 Boolean paired_end Boolean use_bwa_mem_for_pe @@ -1104,8 +1220,8 @@ task bwa { command { python $(which encode_bwa.py) \ - ${idx_tar} \ - ${sep=' ' fastqs} \ + ${bwa_idx_tar} \ + ${fastq_R1} ${fastq_R2} \ ${if paired_end then "--paired-end" else ""} \ ${if use_bwa_mem_for_pe then "--use-bwa-mem-for-pe" else ""} \ ${"--nth " + cpu} @@ -1224,7 +1340,7 @@ task spr { # make two self pseudo replicates } task pool_ta { - Array[File] tas + Array[File?] tas command { python $(which encode_pool_ta.py) \ @@ -1284,7 +1400,7 @@ task xcor { } task fingerprint { - Array[File] nodup_bams + Array[File?] nodup_bams File ctl_bam # one control bam is required File blacklist @@ -1313,8 +1429,8 @@ task fingerprint { } task choose_ctl { - Array[File] tas - Array[File] ctl_tas + Array[File?] tas + Array[File?] ctl_tas File? ta_pooled File? ctl_ta_pooled Boolean always_use_pooled_ctl # always use pooled control for all exp rep. @@ -1362,7 +1478,7 @@ task count_signal_track { } task macs2 { - Array[File] tas # [ta, control_ta]. control_ta is optional + Array[File?] tas # [ta, control_ta]. control_ta is optional Int fraglen # fragment length from xcor String gensz # Genome size (sum of entries in 2nd column of # chr. sizes file, or hs for human, ms for mouse) @@ -1391,7 +1507,8 @@ task macs2 { File npeak = glob("*[!.][!b][!f][!i][!l][!t].narrowPeak.gz")[0] File bfilt_npeak = glob("*.bfilt.narrowPeak.gz")[0] File bfilt_npeak_bb = glob("*.bfilt.narrowPeak.bb")[0] - Array[File] bfilt_npeak_hammock = glob("*.bfilt.narrowPeak.hammock.gz*") + File bfilt_npeak_hammock = glob("*.bfilt.narrowPeak.hammock.gz*")[0] + File bfilt_npeak_hammock_tbi = glob("*.bfilt.narrowPeak.hammock.gz*")[1] File frip_qc = glob("*.frip.qc")[0] } runtime { @@ -1403,7 +1520,7 @@ task macs2 { } task macs2_signal_track { - Array[File] tas # [ta, control_ta]. control_ta is optional + Array[File?] tas # [ta, control_ta]. control_ta is optional Int fraglen # fragment length from xcor String gensz # Genome size (sum of entries in 2nd column of # chr. sizes file, or hs for human, ms for mouse) @@ -1435,7 +1552,7 @@ task macs2_signal_track { } task spp { - Array[File] tas # [ta, control_ta]. control_ta is always required + Array[File?] tas # [ta, control_ta]. control_ta is always required Int fraglen # fragment length from xcor File chrsz # 2-col chromosome sizes file Int cap_num_peak # cap number of raw peaks called from MACS2 @@ -1461,7 +1578,8 @@ task spp { File rpeak = glob("*[!.][!b][!f][!i][!l][!t].regionPeak.gz")[0] File bfilt_rpeak = glob("*.bfilt.regionPeak.gz")[0] File bfilt_rpeak_bb = glob("*.bfilt.regionPeak.bb")[0] - Array[File] bfilt_rpeak_hammock = glob("*.bfilt.regionPeak.hammock.gz*") + File bfilt_rpeak_hammock = glob("*.bfilt.regionPeak.hammock.gz*")[0] + File bfilt_rpeak_hammock_tbi = glob("*.bfilt.regionPeak.hammock.gz*")[1] File frip_qc = glob("*.frip.qc")[0] } runtime { @@ -1489,7 +1607,7 @@ task idr { String rank command { - ${if defined(ta) then "" else "touch null.frip.qc"} + ${if defined(ta) then "" else "touch null.frip.qc"} touch null python $(which encode_idr.py) \ ${peak1} ${peak2} ${peak_pooled} \ @@ -1507,7 +1625,8 @@ task idr { File idr_peak = glob("*[!.][!b][!f][!i][!l][!t]."+peak_type+".gz")[0] File bfilt_idr_peak = glob("*.bfilt."+peak_type+".gz")[0] File bfilt_idr_peak_bb = glob("*.bfilt."+peak_type+".bb")[0] - Array[File] bfilt_idr_peak_hammock = glob("*.bfilt."+peak_type+".hammock.gz*") + File bfilt_idr_peak_hammock = glob("*.bfilt."+peak_type+".hammock.gz*")[0] + File bfilt_idr_peak_hammock_tbi = glob("*.bfilt."+peak_type+".hammock.gz*")[1] File idr_plot = glob("*.txt.png")[0] File idr_unthresholded_peak = glob("*.txt.gz")[0] File idr_log = glob("*.idr*.log")[0] @@ -1535,7 +1654,7 @@ task overlap { String peak_type command { - ${if defined(ta) then "" else "touch null.frip.qc"} + ${if defined(ta) then "" else "touch null.frip.qc"} touch null python $(which encode_naive_overlap.py) \ ${peak1} ${peak2} ${peak_pooled} \ @@ -1552,7 +1671,8 @@ task overlap { File overlap_peak = glob("*[!.][!b][!f][!i][!l][!t]."+peak_type+".gz")[0] File bfilt_overlap_peak = glob("*.bfilt."+peak_type+".gz")[0] File bfilt_overlap_peak_bb = glob("*.bfilt."+peak_type+".bb")[0] - Array[File] bfilt_overlap_peak_hammock = glob("*.bfilt."+peak_type+".hammock.gz*") + File bfilt_overlap_peak_hammock = glob("*.bfilt."+peak_type+".hammock.gz*")[0] + File bfilt_overlap_peak_hammock_tbi = glob("*.bfilt."+peak_type+".hammock.gz*")[1] File frip_qc = if defined(ta) then glob("*.frip.qc")[0] else glob("null")[0] } runtime { @@ -1569,7 +1689,7 @@ task reproducibility { # in a sorted order. for example of 4 replicates, # 1,2 1,3 1,4 2,3 2,4 3,4. # x,y means peak file from rep-x vs rep-y - Array[File] peaks_pr # peak files from pseudo replicates + Array[File?] peaks_pr # peak files from pseudo replicates File? peak_ppr # Peak file from pooled pseudo replicate. String peak_type File chrsz # 2-col chromosome sizes file @@ -1590,8 +1710,10 @@ task reproducibility { File conservative_peak = glob("conservative_peak.*.gz")[0] File optimal_peak_bb = glob("optimal_peak.*.bb")[0] File conservative_peak_bb = glob("conservative_peak.*.bb")[0] - Array[File] optimal_peak_hammock = glob("optimal_peak.*.hammock.gz*") - Array[File] conservative_peak_hammock = glob("conservative_peak.*.hammock.gz*") + File optimal_peak_hammock = glob("optimal_peak.*.hammock.gz*")[0] + File optimal_peak_hammock_tbi = glob("optimal_peak.*.hammock.gz*")[1] + File conservative_peak_hammock = glob("conservative_peak.*.hammock.gz*")[0] + File conservative_peak_hammock_tbi = glob("conservative_peak.*.hammock.gz*")[1] File reproducibility_qc = glob("*reproducibility.qc")[0] } runtime { @@ -1613,46 +1735,46 @@ task qc_report { String? genome #String? encode_accession_id # ENCODE accession ID of sample # workflow params - Boolean paired_end - Boolean ctl_paired_end + Array[Boolean?] paired_ends + Array[Boolean?] ctl_paired_ends String pipeline_type String peak_caller Int? macs2_cap_num_peak Int? spp_cap_num_peak Float idr_thresh # QCs - Array[File]? flagstat_qcs - Array[File]? nodup_flagstat_qcs - Array[File]? dup_qcs - Array[File]? pbc_qcs - Array[File]? ctl_flagstat_qcs - Array[File]? ctl_nodup_flagstat_qcs - Array[File]? ctl_dup_qcs - Array[File]? ctl_pbc_qcs - Array[File]? xcor_plots - Array[File]? xcor_scores + Array[File?] flagstat_qcs + Array[File?] nodup_flagstat_qcs + Array[File?] dup_qcs + Array[File?] pbc_qcs + Array[File?] ctl_flagstat_qcs + Array[File?] ctl_nodup_flagstat_qcs + Array[File?] ctl_dup_qcs + Array[File?] ctl_pbc_qcs + Array[File?] xcor_plots + Array[File?] xcor_scores File? jsd_plot - Array[File]? jsd_qcs - Array[File]? idr_plots - Array[File]? idr_plots_pr + Array[File?] jsd_qcs + Array[File?] idr_plots + Array[File?] idr_plots_pr File? idr_plot_ppr - Array[File]? frip_macs2_qcs - Array[File]? frip_macs2_qcs_pr1 - Array[File]? frip_macs2_qcs_pr2 + Array[File?] frip_macs2_qcs + Array[File?] frip_macs2_qcs_pr1 + Array[File?] frip_macs2_qcs_pr2 File? frip_macs2_qc_pooled File? frip_macs2_qc_ppr1 File? frip_macs2_qc_ppr2 - Array[File]? frip_spp_qcs - Array[File]? frip_spp_qcs_pr1 - Array[File]? frip_spp_qcs_pr2 + Array[File?] frip_spp_qcs + Array[File?] frip_spp_qcs_pr1 + Array[File?] frip_spp_qcs_pr2 File? frip_spp_qc_pooled File? frip_spp_qc_ppr1 File? frip_spp_qc_ppr2 - Array[File]? frip_idr_qcs - Array[File]? frip_idr_qcs_pr + Array[File?] frip_idr_qcs + Array[File?] frip_idr_qcs_pr File? frip_idr_qc_ppr - Array[File]? frip_overlap_qcs - Array[File]? frip_overlap_qcs_pr + Array[File?] frip_overlap_qcs + Array[File?] frip_overlap_qcs_pr File? frip_overlap_qc_ppr File? idr_reproducibility_qc File? overlap_reproducibility_qc @@ -1666,45 +1788,44 @@ task qc_report { ${"--desc '" + sub(description,"'","_") + "'"} \ ${"--genome " + genome} \ ${"--multimapping " + 0} \ - ${if paired_end then "--paired-end" else ""} \ - ${if ctl_paired_end then "--ctl-paired-end" else ""} \ + --paired-ends ${sep=" " paired_ends} \ --pipeline-type ${pipeline_type} \ --peak-caller ${peak_caller} \ ${"--macs2-cap-num-peak " + macs2_cap_num_peak} \ ${"--spp-cap-num-peak " + spp_cap_num_peak} \ --idr-thresh ${idr_thresh} \ - --flagstat-qcs ${sep=' ' flagstat_qcs} \ - --nodup-flagstat-qcs ${sep=' ' nodup_flagstat_qcs} \ - --dup-qcs ${sep=' ' dup_qcs} \ - --pbc-qcs ${sep=' ' pbc_qcs} \ - --ctl-flagstat-qcs ${sep=' ' ctl_flagstat_qcs} \ - --ctl-nodup-flagstat-qcs ${sep=' ' ctl_nodup_flagstat_qcs} \ - --ctl-dup-qcs ${sep=' ' ctl_dup_qcs} \ - --ctl-pbc-qcs ${sep=' ' ctl_pbc_qcs} \ - --xcor-plots ${sep=' ' xcor_plots} \ - --xcor-scores ${sep=' ' xcor_scores} \ + --flagstat-qcs ${sep="_:_" flagstat_qcs} \ + --nodup-flagstat-qcs ${sep="_:_" nodup_flagstat_qcs} \ + --dup-qcs ${sep="_:_" dup_qcs} \ + --pbc-qcs ${sep="_:_" pbc_qcs} \ + --xcor-plots ${sep="_:_" xcor_plots} \ + --xcor-scores ${sep="_:_" xcor_scores} \ + --idr-plots ${sep="_:_" idr_plots} \ + --idr-plots-pr ${sep="_:_" idr_plots_pr} \ + --ctl-flagstat-qcs ${sep='_:_' ctl_flagstat_qcs} \ + --ctl-nodup-flagstat-qcs ${sep='_:_' ctl_nodup_flagstat_qcs} \ + --ctl-dup-qcs ${sep='_:_' ctl_dup_qcs} \ + --ctl-pbc-qcs ${sep='_:_' ctl_pbc_qcs} \ ${"--jsd-plot " + jsd_plot} \ - --jsd-qcs ${sep=' ' jsd_qcs} \ - --idr-plots ${sep=' ' idr_plots} \ - --idr-plots-pr ${sep=' ' idr_plots_pr} \ + --jsd-qcs ${sep='_:_' jsd_qcs} \ + --frip-spp-qcs ${sep='_:_' frip_spp_qcs} \ + --frip-spp-qcs-pr1 ${sep='_:_' frip_spp_qcs_pr1} \ + --frip-spp-qcs-pr2 ${sep='_:_' frip_spp_qcs_pr2} \ + ${"--frip-spp-qc-pooled " + frip_spp_qc_pooled} \ + ${"--frip-spp-qc-ppr1 " + frip_spp_qc_ppr1} \ + ${"--frip-spp-qc-ppr2 " + frip_spp_qc_ppr2} \ ${"--idr-plot-ppr " + idr_plot_ppr} \ - --frip-macs2-qcs ${sep=' ' frip_macs2_qcs} \ - --frip-macs2-qcs-pr1 ${sep=' ' frip_macs2_qcs_pr1} \ - --frip-macs2-qcs-pr2 ${sep=' ' frip_macs2_qcs_pr2} \ + --frip-macs2-qcs ${sep="_:_" frip_macs2_qcs} \ + --frip-macs2-qcs-pr1 ${sep="_:_" frip_macs2_qcs_pr1} \ + --frip-macs2-qcs-pr2 ${sep="_:_" frip_macs2_qcs_pr2} \ ${"--frip-macs2-qc-pooled " + frip_macs2_qc_pooled} \ ${"--frip-macs2-qc-ppr1 " + frip_macs2_qc_ppr1} \ ${"--frip-macs2-qc-ppr2 " + frip_macs2_qc_ppr2} \ - --frip-spp-qcs ${sep=' ' frip_spp_qcs} \ - --frip-spp-qcs-pr1 ${sep=' ' frip_spp_qcs_pr1} \ - --frip-spp-qcs-pr2 ${sep=' ' frip_spp_qcs_pr2} \ - ${"--frip-spp-qc-pooled " + frip_spp_qc_pooled} \ - ${"--frip-spp-qc-ppr1 " + frip_spp_qc_ppr1} \ - ${"--frip-spp-qc-ppr2 " + frip_spp_qc_ppr2} \ - --frip-idr-qcs ${sep=' ' frip_idr_qcs} \ - --frip-idr-qcs-pr ${sep=' ' frip_idr_qcs_pr} \ + --frip-idr-qcs ${sep="_:_" frip_idr_qcs} \ + --frip-idr-qcs-pr ${sep="_:_" frip_idr_qcs_pr} \ ${"--frip-idr-qc-ppr " + frip_idr_qc_ppr} \ - --frip-overlap-qcs ${sep=' ' frip_overlap_qcs} \ - --frip-overlap-qcs-pr ${sep=' ' frip_overlap_qcs_pr} \ + --frip-overlap-qcs ${sep="_:_" frip_overlap_qcs} \ + --frip-overlap-qcs-pr ${sep="_:_" frip_overlap_qcs_pr} \ ${"--frip-overlap-qc-ppr " + frip_overlap_qc_ppr} \ ${"--idr-reproducibility-qc " + idr_reproducibility_qc} \ ${"--overlap-reproducibility-qc " + overlap_reproducibility_qc} \ @@ -1729,11 +1850,29 @@ task qc_report { task read_genome_tsv { File genome_tsv - command { - cat ${genome_tsv} > 'tmp.tsv' - } + + String? null_s + command <<< + # create empty files for all entries + touch ref_fa bowtie2_idx_tar chrsz gensz blacklist + + python <>> output { - Map[String,String] genome = read_map('tmp.tsv') + String? ref_fa = if size('ref_fa')==0 then null_s else read_string('ref_fa') + String? bwa_idx_tar = if size('bwa_idx_tar')==0 then null_s else read_string('bwa_idx_tar') + String? chrsz = if size('chrsz')==0 then null_s else read_string('chrsz') + String? gensz = if size('gensz')==0 then null_s else read_string('gensz') + String? blacklist = if size('blacklist')==0 then null_s else read_string('blacklist') } runtime { cpu : 1 @@ -1744,7 +1883,7 @@ task read_genome_tsv { } task rounded_mean { - Array[Int] ints + Array[Int?] ints command <<< python <1: From 04fc34dd86a02b71e55ba9c264481cd4faee14ca Mon Sep 17 00:00:00 2001 From: Jin Lee Date: Sat, 8 Jun 2019 04:45:53 -0700 Subject: [PATCH 05/14] fix for task-level tasks after refactoring chip.wdl --- test/test_task/test_bwa.wdl | 9 +++++---- test/test_task/test_merge_fastq.json | 25 +++++++++---------------- test/test_task/test_merge_fastq.wdl | 27 +++++++++++++++------------ 3 files changed, 29 insertions(+), 32 deletions(-) diff --git a/test/test_task/test_bwa.wdl b/test/test_task/test_bwa.wdl index 82f17bba..74bd10ba 100644 --- a/test/test_task/test_bwa.wdl +++ b/test/test_task/test_bwa.wdl @@ -20,8 +20,9 @@ workflow test_bwa { String bwa_disks = "local-disk 100 HDD" call chip.bwa as pe_bwa { input : - idx_tar = pe_bwa_idx_tar, - fastqs = pe_fastqs, + bwa_idx_tar = pe_bwa_idx_tar, + fastq_R1 = pe_fastqs[0], + fastq_R2 = pe_fastqs[1], paired_end = true, use_bwa_mem_for_pe = false, @@ -31,8 +32,8 @@ workflow test_bwa { disks = bwa_disks, } call chip.bwa as se_bwa { input : - idx_tar = se_bwa_idx_tar, - fastqs = se_fastqs, + bwa_idx_tar = se_bwa_idx_tar, + fastq_R1 = se_fastqs[0], paired_end = false, use_bwa_mem_for_pe = false, diff --git a/test/test_task/test_merge_fastq.json b/test/test_task/test_merge_fastq.json index a90e7689..0859620e 100644 --- a/test/test_task/test_merge_fastq.json +++ b/test/test_task/test_merge_fastq.json @@ -1,23 +1,16 @@ { - "test_merge_fastq.pe_fastqs" : [ - [ - "chip-seq-pipeline-test-data/input/pe/fastqs/rep1/pair1/rep1-R1.subsampled.67.fastq.gz", - "chip-seq-pipeline-test-data/input/pe/fastqs/rep1/pair2/rep1-R2.subsampled.67.fastq.gz" - ], - [ - "chip-seq-pipeline-test-data/input/pe/fastqs/rep2/pair1/rep2-R1.subsampled.67.fastq.gz", - "chip-seq-pipeline-test-data/input/pe/fastqs/rep2/pair2/rep2-R2.subsampled.67.fastq.gz" - ] + "test_merge_fastq.pe_fastqs_R1" : [ + "chip-seq-pipeline-test-data/input/pe/fastqs/rep1/pair1/rep1-R1.subsampled.67.fastq.gz", + "chip-seq-pipeline-test-data/input/pe/fastqs/rep2/pair1/rep2-R1.subsampled.67.fastq.gz" + ], + "test_merge_fastq.pe_fastqs_R2" : [ + "chip-seq-pipeline-test-data/input/pe/fastqs/rep1/pair2/rep1-R2.subsampled.67.fastq.gz", + "chip-seq-pipeline-test-data/input/pe/fastqs/rep2/pair2/rep2-R2.subsampled.67.fastq.gz" ], "test_merge_fastq.se_fastqs" : [ - [ - "chip-seq-pipeline-test-data/input/se/fastqs/rep1/rep1.subsampled.25.fastq.gz" - ], - [ - "chip-seq-pipeline-test-data/input/se/fastqs/rep2/rep2.subsampled.20.fastq.gz" - ] + "chip-seq-pipeline-test-data/input/se/fastqs/rep1/rep1.subsampled.25.fastq.gz", + "chip-seq-pipeline-test-data/input/se/fastqs/rep2/rep2.subsampled.20.fastq.gz" ], - "test_merge_fastq.ref_pe_merged_fastq_R1" : "chip-seq-pipeline-test-data/ref_output/test_merge_fastq/pe_R1_merged.fastq.gz", "test_merge_fastq.ref_pe_merged_fastq_R2" : "chip-seq-pipeline-test-data/ref_output/test_merge_fastq/pe_R2_merged.fastq.gz", "test_merge_fastq.ref_se_merged_fastq" : "chip-seq-pipeline-test-data/ref_output/test_merge_fastq/se_merged.fastq.gz" diff --git a/test/test_task/test_merge_fastq.wdl b/test/test_task/test_merge_fastq.wdl index d50cbea5..2a7ca206 100644 --- a/test/test_task/test_merge_fastq.wdl +++ b/test/test_task/test_merge_fastq.wdl @@ -4,19 +4,22 @@ import "../../chip.wdl" as chip workflow test_merge_fastq { # test merging rep1 and rep2 - Array[Array[String]] pe_fastqs - Array[Array[String]] se_fastqs + Array[File] pe_fastqs_R1 + Array[File] pe_fastqs_R2 + Array[File] se_fastqs - String ref_pe_merged_fastq_R1 - String ref_pe_merged_fastq_R2 - String ref_se_merged_fastq + File ref_pe_merged_fastq_R1 + File ref_pe_merged_fastq_R2 + File ref_se_merged_fastq call chip.merge_fastq as pe_merge_fastq { input : - fastqs = pe_fastqs, + fastqs_R1 = pe_fastqs_R1, + fastqs_R2 = pe_fastqs_R2, paired_end = true, } call chip.merge_fastq as se_merge_fastq { input : - fastqs = se_fastqs, + fastqs_R1 = se_fastqs, + fastqs_R2 = [], paired_end = false, } @@ -26,11 +29,11 @@ workflow test_merge_fastq { 'pe_merge_fastq_R2', 'se_merge_fastq', ], - files = [ - pe_merge_fastq.merged_fastqs[0], - pe_merge_fastq.merged_fastqs[1], - se_merge_fastq.merged_fastqs[0], - ], + files = select_all([ + pe_merge_fastq.merged_fastq_R1, + pe_merge_fastq.merged_fastq_R2, + se_merge_fastq.merged_fastq_R1, + ]), ref_files = [ ref_pe_merged_fastq_R1, ref_pe_merged_fastq_R2, From cd411314b052e4a4d18559f2128c4409993da98f Mon Sep 17 00:00:00 2001 From: Jin Lee Date: Sat, 8 Jun 2019 07:44:08 -0700 Subject: [PATCH 06/14] update refout after refactoring (macs2 peak removed from qc.json due to separation of macs2/macs2_signal_track) --- .../ENCSR000DYI_subsampled_chr19_only.json | 2 +- .../ENCSR936XTK_subsampled_chr19_only.json | 2 +- .../ENCSR000DYI_subsampled_chr19_only/qc.json | 428 +++++++++++++++++ .../ENCSR936XTK_subsampled_chr19_only/qc.json | 435 ++++++++++++++++++ 4 files changed, 865 insertions(+), 2 deletions(-) create mode 100644 test/test_workflow/ref_output/v1.2.2/ENCSR000DYI_subsampled_chr19_only/qc.json create mode 100644 test/test_workflow/ref_output/v1.2.2/ENCSR936XTK_subsampled_chr19_only/qc.json diff --git a/test/test_workflow/ENCSR000DYI_subsampled_chr19_only.json b/test/test_workflow/ENCSR000DYI_subsampled_chr19_only.json index 47126777..1f5decfe 100644 --- a/test/test_workflow/ENCSR000DYI_subsampled_chr19_only.json +++ b/test/test_workflow/ENCSR000DYI_subsampled_chr19_only.json @@ -1,5 +1,5 @@ { - "chip.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ref_output/v1.1.5/ENCSR000DYI_subsampled_chr19_only/qc.json", + "chip.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ref_output/v1.2.2/ENCSR000DYI_subsampled_chr19_only/qc.json", "chip.pipeline_type" : "tf", "chip.genome_tsv" : "gs://encode-pipeline-genome-data/hg38_chr19_chrM_google.tsv", "chip.fastqs_rep1_R1" : ["gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/rep1.subsampled.25.fastq.gz" diff --git a/test/test_workflow/ENCSR936XTK_subsampled_chr19_only.json b/test/test_workflow/ENCSR936XTK_subsampled_chr19_only.json index bb30a8fe..2ddf10e3 100644 --- a/test/test_workflow/ENCSR936XTK_subsampled_chr19_only.json +++ b/test/test_workflow/ENCSR936XTK_subsampled_chr19_only.json @@ -1,5 +1,5 @@ { - "chip.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ref_output/v1.1.5/ENCSR936XTK_subsampled_chr19_only/qc.json", + "chip.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ref_output/v1.2.2/ENCSR936XTK_subsampled_chr19_only/qc.json", "chip.pipeline_type" : "tf", "chip.genome_tsv" : "gs://encode-pipeline-genome-data/hg38_chr19_chrM_google.tsv", "chip.fastqs_rep1_R1" : ["gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep1-R1.subsampled.67.fastq.gz" diff --git a/test/test_workflow/ref_output/v1.2.2/ENCSR000DYI_subsampled_chr19_only/qc.json b/test/test_workflow/ref_output/v1.2.2/ENCSR000DYI_subsampled_chr19_only/qc.json new file mode 100644 index 00000000..95c9b0ee --- /dev/null +++ b/test/test_workflow/ref_output/v1.2.2/ENCSR000DYI_subsampled_chr19_only/qc.json @@ -0,0 +1,428 @@ +{ + "general": { + "date": "2019-06-08 12:35:01", + "pipeline_ver": "v1.2.1", + "peak_caller": "spp", + "genome": "hg38_chr19_chrM_google.tsv", + "description": "CEBPB ChIP-seq on human A549 produced by the Snyder lab", + "title": "ENCSR000DYI (subsampled 1/25, chr19 and chrM only)", + "paired_end": [] + }, + "flagstat_qc": { + "rep1": { + "total": 1221783, + "total_qc_failed": 0, + "duplicates": 0, + "duplicates_qc_failed": 0, + "mapped": 212025, + "mapped_qc_failed": 0, + "mapped_pct": 17.35, + "paired": 0, + "paired_qc_failed": 0, + "read1": 0, + "read1_qc_failed": 0, + "read2": 0, + "read2_qc_failed": 0, + "paired_properly": 0, + "paired_properly_qc_failed": 0, + "paired_properly_pct": 0.0, + "with_itself": 0, + "with_itself_qc_failed": 0, + "singletons": 0, + "singletons_qc_failed": 0, + "singletons_pct": 0.0, + "diff_chroms": 0, + "diff_chroms_qc_failed": 0 + }, + "rep2": { + "total": 1185718, + "total_qc_failed": 0, + "duplicates": 0, + "duplicates_qc_failed": 0, + "mapped": 216337, + "mapped_qc_failed": 0, + "mapped_pct": 18.25, + "paired": 0, + "paired_qc_failed": 0, + "read1": 0, + "read1_qc_failed": 0, + "read2": 0, + "read2_qc_failed": 0, + "paired_properly": 0, + "paired_properly_qc_failed": 0, + "paired_properly_pct": 0.0, + "with_itself": 0, + "with_itself_qc_failed": 0, + "singletons": 0, + "singletons_qc_failed": 0, + "singletons_pct": 0.0, + "diff_chroms": 0, + "diff_chroms_qc_failed": 0 + } + }, + "ctl_flagstat_qc": { + "rep1": { + "total": 964739, + "total_qc_failed": 0, + "duplicates": 0, + "duplicates_qc_failed": 0, + "mapped": 171493, + "mapped_qc_failed": 0, + "mapped_pct": 17.78, + "paired": 0, + "paired_qc_failed": 0, + "read1": 0, + "read1_qc_failed": 0, + "read2": 0, + "read2_qc_failed": 0, + "paired_properly": 0, + "paired_properly_qc_failed": 0, + "paired_properly_pct": 0.0, + "with_itself": 0, + "with_itself_qc_failed": 0, + "singletons": 0, + "singletons_qc_failed": 0, + "singletons_pct": 0.0, + "diff_chroms": 0, + "diff_chroms_qc_failed": 0 + }, + "rep2": { + "total": 1149924, + "total_qc_failed": 0, + "duplicates": 0, + "duplicates_qc_failed": 0, + "mapped": 204057, + "mapped_qc_failed": 0, + "mapped_pct": 17.75, + "paired": 0, + "paired_qc_failed": 0, + "read1": 0, + "read1_qc_failed": 0, + "read2": 0, + "read2_qc_failed": 0, + "paired_properly": 0, + "paired_properly_qc_failed": 0, + "paired_properly_pct": 0.0, + "with_itself": 0, + "with_itself_qc_failed": 0, + "singletons": 0, + "singletons_qc_failed": 0, + "singletons_pct": 0.0, + "diff_chroms": 0, + "diff_chroms_qc_failed": 0 + } + }, + "dup_qc": { + "rep1": { + "unpaired_reads": 37840, + "paired_reads": 0, + "unmapped_reads": 0, + "unpaired_dupes": 1296, + "paired_dupes": 0, + "paired_opt_dupes": 0, + "dupes_pct": 0.034249 + }, + "rep2": { + "unpaired_reads": 39670, + "paired_reads": 0, + "unmapped_reads": 0, + "unpaired_dupes": 2279, + "paired_dupes": 0, + "paired_opt_dupes": 0, + "dupes_pct": 0.057449 + } + }, + "ctl_dup_qc": { + "rep1": { + "unpaired_reads": 30589, + "paired_reads": 0, + "unmapped_reads": 0, + "unpaired_dupes": 1475, + "paired_dupes": 0, + "paired_opt_dupes": 0, + "dupes_pct": 0.04822 + }, + "rep2": { + "unpaired_reads": 36613, + "paired_reads": 0, + "unmapped_reads": 0, + "unpaired_dupes": 2136, + "paired_dupes": 0, + "paired_opt_dupes": 0, + "dupes_pct": 0.05834 + } + }, + "pbc_qc": { + "rep1": { + "total_read_pairs": 31607, + "distinct_read_pairs": 31232, + "one_read_pair": 31086, + "two_read_pair": 133, + "NRF": 0.988136, + "PBC1": 0.995325, + "PBC2": 233.729323 + }, + "rep2": { + "total_read_pairs": 31207, + "distinct_read_pairs": 30837, + "one_read_pair": 30731, + "two_read_pair": 93, + "NRF": 0.988144, + "PBC1": 0.996563, + "PBC2": 330.44086 + } + }, + "ctl_pbc_qc": { + "rep1": { + "total_read_pairs": 23641, + "distinct_read_pairs": 23298, + "one_read_pair": 23135, + "two_read_pair": 154, + "NRF": 0.985491, + "PBC1": 0.993004, + "PBC2": 150.227273 + }, + "rep2": { + "total_read_pairs": 26532, + "distinct_read_pairs": 26105, + "one_read_pair": 26012, + "two_read_pair": 82, + "NRF": 0.983906, + "PBC1": 0.996437, + "PBC2": 317.219512 + } + }, + "nodup_flagstat_qc": { + "rep1": { + "total": 36544, + "total_qc_failed": 0, + "duplicates": 0, + "duplicates_qc_failed": 0, + "mapped": 36544, + "mapped_qc_failed": 0, + "mapped_pct": 100.0, + "paired": 0, + "paired_qc_failed": 0, + "read1": 0, + "read1_qc_failed": 0, + "read2": 0, + "read2_qc_failed": 0, + "paired_properly": 0, + "paired_properly_qc_failed": 0, + "paired_properly_pct": 0.0, + "with_itself": 0, + "with_itself_qc_failed": 0, + "singletons": 0, + "singletons_qc_failed": 0, + "singletons_pct": 0.0, + "diff_chroms": 0, + "diff_chroms_qc_failed": 0 + }, + "rep2": { + "total": 37391, + "total_qc_failed": 0, + "duplicates": 0, + "duplicates_qc_failed": 0, + "mapped": 37391, + "mapped_qc_failed": 0, + "mapped_pct": 100.0, + "paired": 0, + "paired_qc_failed": 0, + "read1": 0, + "read1_qc_failed": 0, + "read2": 0, + "read2_qc_failed": 0, + "paired_properly": 0, + "paired_properly_qc_failed": 0, + "paired_properly_pct": 0.0, + "with_itself": 0, + "with_itself_qc_failed": 0, + "singletons": 0, + "singletons_qc_failed": 0, + "singletons_pct": 0.0, + "diff_chroms": 0, + "diff_chroms_qc_failed": 0 + } + }, + "ctl_nodup_flagstat_qc": { + "rep1": { + "total": 29114, + "total_qc_failed": 0, + "duplicates": 0, + "duplicates_qc_failed": 0, + "mapped": 29114, + "mapped_qc_failed": 0, + "mapped_pct": 100.0, + "paired": 0, + "paired_qc_failed": 0, + "read1": 0, + "read1_qc_failed": 0, + "read2": 0, + "read2_qc_failed": 0, + "paired_properly": 0, + "paired_properly_qc_failed": 0, + "paired_properly_pct": 0.0, + "with_itself": 0, + "with_itself_qc_failed": 0, + "singletons": 0, + "singletons_qc_failed": 0, + "singletons_pct": 0.0, + "diff_chroms": 0, + "diff_chroms_qc_failed": 0 + }, + "rep2": { + "total": 34477, + "total_qc_failed": 0, + "duplicates": 0, + "duplicates_qc_failed": 0, + "mapped": 34477, + "mapped_qc_failed": 0, + "mapped_pct": 100.0, + "paired": 0, + "paired_qc_failed": 0, + "read1": 0, + "read1_qc_failed": 0, + "read2": 0, + "read2_qc_failed": 0, + "paired_properly": 0, + "paired_properly_qc_failed": 0, + "paired_properly_pct": 0.0, + "with_itself": 0, + "with_itself_qc_failed": 0, + "singletons": 0, + "singletons_qc_failed": 0, + "singletons_pct": 0.0, + "diff_chroms": 0, + "diff_chroms_qc_failed": 0 + } + }, + "overlap_reproducibility_qc": { + "Nt": 143, + "N1": 38, + "N2": 52, + "Np": 150, + "N_opt": 150, + "N_consv": 143, + "opt_set": "ppr", + "consv_set": "rep1-rep2", + "rescue_ratio": 1.04895104895, + "self_consistency_ratio": 1.36842105263, + "reproducibility": "pass" + }, + "idr_reproducibility_qc": { + "Nt": 122, + "N1": 28, + "N2": 54, + "Np": 158, + "N_opt": 158, + "N_consv": 122, + "opt_set": "ppr", + "consv_set": "rep1-rep2", + "rescue_ratio": 1.29508196721, + "self_consistency_ratio": 1.92857142857, + "reproducibility": "pass" + }, + "xcor_score": { + "rep1": { + "num_reads": 205768, + "est_frag_len": 90, + "corr_est_frag_len": 0.15249697026649, + "phantom_peak": 35, + "corr_phantom_peak": 0.2425034, + "argmin_corr": 1500, + "min_corr": 0.1258825, + "NSC": 1.211423, + "RSC": 0.2282137 + }, + "rep2": { + "num_reads": 207841, + "est_frag_len": 90, + "corr_est_frag_len": 0.153057806873569, + "phantom_peak": 35, + "corr_phantom_peak": 0.2379796, + "argmin_corr": 1500, + "min_corr": 0.1224517, + "NSC": 1.249944, + "RSC": 0.264924 + } + }, + "frip_spp_qc": { + "rep1": { + "FRiP": 0.0598898283372 + }, + "rep2": { + "FRiP": 0.0907793597769 + }, + "rep1-pr1": { + "FRiP": 0.0629003330771 + }, + "rep2-pr1": { + "FRiP": 0.0550690795875 + }, + "rep1-pr2": { + "FRiP": 0.0662951575711 + }, + "rep2-pr2": { + "FRiP": 0.0566943435392 + }, + "pooled": { + "FRiP": 0.106337721772 + }, + "ppr1": { + "FRiP": 0.0733185084921 + }, + "ppr2": { + "FRiP": 0.0755124403764 + } + }, + "overlap_frip_qc": { + "rep1-rep2": { + "FRiP": 0.0351612227468 + }, + "rep1-pr": { + "FRiP": 0.0115936459134 + }, + "rep2-pr": { + "FRiP": 0.0227678137061 + }, + "ppr": { + "FRiP": 0.0364020175 + } + }, + "idr_frip_qc": { + "rep1-rep2": { + "FRiP": 0.0324701484119 + }, + "rep1-pr": { + "FRiP": 0.0107929797592 + }, + "rep2-pr": { + "FRiP": 0.0229948431875 + }, + "ppr": { + "FRiP": 0.0374977842951 + } + }, + "jsd_qc": { + "rep1": { + "pct_gen_enrich": 0.0695441906003, + "auc": 0.45680677407, + "ch_div": 0.781401966321, + "elbow_pt": 0.000242846499835, + "jsd": 0.792171433475, + "syn_auc": 0.485139181255, + "syn_elbow_pt": 0.115763759662, + "syn_jsd": 0.294012596339 + }, + "rep2": { + "pct_gen_enrich": 0.0624918395731, + "auc": 0.457389321455, + "ch_div": 0.790096160945, + "elbow_pt": 0.000185561909039, + "jsd": 0.80149818713, + "syn_auc": 0.569239520313, + "syn_elbow_pt": 0.116341276517, + "syn_jsd": 0.330229240524 + } + } +} diff --git a/test/test_workflow/ref_output/v1.2.2/ENCSR936XTK_subsampled_chr19_only/qc.json b/test/test_workflow/ref_output/v1.2.2/ENCSR936XTK_subsampled_chr19_only/qc.json new file mode 100644 index 00000000..ff21595a --- /dev/null +++ b/test/test_workflow/ref_output/v1.2.2/ENCSR936XTK_subsampled_chr19_only/qc.json @@ -0,0 +1,435 @@ +{ + "general": { + "date": "2019-06-08 12:34:04", + "pipeline_ver": "v1.2.1", + "peak_caller": "spp", + "genome": "hg38_chr19_chrM_google.tsv", + "description": "ZNF143 ChIP-seq on human GM12878", + "title": "ENCSR936XTK (subsampled 1/67, chr19 and chrM Only)", + "paired_end": [ + true, + true + ], + "ctl_paired_end": [ + true, + true + ] + }, + "flagstat_qc": { + "rep1": { + "total": 1037412, + "total_qc_failed": 0, + "duplicates": 0, + "duplicates_qc_failed": 0, + "mapped": 85367, + "mapped_qc_failed": 0, + "mapped_pct": 8.23, + "paired": 1037412, + "paired_qc_failed": 0, + "read1": 518706, + "read1_qc_failed": 0, + "read2": 518706, + "read2_qc_failed": 0, + "paired_properly": 69328, + "paired_properly_qc_failed": 0, + "paired_properly_pct": 6.68, + "with_itself": 73921, + "with_itself_qc_failed": 0, + "singletons": 11446, + "singletons_qc_failed": 0, + "singletons_pct": 1.1, + "diff_chroms": 3, + "diff_chroms_qc_failed": 0 + }, + "rep2": { + "total": 1299678, + "total_qc_failed": 0, + "duplicates": 0, + "duplicates_qc_failed": 0, + "mapped": 97977, + "mapped_qc_failed": 0, + "mapped_pct": 7.54, + "paired": 1299678, + "paired_qc_failed": 0, + "read1": 649839, + "read1_qc_failed": 0, + "read2": 649839, + "read2_qc_failed": 0, + "paired_properly": 76347, + "paired_properly_qc_failed": 0, + "paired_properly_pct": 5.87, + "with_itself": 83141, + "with_itself_qc_failed": 0, + "singletons": 14836, + "singletons_qc_failed": 0, + "singletons_pct": 1.14, + "diff_chroms": 1, + "diff_chroms_qc_failed": 0 + } + }, + "ctl_flagstat_qc": { + "rep1": { + "total": 1135028, + "total_qc_failed": 0, + "duplicates": 0, + "duplicates_qc_failed": 0, + "mapped": 81021, + "mapped_qc_failed": 0, + "mapped_pct": 7.14, + "paired": 1135028, + "paired_qc_failed": 0, + "read1": 567514, + "read1_qc_failed": 0, + "read2": 567514, + "read2_qc_failed": 0, + "paired_properly": 59460, + "paired_properly_qc_failed": 0, + "paired_properly_pct": 5.24, + "with_itself": 64102, + "with_itself_qc_failed": 0, + "singletons": 16919, + "singletons_qc_failed": 0, + "singletons_pct": 1.49, + "diff_chroms": 0, + "diff_chroms_qc_failed": 0 + }, + "rep2": { + "total": 1085984, + "total_qc_failed": 0, + "duplicates": 0, + "duplicates_qc_failed": 0, + "mapped": 81878, + "mapped_qc_failed": 0, + "mapped_pct": 7.54, + "paired": 1085984, + "paired_qc_failed": 0, + "read1": 542992, + "read1_qc_failed": 0, + "read2": 542992, + "read2_qc_failed": 0, + "paired_properly": 60740, + "paired_properly_qc_failed": 0, + "paired_properly_pct": 5.59, + "with_itself": 65497, + "with_itself_qc_failed": 0, + "singletons": 16381, + "singletons_qc_failed": 0, + "singletons_pct": 1.51, + "diff_chroms": 7, + "diff_chroms_qc_failed": 0 + } + }, + "dup_qc": { + "rep1": { + "unpaired_reads": 0, + "paired_reads": 21408, + "unmapped_reads": 0, + "unpaired_dupes": 0, + "paired_dupes": 57, + "paired_opt_dupes": 0, + "dupes_pct": 0.002663 + }, + "rep2": { + "unpaired_reads": 0, + "paired_reads": 20395, + "unmapped_reads": 0, + "unpaired_dupes": 0, + "paired_dupes": 39, + "paired_opt_dupes": 0, + "dupes_pct": 0.001912 + } + }, + "ctl_dup_qc": { + "rep1": { + "unpaired_reads": 0, + "paired_reads": 15345, + "unmapped_reads": 0, + "unpaired_dupes": 0, + "paired_dupes": 31, + "paired_opt_dupes": 0, + "dupes_pct": 0.00202 + }, + "rep2": { + "unpaired_reads": 0, + "paired_reads": 16451, + "unmapped_reads": 0, + "unpaired_dupes": 0, + "paired_dupes": 44, + "paired_opt_dupes": 0, + "dupes_pct": 0.002675 + } + }, + "pbc_qc": { + "rep1": { + "total_read_pairs": 18123, + "distinct_read_pairs": 18072, + "one_read_pair": 18021, + "two_read_pair": 51, + "NRF": 0.997186, + "PBC1": 0.997178, + "PBC2": 353.352941 + }, + "rep2": { + "total_read_pairs": 18300, + "distinct_read_pairs": 18265, + "one_read_pair": 18230, + "two_read_pair": 35, + "NRF": 0.998087, + "PBC1": 0.998084, + "PBC2": 520.857143 + } + }, + "ctl_pbc_qc": { + "rep1": { + "total_read_pairs": 8674, + "distinct_read_pairs": 8662, + "one_read_pair": 8650, + "two_read_pair": 12, + "NRF": 0.998617, + "PBC1": 0.998615, + "PBC2": 720.833333 + }, + "rep2": { + "total_read_pairs": 9003, + "distinct_read_pairs": 8989, + "one_read_pair": 8976, + "two_read_pair": 12, + "NRF": 0.998445, + "PBC1": 0.998554, + "PBC2": 748.0 + } + }, + "nodup_flagstat_qc": { + "rep1": { + "total": 42702, + "total_qc_failed": 0, + "duplicates": 0, + "duplicates_qc_failed": 0, + "mapped": 42702, + "mapped_qc_failed": 0, + "mapped_pct": 100.0, + "paired": 42702, + "paired_qc_failed": 0, + "read1": 21351, + "read1_qc_failed": 0, + "read2": 21351, + "read2_qc_failed": 0, + "paired_properly": 42702, + "paired_properly_qc_failed": 0, + "paired_properly_pct": 100.0, + "with_itself": 42702, + "with_itself_qc_failed": 0, + "singletons": 0, + "singletons_qc_failed": 0, + "singletons_pct": 0.0, + "diff_chroms": 0, + "diff_chroms_qc_failed": 0 + }, + "rep2": { + "total": 40712, + "total_qc_failed": 0, + "duplicates": 0, + "duplicates_qc_failed": 0, + "mapped": 40712, + "mapped_qc_failed": 0, + "mapped_pct": 100.0, + "paired": 40712, + "paired_qc_failed": 0, + "read1": 20356, + "read1_qc_failed": 0, + "read2": 20356, + "read2_qc_failed": 0, + "paired_properly": 40712, + "paired_properly_qc_failed": 0, + "paired_properly_pct": 100.0, + "with_itself": 40712, + "with_itself_qc_failed": 0, + "singletons": 0, + "singletons_qc_failed": 0, + "singletons_pct": 0.0, + "diff_chroms": 0, + "diff_chroms_qc_failed": 0 + } + }, + "ctl_nodup_flagstat_qc": { + "rep1": { + "total": 30628, + "total_qc_failed": 0, + "duplicates": 0, + "duplicates_qc_failed": 0, + "mapped": 30628, + "mapped_qc_failed": 0, + "mapped_pct": 100.0, + "paired": 30628, + "paired_qc_failed": 0, + "read1": 15314, + "read1_qc_failed": 0, + "read2": 15314, + "read2_qc_failed": 0, + "paired_properly": 30628, + "paired_properly_qc_failed": 0, + "paired_properly_pct": 100.0, + "with_itself": 30628, + "with_itself_qc_failed": 0, + "singletons": 0, + "singletons_qc_failed": 0, + "singletons_pct": 0.0, + "diff_chroms": 0, + "diff_chroms_qc_failed": 0 + }, + "rep2": { + "total": 32814, + "total_qc_failed": 0, + "duplicates": 0, + "duplicates_qc_failed": 0, + "mapped": 32814, + "mapped_qc_failed": 0, + "mapped_pct": 100.0, + "paired": 32814, + "paired_qc_failed": 0, + "read1": 16407, + "read1_qc_failed": 0, + "read2": 16407, + "read2_qc_failed": 0, + "paired_properly": 32814, + "paired_properly_qc_failed": 0, + "paired_properly_pct": 100.0, + "with_itself": 32814, + "with_itself_qc_failed": 0, + "singletons": 0, + "singletons_qc_failed": 0, + "singletons_pct": 0.0, + "diff_chroms": 0, + "diff_chroms_qc_failed": 0 + } + }, + "overlap_reproducibility_qc": { + "Nt": 385, + "N1": 282, + "N2": 231, + "Np": 379, + "N_opt": 385, + "N_consv": 385, + "opt_set": "rep1-rep2", + "consv_set": "rep1-rep2", + "rescue_ratio": 1.01583113456, + "self_consistency_ratio": 1.22077922078, + "reproducibility": "pass" + }, + "idr_reproducibility_qc": { + "Nt": 251, + "N1": 133, + "N2": 86, + "Np": 246, + "N_opt": 251, + "N_consv": 251, + "opt_set": "rep1-rep2", + "consv_set": "rep1-rep2", + "rescue_ratio": 1.02032520325, + "self_consistency_ratio": 1.54651162791, + "reproducibility": "pass" + }, + "xcor_score": { + "rep1": { + "num_reads": 64872, + "est_frag_len": 225, + "corr_est_frag_len": 0.0913620885188839, + "phantom_peak": 50, + "corr_phantom_peak": 0.1247327, + "argmin_corr": 1500, + "min_corr": 0.04619374, + "NSC": 1.977802, + "RSC": 0.5751073 + }, + "rep2": { + "num_reads": 80996, + "est_frag_len": 225, + "corr_est_frag_len": 0.0875861093995843, + "phantom_peak": 50, + "corr_phantom_peak": 0.1560125, + "argmin_corr": 1500, + "min_corr": 0.06123838, + "NSC": 1.430249, + "RSC": 0.2780056 + } + }, + "frip_spp_qc": { + "rep1": { + "FRiP": 0.448788180611 + }, + "rep2": { + "FRiP": 0.347057213249 + }, + "rep1-pr1": { + "FRiP": 0.418880035414 + }, + "rep2-pr1": { + "FRiP": 0.298368553597 + }, + "rep1-pr2": { + "FRiP": 0.405931828243 + }, + "rep2-pr2": { + "FRiP": 0.298948751643 + }, + "pooled": { + "FRiP": 0.400872389025 + }, + "ppr1": { + "FRiP": 0.393857669657 + }, + "ppr2": { + "FRiP": 0.393906869221 + } + }, + "overlap_frip_qc": { + "rep1-rep2": { + "FRiP": 0.304730715249 + }, + "rep1-pr": { + "FRiP": 0.345313191678 + }, + "rep2-pr": { + "FRiP": 0.224089789214 + }, + "ppr": { + "FRiP": 0.303258386768 + } + }, + "idr_frip_qc": { + "rep1-rep2": { + "FRiP": 0.27875168561 + }, + "rep1-pr": { + "FRiP": 0.263446215139 + }, + "rep2-pr": { + "FRiP": 0.150862304955 + }, + "ppr": { + "FRiP": 0.275834548807 + } + }, + "jsd_qc": { + "rep1": { + "pct_gen_enrich": 0.0231332109413, + "auc": 0.476593096418, + "ch_div": 0.879908847099, + "elbow_pt": 5.60264293001e-15, + "jsd": 0.890440918234, + "syn_auc": 0.515124919669, + "syn_elbow_pt": 0.343103570969, + "syn_jsd": 0.489524234445 + }, + "rep2": { + "pct_gen_enrich": 0.0347787001915, + "auc": 0.47600880084, + "ch_div": 0.855610053957, + "elbow_pt": 3.06625328431e-14, + "jsd": 0.867736502666, + "syn_auc": 0.498414537885, + "syn_elbow_pt": 0.333963215383, + "syn_jsd": 0.405191834328 + } + } +} From 50c806066f9a930bfc1fe95e54b4149f87055d0b Mon Sep 17 00:00:00 2001 From: Jin Lee Date: Sat, 8 Jun 2019 21:43:12 -0700 Subject: [PATCH 07/14] update doc (better input json definition) --- README.md | 49 +- docs/build_genome_database.md | 10 +- docs/input.md | 427 ++++++++---------- docs/install_conda.md | 37 ++ ...mplate_pe.full.json => template.full.json} | 43 +- examples/{template_pe.json => template.json} | 27 +- examples/template_se.full.json | 109 ----- examples/template_se.json | 50 -- 8 files changed, 290 insertions(+), 462 deletions(-) create mode 100644 docs/install_conda.md rename examples/{template_pe.full.json => template.full.json} (96%) rename examples/{template_pe.json => template.json} (85%) delete mode 100644 examples/template_se.full.json delete mode 100644 examples/template_se.json diff --git a/README.md b/README.md index 6c883b22..a55571c8 100644 --- a/README.md +++ b/README.md @@ -37,47 +37,24 @@ You can also run our pipeline on DNAnexus without using Caper or Cromwell. There ## Conda -We no longer recommend Conda for resolving dependencies and plan to phase out Conda support. Instead we recommend using Docker or Singularity. You can install Singularity and use it for our pipeline with Caper (by adding `--use-singularity` to command line arguments). - -1) Install [Conda](https://docs.conda.io/en/latest/miniconda.html). - -2) Install Conda environment for pipeline. - - ```bash - $ conda/install_dependencies.sh - ``` - -3) Initialize Conda and re-login. - - ```bash - $ conda init bash - $ exit - ``` - -4) Configure pipeline's python2 and python3 environments. - - ```bash - $ conda/config_conda_env.sh - $ conda/config_conda_env_py3.sh - ``` - -5) Update pipeline's Conda environment with pipeline's python source code. You need to run this step everytime you update (`git pull`) this pipeline. - - ```bash - $ conda/update_conda_env.sh - ``` +We no longer recommend Conda for resolving dependencies and plan to phase out Conda support. Instead we recommend using Docker or Singularity. You can install Singularity and use it for our pipeline with Caper (by adding `--use-singularity` to command line arguments). Please see [this instruction](docs/install_conda.md). ## Tutorial Make sure that you have configured Caper correctly. -> **WARNING**: DO NOT RUN THIS ON HPC LOGIN NODES. YOUR JOBS WILL BE KILLED. +> **WARNING**: Do not run Caper on HPC login nodes. Your jobs can be killed. -Run it. Due to `--deepcopy` all files in `examples/caper/ENCSR936XTK_subsampled_chr19_only.json` will be recursively copied into Caper's temporary folder (`--tmp-dir`). +Run it. Due to `--deepcopy` all files (HTTP URLs) in `examples/caper/ENCSR936XTK_subsampled_chr19_only.json` will be recursively copied into Caper's temporary folder (`--tmp-dir`). ```bash $ caper run chip.wdl -i examples/caper/ENCSR936XTK_subsampled_chr19_only.json --deepcopy --use-singularity ``` -If you use Conda or Docker (on cloud platforms) then remove `--use-singularity` from the command line and activate it before running a pipeline. +If you use Docker then replace `--use-singularity` with `--use-docker`. +```bash +$ caper run chip.wdl -i examples/caper/ENCSR936XTK_subsampled_chr19_only.json --deepcopy --use-docker +``` + +If you use Conda then remove `--use-singularity` from the command line and activate pipeline's Conda env before running a pipeline. ```bash $ conda activate encode-chip-seq-pipeline $ caper run chip.wdl -i examples/caper/ENCSR936XTK_subsampled_chr19_only.json --deepcopy @@ -87,7 +64,7 @@ To run it on an HPC (e.g. Stanford Sherlock and SCG). See details at [Caper's RE ## Input JSON file -Always use absolute paths in an input JSON. +An input JSON file includes all genomic data files, input parameters and metadata for running pipelines. Always use absolute paths in an input JSON. [Input JSON file specification](docs/input.md) @@ -105,12 +82,6 @@ Find a `metadata.json` on Caper's output directory. $ croo [METADATA_JSON_FILE] ``` -## How to build/download genome database - -You need to specify a genome data TSV file in your input JSON. Such TSV can be generated/downloaded with actual genome database files. - -Use genome database [downloader](genome/download_genome_data.sh) or [builder](docs/build_genome_database.md) for your own genome. - ## Useful tools There are some useful tools to post-process outputs of the pipeline. diff --git a/docs/build_genome_database.md b/docs/build_genome_database.md index 83187886..45b8fe8b 100644 --- a/docs/build_genome_database.md +++ b/docs/build_genome_database.md @@ -1,3 +1,11 @@ +## How to download genome database + +1. Choose `GENOME` from `hg19`, `hg38`, `mm9` and `mm10` and specify a destination directory. + ```bash + $ bash genome/download_genome_data.sh [GENOME] [DESTINATION_DIR] + ``` +2. Find a TSV file on the destination directory and use it for `"chip.genome_tsv"` in your input JSON. + # How to build genome database 1. [Install Conda](https://conda.io/miniconda.html). Skip this if you already have equivalent Conda alternatives (Anaconda Python). Download and run the [installer](https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh). Agree to the license term by typing `yes`. It will ask you about the installation location. On Stanford clusters (Sherlock and SCG4), we recommend to install it outside of your `$HOME` directory since its filesystem is slow and has very limited space. At the end of the installation, choose `yes` to add Miniconda's binary to `$PATH` in your BASH startup script. @@ -12,7 +20,7 @@ $ bash conda/install_dependencies.sh ``` -3. Choose `[GENOME]` from `hg19`, `hg38`, `mm9` and `mm10` and specify a destination directory. This will take several hours. We recommend not to run this installer on a login node of your cluster. It will take >8GB memory and >2h time. +3. Choose `GENOME` from `hg19`, `hg38`, `mm9` and `mm10` and specify a destination directory. This will take several hours. We recommend not to run this installer on a login node of your cluster. It will take >8GB memory and >2h time. ```bash $ bash conda/build_genome_data.sh [GENOME] [DESTINATION_DIR] ``` diff --git a/docs/input.md b/docs/input.md index d6123328..2194d424 100644 --- a/docs/input.md +++ b/docs/input.md @@ -1,260 +1,221 @@ # Input JSON -An input JSON file includes all input parameters and metadata for running pipelines. Items 1) and 2) are mandatory. Items 3) and 4) are optional so that our pipeline will use default values if they are not defined. However, +An input JSON file includes all genomic data files, parameters and metadata for running pipelines. Our pipeline will use default values if they are not defined in an input JSON file. We provide a set of template JSON files: [minimum](../examples/template.json) and [full](../examples/template.full.json). We recommend to use a minimum template instead of full one. A full template includes all parameters of the pipeline with default values defined. -* Mandatory +Please read through the following step-by-step instruction to compose a input JSON file. -1. Reference genome. -2. Input data file paths/URIs. +## Pipeline metadata -* Optional +Parameter|Description +---------|----------- +`chip.title`| Title for experiment which will be shown in a final HTML report +`chip.description`| Description for experiment which will be shown in a final HTML report -3. Pipeline parameters. -4. Resource settings for jobs. +## Pipeline parameters -## Reference genome +Parameter|Default|Description +---------|-------|----------- +`chip.pipeline_type`| `tf` | `tf` for TF ChIP-seq or `histone` for Histone ChIP-seq. +`chip.align_only`| false | Peak calling and its downstream analyses will be disabled. Useful if you just want to map your FASTQs into filtered BAMs/TAG-ALIGNs and don't want to call peaks on them. +`chip.true_rep_only` | false | Disable pseudo replicate generation and all related analyses -We currently support 4 genomes. You can also [build a genome database for your own genome](build_genome_database.md). +## Reference genome -|genome|source|built from| +All reference genome specific reference files/parameters can be defined in a single TSV file `chip.genome_tsv`. However, you can also individally define each file/parameter instead of a TSV file. If both a TSV file and individual parameters are defined, then individual parameters will override those defined in a TSV file. For example, if you define both `chip.genome_tsv` and `chip.blacklist`, then `chip.blacklist` will override that is defined in `chip.genome_tsv`. This is useful when you want to use your own for a specific parameter while keeping all the other parameters same as original. + +Parameter|Type|Description +---------|-------|----------- +`chip.genome_tsv`| File | Choose one of the TSV files listed below or build your own +`chip.ref_fa`| File | Reference FASTA file +`chip.bwa_idx_tar`| File | BWA index TAR file (uncompressed) built from FASTA file with `bwa index` +`chip.chrsz`| File | 2-col chromosome sizes file built from FASTA file with `faidx` +`chip.blacklist`| File | 3-col BED file. Peaks overlapping these regions will be filtered out +`chip.gensz`| String | MACS2's genome sizes (hs for human, mm for mouse or sum of 2nd col in chrsz) + +We currently provide TSV files for 4 genomes as shown in the below table. `GENOME` should be `hg38`, `mm10`, `hg19` or `mm9`. You can [download/build](build_genome_database.md) it on your local computer. You can also [build a genome database for your own genome](build_genome_database.md). + +Platform|Path/URI +-|- +Google Cloud Platform|`gs://encode-pipeline-genome-data/[GENOME]_google.tsv` +Stanford Sherlock|`/home/groups/cherry/encode/pipeline_genome_data/[GENOME]_sherlock.tsv` +Stanford SCG|`/reference/ENCODE/pipeline_genome_data/[GENOME]_scg.tsv` +Local/SLURM/SGE|You need to [build](build_genome_database.md) or [download]() a genome database]. +DNAnexus (CLI)|`dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-genome-data/[GENOME]_dx.tsv` +DNAnexus (CLI, Azure)|`dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-genome-data/[GENOME]_dx_azure.tsv` +DNAnexus (Web)|Choose `[GENOME]_dx.tsv` from [here](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/pipeline-genome-data) +DNAnexus (Web, Azure)|Choose `[GENOME]_dx.tsv` from [here](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/pipeline-genome-data) + +Additional information about each genome: + +|Genome|Source|built from| |-|-|-| |hg38|ENCODE|[GRCh38_no_alt_analysis_set_GCA_000001405](https://www.encodeproject.org/files/GRCh38_no_alt_analysis_set_GCA_000001405.15/@@download/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.gz)| |mm10|ENCODE|[mm10_no_alt_analysis_set_ENCODE](https://www.encodeproject.org/files/mm10_no_alt_analysis_set_ENCODE/@@download/mm10_no_alt_analysis_set_ENCODE.fasta.gz)| |hg19|UCSC|[GRCh37/hg19](http://hgdownload.cse.ucsc.edu/goldenpath/hg19/encodeDCC/referenceSequences/male.hg19.fa.gz)| |mm9|UCSC|[mm9, NCBI Build 37]()| -Choose one TSV file for `"chip.genome_tsv"` in your input JSON. `[GENOME]` should be `hg38`, `mm10`, `hg19` or `mm9`. +## Input genomic data + +Choose endedness of your dataset first. -|platform|path/URI| -|-|-| -|Google Cloud Platform|`gs://encode-pipeline-genome-data/[GENOME]_google.tsv`| -|DNAnexus (CLI)|`dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-genome-data/[GENOME]_dx.tsv`| -|DNAnexus (CLI, Azure)|`dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-genome-data/[GENOME]_dx_azure.tsv`| -|DNAnexus (Web)|Choose `[GENOME]_dx.tsv` from [here](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/pipeline-genome-data)| -|DNAnexus (Web, Azure)|Choose `[GENOME]_dx.tsv` from [here](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/pipeline-genome-data)| -|Stanford Sherlock|`/home/groups/cherry/encode/pipeline_genome_data/[GENOME]_sherlock.tsv`| -|Stanford SCG|`/reference/ENCODE/pipeline_genome_data/[GENOME]_scg.tsv`| -|Local/SLURM/SGE|You need to [build a genome database](build_genome_database.md). | +Parameter|Description +---------|----------- +`chip.paired_end`| Boolean to define endedness for ALL IP replicates. This will override per-replicate definition in `chip.paired_ends` +`chip.ctl_paired_end`| Boolean to define endedness for ALL control replicates. This will override per-replicate definition in `chip.ctl_paired_ends` +`chip.paired_ends`| Array of Boolean to define endedness for each replicate +`chip.ctl_paired_ends`| Array of Boolean to define endedness for each control replicate -## Templates +Define `chip.paired_end` and `chip.ctl_paired_end` if all replicates (or control replicates) in your dataset has the same endedness. You can also individually define endedness for each replicate and control replicate. For example, rep1, rep2 are PE and rep3 is SE. control rep1 is SE and control rep2 is PE. + +```javascript +{ + "chip.paired_ends" : [true, true, false], + "chip.ctl_paired_ends" : [false, true] +} +``` -We provide two sets of template JSON files (minimum and full), and each set for both single ended and paired-end samples. We recommend to use one of minimum templates instead of that used in the tutorial section. A full template includes all parameters of the pipeline with default values defined. +Pipeline can start from any of the following data type (FASTQ, BAM, NODUP_BAM and TAG-ALIGN). -1) Minimum template: +Parameter|Description +---------|----------- +`chip.fastqs_repX_R1`| Array of R1 FASTQ files for replicate X. These files will be merged into one FASTQ file for rep X. +`chip.fastqs_repX_R2`| Array of R2 FASTQ files for replicate X. These files will be merged into one FASTQ file for rep X. Do not define for single ended dataset. +`chip.bams`| Array of BAM file for each replicate. (e.g. `["rep1.bam", "rep2.bam", ...]`) +`chip.nodup_bams`| Array of filtered/deduped BAM file for each replicate. +`chip.tas`| Array of TAG-ALIGN file for each replicate. - * [mini template](../examples/template_se.json) for single ended sample - * [mini template](../examples/template_pe.json) for paired-end sample +For controls: -2) Full template: +Parameter|Description +---------|----------- +`chip.ctl_fastqs_repX_R1`| Array of R1 FASTQ files for control replicate X. These files will be merged into one FASTQ file for rep X. +`chip.ctl_fastqs_repX_R2`| Array of R2 FASTQ files for control replicate X. These files will be merged into one FASTQ file for rep X. Do not define for single ended dataset. +`chip.ctl_bams`| Array of BAM file for each control replicate. (e.g. `["ctl_rep1.bam", "ctl_rep2.bam", ...]`) +`chip.ctl_nodup_bams`| Array of filtered/deduped BAM file for each control replicate. +`chip.ctl_tas`| Array of TAG-ALIGN file for each control replicate. - * [full template](../examples/template_se.full.json) for single ended sample - * [full template](../examples/template_pe.full.json) for paired-end sample +You can mix up different data types for individual replicate/control replicate. For example, pipeline can start from FASTQs for rep1 and rep3, BAMs for rep2, NODUP_BAMs for rep4 and TAG-ALIGNs for rep5. You can define similarly for control replicates. -Let us take a close look at a full template JSON. Comments are not allowed in a JSON file but we added some comments to help you understand each parameter. ```javascript { - ////////// 1) Reference genome ////////// - // Stanford servers: [GENOME]=hg38,hg19,mm10,mm9 - // Sherlock: /home/groups/cherry/encode/pipeline_genome_data/[GENOME]_sherlock.tsv - // SCG4: /reference/ENCODE/pipeline_genome_data/[GENOME]_scg.tsv - - // Cloud platforms (Google Cloud, DNAnexus): [GENOME]=hg38,hg19,mm10,mm9 - // Google Cloud: gs://encode-pipeline-genome-data/[GENOME]_google.tsv - // DNAnexus: dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-genome-data/[GENOME]_dx.tsv - // DNAnexus(Azure): dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-genome-data/[GENOME]_dx_azure.tsv - - // On other computers download or build reference genome database and pick a TSV from [DEST_DIR]. - // Downloader: ./genome/download_genome_data.sh [GENOME] [DEST_DIR] - // Builder (Conda required): ./conda/build_genome_data.sh [GENOME] [DEST_DIR] - - "chip.genome_tsv" : "/path_to_genome_data/hg38/hg38.tsv", - - ////////// 2) Input data files paths/URIs ////////// - - // Read endedness - "chip.paired_end" : true, - - // Read endedness for controls - // If not defined, "chip.paired_end" will be used for controls too - "chip.ctl_paired_end" : true, - - // If you start from FASTQs then define these, otherwise remove from this file. - // You can define up to 6 replicates. - // FASTQs in an array will be merged. - // For example, - // "rep1_R1_L1.fastq.gz", "rep1_R1_L2.fastq.gz" and "rep1_R1_L3.fastq.gz" will be merged together. - "chip.fastqs_rep1_R1" : [ "rep1_R1_L1.fastq.gz", "rep1_R1_L2.fastq.gz", "rep1_R1_L3.fastq.gz" ], - "chip.fastqs_rep1_R2" : [ "rep1_R2_L1.fastq.gz", "rep1_R2_L2.fastq.gz", "rep1_R2_L3.fastq.gz" ], - "chip.fastqs_rep2_R1" : [ "rep2_R1_L1.fastq.gz", "rep2_R1_L2.fastq.gz" ], - "chip.fastqs_rep2_R2" : [ "rep2_R2_L1.fastq.gz", "rep2_R2_L2.fastq.gz" ], - - // Define if you have control FASTQs otherwise remove from this file. - "chip.ctl_fastqs_rep1_R1" : [ "ctl1_R1.fastq.gz" ], - "chip.ctl_fastqs_rep1_R2" : [ "ctl1_R2.fastq.gz" ], - "chip.ctl_fastqs_rep2_R1" : [ "ctl2_R1.fastq.gz" ], - "chip.ctl_fastqs_rep2_R2" : [ "ctl2_R2.fastq.gz" ], - - // If you start from BAMs then define these, otherwise remove from this file. - // You can define up to 6 replicates. The following example array has two replicates. - "chip.bams" : [ - "raw_rep1.bam", - "raw_rep2.bam" - ], - // Define if you have control BAMs otherwise remove from this file. - "chip.ctl_bams" : [ - "raw_ctl1.bam", - "raw_ctl2.bam" - ], - - // If you start from filtered/deduped BAMs then define these, otherwise remove from this file. - // You can define up to 6 replicates. The following example array has two replicates. - "chip.nodup_bams" : [ - "nodup_rep1.bam", - "nodup_rep2.bam" - ], - // Define if you have control filtered/deduped BAMs otherwise remove from this file. - "chip.ctl_nodup_bams" : [ - "nodup_ctl1.bam", - "nodup_ctl2.bam" - ], - - // If you start from TAG-ALIGNs then define these, otherwise remove from this file. - // You can define up to 6 replicates. The following example array has two replicates. - "chip.tas" : [ - "rep1.tagAlign.gz", - "rep2.tagAlign.gz" - ], - // Define if you have control TAG-ALIGNs otherwise remove from this file. - "chip.ctl_tas" : [ - "ctl1.tagAlign.gz", - "ctl2.tagAlign.gz" - ], - - ////////// 3) Pipeline parameters ////////// - - // Pipeline title and description - "chip.title" : "Example (single-ended)", - "chip.description" : "This is an template input JSON for single-ended sample.", - - // Pipeline type (tf or histone). - // default peak_caller: spp for tf, macs2 for histone - "chip.pipeline_type" : "tf", - // You can also manually specify a peak_caller - "chip.peak_caller" : "spp", - - // Pipeline will not proceed to post alignment steps (peak-calling, ...). - // You will get QC report for alignment only. - "chip.align_only" : false, - "chip.true_rep_only" : false, - - // Disable deeptools fingerprint (JS distance) - "chip.disable_fingerprint" : false, - - // Enable count signal track generation - "chip.enable_count_signal_track" : false, - - // Trim R1 of paired ended fastqs for cross-correlation analysis only - // Trimmed fastqs will not be used for any other analyses - "chip.xcor_pe_trim_bp" : 50, - - // Use filtered PE BAM/TAG-ALIGN for cross-correlation analysis ignoring the above trimmed R1 fastq - "chip.use_filt_pe_ta_for_xcor" : false, - - // Choose a dup marker between picard and sambamba - // picard is recommended, use sambamba only when picard fails. - "chip.dup_marker" : "picard", - - // Threshold for mapped reads quality (samtools view -q) - "chip.mapq_thresh" : 30, - - // Skip dup removal in a BAM filtering stage. - "chip.no_dup_removal" : false, - - // Name of mito chromosome. THIS IS NOT A REG-EX! you can define only one chromosome name for mito. - "chip.mito_chr_name" : "chrM", - - // Regular expression to filter out reads with given chromosome name (1st column of BED/TAG-ALIGN) - // Any read with chr name that matches with this reg-ex pattern will be removed from outputs - // If your have changed the above parameter "chip.mito_chr_name" and still want to filter out mito reads, - // then make sure that "chip.mito_chr_name" and "chip.regex_filter_reads" are the same. - "chip.regex_filter_reads" : "chrM", - - // Subsample reads (0: no subsampling) - // Subsampled reads will be used for all downsteam analyses including peak-calling - "chip.subsample_reads" : 0, - "chip.ctl_subsample_reads" : 0, - - // Cross-correlation analysis - // Subsample reads for cross-corr. analysis only (0: no subsampling) - // Subsampled reads will be used for cross-corr. analysis only - "chip.xcor_subsample_reads" : 15000000, - - // Keep irregular chromosome names - // Use this for custom genomes without canonical chromosome names (chr1, chrX, ...) - "chip.keep_irregular_chr_in_bfilt_peak" : false, - - // Choosing an appropriate control for each replicate - // Always use a pooled control to compare with each replicate. - // If a single control is given then use it. - "chip.always_use_pooled_ctl" : false, - // If ratio of depth between controls is higher than this - // then always use a pooled control for all replicates. - "chip.ctl_depth_ratio" : 1.2, - - // Cap number of peaks called from a peak-caller (MACS2) - "chip.macs2_cap_num_peak" : 500000, - // P-value threshold for MACS2 (macs2 callpeak -p) - "chip.pval_thresh" : 0.01, - - // IDR (irreproducible discovery rate) - // Threshold for IDR - "chip.idr_thresh" : 0.05, - - // Cap number of peaks called from a peak-caller (SPP) - "chip.spp_cap_num_peak" : 300000, - - ////////// 5) Resource settings ////////// - - // Resources defined here are PER REPLICATE. - // Therefore, total number of cores will be MAX(["chip.bwa_cpu"] x [NUMBER_OF_REPLICATES], "chip.spp_cpu" x 2 x [NUMBER_OF_REPLICATES]) - // because bwa and spp are bottlenecking tasks of the pipeline. - // Use this total number of cores if you manually qsub or sbatch your job (using local mode of our pipeline). - // "disks" is used for Google Cloud and DNAnexus only. - - "chip.bwa_cpu" : 4, - "chip.bwa_mem_mb" : 20000, - "chip.bwa_time_hr" : 48, - "chip.bwa_disks" : "local-disk 100 HDD", - - "chip.filter_cpu" : 2, - "chip.filter_mem_mb" : 20000, - "chip.filter_time_hr" : 24, - "chip.filter_disks" : "local-disk 100 HDD", - - "chip.bam2ta_cpu" : 2, - "chip.bam2ta_mem_mb" : 10000, - "chip.bam2ta_time_hr" : 6, - "chip.bam2ta_disks" : "local-disk 100 HDD", - - "chip.spr_mem_mb" : 16000, - - "chip.fingerprint_cpu" : 2, - "chip.fingerprint_mem_mb" : 12000, - "chip.fingerprint_time_hr" : 6, - "chip.fingerprint_disks" : "local-disk 100 HDD", - - "chip.xcor_cpu" : 2, - "chip.xcor_mem_mb" : 16000, - "chip.xcor_time_hr" : 24, - "chip.xcor_disks" : "local-disk 100 HDD", - - "chip.macs2_mem_mb" : 16000, - "chip.macs2_time_hr" : 24, - "chip.macs2_disks" : "local-disk 100 HDD", - - "chip.spp_cpu" : 2, - "chip.spp_mem_mb" : 16000, - "chip.spp_time_hr" : 72, - "chip.spp_disks" : "local-disk 100 HDD", + "chip.fastqs_rep1_R1" : ["rep1.fastq.gz"], + "chip.fastqs_rep3_R1" : ["rep3.fastq.gz"], + "chip.bams" : [null, "rep2.bam", null, null, null], + "chip.nodup_bams" : [null, "rep2.bam", null, "rep4.nodup.bam", null], + "chip.tas" : [null, null, null, null, "rep5.tagAlign.gz"] } ``` + +## Optional filtering parameters + +Parameter|Default|Description +---------|-------|----------- +`chip.mapq_thresh` | 30 | Threshold for mapped reads quality (samtools view -q) +`chip.dup_marker` | `picard` | Choose a dup marker between `picard` and `sambamba`. `picard` is recommended, use `sambamba` only when picard fails. +`chip.no_dup_removal` | false | Skip dup removal in a BAM filtering stage. + +## Optional subsampling parameters + +Parameter|Default|Description +---------|-------|----------- +`chip.subsample_reads` | 0 | Subsample reads (0: no subsampling). Subsampled reads will be used for all downsteam analyses including peak-calling +`chip.ctl_subsample_reads` | 0 | Subsample control reads. +`chip.xcor_subsample_reads` | 15000000 | Subsample reads for cross-corr. analysis only (0: no subsampling). Subsampled reads will be used for cross-corr. analysis only + +## Optional cross-correlation analysis parameters + +Parameter|Default|Description +---------|-------|----------- +`chip.xcor_pe_trim_bp` | 50 | Trim R1 of paired ended fastqs for cross-correlation analysis only. Trimmed fastqs will not be used for any other analyses +`chip.use_filt_pe_ta_for_xcor` | false | Use filtered PE BAM/TAG-ALIGN for cross-correlation analysis ignoring the above trimmed R1 fastq + +## Optional control parameters + +Parameter|Default|Description +---------|-------|----------- +`chip.always_use_pooled_ctl` | false | Choosing an appropriate control for each replicate. Always use a pooled control to compare with each replicate. If a single control is given then use it. +`chip.ctl_depth_ratio` | 1.2 | If ratio of depth between controls is higher than this. then always use a pooled control for all replicates. + +## Optional peak-calling parameters + +Parameter|Default|Description +---------|-------|----------- +`chip.peak_caller`| `spp` for `tf` type
`macs2` for `histone` type| `spp` or `macs2`. `spp` requires control
`macs2` can work without controls +`chip.macs2_cap_num_peak` | 500000 | Cap number of peaks called from a peak-caller (MACS2) +`chip.pval_thresh` | 0.01 | P-value threshold for MACS2 (macs2 callpeak -p) +`chip.idr_thresh` | 0.05 | Threshold for IDR (irreproducible discovery rate) +`chip.spp_cap_num_peak` | 300000 | Cap number of peaks called from a peak-caller (SPP) + +## Optional pipeline flags + +Parameter|Default|Description +---------|-------|----------- +`chip.disable_fingerprint` | false | Disable deeptools fingerprint (JS distance) +`chip.enable_count_signal_track` | false | Enable count signal track generation +`chip.keep_irregular_chr_in_bfilt_peak` | false | Keep irregular chromosome names. Use this for custom genomes without canonical chromosome names (chr1, chrX, ...) + +## Other optional parameters + +Parameter|Default|Description +---------|-------|----------- +`chip.mito_chr_name` | `chrM` | Name of mito chromosome. THIS IS NOT A REG-EX! you can define only one chromosome name for mito. +`chip.regex_filter_reads` | `chrM` | Regular expression to filter out reads with given chromosome name (1st column of BED/TAG-ALIGN). Any read with chr name that matches with this reg-ex pattern will be removed from outputs If your have changed the above parameter `chip.mito_chr_name` and still want to filter out mito reads then make sure that `chip.mito_chr_name` and `chip.regex_filter_reads` are the same + +## Resource parameters + +> **WARNING*: It is recommened not to change the following parameters unless you get resource-related errors for a certain task and you want to increase resources for such task. The following parameters are provided for users who want to run our pipeline with Caper's `local` on HPCs and 2). + +Resources defined here are PER REPLICATE. Therefore, total number of cores will be MAX(`chip.bwa_cpu` x `NUMBER_OF_REPLICATES`, `chip.spp_cpu` x 2 x `NUMBER_OF_REPLICATES`) because bwa and spp are bottlenecking tasks of the pipeline. Use this total number of cores if you manually `qsub` or `sbatch` your job (using local mode of Caper). `disks` is used for Google Cloud and DNAnexus only. + +Parameter|Default +---------|------- +`chip.bwa_cpu` | 4 +`chip.bwa_mem_mb` | 20000 +`chip.bwa_time_hr` | 48 +`chip.bwa_disks` | `local-disk 100 HDD` + +Parameter|Default +---------|------- +`chip.filter_cpu` | 2 +`chip.filter_mem_mb` | 20000 +`chip.filter_time_hr` | 24 +`chip.filter_disks` | `local-disk 100 HDD` + +Parameter|Default +---------|------- +`chip.bam2ta_cpu` | 2 +`chip.bam2ta_mem_mb` | 10000 +`chip.bam2ta_time_hr` | 6 +`chip.bam2ta_disks` | `local-disk 100 HDD` + +Parameter|Default +---------|------- +`chip.spr_mem_mb` | 16000 + +Parameter|Default +---------|------- +`chip.fingerprint_cpu` | 2 +`chip.fingerprint_mem_mb` | 12000 +`chip.fingerprint_time_hr` | 6 +`chip.fingerprint_disks` | `local-disk 100 HDD` + +Parameter|Default +---------|------- +`chip.xcor_cpu` | 2 +`chip.xcor_mem_mb` | 16000 +`chip.xcor_time_hr` | 24 +`chip.xcor_disks` | `local-disk 100 HDD` + +Parameter|Default +---------|------- +`chip.macs2_mem_mb` | 16000 +`chip.macs2_time_hr` | 24 +`chip.macs2_disks` | `local-disk 100 HDD` + +Parameter|Default +---------|------- +`chip.spp_cpu` | 2 +`chip.spp_mem_mb` | 16000 +`chip.spp_time_hr` | 72 +`chip.spp_disks` | `local-disk 100 HDD` +``` diff --git a/docs/install_conda.md b/docs/install_conda.md new file mode 100644 index 00000000..f0016bbc --- /dev/null +++ b/docs/install_conda.md @@ -0,0 +1,37 @@ +## How to download genome database + +1. Choose `GENOME` from `hg19`, `hg38`, `mm9` and `mm10` and specify a destination directory. + ```bash + $ bash genome/download_genome_data.sh [GENOME] [DESTINATION_DIR] + ``` +2. Find a TSV file on the destination directory and use it for `"chip.genome_tsv"` in your input JSON. + +# How to install pipeline's Conda environment + +1) Install [Conda](https://docs.conda.io/en/latest/miniconda.html). + +2) Install Conda environment for pipeline. + + ```bash + $ conda/install_dependencies.sh + ``` + +3) Initialize Conda and re-login. + + ```bash + $ conda init bash + $ exit + ``` + +4) Configure pipeline's python2 and python3 environments. + + ```bash + $ conda/config_conda_env.sh + $ conda/config_conda_env_py3.sh + ``` + +5) Update pipeline's Conda environment with pipeline's python source code. You need to run this step everytime you update (`git pull`) this pipeline. + + ```bash + $ conda/update_conda_env.sh + ``` diff --git a/examples/template_pe.full.json b/examples/template.full.json similarity index 96% rename from examples/template_pe.full.json rename to examples/template.full.json index 78ef8a09..b934bcce 100644 --- a/examples/template_pe.full.json +++ b/examples/template.full.json @@ -1,7 +1,17 @@ { + "chip.title" : "Example (paired-end)", + "chip.description" : "This is an template input JSON for paired-end sample.", + + "chip.pipeline_type" : "tf", + "chip.align_only" : false, + "chip.true_rep_only" : false, + "chip.genome_tsv" : "/path_to_genome_data/hg38/hg38.tsv", + "chip.paired_end" : true, - "chip.ctl_paired_end" : true, + "chip.ctl_paired_end" : [true, true], + "chip.paired_ends" : true, + "chip.ctl_paired_ends" : [true, true], "chip.fastqs_rep1_R1" : [ "rep1_R1_L1.fastq.gz", "rep1_R1_L2.fastq.gz", "rep1_R1_L3.fastq.gz" ], "chip.fastqs_rep1_R2" : [ "rep1_R2_L1.fastq.gz", "rep1_R2_L2.fastq.gz", "rep1_R2_L3.fastq.gz" ], @@ -40,42 +50,33 @@ "ctl2.tagAlign.gz" ], - "chip.title" : "Example (paired-end)", - "chip.description" : "This is an template input JSON for paired-end sample.", - - "chip.pipeline_type" : "tf", - "chip.peak_caller" : null, - - "chip.align_only" : false, - "chip.true_rep_only" : false, - - "chip.disable_fingerprint" : false, - "chip.enable_count_signal_track" : false, - - "chip.xcor_pe_trim_bp" : 50, - "chip.use_filt_pe_ta_for_xcor" : false, - - "chip.dup_marker" : "picard", - "chip.mapq_thresh" : 30, + "chip.dup_marker" : "picard", "chip.no_dup_removal" : false, - "chip.mito_chr_name" : "chrM", - "chip.regex_filter_reads" : "chrM", "chip.subsample_reads" : 0, "chip.ctl_subsample_reads" : 0, "chip.xcor_subsample_reads" : 15000000, - "chip.keep_irregular_chr_in_bfilt_peak" : false, + "chip.xcor_pe_trim_bp" : 50, + "chip.use_filt_pe_ta_for_xcor" : false, "chip.always_use_pooled_ctl" : false, "chip.ctl_depth_ratio" : 1.2, + "chip.peak_caller" : null, "chip.macs2_cap_num_peak" : 500000, "chip.pval_thresh" : 0.01, "chip.idr_thresh" : 0.05, "chip.spp_cap_num_peak" : 300000, + "chip.disable_fingerprint" : false, + "chip.enable_count_signal_track" : false, + "chip.keep_irregular_chr_in_bfilt_peak" : false, + + "chip.mito_chr_name" : "chrM", + "chip.regex_filter_reads" : "chrM", + "chip.bwa_cpu" : 4, "chip.bwa_mem_mb" : 20000, "chip.bwa_time_hr" : 48, diff --git a/examples/template_pe.json b/examples/template.json similarity index 85% rename from examples/template_pe.json rename to examples/template.json index af9d8001..f4b6d567 100644 --- a/examples/template_pe.json +++ b/examples/template.json @@ -1,5 +1,18 @@ { + "chip.title" : "Example (paired-end)", + "chip.description" : "This is an template input JSON for paired-end sample.", + + "chip.pipeline_type" : "tf", + "chip.align_only" : false, + "chip.true_rep_only" : false, + "chip.genome_tsv" : "/path_to_genome_data/hg38/hg38.tsv", + "chip.ref_fa" : null, + "chip.bowtie2_idx_tar" : null, + "chip.chrsz" : null, + "chip.blacklist" : null, + "chip.gensz" : null, + "chip.paired_end" : true, "chip.ctl_paired_end" : true, @@ -40,15 +53,11 @@ "ctl2.tagAlign.gz" ], - "chip.title" : "Example (paired-end)", - "chip.description" : "This is an template input JSON for paired-end sample.", - - "chip.pipeline_type" : "tf", - - "chip.align_only" : false, - "chip.disable_fingerprint" : false, - "chip.enable_count_signal_track" : false, "chip.always_use_pooled_ctl" : false, "chip.ctl_depth_ratio" : 1.2, - "chip.idr_thresh" : 0.05 + + "chip.idr_thresh" : 0.05, + + "chip.disable_fingerprint" : false, + "chip.enable_count_signal_track" : false } \ No newline at end of file diff --git a/examples/template_se.full.json b/examples/template_se.full.json deleted file mode 100644 index 45f825bd..00000000 --- a/examples/template_se.full.json +++ /dev/null @@ -1,109 +0,0 @@ -{ - "chip.genome_tsv" : "/path_to_genome_data/hg38/hg38.tsv", - "chip.paired_end" : false, - "chip.ctl_paired_end" : true, - - "chip.fastqs_rep1_R1" : [ "rep1_R1_L1.fastq.gz", "rep1_R1_L2.fastq.gz", "rep1_R1_L3.fastq.gz" ], - "chip.fastqs_rep2_R1" : [ "rep2_R1_L1.fastq.gz", "rep2_R1_L2.fastq.gz" ], - - "chip.ctl_fastqs_rep1_R1" : [ "ctl1_R1.fastq.gz" ], - "chip.ctl_fastqs_rep2_R1" : [ "ctl2_R1.fastq.gz" ], - - "chip.bams" : [ - "raw_rep1.bam", - "raw_rep2.bam" - ], - "chip.ctl_bams" : [ - "raw_ctl1.bam", - "raw_ctl2.bam" - ], - - "chip.nodup_bams" : [ - "nodup_rep1.bam", - "nodup_rep2.bam" - ], - "chip.ctl_nodup_bams" : [ - "nodup_ctl1.bam", - "nodup_ctl2.bam" - ], - - "chip.tas" : [ - "rep1.tagAlign.gz", - "rep2.tagAlign.gz" - ], - "chip.ctl_tas" : [ - "ctl1.tagAlign.gz", - "ctl2.tagAlign.gz" - ], - - "chip.title" : "Example (single-ended)", - "chip.description" : "This is an template input JSON for single-ended sample.", - - "chip.pipeline_type" : "tf", - "chip.peak_caller" : null, - - "chip.align_only" : false, - "chip.true_rep_only" : false, - - "chip.disable_fingerprint" : false, - "chip.enable_count_signal_track" : false, - - "chip.xcor_pe_trim_bp" : 50, - - "chip.dup_marker" : "picard", - - "chip.mapq_thresh" : 30, - "chip.no_dup_removal" : false, - - "chip.mito_chr_name" : "chrM", - "chip.regex_filter_reads" : "chrM", - "chip.subsample_reads" : 0, - "chip.ctl_subsample_reads" : 0, - "chip.xcor_subsample_reads" : 15000000, - - "chip.keep_irregular_chr_in_bfilt_peak" : false, - - "chip.always_use_pooled_ctl" : false, - "chip.ctl_depth_ratio" : 1.2, - - "chip.macs2_cap_num_peak" : 500000, - "chip.pval_thresh" : 0.01, - "chip.idr_thresh" : 0.05, - "chip.spp_cap_num_peak" : 300000, - - "chip.bwa_cpu" : 4, - "chip.bwa_mem_mb" : 20000, - "chip.bwa_time_hr" : 48, - "chip.bwa_disks" : "local-disk 200 HDD", - - "chip.filter_cpu" : 2, - "chip.filter_mem_mb" : 20000, - "chip.filter_time_hr" : 24, - "chip.filter_disks" : "local-disk 400 HDD", - - "chip.bam2ta_cpu" : 2, - "chip.bam2ta_mem_mb" : 10000, - "chip.bam2ta_time_hr" : 6, - "chip.bam2ta_disks" : "local-disk 100 HDD", - - "chip.spr_mem_mb" : 16000, - - "chip.fingerprint_cpu" : 2, - "chip.fingerprint_mem_mb" : 12000, - "chip.fingerprint_time_hr" : 6, - "chip.fingerprint_disks" : "local-disk 200 HDD", - - "chip.xcor_cpu" : 2, - "chip.xcor_mem_mb" : 16000, - "chip.xcor_time_hr" : 24, - "chip.xcor_disks" : "local-disk 100 HDD", - - "chip.macs2_mem_mb" : 16000, - "chip.macs2_time_hr" : 24, - "chip.macs2_disks" : "local-disk 200 HDD", - - "chip.spp_cpu" : 2, - "chip.spp_mem_mb" : 16000, - "chip.spp_time_hr" : 72, - "chip.spp_disks" : "local-disk 200 HDD" -} diff --git a/examples/template_se.json b/examples/template_se.json deleted file mode 100644 index d54e6333..00000000 --- a/examples/template_se.json +++ /dev/null @@ -1,50 +0,0 @@ -{ - "chip.genome_tsv" : "/path_to_genome_data/hg38/hg38.tsv", - "chip.paired_end" : false, - "chip.ctl_paired_end" : true, - - "chip.fastqs_rep1_R1" : [ "rep1_R1_L1.fastq.gz", "rep1_R1_L2.fastq.gz", "rep1_R1_L3.fastq.gz" ], - "chip.fastqs_rep2_R1" : [ "rep2_R1_L1.fastq.gz", "rep2_R1_L2.fastq.gz" ], - - "chip.ctl_fastqs_rep1_R1" : [ "ctl1_R1.fastq.gz" ], - "chip.ctl_fastqs_rep2_R1" : [ "ctl2_R1.fastq.gz" ], - - "chip.bams" : [ - "raw_rep1.bam", - "raw_rep2.bam" - ], - "chip.ctl_bams" : [ - "raw_ctl1.bam", - "raw_ctl2.bam" - ], - - "chip.nodup_bams" : [ - "nodup_rep1.bam", - "nodup_rep2.bam" - ], - "chip.ctl_nodup_bams" : [ - "nodup_ctl1.bam", - "nodup_ctl2.bam" - ], - - "chip.tas" : [ - "rep1.tagAlign.gz", - "rep2.tagAlign.gz" - ], - "chip.ctl_tas" : [ - "ctl1.tagAlign.gz", - "ctl2.tagAlign.gz" - ], - - "chip.title" : "Example (single-ended)", - "chip.description" : "This is an template input JSON for single-ended sample.", - - "chip.pipeline_type" : "tf", - - "chip.align_only" : false, - "chip.disable_fingerprint" : false, - "chip.enable_count_signal_track" : false, - "chip.always_use_pooled_ctl" : false, - "chip.ctl_depth_ratio" : 1.2, - "chip.idr_thresh" : 0.05 -} \ No newline at end of file From 73d6671847308aecf0ef0d14e577768898d7fefb Mon Sep 17 00:00:00 2001 From: Jin Lee Date: Sat, 8 Jun 2019 21:45:13 -0700 Subject: [PATCH 08/14] fix typo in doc --- docs/input.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/input.md b/docs/input.md index 2194d424..d5f07a24 100644 --- a/docs/input.md +++ b/docs/input.md @@ -163,7 +163,7 @@ Parameter|Default|Description ## Resource parameters -> **WARNING*: It is recommened not to change the following parameters unless you get resource-related errors for a certain task and you want to increase resources for such task. The following parameters are provided for users who want to run our pipeline with Caper's `local` on HPCs and 2). +> **WARNING**: It is recommened not to change the following parameters unless you get resource-related errors for a certain task and you want to increase resources for such task. The following parameters are provided for users who want to run our pipeline with Caper's `local` on HPCs and 2). Resources defined here are PER REPLICATE. Therefore, total number of cores will be MAX(`chip.bwa_cpu` x `NUMBER_OF_REPLICATES`, `chip.spp_cpu` x 2 x `NUMBER_OF_REPLICATES`) because bwa and spp are bottlenecking tasks of the pipeline. Use this total number of cores if you manually `qsub` or `sbatch` your job (using local mode of Caper). `disks` is used for Google Cloud and DNAnexus only. From 7dfca46ceb7b14b31017face24bbf7eea80a602a Mon Sep 17 00:00:00 2001 From: Jin Lee Date: Mon, 10 Jun 2019 12:00:03 -0700 Subject: [PATCH 09/14] bug fix for idr/overlap in qc_report --- chip.wdl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/chip.wdl b/chip.wdl index 7d847d43..a4b4da39 100644 --- a/chip.wdl +++ b/chip.wdl @@ -956,8 +956,8 @@ workflow chip { } Array[Pair[Int, Int]] pairs = select_all(pairs__) - if ( !align_only ) { - scatter( pair in pairs ) { + scatter( pair in pairs ) { + if ( !align_only ) { # pair.left = 0-based index of 1st replicate # pair.right = 0-based index of 2nd replicate # Naive overlap on every pair of true replicates @@ -976,8 +976,8 @@ workflow chip { } } - if ( enable_idr && !align_only ) { - scatter( pair in pairs ) { + scatter( pair in pairs ) { + if ( enable_idr && !align_only ) { # pair.left = 0-based index of 1st replicate # pair.right = 0-based index of 2nd replicate # IDR on every pair of true replicates @@ -1685,7 +1685,7 @@ task overlap { task reproducibility { String prefix - Array[File] peaks # peak files from pair of true replicates + Array[File?] peaks # peak files from pair of true replicates # in a sorted order. for example of 4 replicates, # 1,2 1,3 1,4 2,3 2,4 3,4. # x,y means peak file from rep-x vs rep-y From 42d3c25ae01dae3dd7db173fbd539a9eb50cbd0e Mon Sep 17 00:00:00 2001 From: Jin Lee Date: Tue, 11 Jun 2019 17:25:56 -0700 Subject: [PATCH 10/14] testing on dx done after refactoring --- chip.wdl | 50 ++++++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/chip.wdl b/chip.wdl index a4b4da39..01ef86f3 100644 --- a/chip.wdl +++ b/chip.wdl @@ -486,7 +486,7 @@ workflow chip { # before peak calling, get fragment length from xcor analysis or given input # if fraglen [] is defined in the input JSON, fraglen from xcor will be ignored - Int? fraglen_ = if length(fraglen)>0 then fraglen[i] + Int? fraglen_ = if i 1 ) { - # rounded mean of fragment length, which will be used for - # 1) calling peaks for pooled true/pseudo replicates - # 2) calculating FRiP - call rounded_mean as fraglen_mean { input : - ints = fraglen_, - } + # if ( !align_only && num_rep > 1 ) { + # rounded mean of fragment length, which will be used for + # 1) calling peaks for pooled true/pseudo replicates + # 2) calculating FRiP + call rounded_mean as fraglen_mean { input : + ints = fraglen_tmp, } + # } # actually not an array Array[File?] chosen_ctl_ta_pooled = if !has_all_input_of_choose_ctl then [] @@ -956,8 +958,8 @@ workflow chip { } Array[Pair[Int, Int]] pairs = select_all(pairs__) - scatter( pair in pairs ) { - if ( !align_only ) { + if ( !align_only ) { + scatter( pair in pairs ) { # pair.left = 0-based index of 1st replicate # pair.right = 0-based index of 2nd replicate # Naive overlap on every pair of true replicates @@ -976,8 +978,8 @@ workflow chip { } } - scatter( pair in pairs ) { - if ( enable_idr && !align_only ) { + if ( enable_idr && !align_only ) { + scatter( pair in pairs ) { # pair.left = 0-based index of 1st replicate # pair.right = 0-based index of 2nd replicate # IDR on every pair of true replicates @@ -1685,7 +1687,7 @@ task overlap { task reproducibility { String prefix - Array[File?] peaks # peak files from pair of true replicates + Array[File]? peaks # peak files from pair of true replicates # in a sorted order. for example of 4 replicates, # 1,2 1,3 1,4 2,3 2,4 3,4. # x,y means peak file from rep-x vs rep-y @@ -1755,7 +1757,7 @@ task qc_report { Array[File?] xcor_scores File? jsd_plot Array[File?] jsd_qcs - Array[File?] idr_plots + Array[File]? idr_plots Array[File?] idr_plots_pr File? idr_plot_ppr Array[File?] frip_macs2_qcs @@ -1770,7 +1772,7 @@ task qc_report { File? frip_spp_qc_pooled File? frip_spp_qc_ppr1 File? frip_spp_qc_ppr2 - Array[File?] frip_idr_qcs + Array[File]? frip_idr_qcs Array[File?] frip_idr_qcs_pr File? frip_idr_qc_ppr Array[File?] frip_overlap_qcs @@ -1883,7 +1885,7 @@ task read_genome_tsv { } task rounded_mean { - Array[Int?] ints + Array[Int] ints command <<< python < Date: Thu, 13 Jun 2019 15:19:05 -0700 Subject: [PATCH 11/14] added croo out_def.json for v1.1.7 pipeline --- croo/chip.out_def.v1.1.7.json | 666 ++++++++++++++++++++++++++++++++++ 1 file changed, 666 insertions(+) create mode 100644 croo/chip.out_def.v1.1.7.json diff --git a/croo/chip.out_def.v1.1.7.json b/croo/chip.out_def.v1.1.7.json new file mode 100644 index 00000000..64281d20 --- /dev/null +++ b/croo/chip.out_def.v1.1.7.json @@ -0,0 +1,666 @@ +{ + "chip.fingerprint": { + "plot": { + "path": "qc/${basename}", + "table": "QC and logs/Deeptools fingerprint plot" + } + }, + "chip.bwa_ctl": { + "bam": { + "path": "align/ctl${i+1}/${basename}", + "table": "Alignment/Control ${i+1}/Raw BAM from bwa" + }, + "flagstat_qc": { + "path": "qc/ctl${i+1}/${basename}", + "table": "QC and logs/Control ${i+1}/Samtools flagstat for Raw BAM" + } + }, + "chip.filter_ctl": { + "nodup_bam": { + "path": "align/ctl${i+1}/${basename}", + "table": "Alignment/Control ${i+1}/Filtered BAM" + }, + "flagstat_qc": { + "path": "qc/ctl${i+1}/${basename}", + "table": "QC and logs/Control ${i+1}/Samtools flagstat log for filtered BAM" + }, + "dup_qc": { + "path": "qc/ctl${i+1}/${basename}", + "table": "QC and logs/Control ${i+1}/Dup QC for filtered BAM" + }, + "pbc_qc": { + "path": "qc/ctl${i+1}/${basename}", + "table": "QC and logs/Control ${i+1}/PBC QC for filtered BAM" + }, + "mito_dup_log": { + "path": "qc/ctl${i+1}/${basename}", + "table": "QC and logs/Control ${i+1}/Mito dup log for filtered BAM" + } + }, + "chip.bam2ta_ctl": { + "ta": { + "path": "align/ctl${i+1}/${basename}", + "table": "Alignment/Control ${i+1}/TAG-ALIGN" + } + }, + "chip.pool_ta_ctl": { + "ta_pooled": { + "path": "align/pooled-ctl/${basename}", + "table": "Alignment/Pooled control/TAG-ALIGN" + } + }, + "chip.bwa": { + "bam": { + "path": "align/rep${i+1}/${basename}", + "table": "Alignment/Replicate ${i+1}/Raw BAM from bwa" + }, + "flagstat_qc": { + "path": "qc/rep${i+1}/${basename}", + "table": "QC and logs/Replicate ${i+1}/Samtools flagstat for Raw BAM" + } + }, + "chip.filter": { + "nodup_bam": { + "path": "align/rep${i+1}/${basename}", + "table": "Alignment/Replicate ${i+1}/Filtered BAM" + }, + "flagstat_qc": { + "path": "qc/rep${i+1}/${basename}", + "table": "QC and logs/Replicate ${i+1}/Samtools flagstat log for filtered BAM" + }, + "dup_qc": { + "path": "qc/rep${i+1}/${basename}", + "table": "QC and logs/Replicate ${i+1}/Dup QC for filtered BAM" + }, + "pbc_qc": { + "path": "qc/rep${i+1}/${basename}", + "table": "QC and logs/Replicate ${i+1}/PBC QC for filtered BAM" + }, + "mito_dup_log": { + "path": "qc/rep${i+1}/${basename}", + "table": "QC and logs/Replicate ${i+1}/Mito dup log for filtered BAM" + } + }, + "chip.bam2ta": { + "ta": { + "path": "align/rep${i+1}/${basename}", + "table": "Alignment/Replicate ${i+1}/TAG-ALIGN" + } + }, + "chip.spr": { + "ta_pr1": { + "path": "align/rep${i+1}/pseudorep1/${basename}", + "table": "Alignment/Replicate ${i+1}/Pseudoreplicate 1/TAG-ALIGN" + }, + "ta_pr2": { + "path": "align/rep${i+1}/pseudorep2/${basename}", + "table": "Alignment/Replicate ${i+1}/Pseudoreplicate 2/TAG-ALIGN" + } + }, + "chip.pool_ta": { + "ta_pooled": { + "path": "align/pooled-rep/${basename}", + "table": "Alignment/Pooled replicate/TAG-ALIGN" + } + }, + "chip.pool_ta_pr1": { + "ta_pooled": { + "path": "align/pooled-rep/pseudorep1/${basename}", + "table": "Alignment/Pooled replicate/Pseudoreplicate 1/TAG-ALIGN" + } + }, + "chip.pool_ta_pr2": { + "ta_pooled": { + "path": "align/pooled-rep/pseudorep2/${basename}", + "table": "Alignment/Pooled replicate/Pseudoreplicate 2/TAG-ALIGN" + } + }, + "chip.xcor": { + "plot_png": { + "path": "qc/rep${i+1}/${basename}", + "table": "QC and logs/Replicate ${i+1}/Cross-correlation Plot" + }, + "score": { + "path": "qc/rep${i+1}/${basename}", + "table": "QC and logs/Replicate ${i+1}/Cross-correlation score" + }, + "fraglen": { + "path": "qc/rep${i+1}/${basename}", + "table": "QC and logs/Replicate ${i+1}/Estimated fragment length" + } + }, + "chip.macs2": { + "npeak": { + "path": "peak/rep${i+1}/${basename}", + "table": "Peak/Replicate ${i+1}/Raw MACS2 narrowpeak" + }, + "bfilt_npeak": { + "path": "peak/rep${i+1}/${basename}", + "table": "Peak/Replicate ${i+1}/Blacklist-filtered MACS2 narrowpeak" + }, + "bfilt_npeak_bb": { + "path": "peak/rep${i+1}/${basename}", + "table": "Peak/Replicate ${i+1}/Blacklist-filtered MACS2 narrowpeak (BigBed)" + }, + "bfilt_npeak_hammock": { + "path": "peak/rep${i+1}/${basename}", + "table": "Peak/Replicate ${i+1}/Blacklist-filtered MACS2 narrowpeak (hammock)" + }, + "frip_qc": { + "path": "qc/rep${i+1}/${basename}", + "table": "QC and logs/Replicate ${i+1}/Fraction of MACS2 peaks in reads" + }, + "sig_pval": { + "path": "signal/rep${i+1}/${basename}", + "table": "Signal/Replicate ${i+1}/MACS2 signal track (p-val)" + }, + "sig_fc": { + "path": "signal/rep${i+1}/${basename}", + "table": "Signal/Replicate ${i+1}/MACS2 signal track (fold-enrichment)" + } + }, + "chip.macs2_pr1": { + "npeak": { + "path": "peak/rep${i+1}/pseudorep1/${basename}", + "table": "Peak/Replicate ${i+1}/Pseudoreplicate 1/Raw MACS2 narrowpeak" + }, + "bfilt_npeak": { + "path": "peak/rep${i+1}/pseudorep1/${basename}", + "table": "Peak/Replicate ${i+1}/Pseudoreplicate 1/Blacklist-filtered MACS2 narrowpeak" + }, + "bfilt_npeak_bb": { + "path": "peak/rep${i+1}/pseudorep1/${basename}", + "table": "Peak/Replicate ${i+1}/Pseudoreplicate 1/Blacklist-filtered MACS2 narrowpeak (BigBed)" + }, + "bfilt_npeak_hammock": { + "path": "peak/rep${i+1}/pseudorep1/${basename}", + "table": "Peak/Replicate ${i+1}/Pseudoreplicate 1/Blacklist-filtered MACS2 narrowpeak (hammock)" + }, + "frip_qc": { + "path": "qc/rep${i+1}/pseudorep1/${basename}", + "table": "QC and logs/Replicate ${i+1}/Pseudoreplicate 1/Fraction of MACS2 peaks in reads" + } + }, + "chip.macs2_pr2": { + "npeak": { + "path": "peak/rep${i+1}/pseudorep2/${basename}", + "table": "Peak/Replicate ${i+1}/Pseudoreplicate 2/Raw MACS2 narrowpeak" + }, + "bfilt_npeak": { + "path": "peak/rep${i+1}/pseudorep2/${basename}", + "table": "Peak/Replicate ${i+1}/Pseudoreplicate 2/Blacklist-filtered MACS2 narrowpeak" + }, + "bfilt_npeak_bb": { + "path": "peak/rep${i+1}/pseudorep2/${basename}", + "table": "Peak/Replicate ${i+1}/Pseudoreplicate 2/Blacklist-filtered MACS2 narrowpeak (BigBed)" + }, + "bfilt_npeak_hammock": { + "path": "peak/rep${i+1}/pseudorep2/${basename}", + "table": "Peak/Replicate ${i+1}/Pseudoreplicate 2/Blacklist-filtered MACS2 narrowpeak (hammock)" + }, + "frip_qc": { + "path": "qc/rep${i+1}/pseudorep2/${basename}", + "table": "QC and logs/Replicate ${i+1}/Pseudoreplicate 2/Fraction of MACS2 peaks in reads" + } + }, + "chip.macs2_pooled": { + "npeak": { + "path": "peak/pooled-rep/${basename}", + "table": "Peak/Pooled replicate/Raw MACS2 narrowpeak" + }, + "bfilt_npeak": { + "path": "peak/pooled-rep/${basename}", + "table": "Peak/Pooled replicate/Blacklist-filtered MACS2 narrowpeak" + }, + "bfilt_npeak_bb": { + "path": "peak/pooled-rep/${basename}", + "table": "Peak/Pooled replicate/Blacklist-filtered MACS2 narrowpeak (BigBed)" + }, + "bfilt_npeak_hammock": { + "path": "peak/pooled-rep/${basename}", + "table": "Peak/Pooled replicate/Blacklist-filtered MACS2 narrowpeak (hammock)" + }, + "frip_qc": { + "path": "qc/pooled-rep/${basename}", + "table": "QC and logs/Pooled replicate/Fraction of MACS2 peaks in reads" + }, + "sig_pval": { + "path": "signal/pooled-rep/${basename}", + "table": "Signal/Pooled replicate/MACS2 signal track (p-val)" + }, + "sig_fc": { + "path": "signal/pooled-rep/${basename}", + "table": "Signal/Pooled replicate/MACS2 signal track (fold-enrichment)" + } + }, + "chip.macs2_ppr1": { + "npeak": { + "path": "peak/pooled-rep/pseudorep1/${basename}", + "table": "Peak/Pooled replicate/Pseudoreplicate 1/Raw MACS2 narrowpeak" + }, + "bfilt_npeak": { + "path": "peak/pooled-rep/pseudorep1/${basename}", + "table": "Peak/Pooled replicate/Pseudoreplicate 1/Blacklist-filtered MACS2 narrowpeak" + }, + "bfilt_npeak_bb": { + "path": "peak/pooled-rep/pseudorep1/${basename}", + "table": "Peak/Pooled replicate/Pseudoreplicate 1/Blacklist-filtered MACS2 narrowpeak (BigBed)" + }, + "bfilt_npeak_hammock": { + "path": "peak/pooled-rep/pseudorep1/${basename}", + "table": "Peak/Pooled replicate/Pseudoreplicate 1/Blacklist-filtered MACS2 narrowpeak (hammock)" + }, + "frip_qc": { + "path": "qc/pooled-rep/pseudorep1/${basename}", + "table": "QC and logs/Pooled replicate/Pseudoreplicate 1/Fraction of MACS2 peaks in reads" + } + }, + "chip.macs2_ppr2": { + "npeak": { + "path": "peak/pooled-rep/pseudorep2/${basename}", + "table": "Peak/Pooled replicate/Pseudoreplicate 2/Raw MACS2 narrowpeak" + }, + "bfilt_npeak": { + "path": "peak/pooled-rep/pseudorep2/${basename}", + "table": "Peak/Pooled replicate/Pseudoreplicate 2/Blacklist-filtered MACS2 narrowpeak" + }, + "bfilt_npeak_bb": { + "path": "peak/pooled-rep/pseudorep2/${basename}", + "table": "Peak/Pooled replicate/Pseudoreplicate 2/Blacklist-filtered MACS2 narrowpeak (BigBed)" + }, + "bfilt_npeak_hammock": { + "path": "peak/pooled-rep/pseudorep2/${basename}", + "table": "Peak/Pooled replicate/Pseudoreplicate 2/Blacklist-filtered MACS2 narrowpeak (hammock)" + }, + "frip_qc": { + "path": "qc/pooled-rep/pseudorep2/${basename}", + "table": "QC and logs/Pooled replicate/Pseudoreplicate 2/Fraction of MACS2 peaks in reads" + } + }, + "chip.spp": { + "rpeak": { + "path": "peak/rep${i+1}/${basename}", + "table": "Peak/Replicate ${i+1}/Raw SPP narrowpeak" + }, + "bfilt_rpeak": { + "path": "peak/rep${i+1}/${basename}", + "table": "Peak/Replicate ${i+1}/Blacklist-filtered SPP narrowpeak" + }, + "bfilt_rpeak_bb": { + "path": "peak/rep${i+1}/${basename}", + "table": "Peak/Replicate ${i+1}/Blacklist-filtered SPP narrowpeak (BigBed)" + }, + "bfilt_rpeak_hammock": { + "path": "peak/rep${i+1}/${basename}", + "table": "Peak/Replicate ${i+1}/Blacklist-filtered SPP narrowpeak (hammock)" + }, + "frip_qc": { + "path": "qc/rep${i+1}/${basename}", + "table": "QC and logs/Replicate ${i+1}/Fraction of SPP peaks in reads" + } + }, + "chip.spp_pr1": { + "rpeak": { + "path": "peak/rep${i+1}/pseudorep1/${basename}", + "table": "Peak/Replicate ${i+1}/Pseudoreplicate 1/Raw SPP narrowpeak" + }, + "bfilt_rpeak": { + "path": "peak/rep${i+1}/pseudorep1/${basename}", + "table": "Peak/Replicate ${i+1}/Pseudoreplicate 1/Blacklist-filtered SPP narrowpeak" + }, + "bfilt_rpeak_bb": { + "path": "peak/rep${i+1}/pseudorep1/${basename}", + "table": "Peak/Replicate ${i+1}/Pseudoreplicate 1/Blacklist-filtered SPP narrowpeak (BigBed)" + }, + "bfilt_rpeak_hammock": { + "path": "peak/rep${i+1}/pseudorep1/${basename}", + "table": "Peak/Replicate ${i+1}/Pseudoreplicate 1/Blacklist-filtered SPP narrowpeak (hammock)" + }, + "frip_qc": { + "path": "qc/rep${i+1}/pseudorep1/${basename}", + "table": "QC and logs/Replicate ${i+1}/Pseudoreplicate 1/Fraction of SPP peaks in reads" + } + }, + "chip.spp_pr2": { + "rpeak": { + "path": "peak/rep${i+1}/pseudorep2/${basename}", + "table": "Peak/Replicate ${i+1}/Pseudoreplicate 2/Raw SPP narrowpeak" + }, + "bfilt_rpeak": { + "path": "peak/rep${i+1}/pseudorep2/${basename}", + "table": "Peak/Replicate ${i+1}/Pseudoreplicate 2/Blacklist-filtered SPP narrowpeak" + }, + "bfilt_rpeak_bb": { + "path": "peak/rep${i+1}/pseudorep2/${basename}", + "table": "Peak/Replicate ${i+1}/Pseudoreplicate 2/Blacklist-filtered SPP narrowpeak (BigBed)" + }, + "bfilt_rpeak_hammock": { + "path": "peak/rep${i+1}/pseudorep2/${basename}", + "table": "Peak/Replicate ${i+1}/Pseudoreplicate 2/Blacklist-filtered SPP narrowpeak (hammock)" + }, + "frip_qc": { + "path": "qc/rep${i+1}/pseudorep2/${basename}", + "table": "QC and logs/Replicate ${i+1}/Pseudoreplicate 2/Fraction of SPP peaks in reads" + } + }, + "chip.spp_pooled": { + "rpeak": { + "path": "peak/pooled-rep/${basename}", + "table": "Peak/Pooled replicate/Raw SPP narrowpeak" + }, + "bfilt_rpeak": { + "path": "peak/pooled-rep/${basename}", + "table": "Peak/Pooled replicate/Blacklist-filtered SPP narrowpeak" + }, + "bfilt_rpeak_bb": { + "path": "peak/pooled-rep/${basename}", + "table": "Peak/Pooled replicate/Blacklist-filtered SPP narrowpeak (BigBed)" + }, + "bfilt_rpeak_hammock": { + "path": "peak/pooled-rep/${basename}", + "table": "Peak/Pooled replicate/Blacklist-filtered SPP narrowpeak (hammock)" + }, + "frip_qc": { + "path": "qc/pooled-rep/${basename}", + "table": "QC and logs/Pooled replicate/Fraction of SPP peaks in reads" + } + }, + "chip.spp_ppr1": { + "rpeak": { + "path": "peak/pooled-rep/pseudorep1/${basename}", + "table": "Peak/Pooled replicate/Pseudoreplicate 1/Raw SPP narrowpeak" + }, + "bfilt_rpeak": { + "path": "peak/pooled-rep/pseudorep1/${basename}", + "table": "Peak/Pooled replicate/Pseudoreplicate 1/Blacklist-filtered SPP narrowpeak" + }, + "bfilt_rpeak_bb": { + "path": "peak/pooled-rep/pseudorep1/${basename}", + "table": "Peak/Pooled replicate/Pseudoreplicate 1/Blacklist-filtered SPP narrowpeak (BigBed)" + }, + "bfilt_rpeak_hammock": { + "path": "peak/pooled-rep/pseudorep1/${basename}", + "table": "Peak/Pooled replicate/Pseudoreplicate 1/Blacklist-filtered SPP narrowpeak (hammock)" + }, + "frip_qc": { + "path": "qc/pooled-rep/pseudorep1/${basename}", + "table": "QC and logs/Pooled replicate/Pseudoreplicate 1/Fraction of SPP peaks in reads" + } + }, + "chip.spp_ppr2": { + "rpeak": { + "path": "peak/pooled-rep/pseudorep2/${basename}", + "table": "Peak/Pooled replicate/Pseudoreplicate 2/Raw SPP narrowpeak" + }, + "bfilt_rpeak": { + "path": "peak/pooled-rep/pseudorep2/${basename}", + "table": "Peak/Pooled replicate/Pseudoreplicate 2/Blacklist-filtered SPP narrowpeak" + }, + "bfilt_rpeak_bb": { + "path": "peak/pooled-rep/pseudorep2/${basename}", + "table": "Peak/Pooled replicate/Pseudoreplicate 2/Blacklist-filtered SPP narrowpeak (BigBed)" + }, + "bfilt_rpeak_hammock": { + "path": "peak/pooled-rep/pseudorep2/${basename}", + "table": "Peak/Pooled replicate/Pseudoreplicate 2/Blacklist-filtered SPP narrowpeak (hammock)" + }, + "frip_qc": { + "path": "qc/pooled-rep/pseudorep2/${basename}", + "table": "QC and logs/Pooled replicate/Pseudoreplicate 2/Fraction of SPP peaks in reads" + } + }, + "chip.macs2_signal_track": { + "pval_bw": { + "path": "signal/rep${i+1}/${basename}", + "table": "Signal/Replicate ${i+1}/MACS2 signal track (p-val)" + }, + "fc_bw": { + "path": "signal/rep${i+1}/${basename}", + "table": "Signal/Replicate ${i+1}/MACS2 signal track (fold-enrichment)" + } + }, + "chip.macs2_signal_track_pooled": { + "pval_bw": { + "path": "signal/pooled-rep/${basename}", + "table": "Signal/Pooled replicate/MACS2 signal track (p-val)" + }, + "fc_bw": { + "path": "signal/pooled-rep/${basename}", + "table": "Signal/Pooled replicate/MACS2 signal track (fold-enrichment)" + } + }, + "chip.count_signal_track": { + "pos_bw": { + "path": "signal/rep${i+1}/${basename}", + "table": "Signal/Replicate ${i+1}/Count signal track (positive)" + }, + "neg_bw": { + "path": "signal/rep${i+1}/${basename}", + "table": "Signal/Replicate ${i+1}/Count signal track (negative)" + } + }, + "chip.count_signal_track_pooled": { + "pos_bw": { + "path": "signal/pooled-rep/${basename}", + "table": "Signal/Pooled replicate/Count signal track (positive)" + }, + "neg_bw": { + "path": "signal/pooled-rep/${basename}", + "table": "Signal/Pooled replicate/Count signal track (negative)" + } + }, + "chip.idr": { + "bfilt_idr_peak": { + "path": "peak/${basename.split('.')[0].replace('_','_vs_').replace('-','_vs_')}/${basename}", + "table": "Peak/${basename.split('.')[0].replace('_',' vs. ').replace('-',' vs. ').capitalize()}/Blacklist-filtered IDR peak" + }, + "bfilt_idr_peak_bb": { + "path": "peak/${basename.split('.')[0].replace('_','_vs_').replace('-','_vs_')}/${basename}", + "table": "Peak/${basename.split('.')[0].replace('_',' vs. ').replace('-',' vs. ').capitalize()}/Blacklist-filtered IDR peak (BigBed)" + }, + "bfilt_idr_peak_hammock": { + "path": "peak/${basename.split('.')[0].replace('_','_vs_').replace('-','_vs_')}/${basename}", + "table": "Peak/${basename.split('.')[0].replace('_',' vs. ').replace('-',' vs. ').capitalize()}/Blacklist-filtered IDR peak (hammock)" + }, + "idr_unthresholded_peak": { + "path": "peak/${basename.split('.')[0].replace('_','_vs_').replace('-','_vs_')}/${basename}", + "table": "Peak/${basename.split('.')[0].replace('_',' vs. ').replace('-',' vs. ').capitalize()}/Unthresholded raw IDR peak" + }, + "idr_plot": { + "path": "qc/${basename.split('.')[0].replace('_','_vs_').replace('-','_vs_')}/${basename}", + "table": "QC and logs/${basename.split('.')[0].replace('_',' vs. ').replace('-',' vs. ').capitalize()}/IDR plot" + }, + "idr_log": { + "path": "qc/${basename.split('.')[0].replace('_','_vs_').replace('-','_vs_')}/${basename}", + "table": "QC and logs/${basename.split('.')[0].replace('_',' vs. ').replace('-',' vs. ').capitalize()}/IDR log" + }, + "frip_qc": { + "path": "qc/${basename.split('.')[0].replace('_','_vs_').replace('-','_vs_')}/${basename}", + "table": "QC and logs/${basename.split('.')[0].replace('_',' vs. ').replace('-',' vs. ').capitalize()}/Fraction of IDR peaks in reads" + } + }, + "chip.idr_ppr": { + "bfilt_idr_peak": { + "path": "peak/pooled-pseudorep1_vs_2/${basename}", + "table": "Peak/Pooled pseudoreplicate 1 vs. 2/Blacklist-filtered IDR peak" + }, + "bfilt_idr_peak_bb": { + "path": "peak/pooled-pseudorep1_vs_2/${basename}", + "table": "Peak/Pooled pseudoreplicate 1 vs. 2/Blacklist-filtered IDR peak (BigBed)" + }, + "bfilt_idr_peak_hammock": { + "path": "peak/pooled-pseudorep1_vs_2/${basename}", + "table": "Peak/Pooled pseudoreplicate 1 vs. 2/Blacklist-filtered IDR peak (hammock)" + }, + "idr_unthresholded_peak": { + "path": "peak/pooled-pseudorep1_vs_2/${basename}", + "table": "Peak/Pooled pseudoreplicate 1 vs. 2/Unthresholded raw IDR peak" + }, + "idr_plot": { + "path": "qc/pooled-pseudorep1_vs_2/${basename}", + "table": "QC and logs/Pooled pseudoreplicate 1 vs. 2/IDR plot" + }, + "idr_log": { + "path": "qc/pooled-pseudorep1_vs_2/${basename}", + "table": "QC and logs/Pooled pseudoreplicate 1 vs. 2/IDR log" + }, + "frip_qc": { + "path": "qc/pooled-pseudorep1_vs_2/${basename}", + "table": "QC and logs/Pooled pseudoreplicate 1 vs. 2/Fraction of IDR peaks in reads" + } + }, + "chip.idr_pr": { + "bfilt_idr_peak": { + "path": "peak/rep${i+1}-pseudorep1_vs_2/${basename}", + "table": "Peak/Replicate ${i+1}/Pseudorep 1 vs. pseudorep 2/Blacklist-filtered IDR peak" + }, + "bfilt_idr_peak_bb": { + "path": "peak/rep${i+1}-pseudorep1_vs_2/${basename}", + "table": "Peak/Replicate ${i+1}/Pseudorep 1 vs. pseudorep 2/Blacklist-filtered IDR peak (BigBed)" + }, + "bfilt_idr_peak_hammock": { + "path": "peak/rep${i+1}-pseudorep1_vs_2/${basename}", + "table": "Peak/Replicate ${i+1}/Pseudorep 1 vs. pseudorep 2/Blacklist-filtered IDR peak (hammock)" + }, + "idr_unthresholded_peak": { + "path": "peak/rep${i+1}-pseudorep1_vs_2/${basename}", + "table": "Peak/Replicate ${i+1}/Pseudorep 1 vs. pseudorep 2/Unthresholded raw IDR peak" + }, + "idr_plot": { + "path": "qc/rep${i+1}-pseudorep1_vs_2/${basename}", + "table": "QC and logs/Replicate ${i+1}/Pseudorep 1 vs. pseudorep 2/IDR plot" + }, + "idr_log": { + "path": "qc/rep${i+1}-pseudorep1_vs_2/${basename}", + "table": "QC and logs/Replicate ${i+1}/Pseudorep 1 vs. pseudorep 2/IDR log" + }, + "frip_qc": { + "path": "qc/rep${i+1}-pseudorep1_vs_2/${basename}", + "table": "QC and logs/Replicate ${i+1}/Pseudorep 1 vs. pseudorep 2/Fraction of IDR peaks in reads" + } + }, + "chip.overlap": { + "bfilt_overlap_peak": { + "path": "peak/${basename.split('.')[0].replace('_','_vs_').replace('-','_vs_')}/${basename}", + "table": "Peak/${basename.split('.')[0].replace('_',' vs. ').replace('-',' vs. ').capitalize()}/Blacklist-filtered overlap peak" + }, + "bfilt_overlap_peak_bb": { + "path": "peak/${basename.split('.')[0].replace('_','_vs_').replace('-','_vs_')}/${basename}", + "table": "Peak/${basename.split('.')[0].replace('_',' vs. ').replace('-',' vs. ').capitalize()}/Blacklist-filtered overlap peak (BigBed)" + }, + "bfilt_overlap_peak_hammock": { + "path": "peak/${basename.split('.')[0].replace('_','_vs_').replace('-','_vs_')}/${basename}", + "table": "Peak/${basename.split('.')[0].replace('_',' vs. ').replace('-',' vs. ').capitalize()}/Blacklist-filtered overlap peak (hammock)" + }, + "frip_qc": { + "path": "qc/${basename.split('.')[0].replace('_','_vs_').replace('-','_vs_')}/${basename}", + "table": "QC and logs/${basename.split('.')[0].replace('_',' vs. ').replace('-',' vs. ').capitalize()}/Fraction of overlap peaks in reads" + } + }, + "chip.overlap_ppr": { + "bfilt_overlap_peak": { + "path": "peak/pooled-pseudorep1_vs_2/${basename}", + "table": "Peak/Pooled pseudoreplicate 1 vs. 2/Blacklist-filtered overlap peak" + }, + "bfilt_overlap_peak_bb": { + "path": "peak/pooled-pseudorep1_vs_2/${basename}", + "table": "Peak/Pooled pseudoreplicate 1 vs. 2/Blacklist-filtered overlap peak (BigBed)" + }, + "bfilt_overlap_peak_hammock": { + "path": "peak/pooled-pseudorep1_vs_2/${basename}", + "table": "Peak/Pooled pseudoreplicate 1 vs. 2/Blacklist-filtered overlap peak (hammock)" + }, + "frip_qc": { + "path": "qc/pooled-pseudorep1_vs_2/${basename}", + "table": "QC and logs/Pooled pseudoreplicate 1 vs. 2/Fraction of overlap peaks in reads" + } + }, + "chip.overlap_pr": { + "bfilt_overlap_peak": { + "path": "peak/rep${i+1}-pseudorep1_vs_2/${basename}", + "table": "Peak/Replicate ${i+1}/Pseudorep 1 vs. pseudorep 2/Blacklist-filtered overlap peak" + }, + "bfilt_overlap_peak_bb": { + "path": "peak/rep${i+1}-pseudorep1_vs_2/${basename}", + "table": "Peak/Replicate ${i+1}/Pseudorep 1 vs. pseudorep 2/Blacklist-filtered overlap peak (BigBed)" + }, + "bfilt_overlap_peak_hammock": { + "path": "peak/rep${i+1}-pseudorep1_vs_2/${basename}", + "table": "Peak/Replicate ${i+1}/Pseudorep 1 vs. pseudorep 2/Blacklist-filtered overlap peak (hammock)" + }, + "frip_qc": { + "path": "qc/rep${i+1}-pseudorep1_vs_2/${basename}", + "table": "QC and logs/Replicate ${i+1}/Pseudorep 1 vs. pseudorep 2/Fraction of overlap peaks in reads" + } + }, + "chip.reproducibility_idr": { + "optimal_peak": { + "path": "peak/idr_reproducibility/${basename}", + "table": "Peak/IDR reproducibility/Optimal peak" + }, + "optimal_peak_bb": { + "path": "peak/idr_reproducibility/${basename}", + "table": "Peak/IDR reproducibility/Optimal peak (BigBed)" + }, + "optimal_peak_hammock": { + "path": "peak/idr_reproducibility/${basename}", + "table": "Peak/IDR reproducibility/Optimal peak (hammock)" + }, + "conservative_peak": { + "path": "peak/idr_reproducibility/${basename}", + "table": "Peak/IDR reproducibility/Conservative peak" + }, + "conservative_peak_bb": { + "path": "peak/idr_reproducibility/${basename}", + "table": "Peak/IDR reproducibility/Conservative peak (BigBed)" + }, + "conservative_peak_hammock": { + "path": "peak/idr_reproducibility/${basename}", + "table": "Peak/IDR reproducibility/Conservative peak (hammock)" + }, + "reproducibility_qc": { + "path": "qc/${basename}", + "table": "QC and logs/Reproducibility QC for overlap peaks" + } + }, + "chip.reproducibility_overlap": { + "optimal_peak": { + "path": "peak/overlap_reproducibility/${basename}", + "table": "Peak/Overlap reproducibility/Optimal peak" + }, + "optimal_peak_bb": { + "path": "peak/overlap_reproducibility/${basename}", + "table": "Peak/Overlap reproducibility/Optimal peak (BigBed)" + }, + "optimal_peak_hammock": { + "path": "peak/overlap_reproducibility/${basename}", + "table": "Peak/Overlap reproducibility/Optimal peak (hammock)" + }, + "conservative_peak": { + "path": "peak/overlap_reproducibility/${basename}", + "table": "Peak/Overlap reproducibility/Conservative peak" + }, + "conservative_peak_bb": { + "path": "peak/overlap_reproducibility/${basename}", + "table": "Peak/Overlap reproducibility/Conservative peak (BigBed)" + }, + "conservative_peak_hammock": { + "path": "peak/overlap_reproducibility/${basename}", + "table": "Peak/Overlap reproducibility/Conservative peak (hammock)" + }, + "reproducibility_qc": { + "path": "qc/${basename}", + "table": "QC and logs/Reproducibility QC for IDR peaks" + } + }, + "chip.qc_report": { + "report": { + "path": "qc/${basename}", + "table": "QC and logs/Final HTML report" + }, + "qc_json": { + "path": "qc/${basename}", + "table": "QC and logs/Final QC JSON file" + } + } +} From b1ea9c11acedcf6d304c05f712a8939b00abd9e3 Mon Sep 17 00:00:00 2001 From: Jin Lee Date: Fri, 14 Jun 2019 10:48:32 -0700 Subject: [PATCH 12/14] fix typo in code (genome_tsv) --- chip.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chip.wdl b/chip.wdl index 01ef86f3..769bbc22 100644 --- a/chip.wdl +++ b/chip.wdl @@ -1856,7 +1856,7 @@ task read_genome_tsv { String? null_s command <<< # create empty files for all entries - touch ref_fa bowtie2_idx_tar chrsz gensz blacklist + touch ref_fa bowtie2_idx_tar bwa_idx_tar chrsz gensz blacklist python < Date: Fri, 14 Jun 2019 10:48:57 -0700 Subject: [PATCH 13/14] remove commented lines --- conda/config_conda_env_py3.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/conda/config_conda_env_py3.sh b/conda/config_conda_env_py3.sh index f9213af0..9b51ff15 100755 --- a/conda/config_conda_env_py3.sh +++ b/conda/config_conda_env_py3.sh @@ -18,8 +18,4 @@ else exit 1 fi -#echo "=== Installing additional packages for python3 env..." -#source activate ${CONDA_ENV_PY3} -# pip install caper croo - echo "=== All done." From da055f99cd9a48883c910152e18a7691809557e6 Mon Sep 17 00:00:00 2001 From: Jin Lee Date: Fri, 14 Jun 2019 20:11:08 -0700 Subject: [PATCH 14/14] ver: v1.2.1 -> v1.2.2 --- chip.wdl | 6 ++--- docs/deprecated/tutorial_local_singularity.md | 4 ++-- docs/deprecated/tutorial_scg.md | 2 +- docs/deprecated/tutorial_scg_backend.md | 4 ++-- docs/deprecated/tutorial_sge.md | 4 ++-- docs/deprecated/tutorial_sge_backend.md | 2 +- docs/deprecated/tutorial_sherlock.md | 2 +- docs/deprecated/tutorial_sherlock_backend.md | 4 ++-- docs/deprecated/tutorial_slurm.md | 4 ++-- docs/deprecated/tutorial_slurm_backend.md | 4 ++-- docs/dev.md | 6 ++--- docs/tutorial_dx_web.md | 24 +++++++++---------- test/test_task/test.sh | 2 +- test/test_workflow/test_chip.sh | 2 +- workflow_opts/docker.json | 2 +- workflow_opts/scg.json | 2 +- workflow_opts/sge.json | 2 +- workflow_opts/sherlock.json | 2 +- workflow_opts/singularity.json | 2 +- workflow_opts/slurm.json | 2 +- 20 files changed, 41 insertions(+), 41 deletions(-) diff --git a/chip.wdl b/chip.wdl index 769bbc22..8b370e18 100644 --- a/chip.wdl +++ b/chip.wdl @@ -1,12 +1,12 @@ # ENCODE DCC TF/Histone ChIP-Seq pipeline # Author: Jin Lee (leepc12@gmail.com) -#CAPER docker quay.io/encode-dcc/chip-seq-pipeline:v1.2.1 -#CAPER singularity docker://quay.io/encode-dcc/chip-seq-pipeline:v1.2.1 +#CAPER docker quay.io/encode-dcc/chip-seq-pipeline:v1.2.2 +#CAPER singularity docker://quay.io/encode-dcc/chip-seq-pipeline:v1.2.2 #CROO out_def https://storage.googleapis.com/encode-pipeline-output-definition/chip.out_def.json workflow chip { - String pipeline_ver = 'v1.2.1' + String pipeline_ver = 'v1.2.2' ### sample name, description String title = 'Untitled' String description = 'No description' diff --git a/docs/deprecated/tutorial_local_singularity.md b/docs/deprecated/tutorial_local_singularity.md index 98f1bb59..a85f9d87 100644 --- a/docs/deprecated/tutorial_local_singularity.md +++ b/docs/deprecated/tutorial_local_singularity.md @@ -33,7 +33,7 @@ 6. Pull a singularity container for the pipeline. This will pull pipeline's docker container first and build a singularity one on `~/.singularity`. ```bash - $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.2.1.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.2.1 + $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.2.2.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.2.2 ``` 7. Run a pipeline for the test sample. @@ -53,7 +53,7 @@ ```javascript { "default_runtime_attributes" : { - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.2.1.simg", + "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.2.2.simg", "singularity_bindpath" : "/your/,YOUR_OWN_DATA_DIR1,YOUR_OWN_DATA_DIR2,..." } } diff --git a/docs/deprecated/tutorial_scg.md b/docs/deprecated/tutorial_scg.md index cb2a0fa6..4018d8d0 100644 --- a/docs/deprecated/tutorial_scg.md +++ b/docs/deprecated/tutorial_scg.md @@ -63,7 +63,7 @@ Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](htt ```javascript { "default_runtime_attributes" : { - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.2.1.simg", + "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.2.2.simg", "singularity_bindpath" : "/reference/ENCODE,/scratch,/srv/gsfs0,YOUR_OWN_DATA_DIR1,YOUR_OWN_DATA_DIR1,..." } } diff --git a/docs/deprecated/tutorial_scg_backend.md b/docs/deprecated/tutorial_scg_backend.md index ed7895b1..3fbe983e 100644 --- a/docs/deprecated/tutorial_scg_backend.md +++ b/docs/deprecated/tutorial_scg_backend.md @@ -58,7 +58,7 @@ Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](htt 5. Pull a singularity container for the pipeline. This will pull pipeline's docker container first and build a singularity one on `~/.singularity`. ```bash $ sdev # SCG cluster does not allow building a container on login node - $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.2.1.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.2.1 + $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.2.2.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.2.2 $ exit ``` @@ -77,7 +77,7 @@ Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](htt ```javascript { "default_runtime_attributes" : { - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.2.1.simg", + "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.2.2.simg", "singularity_bindpath" : "/scratch/users,/srv/gsfs0,/your/,YOUR_OWN_DATA_DIR1,YOUR_OWN_DATA_DIR1,..." } } diff --git a/docs/deprecated/tutorial_sge.md b/docs/deprecated/tutorial_sge.md index e4e6236e..08aa8fdd 100644 --- a/docs/deprecated/tutorial_sge.md +++ b/docs/deprecated/tutorial_sge.md @@ -61,7 +61,7 @@ Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](htt 7. Pull a singularity container for the pipeline. This will pull pipeline's docker container first and build a singularity one on `~/.singularity`. ```bash - $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.2.1.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.2.1 + $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.2.2.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.2.2 ``` 8. Run a pipeline for the test sample. If your parallel environment (PE) found from step 5) has a different name from `shm` then edit the following shell script to change the PE name. @@ -83,7 +83,7 @@ Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](htt ```javascript { "default_runtime_attributes" : { - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.2.1.simg", + "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.2.2.simg", "singularity_bindpath" : "/your/,YOUR_OWN_DATA_DIR1,YOUR_OWN_DATA_DIR2,..." } } diff --git a/docs/deprecated/tutorial_sge_backend.md b/docs/deprecated/tutorial_sge_backend.md index 04e16644..5af48af1 100644 --- a/docs/deprecated/tutorial_sge_backend.md +++ b/docs/deprecated/tutorial_sge_backend.md @@ -68,7 +68,7 @@ Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](htt 7. Pull a singularity container for the pipeline. This will pull pipeline's docker container first and build a singularity one on `~/.singularity`. ```bash - $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.2.1.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.1 + $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.2.2.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.1 ``` 8. Run a pipeline for the test sample. diff --git a/docs/deprecated/tutorial_sherlock.md b/docs/deprecated/tutorial_sherlock.md index e53233eb..2c0bbf70 100644 --- a/docs/deprecated/tutorial_sherlock.md +++ b/docs/deprecated/tutorial_sherlock.md @@ -68,7 +68,7 @@ Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](htt ```javascript { "default_runtime_attributes" : { - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.2.1.simg", + "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.2.2.simg", "singularity_bindpath" : "/scratch,/lscratch,/oak/stanford,/home/groups/cherry/encode,/your/,YOUR_OWN_DATA_DIR1,YOUR_OWN_DATA_DIR1,..." } } diff --git a/docs/deprecated/tutorial_sherlock_backend.md b/docs/deprecated/tutorial_sherlock_backend.md index 4e24d4da..ea11b917 100644 --- a/docs/deprecated/tutorial_sherlock_backend.md +++ b/docs/deprecated/tutorial_sherlock_backend.md @@ -63,7 +63,7 @@ Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](htt 6. Pull a singularity container for the pipeline. This will pull pipeline's docker container first and build a singularity one on `~/.singularity`. Stanford Sherlock does not allow building a container on login nodes. Wait until you get a command prompt after `sdev`. ```bash $ sdev # sherlock cluster does not allow building a container on login node - $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.2.1.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.2.1 + $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.2.2.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.2.2 $ exit # exit from an interactive node ``` @@ -82,7 +82,7 @@ Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](htt ```javascript { "default_runtime_attributes" : { - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.2.1.simg", + "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.2.2.simg", "singularity_bindpath" : "/scratch,/oak/stanford,/your/,YOUR_OWN_DATA_DIR1,YOUR_OWN_DATA_DIR1,..." } } diff --git a/docs/deprecated/tutorial_slurm.md b/docs/deprecated/tutorial_slurm.md index 125b8e76..b0d67dc7 100644 --- a/docs/deprecated/tutorial_slurm.md +++ b/docs/deprecated/tutorial_slurm.md @@ -56,7 +56,7 @@ Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](htt 7. Pull a singularity container for the pipeline. This will pull pipeline's docker container first and build a singularity one on `~/.singularity`. ```bash - $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.2.1.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.2.1 + $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.2.2.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.2.2 ``` 8. Run a pipeline for the test sample. If your cluster requires to specify any of them then add one to the command line. @@ -78,7 +78,7 @@ Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](htt ```javascript { "default_runtime_attributes" : { - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.2.1.simg", + "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.2.2.simg", "singularity_bindpath" : "/your/,YOUR_OWN_DATA_DIR1,YOUR_OWN_DATA_DIR2,..." } } diff --git a/docs/deprecated/tutorial_slurm_backend.md b/docs/deprecated/tutorial_slurm_backend.md index 8c54f8d5..7ee4b5ef 100644 --- a/docs/deprecated/tutorial_slurm_backend.md +++ b/docs/deprecated/tutorial_slurm_backend.md @@ -68,7 +68,7 @@ Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](htt 7. Pull a singularity container for the pipeline. This will pull pipeline's docker container first and build a singularity one on `~/.singularity`. ```bash - $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.2.1.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.2.1 + $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.2.2.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.2.2 ``` 8. Run a pipeline for the test sample. @@ -86,7 +86,7 @@ Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](htt ```javascript { "default_runtime_attributes" : { - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.2.1.simg", + "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.2.2.simg", "singularity_bindpath" : "/your/,YOUR_OWN_DATA_DIR1,YOUR_OWN_DATA_DIR2,..." } } diff --git a/docs/dev.md b/docs/dev.md index 24f48b57..bec9495e 100644 --- a/docs/dev.md +++ b/docs/dev.md @@ -2,8 +2,8 @@ ## Command line for version change ```bash -PREV_VER=v1.2.1 -NEW_VER=v1.2.1 +PREV_VER=v1.2.2 +NEW_VER=v1.2.2 for f in $(grep -rl ${PREV_VER} --include=*.{wdl,md,sh}) do sed -i "s/${PREV_VER}/${NEW_VER}/g" ${f} @@ -24,7 +24,7 @@ Run the following command line locally to build out DX workflows for this pipeli ```bash # version -VER=v1.2.1 +VER=v1.2.2 # general java -jar ~/dxWDL-0.77.jar compile chip.wdl -project "ENCODE Uniform Processing Pipelines" -extras workflow_opts/docker.json -f -folder /ChIP-seq2/workflows/$VER/general -defaults examples/dx/template_general.json diff --git a/docs/tutorial_dx_web.md b/docs/tutorial_dx_web.md index 2a459e91..aa05b921 100644 --- a/docs/tutorial_dx_web.md +++ b/docs/tutorial_dx_web.md @@ -15,8 +15,8 @@ This document describes instruction for the item 2). 3. Move to one of the following workflow directories according to the platform you have chosen for your project (AWS or Azure). These DX workflows are pre-built with all parameters defined. -* [AWS test workflow](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.2.1/test_ENCSR936XTK_subsampled_chr19_only) -* [Azure test workflow](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.2.1/test_ENCSR936XTK_subsampled_chr19_only) +* [AWS test workflow](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.2.2/test_ENCSR936XTK_subsampled_chr19_only) +* [Azure test workflow](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.2.2/test_ENCSR936XTK_subsampled_chr19_only) 4. Copy it to your project by right-clicking on the DX workflow `chip` and choose "Copy". @@ -40,16 +40,16 @@ This document describes instruction for the item 2). 1. DNAnexus allows only one copy of a workflow per project. The example workflow in the previous section is pre-built for the subsampled test sample [ENCSR936XTK](https://www.encodeproject.org/experiments/ENCSR936XTK/) with all parameters defined already. 2. Copy one of the following workflows according to the platform you have chosen for your project (AWS or Azure). -* [AWS general](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.2.1/general) without pre-defined reference genome. -* [AWS hg38](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.2.1/hg38) with pre-defined hg38 reference genome. -* [AWS hg19](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.2.1/hg19) with pre-defined hg19 reference genome. -* [AWS mm10](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.2.1/mm10) with pre-defined mm10 reference genome. -* [AWS mm9](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.2.1/mm9) with pre-defined mm9 reference genome. -* [Azure general](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.2.1/general) without pre-defined reference genome. -* [Azure hg38](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.2.1/hg38) with pre-defined hg38 reference genome. -* [Azure hg19](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.2.1/hg19) with pre-defined hg19 reference genome. -* [Azure mm10](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.2.1/mm10) with pre-defined mm10 reference genome. -* [Azure mm9](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.2.1/mm9) with pre-defined mm9 reference genome. +* [AWS general](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.2.2/general) without pre-defined reference genome. +* [AWS hg38](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.2.2/hg38) with pre-defined hg38 reference genome. +* [AWS hg19](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.2.2/hg19) with pre-defined hg19 reference genome. +* [AWS mm10](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.2.2/mm10) with pre-defined mm10 reference genome. +* [AWS mm9](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.2.2/mm9) with pre-defined mm9 reference genome. +* [Azure general](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.2.2/general) without pre-defined reference genome. +* [Azure hg38](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.2.2/hg38) with pre-defined hg38 reference genome. +* [Azure hg19](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.2.2/hg19) with pre-defined hg19 reference genome. +* [Azure mm10](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.2.2/mm10) with pre-defined mm10 reference genome. +* [Azure mm9](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.2.2/mm9) with pre-defined mm9 reference genome. 3. Click on the DX workflow `chip`. diff --git a/test/test_task/test.sh b/test/test_task/test.sh index 80599e93..fff14312 100755 --- a/test/test_task/test.sh +++ b/test/test_task/test.sh @@ -12,7 +12,7 @@ INPUT=$2 if [ $# -gt 2 ]; then DOCKER_IMAGE=$3 else - DOCKER_IMAGE=quay.io/encode-dcc/chip-seq-pipeline:v1.2.1 + DOCKER_IMAGE=quay.io/encode-dcc/chip-seq-pipeline:v1.2.2 fi if [ $# -gt 3 ]; then NUM_TASK=$4 diff --git a/test/test_workflow/test_chip.sh b/test/test_workflow/test_chip.sh index f3577db6..f7748591 100755 --- a/test/test_workflow/test_chip.sh +++ b/test/test_workflow/test_chip.sh @@ -8,7 +8,7 @@ fi if [ $# -gt 2 ]; then DOCKER_IMAGE=$3 else - DOCKER_IMAGE=quay.io/encode-dcc/chip-seq-pipeline:v1.2.1 + DOCKER_IMAGE=quay.io/encode-dcc/chip-seq-pipeline:v1.2.2 fi INPUT=$1 GCLOUD_SERVICE_ACCOUNT_SECRET_JSON_FILE=$2 diff --git a/workflow_opts/docker.json b/workflow_opts/docker.json index 883441ee..c70ca859 100644 --- a/workflow_opts/docker.json +++ b/workflow_opts/docker.json @@ -1,6 +1,6 @@ { "default_runtime_attributes" : { - "docker" : "quay.io/encode-dcc/chip-seq-pipeline:v1.2.1", + "docker" : "quay.io/encode-dcc/chip-seq-pipeline:v1.2.2", "zones": "us-west1-a us-west1-b us-west1-c us-central1-c us-central1-b", "failOnStderr" : false, "continueOnReturnCode" : 0, diff --git a/workflow_opts/scg.json b/workflow_opts/scg.json index 6babf7f6..91f7b56e 100644 --- a/workflow_opts/scg.json +++ b/workflow_opts/scg.json @@ -1,7 +1,7 @@ { "default_runtime_attributes" : { "slurm_account" : "YOUR_SLURM_ACCOUNT", - "singularity_container" : "/reference/ENCODE/pipeline_singularity_images/chip-seq-pipeline-v1.2.1.simg", + "singularity_container" : "/reference/ENCODE/pipeline_singularity_images/chip-seq-pipeline-v1.2.2.simg", "singularity_bindpath" : "/reference/ENCODE,/scratch,/srv/gsfs0" } } diff --git a/workflow_opts/sge.json b/workflow_opts/sge.json index 72f2b1ad..d9bc1b67 100644 --- a/workflow_opts/sge.json +++ b/workflow_opts/sge.json @@ -1,6 +1,6 @@ { "default_runtime_attributes" : { "sge_pe" : "shm", - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.2.1.simg" + "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.2.2.simg" } } diff --git a/workflow_opts/sherlock.json b/workflow_opts/sherlock.json index b33b61a1..976795f5 100644 --- a/workflow_opts/sherlock.json +++ b/workflow_opts/sherlock.json @@ -1,7 +1,7 @@ { "default_runtime_attributes" : { "slurm_partition" : "normal", - "singularity_container" : "/home/groups/cherry/encode/pipeline_singularity_images/chip-seq-pipeline-v1.2.1.simg", + "singularity_container" : "/home/groups/cherry/encode/pipeline_singularity_images/chip-seq-pipeline-v1.2.2.simg", "singularity_bindpath" : "/scratch,/lscratch,/oak/stanford,/home/groups/cherry/encode" } } diff --git a/workflow_opts/singularity.json b/workflow_opts/singularity.json index 2b5825e7..86f5b6ba 100644 --- a/workflow_opts/singularity.json +++ b/workflow_opts/singularity.json @@ -1,5 +1,5 @@ { "default_runtime_attributes" : { - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.2.1.simg" + "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.2.2.simg" } } diff --git a/workflow_opts/slurm.json b/workflow_opts/slurm.json index 8cdfba49..0fb53a6e 100644 --- a/workflow_opts/slurm.json +++ b/workflow_opts/slurm.json @@ -2,6 +2,6 @@ "default_runtime_attributes" : { "slurm_partition" : "YOUR_SLURM_PARTITION", "slurm_account" : "YOUR_SLURM_ACCOUNT", - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.2.1.simg" + "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.2.2.simg" } }