From 723bbd4348bd1f3804c67de988f170453d68d76e Mon Sep 17 00:00:00 2001 From: Chris Miller Date: Wed, 9 Dec 2020 17:15:12 -0600 Subject: [PATCH 1/5] updates intervals, adds missing params --- definitions/pipelines/tumor_only_wgs.cwl | 83 ++++++++++++++++++------ 1 file changed, 63 insertions(+), 20 deletions(-) diff --git a/definitions/pipelines/tumor_only_wgs.cwl b/definitions/pipelines/tumor_only_wgs.cwl index 96ecb1f86..db71719bc 100644 --- a/definitions/pipelines/tumor_only_wgs.cwl +++ b/definitions/pipelines/tumor_only_wgs.cwl @@ -30,8 +30,30 @@ inputs: omni_vcf: type: File secondaryFiles: [.tbi] - qc_intervals: + bqsr_intervals: + type: string[] + label: "bqsr_intervals: Array of strings specifying regions for base quality score recalibration" + doc: | + bqsr_intervals provides an array of genomic intervals for which to apply + GATK base quality score recalibrations. Typically intervals are given + for the entire chromosome (i.e. chr1, chr2, etc.), these names should match + the format in the reference file. + target_intervals: type: File + label: "target_intervals: interval_list file of targets used in the sequencing experiment" + doc: | + target_intervals is an interval_list corresponding to the targets for the reagent. In the case of WGS + this generally should contain intervals that span the entire genome. + per_base_intervals: + type: ../types/labelled_file.yml#labelled_file[] + label: "per_base_intervals: additional intervals over which to summarize coverage/QC at a per-base resolution" + doc: "per_base_intervals is a list of regions (in interval_list format) over which to summarize coverage/QC at a per-base resolution." + per_target_intervals: + type: ../types/labelled_file.yml#labelled_file[] + label: "per_target_intervals: additional intervals over which to summarize coverage/QC at a per-target resolution" + doc: "per_target_intervals list of regions (in interval_list format) over which to summarize coverage/QC at a per-target resolution." + summary_intervals: + type: ../types/labelled_file.yml#labelled_file[] picard_metric_accumulation_level: type: string roi_intervals: @@ -68,12 +90,33 @@ inputs: type: int? readcount_minimum_base_quality: type: int? - per_base_intervals: - type: ../types/labelled_file.yml#labelled_file[] - per_target_intervals: - type: ../types/labelled_file.yml#labelled_file[] - summary_intervals: - type: ../types/labelled_file.yml#labelled_file[] + variants_to_table_fields: + type: string[] + default: [CHROM,POS,ID,REF,ALT,set,AC,AF] + variants_to_table_genotype_fields: + type: string[] + default: [GT,AD] + vep_to_table_fields: + type: string[] + default: [HGVSc,HGVSp] + varscan_min_coverage: + type: int? + default: 8 + varscan_min_var_freq: + type: float? + default: 0.05 + varscan_min_reads: + type: int? + default: 2 + maximum_population_allele_frequency: + type: float? + default: 0.001 + qc_minimum_mapping_quality: + type: int? + default: 0 + qc_minimum_base_quality: + type: int? + default: 0 outputs: cram: type: File @@ -160,15 +203,17 @@ steps: reference: reference sequence: sequence trimming: trimming - bqsr_known_sites: bqsr_known_sites omni_vcf: omni_vcf - intervals: qc_intervals + intervals: target_intervals picard_metric_accumulation_level: picard_metric_accumulation_level - minimum_mapping_quality: readcount_minimum_mapping_quality - minimum_base_quality: readcount_minimum_base_quality + bqsr_known_sites: bqsr_known_sites + bqsr_intervals: bqsr_intervals + minimum_mapping_quality: qc_minimum_mapping_quality + minimum_base_quality: qc_minimum_base_quality per_base_intervals: per_base_intervals per_target_intervals: per_target_intervals summary_intervals: summary_intervals + sample_name: sample_name out: [bam, mark_duplicates_metrics, insert_size_metrics, insert_size_histogram, alignment_summary_metrics, gc_bias_metrics, gc_bias_metrics_chart, gc_bias_metrics_summary, wgs_metrics, flagstats, verify_bam_id_metrics, verify_bam_id_depth, per_base_coverage_metrics, per_base_hs_metrics, per_target_coverage_metrics, per_target_hs_metrics, summary_hs_metrics] detect_variants: @@ -177,21 +222,19 @@ steps: reference: reference bam: alignment_and_qc/bam roi_intervals: roi_intervals - #varscan_strand_filter: - #varscan_min_coverage: - #varscan_min_var_freq: - #varscan_p_value: - #varscan_min_reads: - #maximum_population_allele_frequency: + varscan_min_coverage: varscan_min_coverage + varscan_min_var_freq: varscan_min_var_freq + varscan_min_reads: varscan_min_reads + maximum_population_allele_frequency: maximum_population_allele_frequency vep_cache_dir: vep_cache_dir synonyms_file: synonyms_file vep_pick: vep_pick vep_ensembl_assembly: vep_ensembl_assembly vep_ensembl_version: vep_ensembl_version vep_ensembl_species: vep_ensembl_species - #variants_to_table_fields: - #variants_to_table_genotype_fields: - #vep_to_table_fields: + variants_to_table_fields: variants_to_table_fields + variants_to_table_genotype_fields: variants_to_table_genotype_fields + vep_to_table_fields: vep_to_table_fields sample_name: sample_name docm_vcf: docm_vcf vep_custom_annotations: vep_custom_annotations From 60fbcc0e5dd62d6c989d3344f9710fcbde7da89b Mon Sep 17 00:00:00 2001 From: Chris Miller Date: Wed, 27 Jan 2021 16:43:00 -0600 Subject: [PATCH 2/5] adding sequence support to bisulfite pipeline --- definitions/pipelines/bisulfite.cwl | 55 ++++----- ...o_trimmed_fastq_and_biscuit_alignments.cwl | 68 ----------- .../sequence_to_bisulfite_alignment.cwl | 64 ++++++++++ ...equence_to_bisulfite_alignment_adapter.cwl | 48 ++++++++ definitions/tools/biscuit_align.cwl | 114 +++++++++++++++--- 5 files changed, 230 insertions(+), 119 deletions(-) delete mode 100644 definitions/subworkflows/bam_to_trimmed_fastq_and_biscuit_alignments.cwl create mode 100644 definitions/subworkflows/sequence_to_bisulfite_alignment.cwl create mode 100644 definitions/subworkflows/sequence_to_bisulfite_alignment_adapter.cwl diff --git a/definitions/pipelines/bisulfite.cwl b/definitions/pipelines/bisulfite.cwl index a6e3471ed..78392810e 100644 --- a/definitions/pipelines/bisulfite.cwl +++ b/definitions/pipelines/bisulfite.cwl @@ -7,27 +7,26 @@ requirements: - class: MultipleInputFeatureRequirement - class: SubworkflowFeatureRequirement - class: ScatterFeatureRequirement + - class: SchemaDefRequirement + types: + - $import: ../types/sequence_data.yml + - $import: ../types/trimming_options.yml inputs: reference_index: type: string reference_sizes: type: File - instrument_data_bams: - type: File[] - read_group_id: - type: string[] + sequence: + type: ../types/sequence_data.yml#sequence_data[] + doc: | + sequence represents the sequencing data as either FASTQs or BAMs with accompanying + readgroup information. Note that in the @RG field ID and SM are required. sample_name: type: string - trimming_adapters: - type: File - trimming_adapter_trim_end: - type: string - trimming_adapter_min_overlap: - type: int - trimming_max_uncalled: - type: int - trimming_min_readlength: - type: int + trimming_options: + type: + - ../types/trimming_options.yml#trimming_options + - "null" QCannotation: type: File assay_non_cpg_sites: @@ -54,31 +53,19 @@ outputs: type: Directory outputSource: bisulfite_qc/QC_directory steps: - bam_to_trimmed_fastq_and_biscuit_alignments: - run: ../subworkflows/bam_to_trimmed_fastq_and_biscuit_alignments.cwl - scatter: [bam, read_group_id] - scatterMethod: dotproduct + bisulfite_alignment: + run: ../subworkflows/sequence_to_bisulfite_alignment.cwl in: - bam: instrument_data_bams - read_group_id: read_group_id - adapters: trimming_adapters - adapter_trim_end: trimming_adapter_trim_end - adapter_min_overlap: trimming_adapter_min_overlap - max_uncalled: trimming_max_uncalled - min_readlength: trimming_min_readlength + sequence: sequence + trimming_options: trimming_options reference_index: reference_index + sample_name: sample_name out: [aligned_bam] - merge: - run: ../tools/merge_bams.cwl - in: - bams: bam_to_trimmed_fastq_and_biscuit_alignments/aligned_bam - out: - [merged_bam] pileup: run: ../tools/biscuit_pileup.cwl in: - bam: merge/merged_bam + bam: bisulfite_alignment/aligned_bam reference: reference_index out: [vcf] @@ -86,7 +73,7 @@ steps: run: ../subworkflows/bisulfite_qc.cwl in: vcf: pileup/vcf - bam: merge/merged_bam + bam: bisulfite_alignment/aligned_bam reference: reference_index QCannotation: QCannotation out: @@ -110,7 +97,7 @@ steps: run: ../tools/bam_to_cram.cwl in: reference: reference_index - bam: merge/merged_bam + bam: bisulfite_alignment/aligned_bam out: [cram] index_cram: diff --git a/definitions/subworkflows/bam_to_trimmed_fastq_and_biscuit_alignments.cwl b/definitions/subworkflows/bam_to_trimmed_fastq_and_biscuit_alignments.cwl deleted file mode 100644 index fabf1e96f..000000000 --- a/definitions/subworkflows/bam_to_trimmed_fastq_and_biscuit_alignments.cwl +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env cwl-runner - -cwlVersion: v1.0 -class: Workflow -label: "bam to trimmed fastqs and biscuit alignments" -requirements: - - class: SubworkflowFeatureRequirement -inputs: - bam: - type: File - adapters: - type: File - adapter_trim_end: - type: string - adapter_min_overlap: - type: int - max_uncalled: - type: int - min_readlength: - type: int - read_group_id: - type: string - reference_index: - type: string -outputs: - aligned_bam: - type: File - outputSource: biscuit_markdup/markdup_bam -steps: - bam_to_fastq: - run: ../tools/bam_to_fastq.cwl - in: - bam: bam - out: - [fastq1, fastq2] - trim_fastq: - run: ../tools/trim_fastq.cwl - in: - reads1: bam_to_fastq/fastq1 - reads2: bam_to_fastq/fastq2 - adapters: adapters - adapter_trim_end: adapter_trim_end - adapter_min_overlap: adapter_min_overlap - max_uncalled: max_uncalled - min_readlength: min_readlength - out: - [fastq1, fastq2] - biscuit_align: - run: ../tools/biscuit_align.cwl - in: - reference_index: reference_index - fastq1: trim_fastq/fastq1 - fastq2: trim_fastq/fastq2 - read_group_id: read_group_id - out: - [aligned_bam] - index_bam: - run: ../tools/index_bam.cwl - in: - bam: biscuit_align/aligned_bam - out: - [indexed_bam] - biscuit_markdup: - run: ../tools/biscuit_markdup.cwl - in: - bam: index_bam/indexed_bam - out: - [markdup_bam] diff --git a/definitions/subworkflows/sequence_to_bisulfite_alignment.cwl b/definitions/subworkflows/sequence_to_bisulfite_alignment.cwl new file mode 100644 index 000000000..de3fd3e9e --- /dev/null +++ b/definitions/subworkflows/sequence_to_bisulfite_alignment.cwl @@ -0,0 +1,64 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.0 +class: Workflow +label: "take bisulfite sequence data through trimming, alignment, and markdup" +requirements: + - class: SubworkflowFeatureRequirement + - class: ScatterFeatureRequirement + - class: SchemaDefRequirement + types: + - $import: ../types/sequence_data.yml + - $import: ../types/trimming_options.yml +inputs: + sequence: + type: ../types/sequence_data.yml#sequence_data[] + doc: "the unaligned sequence data with readgroup information" + trimming_options: + type: + - ../types/trimming_options.yml#trimming_options + - "null" + reference_index: + type: string + sample_name: + type: string +outputs: + aligned_bam: + type: File + outputSource: index_bam/indexed_bam +steps: + trim_and_align: + scatter: [sequence] + scatterMethod: dotproduct + run: sequence_to_bisulfite_alignment_adapter.cwl + in: + sequence: sequence + trimming_options: trimming_options + reference_index: reference_index + out: + [aligned_bam] + merge: + run: ../tools/merge_bams_samtools.cwl + in: + bams: trim_and_align/aligned_bam + name: sample_name + out: + [merged_bam] + name_sort: + run: ../tools/name_sort.cwl + in: + bam: merge/merged_bam + out: + [name_sorted_bam] + biscuit_markdup: + run: ../tools/biscuit_markdup.cwl + in: + bam: name_sort/name_sorted_bam + out: + [markdup_bam] + index_bam: + run: ../tools/index_bam.cwl + in: + bam: biscuit_markdup/markdup_bam + out: + [indexed_bam] diff --git a/definitions/subworkflows/sequence_to_bisulfite_alignment_adapter.cwl b/definitions/subworkflows/sequence_to_bisulfite_alignment_adapter.cwl new file mode 100644 index 000000000..687c15b1c --- /dev/null +++ b/definitions/subworkflows/sequence_to_bisulfite_alignment_adapter.cwl @@ -0,0 +1,48 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.0 +class: Workflow +label: "adapter for sequence_to_biscuit_alignments" +doc: "Some workflow engines won't stage files in our nested structure, so parse it out here" +requirements: + - class: InlineJavascriptRequirement + - class: SchemaDefRequirement + types: + - $import: ../types/sequence_data.yml + - $import: ../types/trimming_options.yml + - class: StepInputExpressionRequirement + - class: SubworkflowFeatureRequirement +inputs: + sequence: + type: ../types/sequence_data.yml#sequence_data + doc: "the unaligned sequence data with readgroup information" + trimming_options: + type: + - ../types/trimming_options.yml#trimming_options + - "null" + reference_index: + type: string +outputs: + aligned_bam: + type: File + outputSource: biscuit_align/aligned_bam +steps: + biscuit_align: + run: ../tools/biscuit_align.cwl + in: + bam: + source: sequence + valueFrom: "$(self.sequence.hasOwnProperty('bam')? self.sequence.bam : null)" + fastq1: + source: sequence + valueFrom: "$(self.sequence.hasOwnProperty('fastq1')? self.sequence.fastq1 : null)" + fastq2: + source: sequence + valueFrom: "$(self.sequence.hasOwnProperty('fastq2')? self.sequence.fastq2 : null)" + read_group: + source: sequence + valueFrom: $(self.readgroup) + trimming_options: trimming_options + reference_index: reference_index + out: + [aligned_bam] diff --git a/definitions/tools/biscuit_align.cwl b/definitions/tools/biscuit_align.cwl index 59f52f5fb..dd24af689 100644 --- a/definitions/tools/biscuit_align.cwl +++ b/definitions/tools/biscuit_align.cwl @@ -2,50 +2,130 @@ cwlVersion: v1.0 class: CommandLineTool -label: "Biscuit: align" -baseCommand: ["/bin/bash","biscuit_align.sh"] +label: "Trim bisulfite reads and align usin biscuit" +baseCommand: ["/bin/bash","biscuit_trim_and_align.sh"] requirements: - class: ResourceRequirement ramMin: 32000 coresMin: 12 - class: DockerRequirement - dockerPull: "mgibio/biscuit:0.3.8" + dockerPull: "chrisamiller/biscuit:latest" + - class: SchemaDefRequirement + types: + - $import: ../types/sequence_data.yml + - $import: ../types/trimming_options.yml - class: InitialWorkDirRequirement listing: - - entryname: 'biscuit_align.sh' + - entryname: 'biscuit_trim_and_align.sh' entry: | - set -eou pipefail + set -o pipefail + set -o errexit + set -o nounset - cores=$1 - outdir="$2" - read_group_id="$3" - reference_index="$4" - fastq1="$5" - fastq2="$6" + RUN_TRIMMING="false" - /usr/bin/biscuit align -t $cores -M -R "$read_group_id" "$reference_index" "$fastq1" "$fastq2" | /usr/bin/sambamba view -S -f bam -l 0 /dev/stdin | /usr/bin/sambamba sort -t $cores -m 8G -o "$outdir/aligned.bam" /dev/stdin + while getopts "b:?1:?2:?g:r:n:d:t:?o:?" opt; do + case "$opt" in + b) + MODE=bam + BAM="$OPTARG" + ;; + 1) + MODE=fastq + FASTQ1="$OPTARG" + ;; + 2) + MODE=fastq + FASTQ2="$OPTARG" + ;; + g) + READGROUP="$OPTARG" + ;; + r) + REFERENCE="$OPTARG" + ;; + n) + NTHREADS="$OPTARG" + ;; + d) + OUTDIR="$OPTARG" + ;; + t) + RUN_TRIMMING="true" + TRIMMING_ADAPTERS="$OPTARG" + ;; + o) + RUN_TRIMMING="true" + TRIMMING_ADAPTER_MIN_OVERLAP="$OPTARG" + ;; + esac + done + + #reserve at least two cores for sorting + #if there are too few for this, let the scheduler sort it out + SORT_THREADS=2 + if [[ $NTHREADS -gt 3 ]];then + NTHREADS=`expr $NTHREADS - $SORT_THREADS` + fi + #also reserve one core for trimming if it is specified + if [[ "$RUN_TRIMMING" == "true" ]];then + NTHREADS=`expr $NTHREADS - 1` + fi + + if [[ "$MODE" == 'fastq' ]]; then + if [[ "$RUN_TRIMMING" == 'false' ]]; then + /usr/bin/biscuit align -t "$NTHREADS" -M -R "$READGROUP" "$REFERENCE" "$FASTQ1" "$FASTQ2" | /usr/bin/sambamba view -S -f bam -l 0 /dev/stdin | /usr/bin/sambamba sort -t "$SORT_THREADS" -m 8G -o "$OUTDIR/aligned.bam" /dev/stdin + else + /opt/flexbar/flexbar --adapters "$TRIMMING_ADAPTERS" --reads "$FASTQ1" --reads2 "$FASTQ2" --adapter-trim-end LTAIL --adapter-min-overlap "$TRIMMING_ADAPTER_MIN_OVERLAP" --adapter-error-rate 0.1 --max-uncalled 300 --stdout-reads \ + | /usr/bin/biscuit align -t "$NTHREADS" -M -R "$READGROUP" "$REFERENCE" /dev/stdin | /usr/bin/sambamba view -S -f bam -l 0 /dev/stdin | /usr/bin/sambamba sort -t "$SORT_THREADS" -m 8G -o "$OUTDIR/aligned.bam" /dev/stdin + fi + fi + if [[ "$MODE" == 'bam' ]]; then + if [[ "$RUN_TRIMMING" == 'false' ]]; then + /usr/bin/java -Xmx4g -jar /opt/picard/picard.jar SamToFastq I="$BAM" INTERLEAVE=true INCLUDE_NON_PF_READS=true FASTQ=/dev/stdout | /usr/bin/biscuit align -t "$NTHREADS" -M -R "$READGROUP" "$REFERENCE" /dev/stdin | /usr/bin/sambamba view -S -f bam -l 0 /dev/stdin | /usr/bin/sambamba sort -t "$SORT_THREADS" -m 8G -o "$OUTDIR/aligned.bam" /dev/stdin + else + /usr/bin/java -Xmx4g -jar /opt/picard/picard.jar SamToFastq I="$BAM" INTERLEAVE=true INCLUDE_NON_PF_READS=true FASTQ=/dev/stdout \ + | /opt/flexbar/flexbar --adapters "$TRIMMING_ADAPTERS" --adapter-trim-end LTAIL --adapter-min-overlap "$TRIMMING_ADAPTER_MIN_OVERLAP" --adapter-error-rate 0.1 --max-uncalled 300 --stdout-reads -r - \ + | /usr/bin/biscuit align -t "$NTHREADS" -M -R "$READGROUP" "$REFERENCE" /dev/stdin | /usr/bin/sambamba view -S -f bam -l 0 /dev/stdin | /usr/bin/sambamba sort -t "$SORT_THREADS" -m 8G -o "$OUTDIR/aligned.bam" /dev/stdin + fi + fi arguments: [ - { valueFrom: $(runtime.cores), position: -9 }, - { valueFrom: $(runtime.outdir), position: -8 }, + { valueFrom: $(runtime.cores), position: -9, prefix: '-n' }, + { valueFrom: $(runtime.outdir), position: -8, prefix: '-d' }, ] inputs: reference_index: type: string inputBinding: position: -3 + prefix: '-r' + bam: + type: File? + inputBinding: + position: -5 + prefix: '-b' fastq1: - type: File + type: File? inputBinding: position: -2 + prefix: '-1' fastq2: - type: File + type: File? inputBinding: position: -1 - read_group_id: + prefix: '-2' + read_group: type: string inputBinding: position: -4 + prefix: '-g' + trimming_options: + type: + - ../types/trimming_options.yml#trimming_options + - "null" + inputBinding: + valueFrom: $( ['-t', self.adapters.path, '-o', self.min_overlap] ) outputs: aligned_bam: type: File From 59ac49408fb1081d981d516718ca3e453b885def Mon Sep 17 00:00:00 2001 From: Chris Miller Date: Wed, 27 Jan 2021 16:46:05 -0600 Subject: [PATCH 3/5] switching to mgibio docker --- definitions/tools/biscuit_align.cwl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/definitions/tools/biscuit_align.cwl b/definitions/tools/biscuit_align.cwl index dd24af689..302134ea6 100644 --- a/definitions/tools/biscuit_align.cwl +++ b/definitions/tools/biscuit_align.cwl @@ -9,7 +9,7 @@ requirements: ramMin: 32000 coresMin: 12 - class: DockerRequirement - dockerPull: "chrisamiller/biscuit:latest" + dockerPull: "mgibio/biscuit:0.3.16" - class: SchemaDefRequirement types: - $import: ../types/sequence_data.yml From d1bd0e9ed3dfd514fd4e89242721c6b516727eda Mon Sep 17 00:00:00 2001 From: Chris Miller Date: Wed, 27 Jan 2021 16:43:00 -0600 Subject: [PATCH 4/5] adding sequence support to bisulfite pipeline --- definitions/pipelines/bisulfite.cwl | 55 ++++----- ...o_trimmed_fastq_and_biscuit_alignments.cwl | 68 ----------- .../sequence_to_bisulfite_alignment.cwl | 64 ++++++++++ ...equence_to_bisulfite_alignment_adapter.cwl | 48 ++++++++ definitions/tools/biscuit_align.cwl | 114 +++++++++++++++--- 5 files changed, 230 insertions(+), 119 deletions(-) delete mode 100644 definitions/subworkflows/bam_to_trimmed_fastq_and_biscuit_alignments.cwl create mode 100644 definitions/subworkflows/sequence_to_bisulfite_alignment.cwl create mode 100644 definitions/subworkflows/sequence_to_bisulfite_alignment_adapter.cwl diff --git a/definitions/pipelines/bisulfite.cwl b/definitions/pipelines/bisulfite.cwl index a6e3471ed..78392810e 100644 --- a/definitions/pipelines/bisulfite.cwl +++ b/definitions/pipelines/bisulfite.cwl @@ -7,27 +7,26 @@ requirements: - class: MultipleInputFeatureRequirement - class: SubworkflowFeatureRequirement - class: ScatterFeatureRequirement + - class: SchemaDefRequirement + types: + - $import: ../types/sequence_data.yml + - $import: ../types/trimming_options.yml inputs: reference_index: type: string reference_sizes: type: File - instrument_data_bams: - type: File[] - read_group_id: - type: string[] + sequence: + type: ../types/sequence_data.yml#sequence_data[] + doc: | + sequence represents the sequencing data as either FASTQs or BAMs with accompanying + readgroup information. Note that in the @RG field ID and SM are required. sample_name: type: string - trimming_adapters: - type: File - trimming_adapter_trim_end: - type: string - trimming_adapter_min_overlap: - type: int - trimming_max_uncalled: - type: int - trimming_min_readlength: - type: int + trimming_options: + type: + - ../types/trimming_options.yml#trimming_options + - "null" QCannotation: type: File assay_non_cpg_sites: @@ -54,31 +53,19 @@ outputs: type: Directory outputSource: bisulfite_qc/QC_directory steps: - bam_to_trimmed_fastq_and_biscuit_alignments: - run: ../subworkflows/bam_to_trimmed_fastq_and_biscuit_alignments.cwl - scatter: [bam, read_group_id] - scatterMethod: dotproduct + bisulfite_alignment: + run: ../subworkflows/sequence_to_bisulfite_alignment.cwl in: - bam: instrument_data_bams - read_group_id: read_group_id - adapters: trimming_adapters - adapter_trim_end: trimming_adapter_trim_end - adapter_min_overlap: trimming_adapter_min_overlap - max_uncalled: trimming_max_uncalled - min_readlength: trimming_min_readlength + sequence: sequence + trimming_options: trimming_options reference_index: reference_index + sample_name: sample_name out: [aligned_bam] - merge: - run: ../tools/merge_bams.cwl - in: - bams: bam_to_trimmed_fastq_and_biscuit_alignments/aligned_bam - out: - [merged_bam] pileup: run: ../tools/biscuit_pileup.cwl in: - bam: merge/merged_bam + bam: bisulfite_alignment/aligned_bam reference: reference_index out: [vcf] @@ -86,7 +73,7 @@ steps: run: ../subworkflows/bisulfite_qc.cwl in: vcf: pileup/vcf - bam: merge/merged_bam + bam: bisulfite_alignment/aligned_bam reference: reference_index QCannotation: QCannotation out: @@ -110,7 +97,7 @@ steps: run: ../tools/bam_to_cram.cwl in: reference: reference_index - bam: merge/merged_bam + bam: bisulfite_alignment/aligned_bam out: [cram] index_cram: diff --git a/definitions/subworkflows/bam_to_trimmed_fastq_and_biscuit_alignments.cwl b/definitions/subworkflows/bam_to_trimmed_fastq_and_biscuit_alignments.cwl deleted file mode 100644 index fabf1e96f..000000000 --- a/definitions/subworkflows/bam_to_trimmed_fastq_and_biscuit_alignments.cwl +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env cwl-runner - -cwlVersion: v1.0 -class: Workflow -label: "bam to trimmed fastqs and biscuit alignments" -requirements: - - class: SubworkflowFeatureRequirement -inputs: - bam: - type: File - adapters: - type: File - adapter_trim_end: - type: string - adapter_min_overlap: - type: int - max_uncalled: - type: int - min_readlength: - type: int - read_group_id: - type: string - reference_index: - type: string -outputs: - aligned_bam: - type: File - outputSource: biscuit_markdup/markdup_bam -steps: - bam_to_fastq: - run: ../tools/bam_to_fastq.cwl - in: - bam: bam - out: - [fastq1, fastq2] - trim_fastq: - run: ../tools/trim_fastq.cwl - in: - reads1: bam_to_fastq/fastq1 - reads2: bam_to_fastq/fastq2 - adapters: adapters - adapter_trim_end: adapter_trim_end - adapter_min_overlap: adapter_min_overlap - max_uncalled: max_uncalled - min_readlength: min_readlength - out: - [fastq1, fastq2] - biscuit_align: - run: ../tools/biscuit_align.cwl - in: - reference_index: reference_index - fastq1: trim_fastq/fastq1 - fastq2: trim_fastq/fastq2 - read_group_id: read_group_id - out: - [aligned_bam] - index_bam: - run: ../tools/index_bam.cwl - in: - bam: biscuit_align/aligned_bam - out: - [indexed_bam] - biscuit_markdup: - run: ../tools/biscuit_markdup.cwl - in: - bam: index_bam/indexed_bam - out: - [markdup_bam] diff --git a/definitions/subworkflows/sequence_to_bisulfite_alignment.cwl b/definitions/subworkflows/sequence_to_bisulfite_alignment.cwl new file mode 100644 index 000000000..de3fd3e9e --- /dev/null +++ b/definitions/subworkflows/sequence_to_bisulfite_alignment.cwl @@ -0,0 +1,64 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.0 +class: Workflow +label: "take bisulfite sequence data through trimming, alignment, and markdup" +requirements: + - class: SubworkflowFeatureRequirement + - class: ScatterFeatureRequirement + - class: SchemaDefRequirement + types: + - $import: ../types/sequence_data.yml + - $import: ../types/trimming_options.yml +inputs: + sequence: + type: ../types/sequence_data.yml#sequence_data[] + doc: "the unaligned sequence data with readgroup information" + trimming_options: + type: + - ../types/trimming_options.yml#trimming_options + - "null" + reference_index: + type: string + sample_name: + type: string +outputs: + aligned_bam: + type: File + outputSource: index_bam/indexed_bam +steps: + trim_and_align: + scatter: [sequence] + scatterMethod: dotproduct + run: sequence_to_bisulfite_alignment_adapter.cwl + in: + sequence: sequence + trimming_options: trimming_options + reference_index: reference_index + out: + [aligned_bam] + merge: + run: ../tools/merge_bams_samtools.cwl + in: + bams: trim_and_align/aligned_bam + name: sample_name + out: + [merged_bam] + name_sort: + run: ../tools/name_sort.cwl + in: + bam: merge/merged_bam + out: + [name_sorted_bam] + biscuit_markdup: + run: ../tools/biscuit_markdup.cwl + in: + bam: name_sort/name_sorted_bam + out: + [markdup_bam] + index_bam: + run: ../tools/index_bam.cwl + in: + bam: biscuit_markdup/markdup_bam + out: + [indexed_bam] diff --git a/definitions/subworkflows/sequence_to_bisulfite_alignment_adapter.cwl b/definitions/subworkflows/sequence_to_bisulfite_alignment_adapter.cwl new file mode 100644 index 000000000..687c15b1c --- /dev/null +++ b/definitions/subworkflows/sequence_to_bisulfite_alignment_adapter.cwl @@ -0,0 +1,48 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.0 +class: Workflow +label: "adapter for sequence_to_biscuit_alignments" +doc: "Some workflow engines won't stage files in our nested structure, so parse it out here" +requirements: + - class: InlineJavascriptRequirement + - class: SchemaDefRequirement + types: + - $import: ../types/sequence_data.yml + - $import: ../types/trimming_options.yml + - class: StepInputExpressionRequirement + - class: SubworkflowFeatureRequirement +inputs: + sequence: + type: ../types/sequence_data.yml#sequence_data + doc: "the unaligned sequence data with readgroup information" + trimming_options: + type: + - ../types/trimming_options.yml#trimming_options + - "null" + reference_index: + type: string +outputs: + aligned_bam: + type: File + outputSource: biscuit_align/aligned_bam +steps: + biscuit_align: + run: ../tools/biscuit_align.cwl + in: + bam: + source: sequence + valueFrom: "$(self.sequence.hasOwnProperty('bam')? self.sequence.bam : null)" + fastq1: + source: sequence + valueFrom: "$(self.sequence.hasOwnProperty('fastq1')? self.sequence.fastq1 : null)" + fastq2: + source: sequence + valueFrom: "$(self.sequence.hasOwnProperty('fastq2')? self.sequence.fastq2 : null)" + read_group: + source: sequence + valueFrom: $(self.readgroup) + trimming_options: trimming_options + reference_index: reference_index + out: + [aligned_bam] diff --git a/definitions/tools/biscuit_align.cwl b/definitions/tools/biscuit_align.cwl index 59f52f5fb..dd24af689 100644 --- a/definitions/tools/biscuit_align.cwl +++ b/definitions/tools/biscuit_align.cwl @@ -2,50 +2,130 @@ cwlVersion: v1.0 class: CommandLineTool -label: "Biscuit: align" -baseCommand: ["/bin/bash","biscuit_align.sh"] +label: "Trim bisulfite reads and align usin biscuit" +baseCommand: ["/bin/bash","biscuit_trim_and_align.sh"] requirements: - class: ResourceRequirement ramMin: 32000 coresMin: 12 - class: DockerRequirement - dockerPull: "mgibio/biscuit:0.3.8" + dockerPull: "chrisamiller/biscuit:latest" + - class: SchemaDefRequirement + types: + - $import: ../types/sequence_data.yml + - $import: ../types/trimming_options.yml - class: InitialWorkDirRequirement listing: - - entryname: 'biscuit_align.sh' + - entryname: 'biscuit_trim_and_align.sh' entry: | - set -eou pipefail + set -o pipefail + set -o errexit + set -o nounset - cores=$1 - outdir="$2" - read_group_id="$3" - reference_index="$4" - fastq1="$5" - fastq2="$6" + RUN_TRIMMING="false" - /usr/bin/biscuit align -t $cores -M -R "$read_group_id" "$reference_index" "$fastq1" "$fastq2" | /usr/bin/sambamba view -S -f bam -l 0 /dev/stdin | /usr/bin/sambamba sort -t $cores -m 8G -o "$outdir/aligned.bam" /dev/stdin + while getopts "b:?1:?2:?g:r:n:d:t:?o:?" opt; do + case "$opt" in + b) + MODE=bam + BAM="$OPTARG" + ;; + 1) + MODE=fastq + FASTQ1="$OPTARG" + ;; + 2) + MODE=fastq + FASTQ2="$OPTARG" + ;; + g) + READGROUP="$OPTARG" + ;; + r) + REFERENCE="$OPTARG" + ;; + n) + NTHREADS="$OPTARG" + ;; + d) + OUTDIR="$OPTARG" + ;; + t) + RUN_TRIMMING="true" + TRIMMING_ADAPTERS="$OPTARG" + ;; + o) + RUN_TRIMMING="true" + TRIMMING_ADAPTER_MIN_OVERLAP="$OPTARG" + ;; + esac + done + + #reserve at least two cores for sorting + #if there are too few for this, let the scheduler sort it out + SORT_THREADS=2 + if [[ $NTHREADS -gt 3 ]];then + NTHREADS=`expr $NTHREADS - $SORT_THREADS` + fi + #also reserve one core for trimming if it is specified + if [[ "$RUN_TRIMMING" == "true" ]];then + NTHREADS=`expr $NTHREADS - 1` + fi + + if [[ "$MODE" == 'fastq' ]]; then + if [[ "$RUN_TRIMMING" == 'false' ]]; then + /usr/bin/biscuit align -t "$NTHREADS" -M -R "$READGROUP" "$REFERENCE" "$FASTQ1" "$FASTQ2" | /usr/bin/sambamba view -S -f bam -l 0 /dev/stdin | /usr/bin/sambamba sort -t "$SORT_THREADS" -m 8G -o "$OUTDIR/aligned.bam" /dev/stdin + else + /opt/flexbar/flexbar --adapters "$TRIMMING_ADAPTERS" --reads "$FASTQ1" --reads2 "$FASTQ2" --adapter-trim-end LTAIL --adapter-min-overlap "$TRIMMING_ADAPTER_MIN_OVERLAP" --adapter-error-rate 0.1 --max-uncalled 300 --stdout-reads \ + | /usr/bin/biscuit align -t "$NTHREADS" -M -R "$READGROUP" "$REFERENCE" /dev/stdin | /usr/bin/sambamba view -S -f bam -l 0 /dev/stdin | /usr/bin/sambamba sort -t "$SORT_THREADS" -m 8G -o "$OUTDIR/aligned.bam" /dev/stdin + fi + fi + if [[ "$MODE" == 'bam' ]]; then + if [[ "$RUN_TRIMMING" == 'false' ]]; then + /usr/bin/java -Xmx4g -jar /opt/picard/picard.jar SamToFastq I="$BAM" INTERLEAVE=true INCLUDE_NON_PF_READS=true FASTQ=/dev/stdout | /usr/bin/biscuit align -t "$NTHREADS" -M -R "$READGROUP" "$REFERENCE" /dev/stdin | /usr/bin/sambamba view -S -f bam -l 0 /dev/stdin | /usr/bin/sambamba sort -t "$SORT_THREADS" -m 8G -o "$OUTDIR/aligned.bam" /dev/stdin + else + /usr/bin/java -Xmx4g -jar /opt/picard/picard.jar SamToFastq I="$BAM" INTERLEAVE=true INCLUDE_NON_PF_READS=true FASTQ=/dev/stdout \ + | /opt/flexbar/flexbar --adapters "$TRIMMING_ADAPTERS" --adapter-trim-end LTAIL --adapter-min-overlap "$TRIMMING_ADAPTER_MIN_OVERLAP" --adapter-error-rate 0.1 --max-uncalled 300 --stdout-reads -r - \ + | /usr/bin/biscuit align -t "$NTHREADS" -M -R "$READGROUP" "$REFERENCE" /dev/stdin | /usr/bin/sambamba view -S -f bam -l 0 /dev/stdin | /usr/bin/sambamba sort -t "$SORT_THREADS" -m 8G -o "$OUTDIR/aligned.bam" /dev/stdin + fi + fi arguments: [ - { valueFrom: $(runtime.cores), position: -9 }, - { valueFrom: $(runtime.outdir), position: -8 }, + { valueFrom: $(runtime.cores), position: -9, prefix: '-n' }, + { valueFrom: $(runtime.outdir), position: -8, prefix: '-d' }, ] inputs: reference_index: type: string inputBinding: position: -3 + prefix: '-r' + bam: + type: File? + inputBinding: + position: -5 + prefix: '-b' fastq1: - type: File + type: File? inputBinding: position: -2 + prefix: '-1' fastq2: - type: File + type: File? inputBinding: position: -1 - read_group_id: + prefix: '-2' + read_group: type: string inputBinding: position: -4 + prefix: '-g' + trimming_options: + type: + - ../types/trimming_options.yml#trimming_options + - "null" + inputBinding: + valueFrom: $( ['-t', self.adapters.path, '-o', self.min_overlap] ) outputs: aligned_bam: type: File From dca5c618c2434e074af21a22614c1da442ab95d9 Mon Sep 17 00:00:00 2001 From: Chris Miller Date: Wed, 27 Jan 2021 16:46:05 -0600 Subject: [PATCH 5/5] switching to mgibio docker --- definitions/tools/biscuit_align.cwl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/definitions/tools/biscuit_align.cwl b/definitions/tools/biscuit_align.cwl index dd24af689..302134ea6 100644 --- a/definitions/tools/biscuit_align.cwl +++ b/definitions/tools/biscuit_align.cwl @@ -9,7 +9,7 @@ requirements: ramMin: 32000 coresMin: 12 - class: DockerRequirement - dockerPull: "chrisamiller/biscuit:latest" + dockerPull: "mgibio/biscuit:0.3.16" - class: SchemaDefRequirement types: - $import: ../types/sequence_data.yml