Skip to content

Commit 295be09

Browse files
committed
Two new workflows:
handle (ref-indepedent, alignment) a single piece of on-instrument demultiplexed CCS bam
1 parent 17f712b commit 295be09

File tree

3 files changed

+336
-0
lines changed

3 files changed

+336
-0
lines changed

.dockstore.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,12 @@ workflows:
8181
- name: PBMASIsoSeqDemultiplex
8282
subclass: wdl
8383
primaryDescriptorPath: /wdl/pipelines/PacBio/Utility/PBMASIsoSeqDemultiplex.wdl
84+
- name: ProcessOnInstrumentDemuxedChunkRefFree
85+
subclass: wdl
86+
primaryDescriptorPath: /wdl/pipelines/PacBio/Utility/ProcessOnInstrumentDemuxedChunkRefFree.wdl
87+
- name: ProcessOnInstrumentDemuxedChunk
88+
subclass: wdl
89+
primaryDescriptorPath: /wdl/pipelines/PacBio/Utility/ProcessOnInstrumentDemuxedChunk.wdl
8490

8591
###################################################
8692
# TechAgnostic - *mics data processing
Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
version 1.0
2+
3+
import "../../../tasks/Utility/GeneralUtils.wdl" as GU
4+
5+
import "../../../tasks/Alignment/AlignHiFiUBAM.wdl" as ALN
6+
7+
import "../../TechAgnostic/Utility/AlignedBamQCandMetrics.wdl" as QCMetrics
8+
9+
import "../../../tasks/Utility/Finalize.wdl" as FF
10+
11+
workflow ProcessOnInstrumentDemuxedChunk {
12+
13+
meta {
14+
desciption: "Given an on-instrument demultiplexed hifi_reads.bam, perform alignment and QC check, and collect a few metrics."
15+
}
16+
17+
parameter_meta {
18+
19+
#########
20+
# inputs
21+
readgroup_id:
22+
"Unique ID for a readgroup, for example (D|E)A[0-9]{6}-<barcode>; no whitespaces allowed"
23+
24+
bam_SM_field:
25+
"value to place in the SM field of the resulting BAM header's @RG line"
26+
27+
platform:
28+
"PacBio platform used for generating the data; accepted value: [Sequel, Revio]"
29+
30+
gcs_out_root_dir:
31+
"output files will be copied over there"
32+
33+
qc_metrics_config_json:
34+
"A config json to for running the QC and metrics-collection sub-workflow 'AlignedBamQCandMetrics'"
35+
36+
fingerprint_sample_id:
37+
"For fingerprint verification: the ID of the sample supposedly this BAM belongs to; note that the fingerprint VCF is assumed to be located at {fingerprint_store}/{fingerprint_sample_id}*.vcf(.gz)?"
38+
39+
expected_sex_type:
40+
"If provided, triggers sex concordance check. Accepted value: [M, F, NA, na]"
41+
42+
#########
43+
# outputs
44+
wgs_cov:
45+
"whole genome mean coverage"
46+
47+
nanoplot_summ:
48+
"Summary on alignment metrics provided by Nanoplot (todo: study the value of this output)"
49+
50+
sam_flag_stats:
51+
"SAM flag stats"
52+
53+
fingerprint_check:
54+
"Summary on (human) fingerprint checking results"
55+
56+
contamination_est:
57+
"cross-(human)individual contamination estimation by VerifyBAMID2"
58+
59+
inferred_sex_info:
60+
"Inferred sex concordance information if expected sex type is provided"
61+
62+
methyl_tag_simple_stats:
63+
"Simple stats on the reads with & without SAM methylation tags (MM/ML)."
64+
65+
aBAM_metrics_files_out:
66+
"A map where keys are summary-names and values are paths to files generated from the various QC/metrics tasks"
67+
}
68+
69+
input {
70+
String gcs_out_root_dir
71+
72+
File uBAM
73+
File? uPBI
74+
75+
String readgroup_id
76+
String bam_SM_field
77+
78+
String platform
79+
80+
# args for optional QC subworkflows
81+
File? qc_metrics_config_json
82+
String? fingerprint_sample_id
83+
String? expected_sex_type
84+
85+
File ref_map_file
86+
String disk_type = "SSD"
87+
}
88+
89+
output {
90+
String last_processing_date = today.yyyy_mm_dd
91+
92+
File aligned_bam = FinalizeAlignedBam.gcs_path
93+
File aligned_bai = FinalizeAlignedBai.gcs_path
94+
File aligned_pbi = FinalizeAlignedPbi.gcs_path
95+
96+
String movie = AlignHiFiUBAM.movie
97+
98+
# metrics block (caveat: always has to keep an eye on the QC subworkflow about outputs)
99+
Float wgs_cov = QCandMetrics.wgs_cov
100+
Map[String, Float] nanoplot_summ = QCandMetrics.nanoplot_summ
101+
Map[String, Float] sam_flag_stats = QCandMetrics.sam_flag_stats
102+
103+
# fingerprint
104+
Map[String, String]? fingerprint_check = QCandMetrics.fingerprint_check
105+
106+
# contam
107+
Float? contamination_est = QCandMetrics.contamination_est
108+
109+
# sex concordance
110+
Map[String, String]? inferred_sex_info = QCandMetrics.inferred_sex_info
111+
112+
# methyl
113+
Map[String, String]? methyl_tag_simple_stats = QCandMetrics.methyl_tag_simple_stats
114+
115+
# file-based QC/metrics outputs all packed into a finalization map
116+
Map[String, String] aBAM_metrics_files_out = QCandMetrics.aBAM_metrics_files_out
117+
}
118+
119+
###################################################################################
120+
# prep work
121+
122+
# where to store final results
123+
String workflow_name = "ProcessOnInstrumentDemuxedChunk"
124+
String outdir = sub(gcs_out_root_dir, "/$", "") + "/" + workflow_name
125+
126+
# String bc_specific_out = outdir + '/' + readgroup_id
127+
String bc_specific_aln_out = outdir + '/alignments/' + readgroup_id
128+
String bc_specific_metrics_out = outdir + "/metrics/" + readgroup_id
129+
130+
###################################################################################
131+
# align
132+
call ALN.AlignHiFiUBAM as AlignHiFiUBAM { input:
133+
uBAM = uBAM,
134+
uPBI = uPBI,
135+
bam_sample_name = bam_SM_field,
136+
ref_map_file = ref_map_file,
137+
application = 'HIFI'
138+
}
139+
140+
File aBAM = AlignHiFiUBAM.aligned_bam
141+
File aBAI = AlignHiFiUBAM.aligned_bai
142+
File aPBI = AlignHiFiUBAM.aligned_pbi
143+
144+
# save
145+
call FF.FinalizeToFile as FinalizeAlignedBam { input: outdir = bc_specific_aln_out, file = aBAM, name = readgroup_id + '.bam' }
146+
call FF.FinalizeToFile as FinalizeAlignedBai { input: outdir = bc_specific_aln_out, file = aBAI, name = readgroup_id + '.bai' }
147+
call FF.FinalizeToFile as FinalizeAlignedPbi { input: outdir = bc_specific_aln_out, file = aPBI, name = readgroup_id + '.pbi' }
148+
149+
###################################################################################
150+
# QC
151+
AlignedBamQCnMetricsConfig qcm_config = read_json(select_first([qc_metrics_config_json]))
152+
call QCMetrics.Work as QCandMetrics { input:
153+
bam = aBAM,
154+
bai = aBAI,
155+
156+
tech = platform,
157+
158+
cov_bed = qcm_config.cov_bed,
159+
cov_bed_descriptor = qcm_config.cov_bed_descriptor,
160+
161+
fingerprint_vcf_store = qcm_config.fingerprint_vcf_store,
162+
fingerprint_sample_id = fingerprint_sample_id,
163+
164+
expected_sex_type = expected_sex_type,
165+
166+
vbid2_config_json = qcm_config.vbid2_config_json,
167+
168+
methyl_tag_check_bam_descriptor = qcm_config.methyl_tag_check_bam_descriptor,
169+
170+
ref_map_file = ref_map_file,
171+
disk_type = disk_type,
172+
173+
output_prefix = readgroup_id, # String output_prefix = bam_sample_name + "." + flowcell
174+
gcs_out_root_dir = bc_specific_metrics_out,
175+
}
176+
177+
###################################################################################
178+
179+
call GU.GetTodayDate as today {}
180+
}
Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
version 1.0
2+
3+
4+
import "../../../tasks/Utility/BAMutils.wdl" as BU
5+
import "../../../tasks/Utility/GeneralUtils.wdl" as GU
6+
7+
import "../../../tasks/Visualization/NanoPlot.wdl" as QC0
8+
import "../../TechAgnostic/Utility/CountTheBeans.wdl" as QC1
9+
import "../../TechAgnostic/Utility/DystPeaker.wdl" as QC2
10+
import "../../TechAgnostic/Utility/FASTQstats.wdl" as QC3
11+
12+
import "../../../tasks/Utility/Finalize.wdl" as FF
13+
import "../../TechAgnostic/Utility/SaveFilesToDestination.wdl" as SAVE
14+
15+
workflow ProcessOnInstrumentDemuxedChunkRefFree {
16+
17+
meta {
18+
desciption: "Given an on-instrument demultiplexed hifi_reads.bam, perform ref-independent prep work, and collect a few metrics"
19+
}
20+
21+
parameter_meta {
22+
readgroup_id:
23+
"Unique ID for a readgroup, for example (D|E)A[0-9]{6}-<barcode>"
24+
25+
short_reads_threshold:
26+
"a length threshold below which reads are classified as short"
27+
28+
bam_descriptor:
29+
"a one-word description of the purpose of the BAM (e.g. 'a_particular_seq_center'; used for saving the reads that don't have any MM/ML tags; doesn't need to be single-file specific)"
30+
31+
hifi_stats:
32+
"A few metrics output by Nanoplot"
33+
hifi_fq_stats:
34+
"A few metrics output by seqkit stats"
35+
36+
read_len_summaries:
37+
"A few metrics summarizing the read length distribution"
38+
read_len_peaks:
39+
"Peaks of the read length distribution (heruistic)"
40+
read_len_deciles:
41+
"Deciles of the read length distribution"
42+
43+
methyl_tag_simple_stats:
44+
"Simple stats on the reads with & without SAM methylation tags (MM/ML)."
45+
46+
uBAM_metrics_files_out:
47+
"A map where keys are summary-names and values are paths to files generated from the various QC/metrics tasks"
48+
}
49+
50+
input {
51+
File uBAM
52+
String readgroup_id
53+
54+
String bam_descriptor
55+
Int short_reads_threshold
56+
57+
String disk_type = "SSD"
58+
59+
String gcs_out_root_dir
60+
}
61+
62+
output {
63+
String movie = movie_name
64+
65+
File hifi_fq = FinalizeFQ.gcs_path
66+
67+
String last_processing_date = today.yyyy_mm_dd
68+
69+
# todo merge these two together
70+
Map[String, Float] hifi_stats = NanoPlotFromUBam.stats_map
71+
Map[String, Float] hifi_fq_stats = FASTQstats.stats
72+
73+
Map[String, String] read_len_summaries = DystPeaker.read_len_summaries
74+
Array[Int] read_len_peaks = DystPeaker.read_len_peaks
75+
Array[Int] read_len_deciles = DystPeaker.read_len_deciles
76+
77+
# methylation call rate stats
78+
Map[String, String] methyl_tag_simple_stats = NoMissingBeans.methyl_tag_simple_stats
79+
80+
# file-based outputs all packed into a finalization map
81+
Map[String, String] uBAM_metrics_files_out = FF.result
82+
}
83+
84+
###################################################################################
85+
# prep work
86+
87+
# where to store final results
88+
String workflow_name = "ProcessOnInstrumentDemuxedChunkRefFree"
89+
String outdir = sub(gcs_out_root_dir, "/$", "") + "/" + workflow_name
90+
91+
String outdir_ref_free = outdir + '/RefFree' # ideally, this isn't necessary, but it's a relic we'd not touch right now (2023-12-23)
92+
String bc_specific_out = outdir_ref_free + '/' + readgroup_id
93+
String bc_specific_metrics_out = bc_specific_out + "/metrics"
94+
95+
###################################################################################
96+
call BU.GetReadGroupInfo as RG {input: bam = uBAM, keys = ['PU']}
97+
String movie_name = RG.read_group_info['PU']
98+
99+
###################################################################################
100+
# convert to FASTQ (most for assembly)
101+
call BU.BamToFastq {input: bam = uBAM, prefix = "does_not_matter"}
102+
call FF.FinalizeToFile as FinalizeFQ { input:
103+
outdir = bc_specific_out,
104+
file = BamToFastq.reads_fq,
105+
name = readgroup_id + '.hifi.fq.gz'
106+
}
107+
108+
###################################################################################
109+
# more QCs and metrics
110+
111+
##########
112+
call QC0.NanoPlotFromUBam {input: uBAM = uBAM}
113+
FinalizationManifestLine a = object
114+
{files_to_save: flatten([[NanoPlotFromUBam.stats], NanoPlotFromUBam.plots]),
115+
is_singleton_file: false,
116+
destination: bc_specific_metrics_out + "/nanoplot",
117+
output_attribute_name: "nanoplot"}
118+
##########
119+
call QC1.CountTheBeans as NoMissingBeans { input:
120+
bam=uBAM,
121+
bam_descriptor=bam_descriptor,
122+
gcs_out_root_dir=bc_specific_metrics_out,
123+
use_local_ssd=disk_type=='LOCAL'
124+
}
125+
Map[String, String] methyl_out = {"missing_methyl_tag_reads":
126+
NoMissingBeans.methyl_tag_simple_stats['files_holding_reads_without_tags']}
127+
##########
128+
call QC2.DystPeaker { input:
129+
input_file=uBAM, input_is_bam=true,
130+
id=readgroup_id, short_reads_threshold=short_reads_threshold,
131+
gcs_out_root_dir=bc_specific_metrics_out
132+
}
133+
Map[String, String] read_len_out = {"read_len_hist": DystPeaker.read_len_hist,
134+
"raw_rl_file": DystPeaker.read_len_summaries['raw_rl_file']}
135+
136+
##########
137+
call QC3.FASTQstats { input:
138+
reads=BamToFastq.reads_fq,
139+
file_type='FASTQ'
140+
}
141+
142+
###################################################################################
143+
call SAVE.SaveFilestoDestination as FF { input:
144+
instructions = select_all([a]),
145+
already_finalized = select_all([methyl_out,
146+
read_len_out])
147+
}
148+
149+
call GU.GetTodayDate as today {}
150+
}

0 commit comments

Comments
 (0)