-
Notifications
You must be signed in to change notification settings - Fork 9
/
giraffe_and_deepvariant_fromGAF.wdl
249 lines (233 loc) · 11.5 KB
/
giraffe_and_deepvariant_fromGAF.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
version 1.0
import "../tasks/bioinfo_utils.wdl" as utils
import "../tasks/gam_gaf_utils.wdl" as gautils
import "../tasks/vg_map_hts.wdl" as map
import "../tasks/deepvariant.wdl" as dv
workflow GiraffeDeepVariantFromGAF {
meta {
description: "## Giraffe-DeepVariant-fromGAF workflow \n Surject a GAF and call small variants with DeepVariant. More information at [https://github.com/vgteam/vg_wdl/tree/gbz#giraffe-deepvariant-from-gaf-workflow](https://github.com/vgteam/vg_wdl/tree/gbz#giraffe-deepvariant-from-gaf-workflow)."
}
parameter_meta {
INPUT_GAF: "Input gzipped GAF file"
GBZ_FILE: "Path to .gbz index file"
SAMPLE_NAME: "The sample name"
OUTPUT_SINGLE_BAM: "Should a single merged BAM file be saved? If yes, unmapped reads will be inluded and 'calling bams' (one per contig) won't be outputed. Default is 'true'."
PAIRED_READS: "Are the reads paired? Default is 'true'."
PATH_LIST_FILE: "(OPTIONAL) Text file where each line is a path name in the GBZ index, to use instead of CONTIGS. If neither is given, paths are extracted from the GBZ and subset to chromosome-looking paths."
CONTIGS: "(OPTIONAL) Desired reference genome contigs, which are all paths in the GBZ index."
REFERENCE_PREFIX: "Remove this off the beginning of path names in surjected BAM (set to match prefix in PATH_LIST_FILE)"
REFERENCE_FILE: "(OPTIONAL) If specified, use this FASTA reference instead of extracting it from the graph. Required if the graph does not contain all bases of the reference."
REFERENCE_INDEX_FILE: "(OPTIONAL) If specified, use this .fai index instead of indexing the reference file."
REFERENCE_DICT_FILE: "(OPTIONAL) If specified, use this pre-computed .dict file of sequence lengths. Required if REFERENCE_INDEX_FILE is set"
LEFTALIGN_BAM: "Whether or not to left-align reads in the BAM. Default is 'true'."
REALIGN_INDELS: "Whether or not to realign reads near indels. Default is 'true'."
REALIGNMENT_EXPANSION_BASES: "Number of bases to expand indel realignment targets by on either side, to free up read tails in slippery regions. Default is 160."
MIN_MAPQ: "Minimum MAPQ of reads to use for calling. 4 is the lowest at which a mapping is more likely to be right than wrong. Default is 1"
MAX_FRAGMENT_LENGTH: "Maximum distance at which to mark paired reads properly paired. Default is 3000."
DV_MODEL_META: ".meta file for a custom DeepVariant calling model"
DV_MODEL_INDEX: ".index file for a custom DeepVariant calling model"
DV_MODEL_DATA: ".data-00000-of-00001 file for a custom DeepVariant calling model"
DV_KEEP_LEGACY_AC: "Should DV use the legacy allele counter behavior? Default is 'true'."
DV_NORM_READS: "Should DV normalize reads itself? Default is 'fasle'."
OTHER_MAKEEXAMPLES_ARG: "Additional arguments for the make_examples step of DeepVariant"
VG_CORES: "Number of cores to use when projecting the reads. Default is 16."
VG_MEM: "Memory, in GB, to use when projecting the reads. Default is 120."
CALL_CORES: "Number of cores to use when calling variants. Default is 8."
CALL_MEM: "Memory, in GB, to use when calling variants. Default is 50."
}
input {
File INPUT_GAF
File GBZ_FILE
String SAMPLE_NAME
Boolean OUTPUT_SINGLE_BAM = true
Boolean PAIRED_READS = true
File? PATH_LIST_FILE
Array[String]+? CONTIGS
String REFERENCE_PREFIX = ""
File? REFERENCE_FILE
File? REFERENCE_INDEX_FILE
File? REFERENCE_DICT_FILE
Boolean LEFTALIGN_BAM = true
Boolean REALIGN_INDELS = true
Int REALIGNMENT_EXPANSION_BASES = 160
Int MIN_MAPQ = 1
Int MAX_FRAGMENT_LENGTH = 3000
File? DV_MODEL_META
File? DV_MODEL_INDEX
File? DV_MODEL_DATA
Boolean DV_KEEP_LEGACY_AC = true
Boolean DV_NORM_READS = false
String OTHER_MAKEEXAMPLES_ARG = ""
Int VG_CORES = 16
Int VG_MEM = 120
Int CALL_CORES = 8
Int CALL_MEM = 50
}
# Which path names to work on?
if (!defined(CONTIGS)) {
if (!defined(PATH_LIST_FILE)) {
# Extract path names to call against from GBZ file if PATH_LIST_FILE input not provided
# Filter down to major paths, because GRCh38 includes thousands of
# decoys and unplaced/unlocalized contigs, and we can't efficiently
# scatter across them, nor do we care about accuracy on them, and also
# calling on the decoys is semantically meaningless.
call map.extractSubsetPathNames {
input:
in_gbz_file=GBZ_FILE,
in_extract_mem=VG_MEM
}
}
}
if (defined(CONTIGS)) {
# Put the paths in a file to use later. We know the value is defined,
# but WDL is a bit low on unboxing calls for optionals so we use
# select_first.
File written_path_names_file = write_lines(select_first([CONTIGS]))
}
File pipeline_path_list_file = select_first([PATH_LIST_FILE, extractSubsetPathNames.output_path_list_file, written_path_names_file])
# To make sure that we have a FASTA reference with a contig set that
# exactly matches the graph, we generate it ourselves, from the graph.
if (!defined(REFERENCE_FILE)) {
call map.extractReference {
input:
in_gbz_file=GBZ_FILE,
in_path_list_file=pipeline_path_list_file,
in_prefix_to_strip=REFERENCE_PREFIX,
in_extract_mem=VG_MEM
}
}
File reference_file = select_first([REFERENCE_FILE, extractReference.reference_file])
if (!defined(REFERENCE_INDEX_FILE)) {
call utils.indexReference {
input:
in_reference_file=reference_file
}
}
File reference_index_file = select_first([REFERENCE_INDEX_FILE, indexReference.reference_index_file])
File reference_dict_file = select_first([REFERENCE_DICT_FILE, indexReference.reference_dict_file])
call gautils.surjectGAFtoBAM {
input:
in_gaf_file=INPUT_GAF,
in_gbz_file=GBZ_FILE,
in_path_list_file=pipeline_path_list_file,
in_sample_name=SAMPLE_NAME,
in_max_fragment_length=MAX_FRAGMENT_LENGTH,
in_paired_reads=PAIRED_READS,
nb_cores=VG_CORES,
mem_gb=VG_MEM
}
call utils.sortBAM {
input:
in_bam_file=surjectGAFtoBAM.output_bam_file,
in_ref_dict=reference_dict_file,
in_prefix_to_strip=REFERENCE_PREFIX
}
# Split merged alignment by contigs list
call utils.splitBAMbyPath {
input:
in_sample_name=SAMPLE_NAME,
in_merged_bam_file=sortBAM.sorted_bam,
in_merged_bam_file_index=sortBAM.sorted_bam_index,
in_path_list_file=pipeline_path_list_file,
in_prefix_to_strip=REFERENCE_PREFIX
}
##
## Call variants with DeepVariant in each contig
##
scatter (bam_and_index_for_path in zip(splitBAMbyPath.bam_contig_files, splitBAMbyPath.bam_contig_files_index)) {
## Evantually shift and realign reads
if (LEFTALIGN_BAM){
# Just left-shift each read individually
call utils.leftShiftBAMFile {
input:
in_bam_file=bam_and_index_for_path.left,
in_reference_file=reference_file,
in_reference_index_file=reference_index_file
}
}
if (REALIGN_INDELS) {
File forrealign_bam = select_first([leftShiftBAMFile.output_bam_file, bam_and_index_for_path.left])
File forrealign_index = select_first([leftShiftBAMFile.output_bam_index_file, bam_and_index_for_path.right])
# Do indel realignment
call utils.prepareRealignTargets {
input:
in_bam_file=forrealign_bam,
in_bam_index_file=forrealign_index,
in_reference_file=reference_file,
in_reference_index_file=reference_index_file,
in_reference_dict_file=reference_dict_file,
in_expansion_bases=REALIGNMENT_EXPANSION_BASES
}
call utils.runAbraRealigner {
input:
in_bam_file=forrealign_bam,
in_bam_index_file=forrealign_index,
in_target_bed_file=prepareRealignTargets.output_target_bed_file,
in_reference_file=reference_file,
in_reference_index_file=reference_index_file
}
}
File calling_bam = select_first([runAbraRealigner.indel_realigned_bam, leftShiftBAMFile.output_bam_file, bam_and_index_for_path.left])
File calling_bam_index = select_first([runAbraRealigner.indel_realigned_bam_index, leftShiftBAMFile.output_bam_index_file, bam_and_index_for_path.right])
## DeepVariant calling
call dv.runDeepVariantMakeExamples {
input:
in_sample_name=SAMPLE_NAME,
in_bam_file=calling_bam,
in_bam_file_index=calling_bam_index,
in_reference_file=reference_file,
in_reference_index_file=reference_index_file,
in_min_mapq=MIN_MAPQ,
in_keep_legacy_ac=DV_KEEP_LEGACY_AC,
in_norm_reads=DV_NORM_READS,
in_other_makeexamples_arg=OTHER_MAKEEXAMPLES_ARG,
in_call_cores=CALL_CORES,
in_call_mem=CALL_MEM
}
call dv.runDeepVariantCallVariants {
input:
in_sample_name=SAMPLE_NAME,
in_reference_file=reference_file,
in_reference_index_file=reference_index_file,
in_examples_file=runDeepVariantMakeExamples.examples_file,
in_nonvariant_site_tf_file=runDeepVariantMakeExamples.nonvariant_site_tf_file,
in_model_meta_file=DV_MODEL_META,
in_model_index_file=DV_MODEL_INDEX,
in_model_data_file=DV_MODEL_DATA,
in_call_cores=CALL_CORES,
in_call_mem=CALL_MEM
}
}
# Merge distributed variant called VCFs
call utils.concatClippedVCFChunks {
input:
in_sample_name=SAMPLE_NAME,
in_clipped_vcf_chunk_files=runDeepVariantCallVariants.output_vcf_file
}
call utils.concatClippedVCFChunks as concatClippedGVCFChunks {
input:
in_sample_name=SAMPLE_NAME,
in_clipped_vcf_chunk_files=runDeepVariantCallVariants.output_gvcf_file
}
if (OUTPUT_SINGLE_BAM){
call utils.mergeAlignmentBAMChunks as mergeBAM {
input:
in_sample_name=SAMPLE_NAME,
in_alignment_bam_chunk_files=select_all(flatten([calling_bam, [splitBAMbyPath.bam_unmapped_file]]))
}
}
if (!OUTPUT_SINGLE_BAM){
Array[File] output_calling_bam_files = calling_bam
Array[File] output_calling_bam_index_files = calling_bam_index
}
output {
File output_vcf = concatClippedVCFChunks.output_merged_vcf
File output_vcf_index = concatClippedVCFChunks.output_merged_vcf_index
File output_gvcf = concatClippedGVCFChunks.output_merged_vcf
File output_gvcf_index = concatClippedGVCFChunks.output_merged_vcf_index
File? output_bam = mergeBAM.merged_bam_file
File? output_bam_index = mergeBAM.merged_bam_file_index
Array[File]? output_calling_bams = output_calling_bam_files
Array[File]? output_calling_bam_indexes = output_calling_bam_index_files
}
}