forked from nebiolabs/EM-seq
-
Notifications
You must be signed in to change notification settings - Fork 0
/
em-seq.nf
481 lines (388 loc) · 16 KB
/
em-seq.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
// this consumes fastq files, adapter trims, then aligns reads to the specified reference using bwa-meth
// mode can be set to tile_fastqs or run_fastqs depending on whether the system should map each tile's reads in distinct jobs then combine (tile_fastqs)
// or all reads for a library in a single job (run_fastqs)
flowcell = params.flowcell
genome = params.genome
params.tmp_dir = '/tmp'
outputPath = params.outdir = 'output'
fastq_mode = params.fastq_mode = 'run_fastqs'
println "Processing " + flowcell + "... => " + outputPath
fastq_glob = params.fastq_glob ?: '*.{1,2}.fastq*'
Channel.fromFilePairs(fastq_glob)
.map{ lib,read -> [flowcell: flowcell, library:lib, insert_read1:read[0], insert_read2:read[1], barcode:'N', lane:'all', tile:'all' ]}.set{fq_set_channel}
process mapping {
cpus fastq_mode == 'tile-fastq' ? 4 : 16
errorStrategy 'retry'
tag { [flowcell, fq_set.library] }
conda "bwameth=0.2.2 seqtk=1.3 sambamba=0.7.0 fastp=0.20.1 mark-nonconverted-reads=1.1"
input:
val fq_set from fq_set_channel
output:
set val(fq_set.library), file("*.aln.bam") into aligned_files
set val(fq_set.library), file("*.nonconverted.tsv") into nonconverted_counts
set val(fq_set.library), file("*_fastp.json") into fastp_log_files
shell:
'''
inst_name=$(zcat -f '!{fq_set.insert_read1}' | head -n 1 | cut -f 1 -d ':' | sed 's/^@//')
fastq_barcode=$(zcat -f '!{fq_set.insert_read1}' | head -n 1 | sed -r 's/.*://')
if [[ "${inst_name:0:2}" == 'A0' ]] || [[ "${inst_name:0:2}" == 'NS' ]] || \
[[ "${inst_name:0:2}" == 'NB' ]] || [[ "${inst_name:0:2}" == 'VH' ]] ; then
trim_polyg='--trim_poly_g'
echo '2-color instrument: poly-g trim mode on'
else
trim_polyg=''
fi
seqtk mergepe <(zcat -f "!{fq_set.insert_read1}") <(zcat -f "!{fq_set.insert_read2}") \
| fastp --stdin --stdout -l 2 -Q ${trim_polyg} --interleaved_in --overrepresentation_analysis \
-j "!{fq_set.library}_fastp.json" 2> fastp.stderr \
| bwameth.py -p -t !{task.cpus} --read-group "@RG\\tID:${fastq_barcode}\\tSM:!{fq_set.library}" --reference !{genome} /dev/stdin \
2> "!{fq_set.library}_${fastq_barcode}!{fq_set.flowcell}_!{fq_set.lane}_!{fq_set.tile}.log.bwamem" \
| mark-nonconverted-reads.py 2> "!{fq_set.library}_${fastq_barcode}_!{fq_set.flowcell}_!{fq_set.lane}_!{fq_set.tile}.nonconverted.tsv" \
| sambamba view -t 2 -S -f bam -o "!{fq_set.library}_${fastq_barcode}_!{fq_set.flowcell}_!{fq_set.lane}_!{fq_set.tile}.aln.bam" /dev/stdin 2> sambamba.stderr;
'''
}
process mergeAndMarkDuplicates {
cpus 8
errorStrategy 'retry'
tag { library }
publishDir "${outputPath}", mode: 'copy', pattern: '*.{md.bam}*'
conda "samtools=1.9 samblaster=0.1.24 sambamba=0.7.0"
input:
set val(library), file(libraryBam) from aligned_files.groupTuple()
output:
set val(library), file('*.md.bam'), file('*.md.bam.bai') into md_bams
file('*.samblaster') into samblaster_logs
shell:
'''
samtools cat -b <( find . -name '*.aln.bam' ) \
| samtools view -h /dev/stdin \
| samblaster 2> !{library}.log.samblaster \
| sambamba view -t 2 -l 0 -S -f bam /dev/stdin \
| sambamba sort --tmpdir=!{params.tmp_dir} -t !{task.cpus} -m 20GB -o !{library}.md.bam /dev/stdin
'''
}
// we need to send the same file into multiple channels for downstream
// analysis whether they came from aligned bams or fastqs
md_bams.into { md_files_for_mbias; md_files_for_extract; md_files_for_fastqc;
md_files_for_samstats; md_files_for_picard;
md_files_for_goleft; md_files_for_picard_gc; md_files_for_samflagstats;
md_files_for_aggregate; md_files_for_human_reads;
}
process methylDackel_mbias {
cpus 8
errorStrategy 'retry'
tag {library}
conda "methyldackel=0.4.0 samtools=1.9"
input:
tuple library, file(md_file), file(md_bai) from md_files_for_mbias.groupTuple()
output:
file('*.svg') into mbias_output_svg
file('*.tsv') into mbias_output_tsv
tuple library, file('*.tsv') into mbias_for_aggregate
shell:
'''
echo -e "chr\tcontext\tstrand\tRead\tPosition\tnMethylated\tnUnmethylated\tnMethylated(+dups)\tnUnmethylated(+dups)" > !{library}_combined_mbias.tsv
chrs=(`samtools view -H !{md_file} | grep @SQ | cut -f 2 | sed 's/SN://'| grep -v _random | grep -v chrUn | sed 's/|/\\|/'`)
for chr in ${chrs[*]}; do
for context in CHH CHG CpG; do
arg=''
if [ $context = 'CHH' ]; then
arg='--CHH --noCpG'
elif [ $context = 'CHG' ]; then
arg='--CHG --noCpG'
fi
# need two calls to add columns containing the counts without filtering duplicate reads (for rrEM-seq where start/end is constrained)
# not sure why we need both --keepDupes and -F, probably a bug in mbias
join -t $'\t' -j1 -o 1.2,1.3,1.4,1.5,1.6,2.5,2.6 -a 1 -e 0 \
<( \
MethylDackel mbias --noSVG $arg -@ !{task.cpus} -r $chr !{genome} !{md_file} | \
tail -n +2 | awk '{print $1"-"$2"-"$3"\t"$0}' | sort -k 1b,1
) \
<( \
MethylDackel mbias --noSVG --keepDupes -F 2816 $arg -@ !{task.cpus} -r $chr !{genome} !{md_file} | \
tail -n +2 | awk '{print $1"-"$2"-"$3"\t"$0}' | sort -k 1b,1
) \
| sed "s/^/${chr}\t${context}\t/" \
>> !{library}_combined_mbias.tsv
done
done
# makes the svg files for trimming checks
MethylDackel mbias -@ !{task.cpus} --noCpG --CHH --CHG -r ${chrs[0]} !{genome} !{md_file} !{library}_chn
for f in *chn*.svg; do sed -i "s/Strand<\\/text>/Strand $f ${chrs[0]} CHN <\\/text>/" $f; done;
MethylDackel mbias -@ !{task.cpus} -r ${chrs[0]} !{genome} !{md_file} !{library}_cpg
for f in *cpg*.svg; do sed -i "s/Strand<\\/text>/Strand $f ${chrs[0]} CpG<\\/text>/" $f; done;
'''
}
process methylDackel_extract {
cpus 8
tag {library}
publishDir "${outputPath}", mode: 'copy'
conda "methyldackel=0.4.0 pigz=2.4"
input:
tuple library, file(md_file), file(md_bai) from md_files_for_extract.groupTuple()
output:
tuple library, file('*.methylKit.gz') into extract_output
shell:
'''
MethylDackel extract --methylKit --OT 0,0,0,95 --OB 0,0,5,0 -@ !{task.cpus} --CHH --CHG -o !{library} !{genome} !{md_file}
pigz -p !{task.cpus} *.methylKit
'''
}
process select_human_reads {
cpus 8
tag {library}
conda "sambamba=0.7.1 bedtools=2.29.2"
input:
tuple library, file(md_file), file(md_bai) from md_files_for_human_reads.groupTuple()
output:
tuple library, file('*.human.bam') into human_bams_gc
tuple library, file('*.human.bam') into human_bams_inserts
shell:
'''
sambamba view -t 8 -l 0 -f bam !{md_file} chr1 chr2 chr3 chr4 chr5 chr6 \
chr7 chr8 chr9 chr10 chr11 chr12 \
chr13 chr14 chr15 chr16 chr17 chr18 \
chr19 chr20 chr21 chr22 chrX chrY \
> !{md_file}.human.bam
'''
}
process runFastQC {
cpus 1
errorStrategy 'retry'
tag { library }
conda "fastqc=0.11.8"
input:
tuple library, file(md_file), file(md_bai) from md_files_for_fastqc.groupTuple()
output:
file('*_fastqc.zip') into fastqc_results
tuple library, file('*_fastqc.zip') into fastqc_results_for_aggregate
shell:
'''
fastqc -f bam !{md_file}
'''
}
process sum_nonconverted_reads {
input:
tuple library, file(count_files) from nonconverted_counts.groupTuple()
output:
file('*-nonconverted-counts.tsv') into cat_nonconversions
tuple library, file('*-nonconverted-counts.tsv') into nonconverted_counts_for_aggregate
shell:
'''
files=(*.tsv)
paste *.tsv | awk -v numFiles=${#files[@]} -v OFS='\t' '
{
row = sep = ""
for(i=1; i < NF/numFiles; ++i) { row = row sep $i; sep = OFS }
sum = $(NF/numFiles) # last header col. / (1st) data col. to sum
for(i=2; i<=numFiles; ++i) sum += $(NF/numFiles * i) # add other cols.
printf "%s%s%s\\n", row, OFS, sum
}' > tmp-counts.tsv
awk '{print "!{library}\t" $0}' tmp-counts.tsv > !{library}-nonconverted-counts.tsv
'''
}
process combine_nonconversion {
publishDir "${outputPath}", mode: 'copy'
input:
file ('*') from cat_nonconversions
output:
file ("combined-nonconverted.tsv")
shell:
'''
cat *.tsv > combined-nonconverted.tsv
'''
}
process samtools_flagstats {
cpus 2
errorStrategy 'retry'
tag { library }
conda "samtools=1.9"
input:
tuple library, file(md_file), file(md_bai) from md_files_for_samflagstats.groupTuple(by: 0)
output:
file('*.flagstat') into flagstats
file('*.idxstat') into idxstats
tuple library, file('*.flagstat') into flagstats_for_aggregate
tuple library, file('*.idxstat') into idxstats_for_aggregate
shell:
'''
samtools flagstat -@!{task.cpus} !{md_file} > !{md_file}.flagstat
samtools idxstats !{md_file} > !{md_file}.idxstat
'''
}
process samtools_stats {
cpus 2
errorStrategy 'retry'
tag { library }
conda "samtools=1.9"
input:
tuple library, file(md_file),file(md_bai) from md_files_for_samstats.groupTuple()
output:
file('*.samstat') into samstats
shell:
'''
samtools stats -@!{task.cpus} !{md_file} > !{md_file}.samstat
'''
}
process picard_gc_bias {
cpus 1
errorStrategy 'retry'
tag { library }
conda "picard=2.20.7"
input:
tuple library, file(md_file), file(md_bai) from md_files_for_picard_gc.groupTuple()
output:
file('*gc_metrics') into picard_gc_stats
shell:
'''
picard -Xmx4g CollectGcBiasMetrics IS_BISULFITE_SEQUENCED=true VALIDATION_STRINGENCY=LENIENT I=!{md_file} O=!{md_file}.gc_metrics S=!{md_file}.gc_summary_metrics CHART=!{md_file}.gc.pdf R=!{genome}
'''
}
process picard_stats {
cpus 4
errorStrategy 'retry'
tag { library }
conda "picard=2.20.7"
input:
tuple library, file(md_file), file(md_bai) from md_files_for_picard.groupTuple()
output:
file('*_metrics') into picard_stats
shell:
'''
picard -Xmx16g CollectInsertSizeMetrics VALIDATION_STRINGENCY=LENIENT I=!{md_file} O=!{md_file}.insertsize_metrics MINIMUM_PCT=0 HISTOGRAM_FILE=/dev/null
'''
}
process human_gc_bias {
cpus 1
errorStrategy 'retry'
tag { library }
conda "picard=2.20.7"
input:
tuple library, file(md_file) from human_bams_gc.groupTuple()
output:
tuple library, file('*gc_metrics') into human_gc_stats_for_aggregate
shell:
'''
picard -Xmx4g CollectGcBiasMetrics IS_BISULFITE_SEQUENCED=true VALIDATION_STRINGENCY=LENIENT I=!{md_file} O=!{md_file}.gc_metrics S=!{md_file}.gc_summary_metrics CHART=!{md_file}.gc.pdf R=!{genome}
'''
}
process human_insert_size {
cpus 4
errorStrategy 'retry'
tag { library }
conda "picard=2.20.7"
input:
tuple library, file(md_file) from human_bams_inserts.groupTuple()
output:
tuple library, file('*insertsize_metrics') into human_stats_for_aggregate
shell:
'''
picard -Xmx16g CollectInsertSizeMetrics VALIDATION_STRINGENCY=LENIENT I=!{md_file} O=!{md_file}.insertsize_metrics MINIMUM_PCT=0.0001 HISTOGRAM_FILE=/dev/null
'''
}
process goleft {
cpus 1
conda 'goleft=0.2.0'
input:
tuple library, file(md_file), file(md_bai) from md_files_for_goleft.groupTuple()
output:
file("${library}/*-indexcov.ped") into goleft_ped
file("${library}/*-indexcov.roc") into goleft_roc
shell:
'''
goleft indexcov --directory !{library} *.bam
'''
}
process multiqc {
cpus 1
publishDir "${outputPath}", mode: 'copy'
conda "multiqc=1.7"
input:
file('*') from fastqc_results.flatten().toList()
file('*') from flagstats.flatten().toList()
file('*') from idxstats.flatten().toList()
file('*') from samstats.flatten().toList()
file('*') from picard_stats.flatten().toList()
file('*') from picard_gc_stats.flatten().toList()
file('*') from goleft_ped.flatten().toList()
file('*') from goleft_roc.flatten().toList()
file('*') from samblaster_logs.flatten().toList()
file('*') from fastp_log_files.flatten().toList()
output:
file "*report.html"
shell:
'''
for file in $(cat input.* | sed -e 's/\\[//g' | sed -e 's/, \\|\\]/\\n/g'); do ln -s ${file} ./; done
cat <<CONFIG > multiqc_config.yaml
title: Bwameth Alignment Summary - !{flowcell}
extra_fn_clean_exts:
- '.md'
- '_fastp'
custom_plot_config:
picard_insert_size:
xmax: 1000
table_columns_placement:
Samtools Stats:
raw_total_sequences: 10
reads_mapped_percent: 20
reads_properly_paired_percent: 30
reads_MQ0_percent: 35
Samblaster:
pct_dups: 40
Picard:
summed_median: 50
table_columns_visible:
Picard:
PCT_PF_READS_ALIGNED: False
summed_mean: False
Samtools Stats:
reads_mapped: False
mapped_passed: False
non-primary_alignments: False
reads_MQ0_percent: True
Samtools Flagstat:
mapped_passed: False
samtools_idxstats_always:
- plasmid_puc19c
- phage_lambda
FastQC:
percent_duplicates: False
total_sequences: False
avg_sequence_length: False
percent_fails: False
total_sequences: False
CONFIG
multiqc -ip .
'''
}
process combine_mbias_tsv {
publishDir "${outputPath}", mode: 'copy', pattern: 'combined*'
input:
file(tsv) from mbias_output_tsv
output:
file "combined-mbias.tsv"
shell:
'''
echo -ne 'flowcell\tlibrary\t' > combined-mbias.tsv
ls *_mbias.tsv | head -n 1 | xargs head -n 1 >> combined-mbias.tsv
for f in *_mbias.tsv; do
filebase=`basename "${f}" _combined_mbias.tsv`
paste <( yes "!{flowcell} ${filebase}" | head -n `nl "$f" | tail -n 1 | cut -f 1` ) "$f" | tail -n +2 >> combined-mbias.tsv
done
'''
}
process combine_mbias_svg {
publishDir "${outputPath}", mode: 'copy', pattern: 'combined*'
conda 'cairosvg=2.4.2 ghostscript=9.22'
input:
file(svg) from mbias_output_svg.groupTuple()
output:
file "combined-mbias.pdf"
shell:
'''
for f in *.svg; do
cairosvg <(sed s/-nan/-1/ $f) -o $f.pdf
done
gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=combined-mbias.pdf *.pdf
'''
}