forked from iwohlers/lied_egypt_genome
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Snakefile_genome_alignment
570 lines (520 loc) · 27.6 KB
/
Snakefile_genome_alignment
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
#kate:syntax python;
from global_variables import *
################################################################################
########### Reference to assembly genome alignment with nucmer #################
################################################################################
rule repeatmasked_primary_grch38:
input: expand("repeatmasked_GRCh38/Homo_sapiens.GRCh38.dna.{chrom}.fa.masked", \
chrom=CHR_GRCh38)
output: "repeatmasked_GRCh38/Homo_sapiens.GRCh38.dna.primary_assembly.fa"
shell: "cat {input} > {output}"
rule repeatmasked_primary_egyptrefv2:
input: expand("repeatmasked_EGYPTREFV2/Homo_sapiens.EGYPTREFV2.dna.{scaffold}.fa.masked", \
scaffold=EGYPTREFV2_SCAFFOLDS)
output: "repeatmasked_EGYPTREFV2/Homo_sapiens.EGYPTREFV2.dna.primary_assembly.fa"
shell: "cat {input} > {output}"
rule repeatmasked_primary_cegyptrefv2:
input: expand("repeatmasked_CEGYPTREFV2/Homo_sapiens.CEGYPTREFV2.dna.{chrom}.fa.masked", \
chrom=CEGYPTV2_CONTIGS)
output: "repeatmasked_CEGYPTREFV2/Homo_sapiens.CEGYPTREFV2.dna.primary_assembly.fa"
shell: "cat repeatmasked_CEGYPTREFV2/*.fa.masked > {output}"
rule repeatmasked_primary_egyptref:
input: expand("repeatmasked_EGYPTREF/Homo_sapiens.EGYPTREF.dna.{scaffold}.fa.masked", \
scaffold=EGYPTREF_SCAFFOLDS)
output: "repeatmasked_EGYPTREF/Homo_sapiens.EGYPTREF.dna.primary_assembly.fa"
shell: "cat {input} > {output}"
rule repeatmasked_primary_cegyptref:
input: expand("repeatmasked_CEGYPTREF/Homo_sapiens.CEGYPTREF.dna.{contig}.fa.masked", \
contig=CEGYPT_CONTIGS)
output: "repeatmasked_CEGYPTREF/Homo_sapiens.CEGYPTREF.dna.primary_assembly.fa"
shell: "cat {input} > {output}"
rule repeatmasked_primary_yoruba:
input: expand("repeatmasked_YORUBA/Homo_sapiens.YORUBA.dna.{scaffold}.fa.masked", \
scaffold=YORUBA_SCAFFOLDS)
output: "repeatmasked_YORUBA/Homo_sapiens.YORUBA.dna.primary_assembly.fa"
shell: "cat repeatmasked_YORUBA/Homo_sapiens.YORUBA.dna.*.fa.masked > {output}"
rule repeatmasked_primary_ak1:
input: expand("repeatmasked_AK1/Homo_sapiens.AK1.dna.{scaffold}.fa.masked", \
scaffold=AK1_SCAFFOLDS)
output: "repeatmasked_AK1/Homo_sapiens.AK1.dna.primary_assembly.fa"
shell: "cat repeatmasked_AK1/Homo_sapiens.AK1.dna.*.fa.masked > {output}"
rule repeatmasked_primary_egyptrefwtdbg2:
input: expand("repeatmasked_EGYPTREFWTDBG2/Homo_sapiens.EGYPTREFWTDBG2.dna.{scaffold}.fa.masked", \
scaffold=EGYPTREFWTDBG2_SCAFFOLDS)
output: "repeatmasked_EGYPTREFWTDBG2/Homo_sapiens.EGYPTREFWTDBG2.dna.primary_assembly.fa"
shell: "cat repeatmasked_EGYPTREFWTDBG2/*.fa.masked > {output}"
# Genome alignments using mummer4
# Comparing the entire GRCh38 assembly with the entire EGYPTREF assembly
# --mum: Use anchor matches that are unique in both the reference and
# query (false)
# --threads=NUM: Use NUM threads (# of cores)
# Path to conda environment:
# /scratch/node25_genome_alignment/.snakemake/conda/1d58cf3a/bin
rule align_assemblies_with_nucmer:
input: ref="seq_{a1}/Homo_sapiens.{a1}.dna.primary_assembly.fa",
query="seq_{a2}/Homo_sapiens.{a2}.dna.primary_assembly.fa"
output: "align_nucmer_{a1}_vs_{a2}/{a1}_vs_{a2}.delta"
conda: "envs/mummer.yaml"
shell: "nucmer " + \
"--mum " + \
"--threads=32 " + \
"-p align_nucmer_{wildcards.a1}_vs_{wildcards.a2}/{wildcards.a1}_vs_{wildcards.a2} " + \
"{input[0]} {input[1]}"
################################################################################
######################### Filtering alignments #################################
################################################################################
# Here are the delta-filte options for filtering nucmer-generated alignments
# USAGE: delta-filter [options] <deltafile>
#
# -1 1-to-1 alignment allowing for rearrangements
# (intersection of -r and -q alignments)
# -g 1-to-1 global alignment not allowing rearrangements
# -h Display help information
# -i float Set the minimum alignment identity [0, 100], default 0
# -l int Set the minimum alignment length, default 0
# -m Many-to-many alignment allowing for rearrangements
# (union of -r and -q alignments)
# -q Maps each position of each query to its best hit in
# the reference, allowing for reference overlaps
# -r Maps each position of each reference to its best hit
# in the query, allowing for query overlaps
# -u float Set the minimum alignment uniqueness, i.e. percent of
# the alignment matching to unique reference AND query
# sequence [0, 100], default 0
# -o float Set the maximum alignment overlap for -r and -q options
# as a percent of the alignment length [0, 100], default 100
# Selecting 1-to-1 alignments;
# -1 : 1-to-1 alignment allowing for rearrangements (intersection of -r and -q
# alignments)
rule select_one_to_one_alignments:
input: "{aligntype}_nucmer_{a1}_vs_{a2}/{a1}_vs_{a2}.delta"
output: "{aligntype}_nucmer_{a1}_vs_{a2}/{a1}_vs_{a2}_1to1.delta"
conda: "envs/mummer.yaml"
shell: "delta-filter -1 {input[0]} > {output[0]}"
# Mapping each contig to its best position in the reference
# -q Maps each position of each query to its best hit in
# the reference, allowing for reference overlaps
rule select_query_best_hit_in_reference_alignments:
input: "align_nucmer_{a1}_vs_{a2}/{a1}_vs_{a2}.delta"
output: "align_nucmer_{a1}_vs_{a2}/{a1}_vs_{a2}_q.delta"
conda: "envs/mummer.yaml"
shell: "delta-filter -q {input[0]} > {output[0]}"
# The alignments with no filter applied (simple copying of the original file)
rule select_all_alignments:
input: "{aligntype}_nucmer_{a1}_vs_{a2}/{a1}_vs_{a2}.delta"
output: "{aligntype}_nucmer_{a1}_vs_{a2}/{a1}_vs_{a2}_nofilter.delta"
shell: "cp {input[0]} {output[0]}"
rule align_repeatmasked_assemblies_with_nucmer:
input: ref="repeatmasked_{a1}/Homo_sapiens.{a1}.dna.primary_assembly.fa",
query="repeatmasked_{a2}/Homo_sapiens.{a2}.dna.primary_assembly.fa"
output: "alignrepeatmasked_nucmer_{a1}_vs_{a2}/{a1}_vs_{a2}.delta"
conda: "envs/mummer.yaml"
shell: "nucmer " + \
"--mum " + \
"--threads=16 " + \
"-p alignrepeatmasked_nucmer_{wildcards.a1}_vs_{wildcards.a2}/{wildcards.a1}_vs_{wildcards.a2} " + \
"{input[0]} {input[1]}"
rule align_assemblies_with_nucmer_all:
input: expand("align_nucmer_GRCh38_vs_{a}/GRCh38_vs_{a}.delta", \
a=["EGYPTREF","CEGYPTREF","EGYPTREFV2","CEGYPTREFV2","EGYPTREFWTDBG2","AK1","YORUBA","GRCh38"])
# Tiling: Attempts to construct a tiling path out of the query contigs as mapped
# to the reference sequences
# Columns: start ref, end ref, gap between contig and next, length contig,
# alignment coverage of contig, average percent identiy, orientation of contig,
# contig id
rule show_tiling:
input: "{aligntype}_nucmer_{a1}_vs_{a2}/{a1}_vs_{a2}.delta"
output: "{aligntype}_nucmer_{a1}_vs_{a2}/{a1}_vs_{a2}.tiling"
conda: "envs/mummer.yaml"
shell: "show-tiling {input} > {output} "
# Showing coordinates of alignments
rule show_coords:
input: "{aligntype}_nucmer_{a1}_vs_{a2}/{a1}_vs_{a2}.delta"
output: "{aligntype}_nucmer_{a1}_vs_{a2}/{a1}_vs_{a2}.coords"
conda: "envs/mummer.yaml"
shell: "show-coords {input} > {output} "
# Plotting the best 1-to-1 mapping
# OPTIONS:
# -b|breaklen Highlight alignments with breakpoints further than
# breaklen nucleotides from the nearest sequence end
# --[no]color Color plot lines with a percent similarity gradient or
# turn off all plot color (default color by match dir)
# If the plot is very sparse, edit the .gp script to plot
# with 'linespoints' instead of 'lines'
# -c
# --[no]coverage Generate a reference coverage plot (default for .tiling)
# --depend Print the dependency information and exit
# -f
# --filter Only display .delta alignments which represent the "best"
# hit to any particular spot on either sequence, i.e. a
# one-to-one mapping of reference and query subsequences
# -h
# --help Display help information and exit
# -l
# --layout Layout a .delta multiplot in an intelligible fashion,
# this option requires the -R -Q options
# --fat Layout sequences using fattest alignment only
# -p|prefix Set the prefix of the output files (default 'out')
# -rv Reverse video for x11 plots
# -r|IdR Plot a particular reference sequence ID on the X-axis
# -q|IdQ Plot a particular query sequence ID on the Y-axis
# -R|Rfile Plot an ordered set of reference sequences from Rfile
# -Q|Qfile Plot an ordered set of query sequences from Qfile
# Rfile/Qfile Can either be the original DNA multi-FastA
# files or lists of sequence IDs, lens and dirs [ /+/-]
# -r|rport Specify the port to send reference ID and position on
# mouse double click in X11 plot window
# -q|qport Specify the port to send query IDs and position on mouse
# double click in X11 plot window
# -s|size Set the output size to small, medium or large
# --small --medium --large (default 'small')
# -S
# --SNP Highlight SNP locations in each alignment
# -t|terminal Set the output terminal to x11, postscript or png
# --x11 --postscript --png (default 'x11')
# -t|title Specify the gnuplot plot title (default none)
# -x|xrange Set the xrange for the plot '[min:max]'
# -y|yrange Set the yrange for the plot '[min:max]'
# -V
# --version Display the version information and exit
rule mummerplot:
input: "{aligntype}_nucmer_{a1}_vs_{a2}/{a1}_vs_{a2}_{filter}.delta",
"data/mummerplot_chromosomes.txt"
output: "{aligntype}_nucmer_{a1}_vs_{a2}/{a1}_vs_{a2}_{filter}.gp",
"{aligntype}_nucmer_{a1}_vs_{a2}/{a1}_vs_{a2}_{filter}.ps"
params: outprefix=lambda wildcards: wildcards.aligntype+"_nucmer_"+ \
wildcards.a1+"_vs_"+wildcards.a2+"/"+wildcards.a1+ \
"_vs_"+wildcards.a2+"_"+wildcards.filter
conda: "envs/mummer.yaml"
shell: "mummerplot " + \
"-p {params.outprefix} " + \
"--postscript " + \
"--filter " + \
"--layout " + \
"--medium " + \
"-title {wildcards.a2} " + \
"-R {input[1]} " + \
"{input[0]}; " + \
"gnuplot {output[0]}; "
# aligntype=["align","alignrepeatmasked"], \
# a2=["EGYPTREF","CEGYPTREF","EGYPTREFV2","CEGYPTREFV2"] + \
# ["EGYPTREFWTDBG2","YORUBA","AK1","GRCh38"], \
rule mummerplot_all:
input: expand("{aligntype}_nucmer_{a1}_vs_{a2}/{a1}_vs_{a2}_{filter}.ps", \
aligntype=["align"], \
a1="GRCh38", \
a2=["EGYPTREFMETAV2ADDED","EGYPTREFWTDBG2V3PILON","EGYPTREFV2","AK1","YORUBA"], \
filter=["nofilter","1to1"])
rule mummerplot_chromosomewise:
input: "{aligntype}_nucmer_{a1}_vs_{a2}/{a1}_vs_{a2}_{filter}.delta",
"data/mummerplot_chromosomes.txt"
output: "{aligntype}_nucmer_{a1}_vs_{a2}/chrom_{a1}_vs_{a2}_{filter}_{chrom}.gp",
"{aligntype}_nucmer_{a1}_vs_{a2}/chrom_{a1}_vs_{a2}_{filter}_{chrom}.ps",
params: outprefix=lambda wildcards: wildcards.aligntype+"_nucmer_"+ \
wildcards.a1+"_vs_"+wildcards.a2+"/chrom_"+wildcards.a1+ \
"_vs_"+wildcards.a2+"_"+wildcards.filter+ \
"_"+wildcards.chrom
conda: "envs/mummer.yaml"
shell: "mummerplot " + \
"-p {params.outprefix} " + \
"--postscript " + \
"--filter " + \
"--layout " + \
"--medium " + \
"-title {wildcards.a2} " + \
"-R {input[1]} " + \
"-r {wildcards.chrom} " + \
"{input[0]}; " + \
"gnuplot {output[0]}; "
ASSEMBLIES = ["CEGYPTREFV2"]
CHROMOSOMES = [str(x) for x in range(1,23)]+["X","Y","MT"]
# a2=["EGYPTREF","CEGYPTREF","EGYPTREFV2","CEGYPTREFV2"] + \
# ["EGYPTREFWTDBG2","YORUBA","AK1","GRCh38"], \
rule mummerplot_chromosomewise_all:
input: expand("{aligntype}_nucmer_{a1}_vs_{a2}/chrom_{a1}_vs_{a2}_{filter}_{chrom}.ps", \
aligntype=["align","alignrepeatmasked"], \
a1="GRCh38", \
a2=["EGYPTREFWTDBG2"], \
filter=["nofilter","1to1"], \
chrom=CHROMOSOMES)
################################################################################
#################### Detection of differences using Syri #######################
################################################################################
# Parameters for nucmer, delta-filter and show-coords of the mummer3 suite are
# taken from Supplement 2 of the BioRxiv Syri publication here:
# https://www.biorxiv.org/content/10.1101/546622v1
rule syri_align_nucmer:
input: ref="seq_{a1}/Homo_sapiens.{a1}.dna.primary_assembly.fa",
query="seq_{a2}/Homo_sapiens.{a2}.dna.primary_assembly.fa"
output: "syri_{a1}_vs_{a2}/{a1}_vs_{a2}.delta"
conda: "envs/syri.yaml"
shell: "nucmer " + \
"--maxmatch " + \
"-c 500 " + \
"-b 500 " + \
"-l 100 " + \
"--threads=16 " + \
"-p syri_{wildcards.a1}_vs_{wildcards.a2}/{wildcards.a1}_vs_{wildcards.a2} " + \
"{input[0]} {input[1]}"
rule syri_delta_filter:
input: "syri_{a1}_vs_{a2}/{a1}_vs_{a2}.delta"
output: "syri_{a1}_vs_{a2}/{a1}_vs_{a2}_filtered.delta"
conda: "envs/syri.yaml"
shell: "delta-filter " + \
"-m " + \
"-i 90 " + \
"-l 100 " + \
"{input} > {output}"
rule syri_show_coords:
input: "syri_{a1}_vs_{a2}/{a1}_vs_{a2}_filtered.delta"
output: "syri_{a1}_vs_{a2}/{a1}_vs_{a2}.coords"
conda: "envs/syri.yaml"
shell: "show-coords -THrd {input} > {output}"
# Input Files:
# -c INFILE File containing alignment coordinates in a tsv format
# (default: None)
# -r REF Genome A (which is considered as reference for the
# alignments). Required for local variation (large
# indels, CNVs) identification. (default: None)
# -q QRY Genome B (which is considered as query for the
# alignments). Required for local variation (large
# indels, CNVs) identification. (default: None)
# -d DELTA .delta file from mummer. Required for short variation
# (SNPs/indels) identification when CIGAR string is not
# available (default: None)
# optional arguments:
# -o FOUT Output file name (default: syri)
# -k Keep internediate output files (default: False)
# --log {DEBUG,INFO,WARN}
# log level (default: INFO)
# --lf LOG_FIN Name of log file (default: syri.log)
# --dir DIR path to working directory (if not current directory)
# (default: None)
# --prefix PREFIX Prefix to add before the output file Names (default: )
# --seed SEED seed for generating random numbers (default: 1)
# --nc NCORES number of cores to use in parallel (max is number of
# chromosomes) (default: 1)
# --novcf Do not combine all files into one output file
# (default: False)
#SR identification:
# --nosr Set to skip structural rearrangement identification
# (default: False)
# -b BRUTERUNTIME Cutoff to restrict brute force methods to take too
# much time (in seconds). Smaller values would make
# algorithm faster, but could have marginal effects on
# accuracy. In general case, would not be required.
# (default: 60)
# --unic TRANSUNICOUNT Number of uniques bps for selecting translocation.
# Smaller values would select smaller TLs better, but
# may increase time and decrease accuracy. (default:
# 1000)
# --unip TRANSUNIPERCENT
# Percent of unique region requried to select
# translocation. Value should be in range (0,1]. Smaller
# values would selection of translocation which are more
# overlapped with other regions. (default: 0.5)
# --inc INCREASEBY Minimum score increase required to add another
# alignment to translocation cluster solution (default:
# 1000)
# --no-chrmatch Do not allow SyRI to automatically match chromosome
# ids between the two genomes if they are not equal
# (default: False)
# ShV identification:
# --nosv Set to skip structural variation identification
# (default: False)
# --nosnp Set to skip SNP/Indel (within alignment)
# identification (default: False)
# --all Use duplications too for variant identification
# (default: False)
# --allow-offset OFFSET
# BPs allowed to overlap (default: 0)
# --cigar Find SNPs/indels using CIGAR string. Necessary for
# alignment generated using aligners other than nucmers
# (default: False)
# -s SSPATH path to show-snps from mummer (default: show-snps)
rule syri_run:
input: infile="syri_{a1}_vs_{a2}/{a1}_vs_{a2}.coords",
ref="seq_{a1}/Homo_sapiens.{a1}.dna.primary_assembly.fa",
query="seq_{a2}/Homo_sapiens.{a2}.dna.primary_assembly.fa",
delta="syri_{a1}_vs_{a2}/{a1}_vs_{a2}_filtered.delta"
output: "syri_{a1}_vs_{a2}/{a1}_vs_{a2}.bla"
params: outbase=lambda wildcards: "syri"+"_"+wildcards.a1+"_vs_"+ \
wildcards.a2+"/"+wildcards.a1+"_vs_"+ \
wildcards.a2
conda: "envs/syri.yaml"
shell: "cwd/syri/bin/syri " + \
"-c {input.infile} " + \
"-r {input.ref} " + \
"-q {input.query} " + \
"-d {input.delta} " + \
"-o {params.outbase} " + \
"--dir cwd/syri/ " + \
"--lf {params.outbase}.log " + \
"--nc 16 "
rule syri_run_all:
input: expand("syri_GRCh38_vs_{a2}/GRCh38_vs_{a2}.bla", \
a2=["EGYPTREF","CEGYPTREF","EGYPTREFV2","CEGYPTREFV2"] + \
["YORUBA","AK1","GRCh38"])
################################################################################
## Extracting various information within specified genes (i.e. gene-centric) ###
################################################################################
# These are the genes of interest (ABCC7=CFTR, HD=HDDC3?, Factor V=F5)
GENES = ["CFTR","HDDC3","DMD","BRCA1","BRCA2","TP53","EGFR","APP","PSEN1","F5", \
"CARD11","LAMA4","MRC1","USH2A","PRAMEF17","C1QTNF12","CFAP74","MMEL1", \
"TTC34","GUCY1A1","FADS3"]
# Therefore, obtain a recent Ensembl annotation file first
rule get_ensembl_gene_annotation_gtf:
output: temp("annotations/Homo_sapiens.GRCh38.94.gtf.gz")
shell: "wget -P annotations " + \
"ftp://ftp.ensembl.org/pub/release-94/gtf/homo_sapiens/" + \
"Homo_sapiens.GRCh38.94.gtf.gz "
rule unzip_ensembl_gene_annotation_gtf:
input: "annotations/Homo_sapiens.GRCh38.94.gtf.gz"
output: "annotations/Homo_sapiens.GRCh38.94.gtf"
shell: "gzip -d {input}"
rule gc_get_gene_annotation:
input: "annotations/Homo_sapiens.GRCh38.94.gtf"
output: "gene_centric/{gene}/{gene}.gtf"
shell: "cat {input} | grep '#' > {output}; " + \
"cat {input} | grep 'gene_name \"{wildcards.gene}\";' >> {output}"
# How many bases left and right of gene boundaries to consider
WINDOW = {
"CFTR": [100000,100000],
"HDDC3": [100000,100000],
"DMD": [100000,100000],
"BRCA1": [100000,100000],
"BRCA2": [100000,100000],
"TP53": [100000,100000],
"EGFR": [100000,100000],
"APP": [100000,100000],
"PSEN1": [100000,100000],
"F5": [100000,100000],
"CARD11": [100000,100000],
"LAMA4": [100000,100000],
"MRC1": [100000,100000],
"USH2A": [100000,100000],
"FADS1": [100000,100000],
"FADS2": [100000,100000],
"PRAMEF17": [100000,100000],
"C1QTNF12": [100000,100000],
"CFAP74": [100000,100000],
"MMEL1": [100000,100000],
"TTC34": [100000,100000],
"GUCY1A1": [100000,100000],
"FADS3": [100000,100000]
}
rule gc_get_start_end_position:
input: "gene_centric/{gene}/{gene}.gtf"
output: "gene_centric/{gene}/{gene}.bed"
run:
with open(input[0],"r") as f_in, open(output[0],"w") as f_out:
f_out.write("# Custom bed file for region around gene\n")
for line in f_in:
if line[0] == "#":
continue
s = line.split("\t")
if s[2] == "gene":
chr = s[0]
start = str(int(s[3]) - WINDOW[wildcards.gene][0])
end = str(int(s[4]) + WINDOW[wildcards.gene][1])
strand = s[6]
f_out.write("\t".join([chr,start,end,'.','.',strand])+"\n")
rule gc_get_overlapping_genes:
input: "annotations/Homo_sapiens.GRCh38.94.gtf",
"gene_centric/{gene}/{gene}.bed"
output: "gene_centric/{gene}/{gene}_overlapping.gtf"
run:
with open(input[1],"r") as f_in:
for line in f_in:
if line[0] == "#":
continue
s = line.split('\t')
[q_chrom,q_start,q_end] = s[:3]
with open(input[0],"r") as f_in, open(output[0],"w") as f_out:
for line in f_in:
if line[0] == "#":
continue
s = line.split("\t")
chrom,start,end = s[:3]
if chrom == q_chrom:
if q_start<start<q_end or q_start<end<q_end:
f_out.write(line)
rule gc_delta_format:
input: align="{aligntype}_nucmer_{a1}_vs_{a2}/{a1}_vs_{a2}_{filter}.delta",
gene="gene_centric/{gene}/{gene}.bed"
output: "gene_centric/{gene}/{aligntype}_{gene}_{a1}_vs_{a2}_{filter}.delta"
script: "scripts/filter_delta_alignment_file.py"
# Running the tool nucdiff to compare two assemblies based on alignment with
# mummer, which is also performed by the nucdiff tool; therefore, use 1to1
# alignments, such that at every position only one alignment matches
# "--filter_opt '-l 1000 -i 99' " +
rule gc_run_nucdiff:
input: ref="seq_{a1}/Homo_sapiens.{a1}.dna.primary_assembly.fa",
query="seq_{a2}/Homo_sapiens.{a2}.dna.primary_assembly.fa",
delta="gene_centric/{gene}/{aligntype}_{gene}_{a1}_vs_{a2}_1to1.delta"
output: "gene_centric/{gene}/nucdiff_{aligntype}_{a1}_vs_{a2}/{a1}_vs_{a2}.delta",
"gene_centric/{gene}/nucdiff_{aligntype}_{a1}_vs_{a2}/results/{a1}_vs_{a2}_ref_snps.gff",
"gene_centric/{gene}/nucdiff_{aligntype}_{a1}_vs_{a2}/results/{a1}_vs_{a2}_ref_struct.gff",
"gene_centric/{gene}/nucdiff_{aligntype}_{a1}_vs_{a2}/results/{a1}_vs_{a2}_ref_blocks.gff",
"gene_centric/{gene}/nucdiff_{aligntype}_{a1}_vs_{a2}/results/{a1}_vs_{a2}_ref_snps.vcf",
"gene_centric/{gene}/nucdiff_{aligntype}_{a1}_vs_{a2}/results/{a1}_vs_{a2}_query_snps.gff",
"gene_centric/{gene}/nucdiff_{aligntype}_{a1}_vs_{a2}/results/{a1}_vs_{a2}_query_struct.gff",
"gene_centric/{gene}/nucdiff_{aligntype}_{a1}_vs_{a2}/results/{a1}_vs_{a2}_query_blocks.gff",
"gene_centric/{gene}/nucdiff_{aligntype}_{a1}_vs_{a2}/results/{a1}_vs_{a2}_query_snps.vcf",
"gene_centric/{gene}/nucdiff_{aligntype}_{a1}_vs_{a2}/results/{a1}_vs_{a2}_stat.out"
params: outdir=lambda wildcards: "gene_centric/"+wildcards.gene+"/nucdiff_"+wildcards.aligntype+"_"+wildcards.a1+"_vs_"+wildcards.a2
conda: "envs/nucdiff.yaml"
shell: "cp {input.delta} {output[0]}; " + \
"nucdiff {input.ref} {input.query} {params.outdir} " + \
"{wildcards.a1}_vs_{wildcards.a2} " + \
"--vcf yes " + \
"--delta_file {input.delta} " + \
"--proc 8"
rule gc_run_nucdiff_all:
input: expand("gene_centric/{gene}/nucdiff_{aligntype}_{a1}_vs_{a2}/results/{a1}_vs_{a2}_stat.out", \
gene=GENES,aligntype=["align"],a1="GRCh38", \
a2=["CEGYPTREFV2","EGYPTREFWTDBG2","EGYPTREFWTDBG2V2","EGYPTREFWTDBG2V3"])
# VCF file from assembly here get annotated with dbsnp IDs
# Compressing and indexing of files to be used with vcf-merge
rule gc_index_snps:
input: "gene_centric/{gene}/nucdiff_{aligntype}_{a1}_vs_{a2}/results/{a1}_vs_{a2}_ref_snps.vcf"
output: "gene_centric/{gene}/nucdiff_{aligntype}_{a1}_vs_{a2}/results/{a1}_vs_{a2}_ref_snps.vcf.gz",
"gene_centric/{gene}/nucdiff_{aligntype}_{a1}_vs_{a2}/results/{a1}_vs_{a2}_ref_snps.vcf.gz.tbi"
conda: "envs/rsid_annotate.yaml"
shell: "cat {input} | bgzip > {output[0]}; tabix -p vcf {output[0]}"
rule gc_annotate_rsids:
input: vcf="gene_centric/{gene}/nucdiff_{aligntype}_{a1}_vs_{a2}/results/{a1}_vs_{a2}_ref_snps.vcf.gz",
vcf_index="gene_centric/{gene}/nucdiff_{aligntype}_{a1}_vs_{a2}/results/{a1}_vs_{a2}_ref_snps.vcf.gz.tbi",
dbsnp="dbsnp_GRCh38/dbsnp.vcf.gz",
dbsnp_index="dbsnp_GRCh38/dbsnp.vcf.gz.tbi"
output: "gene_centric/{gene}/nucdiff_{aligntype}_{a1}_vs_{a2}/results/{a1}_vs_{a2}_ref_snps_annotated.vcf.gz"
conda: "envs/rsid_annotate.yaml"
shell: "bcftools annotate --annotations {input.dbsnp} " + \
"--columns ID " + \
"--output {output} " + \
"--output-type z " + \
"{input.vcf} "
# Plot the aligned contigs/scaffolds for this region using mummerplot
# An example plot is gene_centric/FADS1/mummerplot_align_GRCh38_vs_CEGYPTREFV2/GRCh38_vs_CEGYPTREFV2_nofilter_11_000095F.gp
rule gc_mummerplot:
input: "gene_centric/{gene}/{aligntype}_{gene}_{a1}_vs_{a2}_{filter}.delta"
output: "gene_centric/{gene}/mummerplot_{aligntype}_{a1}_vs_{a2}/{a1}_vs_{a2}_{filter}_{r}_{q}.gp",
"gene_centric/{gene}/mummerplot_{aligntype}_{a1}_vs_{a2}/{a1}_vs_{a2}_{filter}_{r}_{q}.ps"
params: outprefix=lambda wildcards: "gene_centric/"+wildcards.gene+ \
"/mummerplot_"+wildcards.aligntype+"_"+wildcards.a1+ \
"_vs_"+wildcards.a2+"/"+wildcards.a1+"_vs_"+wildcards.a2+ \
"_"+wildcards.filter+"_"+wildcards.r+"_"+wildcards.q
conda: "envs/mummer.yaml"
shell: "mummerplot " + \
"-p {params.outprefix} " + \
"--postscript " + \
"--layout " + \
"--medium " + \
"-title {wildcards.gene} " + \
"-r {wildcards.r} " + \
"-q {wildcards.q} " + \
"--SNP " + \
# "-x [*:*] " + \
# "-y [*:*] " + \
"{input[0]}; " + \
"gnuplot {output[0]}; "