forked from iwohlers/lied_egypt_genome
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSnakefile_phasing
248 lines (230 loc) · 11.3 KB
/
Snakefile_phasing
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
#kate:syntax python;
import gzip
# Grob habe ich nun alle Varianten (auch Indels) mit Phasing genommen, die
# Erfolgreich auf hg19 geliftet wurden
# In hg19 das gleiche Referenz-Allele wie in hg38 haben (das wechselt ganz
# selten)
# Deren Genotypen für EGYPTREF mind. 1 Alt-Allel haben (wenn es 0/0 war, hab ich
# es weggelassen, um das VCF File möglichst klein zu machen; ändert aber nichts
# am Ergebnis)
# Deren Genotypen für EGYPTREF gecalled wurden
# Die „known“ sind, d.h. eine rsID besitzen
#
# Die Ergebnisse sind nun in der Datei
# /data/lied_egypt_genome/output_wgs/phasing/sanger/joined.txt zusammengefasst.
#
# Spalte: chr:pos:ref:alt in hg38 (mit „chr“ vor der Zahl UCSC-Schreibweise)
# Spalte: chr:pos:ref:alt in hg19 (ohne „chr“ vor der Zahl Ensembl-Schreibweise)
# Spalte: rsid aus der Variant-Calling-Datei
# Spalte: Sample-Name
# Unphased Genotypen
# Phased Genotypen
#
# Wahrscheinlich ist es am einfachsten, wenn du aus deinem gephasten VCF nun
# auch eine TSV-Datei erzeugst (mit „bcftools query“ geht das relativ einfach)
# und diese dann mit meiner Datei joinst.
#
# Als Phasing-Programm habe ich EAGLE mit dem HRC-Panel verwendet.
# Getting HRC reference panel information (Readme and sites)
rule get_hrc_readme:
output: "phasing_comparison/HRC.r1-1/README"
shell: "wget -P phasing_comparison/HRC.r1-1/ ftp://ngs.sanger.ac.uk/production/hrc/HRC.r1-1/README"
rule get_hrc_sites:
output: "phasing_comparison/HRC.r1-1/HRC.r1-1.GRCh37.wgs.mac5.sites.vcf.gz",
"phasing_comparison/HRC.r1-1/HRC.r1-1.GRCh37.wgs.mac5.sites.tab.gz"
shell: "wget -P phasing_comparison/HRC.r1-1/ ftp://ngs.sanger.ac.uk/production/hrc/HRC.r1-1/HRC.r1-1.GRCh37.wgs.mac5.sites.vcf.gz; " + \
"wget -P phasing_comparison/HRC.r1-1/ ftp://ngs.sanger.ac.uk/production/hrc/HRC.r1-1/HRC.r1-1.GRCh37.wgs.mac5.sites.tab.gz; "
rule link_10x_phased_variants:
input: "longranger_phasing/EGYPTREF/outs/phased_variants.vcf.gz"
output: "phasing_comparison/EGYPTREF_10x_phased_orig.vcf.gz"
shell: "ln -s ../{input} {output}"
# Normalize the 10x-phased variants, i.e. split multiallelic sites into
# multiple rows
# -m, --multiallelics -|+[snps|indels|both|any]
# split multiallelic sites into biallelic records (-) or join biallelic sites
# into multiallelic records (+). An optional type string can follow which
# controls variant types which should be split or merged together: If only SNP
# records should be split or merged, specify snps; if both SNPs and indels
# should be merged separately into two records, specify both; if SNPs and indels
# should be merged into a single record, specify any.
# This and the next bcftools command are the same that Matthias applied before
# uploading the variant data to the Sanger imputation server
rule split_multiallelic_sites:
input: "phasing_comparison/EGYPTREF_10x_phased_orig.vcf.gz"
output: "phasing_comparison/EGYPTREF_10x_phased.vcf.gz"
shell: "bcftools norm -m-both -Ov {input} | bgzip -c > {output}; "+ \
"tabix {output}"
# Normalize the 10x-phased variants, i.e. split multiallelic sites into
# multiple rows
# Left-align and normalize indels, check if REF alleles match the reference,
# split multiallelic sites into multiple rows; recover multiallelics from
# multiple rows. Left-alignment and normalization will only be applied if the
# --fasta-ref option is supplied.
# Not needed???
#rule normalize_vcf:
# input: "phasing_comparison/EGYPTREF_10x_phased_nomultiallelic.vcf.gz"
# output: "phasing_comparison/EGYPTREF_10x_phased.vcf.gz"
# shell: "bcftools norm -o {output} -Oz {input} "
rule link_sanger_phased_variants:
input: "/data/lied_egypt_genome/output_wgs/phasing/sanger/joined.txt"
output: "phasing_comparison/EGYPTREF_sanger_phased.txt"
shell: "ln -s {input} {output}"
rule txt_to_vcf:
input: "phasing_comparison/EGYPTREF_sanger_phased.txt"
output: "phasing_comparison/EGYPTREF_sanger_phased.vcf"
run:
with open(input[0],"r") as f_in, open(output[0],"w") as f_out:
f_out.write("##fileformat=VCFv4.2\n")
f_out.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tEGYPTREF\n")
for line in f_in:
# Skip header
if line[:20] == "hg38_chr_pos_ref_alt":
continue
s = line.strip("\n").split(" ")
# Get hg38 chromosomes and coordinates
chrom,pos,ref,alt = s[0].split(":")
rsid = s[2]
unphased_gt = s[4]
phased_gt = s[5]
unphased_alleles = unphased_gt.split("/")
phased_alleles = phased_gt.split("|")
# Make sure phased alleles match unphased alleles
assert(((unphased_alleles[0]==phased_alleles[0]) and \
(unphased_alleles[1]==phased_alleles[1])) or \
((unphased_alleles[0]==phased_alleles[1]) and \
(unphased_alleles[1]==phased_alleles[0])))
f_out.write("\t".join([chrom,pos,rsid,ref,alt,'.','PASS','.', \
'GT',phased_gt])+"\n")
rule compress_sanger_phased:
input: "phasing_comparison/EGYPTREF_sanger_phased.vcf"
output: "phasing_comparison/EGYPTREF_sanger_phased.vcf.gz"
shell: "cat {input} | vcf-sort | bgzip -c > {output}; "+ \
"tabix {output}"
# --diff-site
# Outputs the sites that are common / unique to each file. The output file has
# the suffix ".diff.sites_in_files".
rule compare_phasings_sites_in_file:
input: "phasing_comparison/EGYPTREF_10x_phased.vcf.gz",
"phasing_comparison/EGYPTREF_sanger_phased.vcf.gz"
output: "phasing_comparison/EGYPTREF_{chrom}.diff.sites_in_files"
params: out_base=lambda wildcards, output: output[0][:-20]
shell: "vcftools --gzvcf {input[0]} " + \
"--gzdiff {input[1]} " + \
"--out {params.out_base} " + \
"--chr {wildcards.chrom} " + \
"--diff-site "
# Deduce from diff.sites_in_file file the bi-alellic sites present in both
# imputations
rule get_sites_for_discordance:
input: "phasing_comparison/EGYPTREF_{chrom}.diff.sites_in_files"
output: "phasing_comparison/EGYPTREF_{chrom}.sites_for_discordance"
shell: "cat {input} | grep -v ',' | grep -P '\tB\t' | cut -f 1,2 > {output}"
# --diff-site-discordance
# This option calculates discordance on a site by site basis. The resulting
# output file has the suffix ".diff.sites".
rule compare_phasings_site:
input: "phasing_comparison/EGYPTREF_10x_phased.vcf.gz",
"phasing_comparison/EGYPTREF_sanger_phased.vcf.gz"
output: "phasing_comparison/EGYPTREF_{chrom}.diff.sites"
params: out_base=lambda wildcards, output: output[0][:-11]
shell: "vcftools --gzvcf {input[0]} " + \
"--gzdiff {input[1]} " + \
"--out {params.out_base} " + \
"--chr {wildcards.chrom} " + \
"--diff-site-discordance "
# --diff-discordance-matrix
# This option calculates a discordance matrix. This option only works with
# bi-allelic loci with matching alleles that are present in both files. The
# resulting output file has the suffix ".diff.discordance.matrix".
rule compare_phasings_discordance_matrix:
input: "phasing_comparison/EGYPTREF_10x_phased.vcf.gz",
"phasing_comparison/EGYPTREF_sanger_phased.vcf.gz",
"phasing_comparison/EGYPTREF_{chrom}.sites_for_discordance"
output: "phasing_comparison/EGYPTREF_{chrom}.diff.discordance_matrix"
params: out_base=lambda wildcards, output: output[0][:-24]
shell: "vcftools --gzvcf {input[0]} " + \
"--gzdiff {input[1]} " + \
"--out {params.out_base} " + \
"--positions {input[2]} " + \
"--diff-discordance-matrix "
# Get shared sites from 10x phasing
rule vcf_10x_files_sites_both:
input: "phasing_comparison/EGYPTREF_10x_phased.vcf.gz",
"phasing_comparison/EGYPTREF_{chrom}.sites_for_discordance"
output: "phasing_comparison/EGYPTREF_10x_phased_{chrom}.vcf.gz"
shell: "vcftools --gzvcf {input[0]} " + \
"--stdout " + \
"--recode " + \
"--recode-INFO-all " + \
"--remove-indels " + \
"--non-ref-ac-any 1 " + \
"--positions {input[1]} | " + \
"bgzip -c > {output}; "+ \
"tabix {output}"
# Get shared sites from sanger phasing
rule vcf_sanger_files_site_both:
input: "phasing_comparison/EGYPTREF_sanger_phased.vcf.gz",
"phasing_comparison/EGYPTREF_{chrom}.sites_for_discordance"
output: "phasing_comparison/EGYPTREF_sanger_phased_{chrom}.vcf.gz"
shell: "vcftools --gzvcf {input[0]} " + \
"--stdout " + \
"--recode " + \
"--recode-INFO-all " + \
"--positions {input[1]} | "
"bgzip -c > {output}; "+ \
"tabix {output}"
# Merge Sanger_phasing to 10x phasing VCF file
rule compare_phasings:
input: "phasing_comparison/EGYPTREF_10x_phased_{chrom}.vcf.gz",
"phasing_comparison/EGYPTREF_sanger_phased_{chrom}.vcf.gz"
output: "phasing_comparison/EGYPTREF_{chrom}_concordant.txt",
"phasing_comparison/EGYPTREF_{chrom}_discordant.txt",
"phasing_comparison/EGYPTREF_{chrom}_nonmatching.txt"
run:
vars = {}
lines = {}
with gzip.open(input[0],"r") as f_10x:
for line in f_10x:
line = line.decode()
if line[0] == '#':
continue
s = line.strip("\n").split("\t")
chrom = s[0]
pos = s[1]
ref = s[3]
alt = s[4]
gt = s[9].split(':')[0]
vars["\t".join([chrom,pos,ref,alt])] = gt
lines["\t".join([chrom,pos,ref,alt])] = line
with gzip.open(input[1],"r") as f_sanger, \
open(output[0],"w") as f_concordant, \
open(output[1],"w") as f_discordant:
for line in f_sanger:
line = line.decode()
if line[0] == '#':
continue
s = line.strip("\n").split("\t")
chrom = s[0]
pos = s[1]
ref = s[3]
alt = s[4]
gt = s[9].split(':')[0]
gt_10x = vars["\t".join([chrom,pos,ref,alt])]
line_10x = lines["\t".join([chrom,pos,ref,alt])]
print(gt)
print(gt_10x)
if gt == gt_10x:
f_concordant.write(line_10x)
else:
f_discordant.write(line_10x.strip("\n")+"\t"+line)
del lines["\t".join([chrom,pos,ref,alt])]
with open(output[2],"w") as f_nonmatching:
for line in list(lines):
f_nonmatching.write(line)
rule compare_phasings_all:
input: expand("phasing_comparison/EGYPTREF_{chrom}.{diff_type}", \
diff_type=["diff.sites_in_files","diff.sites", \
"diff.discordance_matrix"], \
chrom=["chr"+str(x) for x in range(1,23)]), \
expand("phasing_comparison/EGYPTREF_{chrom}_concordant.txt", \
chrom=["chr"+str(x) for x in range(1,23)])