-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
130 lines (109 loc) · 4.57 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/env python3.5
# -*- coding: utf-8 -*-
__author__ = "Tobias Tilly"
__name__ = "main"
'''
The main module holds all other modules and controls the workflow.
ToDo:
this could be extended for a nice CLI (with docopt)
'''
import argparse
import sys
import os
import wrapper
import parser
import output
import downloader
# output-directories
OUTPUT_BCFTOOLS = "bcftools-output/"
OUTPUT_VCFTOOLS = "vcftools-output/"
def shallow_analysis(file_path):
bcf_stats = wrapper.get_bcf_stats(file_path)
stat_summary = parser.analyze_bcf_stats(bcf_stats, file_path)
output.print_statistic_summary(stat_summary)
def create_intersection(files, output_dir):
wrapper.get_intersection(files, output_dir)
def create_pop_stats(pop_1k_file, pop_gnomad_file, population):
bcf_stats_1k_pop = wrapper.get_stats_with_bins(0.0,1.0, 0.05, pop_1k_file)
with open(OUTPUT_BCFTOOLS+"bcf_stats_1k_{}".format(population), 'w') as f:
f.write(bcf_stats_1k_pop)
bcf_stats_gnomad_pop = wrapper.get_stats_with_bins(0.0,1.0, 0.05, pop_gnomad_file)
with open(OUTPUT_BCFTOOLS+"bcf_stats_gnomad_{}".format(population), 'w') as f:
f.write(bcf_stats_gnomad_pop)
def plot_af_stats_comparison(population, output_file, maf=False):
with open(OUTPUT_BCFTOOLS + "bcf_stats_1k_{}".format(population) , 'r') as f:
bcf_stats = f.read()
AF_1k = parser.extract_allele_frequencies(bcf_stats)
with open(OUTPUT_BCFTOOLS + "bcf_stats_gnomad_{}".format(population), 'r') as f:
bcf_stats = f.read()
AF_gnomad = parser.extract_allele_frequencies(bcf_stats)
#output.plot_allele_frequencies(AF_1k_ '1000Genomes')
output.plot_allele_frequency_comparison(AF_1k, AF_gnomad, ['1000Genomes','GnomAD'], output_file, maf)
def subset_filtered_list(file):
population_list = open('data/1k_phase3.panel', 'r')
eur_list = wrapper.filter_list(population_list, 'EUR')
eur_column = [x.split("\t")[0] for x in eur_list ]
nonFin_list = wrapper.filter_list(eur_list, "FIN", True)
nonFin_column = [x.split("\t")[0] for x in nonFin_list ]
wrapper.wrap_vcf_subset(file, eur_column, "data/eur_1k")
wrapper.wrap_vcf_subset(file, nonFin_column, "data/nonFin_1k")
def hwe_analysis(file_path):
out = wrapper.get_hwe_count(file_path, '0.05', '0.5')
kept, total = parser.get_hwe_quality(out)
output.print_hwe_stat(kept, total)
def initial_download(chromosome):
downloader.download_1k(chromosome)
downloader.download_1k_panel()
downloader.download_gnomad(chromosome)
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('--download',
help='starts downloading the 1k and gnomad files for the given chromosome')
arg_parser.add_argument('--files', nargs='+', help='takes 2 files for the full analysis(first 1k, then gnomad)')
arg_parser.add_argument('--summary', help='takes one file for a statistic summary')
args = arg_parser.parse_args()
# checking if the data directory exists, which is used in most of the workflows
if not os.path.exists('data'):
os.makedirs('data')
if not os.path.exists(OUTPUT_BCFTOOLS):
os.makedirs(OUTPUT_BCFTOOLS)
if not os.path.exists(OUTPUT_VCFTOOLS):
os.makedirs(OUTPUT_VCFTOOLS)
if args.download:
initial_download(args.download)
if args.summary:
if os.path.isfile(args.summary):
shallow_analysis(args.summary)
else:
print("The file {} does not exist.".format(args.summary))
sys.exit(1)
elif args.files:
if len(args.files) > 2:
print("Only 2 files are allowed. You gave: {}".format(len(args.files)))
sys.exit(1)
else:
for file in args.files:
if os.path.isfile(file):
shallow_analysis(file)
else:
print("The file {} does not exist.".format(file))
sys.exit(1)
print("Creating EUR and nonFIN-EUR Subsets...")
subset_filtered_list(args.files[0])
print("Subsets created!")
print("Intersecting filtered 1k_EUR with gnomAD...")
create_intersection(["data/eur_1k_subset_output.vcf.gz", args.files[1]], "data/isec_eur")
print("EUR Intersection created!")
print("Creating EUR stats...")
create_pop_stats("data/isec_eur/0000.vcf", "data/isec_eur/0001.vcf", 'EUR')
print("Plotted EUR stats! Saved as 'EUR_af_stats.png'")
plot_af_stats_comparison('EUR', 'EUR_af_stats.png')
print("Intersecting filtered 1k_nonFIN with gnomAD...")
create_intersection(["data/nonFin_1k_subset_output.vcf.gz", args.files[1]], "data/isec_nonFin")
print("nonFin Intersection created!")
print("Creating nonFin stats...")
print("Stats created!")
create_pop_stats("data/isec_nonFin/0000.vcf", "data/isec_nonFin/0001.vcf", 'nonFin')
plot_af_stats_comparison('nonFin', 'nonFin_af_stats.png')
print("Plotted nonFin stats! Saved as 'nonFin_af_stats.png'")
hwe_analysis("data/isec_nonFin/0000.vcf")
print("Full analysis: DONE!")