forked from hugang123/Dxy
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathDxy_calculate
205 lines (183 loc) · 8.32 KB
/
Dxy_calculate
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#!/usr/bin/env python
# coding=utf-8
# python3
import gzip
import re
import math
import sys
from argparse import ArgumentParser,RawTextHelpFormatter
from datetime import datetime
def read_file_gz(gzfile):
open_file = gzip.open(gzfile, 'rb')
for i in open_file:
line = str(i, encoding='utf8')
if line and not line.startswith('#'):
#print(line)
yield (line.rstrip('\r\n'))
open_file.close()
# 读取文件最后一行,获取该染色体最大的pos,在窗口滑动时有作用
def read_last_line(gzfile):
open_file = gzip.open(gzfile, 'rb')
first_line = open_file.readline() # 读第一行
max_pos = 0
off = -50 # 设置偏移量
while True:
open_file.seek(off, 2) # seek(off, 2)表示文件指针:从文件末尾(2)开始向前50个字符(-50)
lines = open_file.readlines() # 读取文件指针范围内所有行
if len(lines) >= 2: # 判断是否最后至少有两行,这样保证了最后一行是完整的
#max_pos += int(str(lines[-1], encoding='utf8').split("\t")[1]) # 取最后一行
max_pos += int(str(lines[-1], encoding='utf8').split("\t")[1]) # 取最后一行
break
# 如果off为50时得到的readlines只有一行内容,那么不能保证最后一行是完整的
# 所以off翻倍重新运行,直到readlines不止一行
off *= 2
#print(max_pos)
return max_pos
# read_pop_file(file)[0] 为popA [1]为popB
def read_pop_file(pop_file):
open_file = open(pop_file, "r")
popA = re.split("\s+", open_file.readline().split(":")[1].rstrip("\n"))
popB = re.split("\s+", open_file.readline().split(":")[1].rstrip("\n"))
open_file.close()
return popA, popB
def get_pop_index(gzfile, popfile):
all_pop = []
pop2 = read_pop_file(popfile)
open_file = gzip.open(gzfile, 'rb')
for i in open_file:
line = str(i, encoding='utf8')
if line and line.startswith('#CHROM'):
all_pop.extend(re.split("\s+",line.strip()))
# print(all_pop)
# print(len(all_pop))
break
open_file.close()
popA_index = [all_pop.index(i) for i in pop2[0]]
popB_index = [all_pop.index(j) for j in pop2[1]]
# print([popA_index,popB_index])
return popA_index, popB_index
def log(*args):
print(*args)
# 设置滑动窗口,该postion即为last line中最大的
def slide_window(window, step, position):
window_start = 1
while window_start < position:
if int(window_start + window -1) <= position:
# print(100/int(window))
yield int(window_start), int(window_start + window - 1)
elif int(window_start + window -1) > position:
window = int(position) - int(window_start) + 1
# print(100 / int(window))
yield int(window_start), int(position)
break
window_start += step
def get_pop_alles_freq(gzfile, popfile):
popA_index, popB_index = get_pop_index(gzfile, popfile)
popAB_index = []
popAB_index.extend(popA_index)
popAB_index.extend(popB_index)
# print(popAB_index)
Number_popA = len(popA_index)
Number_popB = len(popB_index)
AF_info = []
for line in read_file_gz(gzfile):
#print(line)
# 记录信息的列表
a = []
# 切分vcf,获取关键信息
#array = line.split("\t")
array = re.split("\s+",line)
# print(array)
# 记录染色体、位置等相关信息
chr, pos = array[0], int(array[1])
# 获取所有样品的基因型
all_genotype = [i.split(":")[0] for i in array[9:]]
# 从所有样品的基因型中抽提出我们测试的两个群体的基因型
pop_genotype = [all_genotype[i - 9] for i in popAB_index]
# 计算每个位点的AF
popA_ref = [pop_genotype[:Number_popA].count(i) for i in ['0/0', '0/1', '1/1']]
# print(popA_ref)
popA_Ref_freq = (2 * popA_ref[0] + 1 * popA_ref[1]) / (2 * sum(popA_ref)) if sum(popA_ref) > 0 else ""
# print(popA_Ref_freq)
popB_ref = [pop_genotype[Number_popA:].count(i) for i in ['0/0', '0/1', '1/1']]
# print(popB_ref)
popB_Ref_freq = (2 * popB_ref[0] + 1 * popB_ref[1]) / (2 * sum(popB_ref)) if sum(popB_ref) > 0 else ""
# print(popB_Ref_freq)
a.extend([chr, pos, popA_Ref_freq, popB_Ref_freq])
# 判断混合群体是否存在多样性,排除"./."之外,不少于一中基因型
# print(pop_genotype)
myset = set(pop_genotype)
if "./." in pop_genotype:
myset.remove("./.")
if len(myset) >= 2:
a.append(1)
else:
a.append(0)
AF_info.append(a)
# print(AF_info)
return AF_info
def calculate_dxy(gzfile, popfile, window, step, outfile):
open_file = open(outfile, 'w')
open_file.write('chrom\twindow_start\twindow_end\tN_SNP\tDxy\n')
max_pos = read_last_line(gzfile) # 获取最大pos
AF_info = get_pop_alles_freq(gzfile, popfile)
chr = AF_info[0][0]
# 滑动窗口
for start, end in slide_window(window, step, max_pos):
a = 0 # 多态性位点
b = 0 # 记录行数
Ja = Jb = Jab = 0 # 记录杂合度
Dxy = ''
print([start,end])
for line in AF_info:
print(line)
if int(start) <= line[1] < int(start) + int(step):
b += 1
# print(b)
if int(start) <= line[1] <= int(end):
# if line[4] == 1 and bool (line[2]) and bool(line[3]),bool(0)为false,易错
if line[4] == 1 and str(line[2]).count(".") == 1 and str(line[3]).count("."):
a += 1
Ja += (2 * line[2] * (1 - line[2]))
Jb += (2 * line[3] * (1 - line[3]))
Jab += (1 + 2*line[2]*line[3]-line[2]-line[3])
# Jab += line[2] * line[3] +(1-line[2])(1-line[3])
# print([Ja,Jb,Jab])
continue
elif line[1] > int(end):
del AF_info[0:b]
break
print(a)
if all([a, Ja, Jb]) > 0:
JA = 1 - Ja / window
JB = 1 - Jb / window
JAB = (Jab + window - a) / window
print(JA,JB,JAB)
print(JAB/math.sqrt(JA * JB))
print(math.log(JAB/math.sqrt(JA * JB),10))
print(math.log(JAB/math.sqrt(JA * JB),math.e))
Dxy = bool(Dxy) - math.log(JAB / math.sqrt(JA * JB), math.e)
else:
Dxy = Dxy + "NA"
open_file.write("{}\t{}\t{}\t{}\t{}\n".format(chr, start, end, a, Dxy))
open_file.close()
def get_para(parameters):
parser=ArgumentParser(formatter_class=RawTextHelpFormatter,description='Calculate Dxy form the vcf(.gz) between the two populations.',epilog='Contact information:\n\tName: hugang\n\tEmail: hugang_biology@163.com')
parser.add_argument('-v',dest="gzvcf",type=str,required=True,metavar='vcf',help="vcf file with gz format")
parser.add_argument('-p',dest="poplist",type=str,required=True,metavar='poplist',help="input the two populations file,each row represent one pop \nExample\npop1:Sampl1\tsample2\npop2:Sample3\tSample4")
parser.add_argument('-w',dest='window',default=100000,type=int,required=False,metavar='window [default:100000]',help="window size")
parser.add_argument('-s',dest='step',default=50000,type=int,required=False,metavar='step [default:50000]',help="step size")
parser.add_argument('-o',dest='output',default='Dxy.txt',type=str,required=False,metavar='output [default:Dxy.txt]',help="output file")
return vars(parser.parse_args(parameters))
# vars() 函数返回对象object的属性和属性值的字典对象
def main():
parameters = get_para(sys.argv[1:])
calculate_dxy(parameters['gzvcf'],parameters['poplist'],parameters['window'],parameters['step'],parameters['output'])
if __name__ == "__main__":
startTime = datetime.now()
main()
endTime = datetime.now()
time_consume = (endTime - startTime).seconds
log("startTime", endTime, "\n", "endTime", endTime, "\n", "time_consume", time_consume, "\n")
# 思路,还是先计算每个位点的AF,然后再计算
# 结果为NA,两种情况,第一窗口内无SNP;第二种情况,某一群体在该窗口所有位点为纯合,且为同一种基因型,常见于SNP数目少的情况