-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathseq_read_accu.py
109 lines (99 loc) · 3.92 KB
/
seq_read_accu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import os
import csv
import pandas as pd
from pandas import Series
from preprocess_SNP import Output, Read
from preprocess_seq import seq_standardize
def seq_read(output,result_file):
n = output.n # k为上下游单侧范围取值
files = os.listdir('data_std')
with open(result_file, 'w',newline='') as csvfile:
w = csv.writer(csvfile)
w.writerow(['id','sequence','mutated nucleotide','location','match range(oneside)'])
for filename in files:
d_for = {}
d_res = {}
ser = pd.read_csv('data_std/' + filename)
clo_list = ser.columns
# print(clo_list)
# print(type(ser[clo_list[1]]))
whole_gene = pd.Series(ser[clo_list[1]].values,index=ser[clo_list[0]].values)
# print(whole_gene.items)
feature = clo_list[1] # feature('+'/'-'):全基因组片段为正链或反
if feature == '+':
d_for = output.dict1
d_res = output.dict2
elif feature == '-':
d_for = output.dict3
d_res = output.dict4
# print("dict")
d_for_keys = list(d_for.keys())
d_for_vals = list(d_for.values())
d_res_keys = list(d_res.keys())
d_res_vals = list(d_res.values())
for id, seq in whole_gene.items():
result = []
# id:全基因组中的片段编号
# seq:id所对应的片段序列,待处理
ln = len(seq)
# 取正链
number = 0
for i in range(0, ln - 2*n):
number += 1
# print('i:'+str(i))
seq_for = seq[i:i + n]
# 取正链(forward strand)长度为n,反链(reverse strand)
seq_for_bin = output.Encode(seq_for)
seq_for_Dec = output.binToDec(seq_for_bin)
# 在正链突变位点字典中寻找并返回
flag_0 = 1
if seq_for_Dec in d_for_vals:
ind1 = d_for_vals.index(seq_for_Dec)
key = d_for_keys[ind1]
flag_0 = 0
# for name_for,value_for in d_for.items():
# if seq_for_Dec == value_for:
# key = name_for
# flag_0 = 0
if flag_0:
# print('no find*'+str(number))
continue
# print('after key\n')
# key = 'name + up'
key_list = key.split(feature)
key_rev = key_list[0] + feature +' down'
flag_1 = 1
ind2 = d_res_keys.index(key_rev)
value_rev = d_res_vals[ind2]
flag_1 = 0
# for name_res,value_res in d_res.items():
# if key_rev == name_res:
# value_rev = value_res
# flag_1 = 0
# break
if flag_1:
continue
seq_rev = seq[i+n+1:i+2*n+1]
seq_rev_bin = output.Encode(seq_rev)
seq_rev_Dec = output.binToDec(seq_rev_bin)
if value_rev == seq_rev_Dec:
# 匹配成功
result = [id,key_list[0],seq[i+n],str(i+n+1),str(n)]
# 全基因组片段突变点位置信息str(i+n+1)从1开始
# print('1')
with open(result_file,'a+',newline = '') as csvfile:
w = csv.writer(csvfile)
w.writerow(result)
break
if __name__ == '__main__':
result_file = 'result_accu.csv' # 匹配结果文件位置
SNP_file = 'SNP/20230309--SNPs.xls' # SNP文件位置
f = open('requirement_accu.txt')
n = int(f.readline())
table = Read(SNP_file)
output = Output(table,n)
output.cutOut()
print('end of cutout')
# seq_standardize()
seq_read(output,result_file)
print('end of matching')