-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathedta.py
119 lines (104 loc) · 2.83 KB
/
edta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import string
VALID_CHARS = set([c for c in string.ascii_letters] + [c for c in string.digits])
class DNA():
def __init__(self, name='', info='', content=None):
self.name = name
self.info = info
self.content = content
def __repr__(self):
return "name: {}\ninfo: {}\ncontent: {}".format(self.name, self.info, self.content)
def add_new_DNA(dna_list, line):
assert line[0] == '>'
first_space_idx = line.find(' ')
if first_space_idx != -1:
dna_name = line[1:first_space_idx]
dna_info = line[first_space_idx:].strip()
else:
dna_name = line[1:]
dna_info = ''
dna_list.append(DNA(name=dna_name, info=dna_info, content=[]))
def add_line_to_DNA(cur_DNA, line):
for x in line:
if x in VALID_CHARS:
cur_DNA.content.append(x)
elif x == ' ':
continue
else:
raise Exception()
def parse_FASTA(file):
"""
Basic state machine for parsing.
0 = Expecting '>' or empty line.
1 = Expecting valid string for current DNA or empty line.
2 = Got at least one string for current DNA. Ready for new DNA
or continue current DNA.
"""
state = 0
dna_list = []
for line in file:
line = line.strip()
if state == 0:
if line[0] == '>':
add_new_DNA(dna_list, line)
state = 1
elif line == '':
continue
else:
raise Exception()
elif state == 1:
add_line_to_DNA(dna_list[-1], line)
state = 2
elif state == 2:
if line[0] == '>':
add_new_DNA(dna_list, line)
state = 1
else:
add_line_to_DNA(dna_list[-1], line)
else:
raise Exception()
file.seek(0)
return dna_list
with open("rosalind_edta.txt") as file:
dna_list = parse_FASTA(file)
s, t = [x.content for x in dna_list]
gamma = 1
match_cost = 0
mismatch_cost = 1
dists = [[d for d in range(len(s) + 1)]] + [[None for j in range(len(s) + 1)] for i in range(len(t))]
for i in range(1, len(t) + 1):
dists[i][0] = gamma * i
for j in range(1, len(s) + 1):
dists[i][j] = min(dists[i - 1][j - 1] + (match_cost if s[j - 1] == t[i - 1] else mismatch_cost), dists[i - 1][j] + gamma, dists[i][j - 1] + gamma)
print(dists[-1][-1])
s_alig = []
t_alig = []
i = len(t)
j = len(s)
while i > 0 and j > 0:
insertion_val = dists[i - 1][j] + gamma
deletion_val = dists[i][j - 1] + gamma
subst_val = dists[i - 1][j - 1] + (match_cost if s[j - 1] == t[i - 1] else mismatch_cost)
min_val = min(subst_val, insertion_val, deletion_val)
if subst_val == min_val:
i -= 1
j -= 1
s_alig.append(s[j])
t_alig.append(t[i])
elif deletion_val == min_val:
j -= 1
s_alig.append(s[j])
t_alig.append('-')
elif insertion_val == min_val:
i -= 1
s_alig.append('-')
t_alig.append(t[i])
else:
raise Exception()
if i > 0:
t_alig += ['-'] * i
i = 0
if j > 0:
s_alig += ['-'] * j
j = 0
print(''.join(reversed(s_alig)))
print(''.join(reversed(t_alig)))