-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGenome.py
243 lines (208 loc) · 10.5 KB
/
Genome.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
# python3 Genome.py
from lp_utils import translate, generate_all_paths
import random
#from drawing_paths import draw_path, draw_lattice
class Genome:
"""
Genome(or Chromosome) class. This is the blueprint of an individual and mainly consists of a set of paths. Given that all the
paths generated by generate_all_paths in lp_utils.py are in lexicographical order, every path has a specific index. Hence to
minimize memory usage and facilitate operations, the set of paths is just going to be the set of their respective indices.
To improve evolution, we don't want individuals to have repeated paths. Hence, to avoid that, each individual has the set of all paths
stored in take_paths. We pick the Sequences(paths) for that individual from the list take_paths and during mutation, we also ensure that
only paths available in take_paths are taken for substiution.
"""
def __init__(self, num_sequences, m, n, k, paths, len_paths, empty=False):
"""
Constructor of individual(Genome or Chromosome).
*paths is a paramter, but we don't need paths here though. However, the program was originally written this way.
empty is a boolean parameter that tells the program when to generate generic individuals(with invalid/empty paths) to be used later or
to create an actual individual with randommly generated valid paths.
"""
self.m = m
self.n = n
self.k = k
self.num_sequences = num_sequences
self.sequences = [] # Initialize the list of sequences in the genome.
# Generate a list of indices for the available paths.
self.take_paths = [
i for i in range(len_paths)
] # list of all paths(indices actually)
# self.poison()
self.translated = None # To store the paths in human readable form
self._generate_sequences(paths, len_paths, empty)
def _generate_sequences(self, paths, len_paths, empty):
""" "
Private method to generate random sequences(paths) for an individual(or genome).
"""
# Generate the sequences in the genome based on the value of the empty parameter.
if not empty: # Generate valid paths
for i in range(self.num_sequences):
r = random.randint(
0, len_paths - i - 1
) # Choose a random index from the list of available paths.
self.sequences.append(
self.take_paths[r]
) # Add the new sequence to the genome, and remove the chosen path from the list of available paths.
self.take_paths.pop(r)
else:
# If the empty parameter is True, generate empty sequences.
for i in range(self.num_sequences):
self.sequences.append(-1)
def fitness(self, dict_equivalences, penalty_indexes=True):
"""
Method to calculate the fitness of the genome:
We compare each Sequence to every other sequence and return the number of Sequences that were k-distinct and the
number of those that were k-equivalent.
If the penalty_indexes argument is True, then we also store the indices of the Sequences that were k-equivalent.
The comparison:
dict_equivalences contains a dictionary where all the paths have already been compared to each other.
Each key in the dictionary represents a path's index, and the values are the indices of the paths with which that
key is k-equivalent.
"""
penalty = 0 # To count the number of Sequence comparisons of the Genome object in which the sequences are k-equivalent.
distinct = 0 # To count the number of Sequence comparisons of the Genome object in which the sequences are k-distinct.
if (
penalty_indexes
): # If the penalty_indexes parameter is True, calculate the penalty and the penalty index.
penalty_index = (
[]
) # To store the indices of Sequences that are k-equivalent.
for i in range(self.num_sequences):
for j in range(i + 1, self.num_sequences):
if (
self.sequences[j] in dict_equivalences[str(self.sequences[i])]
): # are k-equivalent
penalty += 1
penalty_index.append([i, j])
#else:
#distinct += 1
if penalty == 0:
# If the penalty is 0, return a large value and the empty penalty index.
return (9999, penalty_index)
# Return the number of k-distinct comparisons and the indexes of k-equivalent comparisons.
#return (distinct, penalty_index)
return (1/penalty, penalty_index)
else:
# If the penalty_indexes parameter is False, only calculate the penalty.
for i in range(self.num_sequences):
for j in range(i + 1, self.num_sequences):
if self.sequences[j] in dict_equivalences[str(self.sequences[i])]:
penalty += 1
#else:
#distinct += 1
if penalty == 0:
# If the penalty is 0, return a large value.
return 9999
#return distinct
return 1/penalty
def divert(self, other):
"""
This method is not used throughout the program, it was initially meant to
calculate how much one Genome differs from another.
"""
same = 0
count = 0
len_seq = len(self.sequences)
# Compare each sequence in the genome to each sequence in the other genome.
for i in range(len_seq):
for j in range(len_seq):
# Check if the sequences at the given indices are the same.
same += self.sequences[i].same_paths(other.sequences[j])
if same == 0:
# If the sequences are different, increment the count.
count += 1
else:
# If the sequences are the same, reset the different variable.
same = 0
# Return the ratio of sequences that are different.
return count / len_seq
def show(self, paths):
"""
Method to print the Sequences' paths of a genome to the console in order(with respect to their indices).
"""
self.sequences.sort()
for seq in self.sequences:
for term in paths[seq]:
print(term[0], term[1], term[2], sep=" ", end=" ")
print(f"pi:{seq} \n")
def mutate(self, dict_equivalences):
"""
Mutate the genome by replacing one of its sequences with a new sequence. The take_paths list contains
the available paths.
Also, when we remove one path from the sequences list, we have to put it back among
the available paths(take_paths).
"""
index_eq = self.fitness(dict_equivalences)[
-1
] # Get the indices of k-equivalent sequences.
i = random.randint(
0, len(self.take_paths) - 1
) # Choose a random index from the list of available paths.
if (
index_eq
): # If the penalty index is not empty, choose a random index from the penalty index.
r = index_eq[random.randint(0, len(index_eq) - 1)][random.randint(0, 1)]
# The following order of commands is critical.
self.take_paths.append(self.sequences[r])
self.sequences[r] = self.take_paths[
i
] # Replace the sequence at the chosen index with a new sequence using the chosen path.
self.take_paths.pop(i)
return
def nmutate(self, dict_equivalences):
"""
nmutate meaning normal mutate. Similar to mutate(), but here the sequence to be changed is selected at random, even if the sequence
was k-distinct from every other sequence.
"""
index_eq = len(
self.fitness(dict_equivalences)[-1]
) # this list will just be used to check if mutation is needed. If empty, then Genome is perfect.
if index_eq:
r = random.randint(0, self.num_sequences - 1)
i = random.randint(0, len(self.take_paths) - 1)
self.take_paths.append(self.sequences[r])
self.sequences[r] = self.take_paths[
i
] # does the order matter? no, since the last path index was appended at the end of self.take_paths
self.take_paths.pop(i)
return
def smutate(self, dict_equivalences, l):
"""
smutate meaning special mutate. Similar to mutate(), but here the sequence to be changed is selected at random, even if the sequence
was k-distinct from every other sequence. Also, the sequences may repeat themselves, in contrast to nmutate() and mutate() where the mutation
always made sure not to repeat paths.
"""
index_eq = len(self.fitness(dict_equivalences)[-1])
if index_eq:
r = random.randint(0, self.num_sequences - 1)
self.sequences[r] = random.randint(0, l - 1)
return
def translate(self, paths):
"""
Method to translate path from the 3-digit term format to the Alphabetical format(EN...EEN).
"""
itoc = {0:"E",1:"N"}
translated = []
for sequence in self.sequences:
translated.append(translate(sequence, "to_A", paths))
for index,list_of_terms in enumerate(translated):
for index2, term in enumerate(list_of_terms):
translated[index][index2] = itoc[term]
self.translated = translated
return translated
def draw(self):
"""
Method to visualize the lattice and paths.
"""
from drawing_paths import draw_path, draw_lattice
paths = generate_all_paths(self.m, self.n)
draw_lattice(self.m, self.n)
o = 40 / self.num_sequences
i = -0.5 * self.num_sequences
for j in range(len(self.sequences)):
draw_path(
translate(paths[self.sequences[j]], "to_A"), self.m, self.n, o * i, j
)
i += 1
def __repr__(self):
return f"Genome(num_genes = {self.num_sequences}, m = {self.m}, n = {self.n}, k= {self.k})"