Genome.py

# python3 Genome.py
from lp_utils import translate, generate_all_paths
import random
#from drawing_paths import draw_path, draw_lattice


class Genome:
    """
    Genome(or Chromosome) class. This is the blueprint of an individual and mainly consists of a set of paths. Given that all the
    paths generated by generate_all_paths in lp_utils.py are in lexicographical order, every path has a specific index. Hence to
    minimize memory usage and facilitate operations, the set of paths is just going to be the set of their respective indices.

    To improve evolution, we don't want individuals to have repeated paths. Hence, to avoid that, each individual has the set of all paths
    stored in take_paths. We pick the Sequences(paths) for that individual from the list take_paths and during mutation, we also ensure that
    only paths available in take_paths are taken for substiution.
    """

    def __init__(self, num_sequences, m, n, k, paths, len_paths, empty=False):
        """
        Constructor of individual(Genome or Chromosome).
        *paths is a paramter, but we don't need paths here though. However, the program was originally written this way.

        empty is a boolean parameter that tells the program when to generate generic individuals(with invalid/empty paths) to be used later or
        to create an actual individual with randommly generated valid paths.
        """
        self.m = m
        self.n = n
        self.k = k
        self.num_sequences = num_sequences
        self.sequences = []  # Initialize the list of sequences in the genome.
        # Generate a list of indices for the available paths.
        self.take_paths = [
            i for i in range(len_paths)
        ]  # list of all paths(indices actually)
        # self.poison()
        self.translated = None # To store the paths in human readable form
        self._generate_sequences(paths, len_paths, empty)

    def _generate_sequences(self, paths, len_paths, empty):
        """ "
        Private method to generate random sequences(paths) for an individual(or genome).
        """
        # Generate the sequences in the genome based on the value of the empty parameter.
        if not empty:  # Generate valid paths
            for i in range(self.num_sequences):
                r = random.randint(
                    0, len_paths - i - 1
                )  # Choose a random index from the list of available paths.

                self.sequences.append(
                    self.take_paths[r]
                )  # Add the new sequence to the genome, and remove the chosen path from the list of available paths.
                self.take_paths.pop(r)
        else:
            # If the empty parameter is True, generate empty sequences.
            for i in range(self.num_sequences):
                self.sequences.append(-1)

    def fitness(self, dict_equivalences, penalty_indexes=True):
        """
        Method to calculate the fitness of the genome:
           We compare each Sequence to every other sequence and return the number of Sequences that were k-distinct and the
           number of those that were k-equivalent.
        If the penalty_indexes argument is True, then we also store the indices of the Sequences that were k-equivalent.

        The comparison:
           dict_equivalences contains a dictionary where all the paths have already been compared to each other.
           Each key in the dictionary represents a path's index, and the values are the indices of the paths with which that
           key is k-equivalent.
        """

        penalty = 0  # To count the number of Sequence comparisons of the Genome object in which the sequences are k-equivalent.

        distinct = 0  # To count the number of Sequence comparisons of the Genome object in which the sequences are k-distinct.

        if (
            penalty_indexes
        ):  # If the penalty_indexes parameter is True, calculate the penalty and the penalty index.
            penalty_index = (
                []
            )  # To store the indices of Sequences that are k-equivalent.

            for i in range(self.num_sequences):
                for j in range(i + 1, self.num_sequences):
                    if (
                        self.sequences[j] in dict_equivalences[str(self.sequences[i])]
                    ):  # are k-equivalent
                        penalty += 1
                        penalty_index.append([i, j])
                    #else:
                        #distinct += 1
            if penalty == 0:
                # If the penalty is 0, return a large value and the empty penalty index.
                return (9999, penalty_index)

            # Return the number of k-distinct comparisons and the indexes of k-equivalent comparisons.
            #return (distinct, penalty_index)
            return (1/penalty, penalty_index)
        else:
            # If the penalty_indexes parameter is False, only calculate the penalty.
            for i in range(self.num_sequences):
                for j in range(i + 1, self.num_sequences):
                    if self.sequences[j] in dict_equivalences[str(self.sequences[i])]:
                        penalty += 1

                    #else:
                        #distinct += 1

            if penalty == 0:
                # If the penalty is 0, return a large value.
                return 9999

            #return distinct
            return 1/penalty

    def divert(self, other):
        """
        This method is not used throughout the program, it was initially meant to
        calculate how much one Genome differs from another.
        """

        same = 0
        count = 0
        len_seq = len(self.sequences)

        # Compare each sequence in the genome to each sequence in the other genome.
        for i in range(len_seq):
            for j in range(len_seq):
                # Check if the sequences at the given indices are the same.
                same += self.sequences[i].same_paths(other.sequences[j])

            if same == 0:
                # If the sequences are different, increment the count.
                count += 1
            else:
                # If the sequences are the same, reset the different variable.
                same = 0

        # Return the ratio of sequences that are different.
        return count / len_seq

    def show(self, paths):
        """
        Method to print the Sequences' paths of a genome to the console in order(with respect to their indices).
        """
        self.sequences.sort()
        for seq in self.sequences:
            for term in paths[seq]:
                print(term[0], term[1], term[2], sep=" ", end="   ")
            print(f"pi:{seq} \n")

    def mutate(self, dict_equivalences):
        """
        Mutate the genome by replacing one of its sequences with a new sequence. The take_paths list contains
        the available paths.
        Also, when we remove one path from the sequences list, we have to put it back among
        the available paths(take_paths).
        """

        index_eq = self.fitness(dict_equivalences)[
            -1
        ]  # Get the indices of k-equivalent sequences.
        i = random.randint(
            0, len(self.take_paths) - 1
        )  # Choose a random index from the list of available paths.

        if (
            index_eq
        ):  # If the penalty index is not empty, choose a random index from the penalty index.
            r = index_eq[random.randint(0, len(index_eq) - 1)][random.randint(0, 1)]
            # The following order of commands is critical.
            self.take_paths.append(self.sequences[r])
            self.sequences[r] = self.take_paths[
                i
            ]  # Replace the sequence at the chosen index with a new sequence using the chosen path.
            self.take_paths.pop(i)

        return

    def nmutate(self, dict_equivalences):
        """
        nmutate meaning normal mutate. Similar to mutate(), but here the sequence to be changed is selected at random, even if the sequence
        was k-distinct from every other sequence.
        """

        index_eq = len(
            self.fitness(dict_equivalences)[-1]
        )  # this list will just be used to check if mutation is needed. If empty, then Genome is perfect.

        if index_eq:
            r = random.randint(0, self.num_sequences - 1)
            i = random.randint(0, len(self.take_paths) - 1)

            self.take_paths.append(self.sequences[r])
            self.sequences[r] = self.take_paths[
                i
            ]  # does the order matter? no, since the last path index was appended at the end of self.take_paths
            self.take_paths.pop(i)
        return

    def smutate(self, dict_equivalences, l):
        """
        smutate meaning special mutate. Similar to mutate(), but here the sequence to be changed is selected at random, even if the sequence
        was k-distinct from every other sequence. Also, the sequences may repeat themselves, in contrast to nmutate() and mutate() where the mutation
        always made sure not to repeat paths.
        """
        index_eq = len(self.fitness(dict_equivalences)[-1])
        if index_eq:
            r = random.randint(0, self.num_sequences - 1)
            self.sequences[r] = random.randint(0, l - 1)

        return

    def translate(self, paths):
        """
        Method to translate path from the 3-digit term format to the Alphabetical format(EN...EEN).
        """
        itoc = {0:"E",1:"N"}
        translated = []
        for sequence in self.sequences:
            translated.append(translate(sequence, "to_A", paths))
        for index,list_of_terms in enumerate(translated):
            for index2, term in enumerate(list_of_terms):
                translated[index][index2] = itoc[term]
        self.translated = translated
        return translated

    def draw(self):
        """
        Method to visualize the lattice and paths.
        """
        from drawing_paths import draw_path, draw_lattice
        paths = generate_all_paths(self.m, self.n)
        draw_lattice(self.m, self.n)
        o = 40 / self.num_sequences
        i = -0.5 * self.num_sequences
        for j in range(len(self.sequences)):
            draw_path(
                translate(paths[self.sequences[j]], "to_A"), self.m, self.n, o * i, j
            )
            i += 1
    def __repr__(self):
        return f"Genome(num_genes = {self.num_sequences}, m = {self.m}, n = {self.n}, k= {self.k})"