compile_changes.py

"""
This is the main analysis script of the project
Works on time series data generated by cluster_dynamics.py as well as the final lattice to output:
1) Cluster growth/decay probabilities (probability that a cluster of size s will grow/decay, as a function of s)
*_cluster_growth_probabilities.txt

2) Mean change, residue, and number of changes undergone by clusters of all sizes
*_cluster_ds.txt

3) Distribution of residues associated with clusters of all sizes
*_residue_info.txt

4) Number of growth, decay, merge and split processes undergone by clusters of all sizes
*_cluster_processes.txt

5) Number of times every possible change dS occurred
*_changes.txt

6) Cluster distribution of final lattice
*_cluster_distribution.txt

7) Final density of all ensembles
*_densities.txt

All these files are dumped in the "outputs" folder of the root directory
"""


from matplotlib import pyplot as plt
from multiprocessing import Pool, set_start_method
from numpy import histogram, zeros
from os import makedirs, path
from pickle import dump
from tqdm import tqdm

from depth_first_clustering import depth_first_clustering
from utils import load_automaton_data


def analyze_data(model_name, simulation_index):
    """
    All file I/O and data collection is done here
    This function is multiprocessed
    """

    grown_clusters = []
    decayed_clusters = []
    changes_list = []

    data = load_automaton_data(model_name, simulation_index) 
    info = data["info"]
    cluster_data = data["cluster_data"]
    final_lattice = data["final_lattice"]
    final_density = data["density_data"][-1]

    length = len(final_lattice)
    cluster_ds = [[] for _ in range(length * length)]

    if simulation_index == 0:
        print("Analyzing data ...")
        iterator = tqdm(cluster_data)
    else:
        iterator = cluster_data

    for update in iterator:
        if update is None:
            change = 0
            changes_list.append(change)

        elif update["type"] == "growth":
            change = 1
            changes_list.append(change)
            cluster_ds[update["size"]].append(change)
            grown_clusters.append(update["size"])

        elif update["type"] == "decay":
            change = -1
            changes_list.append(change)
            cluster_ds[update["size"]].append(change)
            decayed_clusters.append(update["size"])

        elif update["type"] == "appearance":
            change = 1
            changes_list.append(change)
            cluster_ds[0].append(change)
            grown_clusters.append(0)

        elif update["type"] == "disappearance":
            change = -1
            changes_list.append(change)
            cluster_ds[1].append(change)
            decayed_clusters.append(1)

        elif update["type"] == "merge":
            initial_sizes, final_size = update["initial_sizes"], update["final_size"]
            change = int(final_size - max(initial_sizes))
            changes_list.append(change)
            cluster_ds[int(max(initial_sizes))].append(change)

            for initial_size in initial_sizes:
                grown_clusters.append(int(initial_size))

        elif update["type"] == "split":
            initial_size, final_sizes = update["initial_size"], update["final_sizes"]
            change = int(max(final_sizes) - int(initial_size))
            changes_list.append(change)
            cluster_ds[initial_size].append(change)
            decayed_clusters.append(initial_size)

    analysed_data = [grown_clusters, decayed_clusters, changes_list, cluster_ds, final_lattice, final_density]
    return analysed_data


def compile_changes(model_name, simulation_indices, plot_name='data', calc_residue=True):
    grown_clusters = []
    decayed_clusters = []

    changes_list = []

    final_lattices = []
    final_densities = []

    with Pool(len(simulation_indices)) as pool:
        data = list(pool.starmap(analyze_data, [(model_name, simulation_index) for simulation_index in simulation_indices]))

    length = len(data[0][4])
    cluster_ds = [[] for _ in range(length * length)]

    print("Combining lists...")
    for analysed_data in tqdm(data):
        grown_clusters += analysed_data[0]
        decayed_clusters += analysed_data[1]
        changes_list += analysed_data[2]
        final_lattices.append(analysed_data[4].copy())
        final_densities.append(analysed_data[5])

    for i in range(len(cluster_ds)):
        for analysed_data in data:
            cluster_ds[i] += analysed_data[3][i]

    print("Computing histogram")
    start = 2
    sizes = list(range(start, min(500, min(len(grown_clusters), len(decayed_clusters)) - 10)))
    changes = list(range(int(min(changes_list)), int(max(changes_list)) + 1))
    growth_sizes_histogram = histogram(grown_clusters, bins=sizes)[0]
    decay_sizes_histogram = histogram(decayed_clusters, bins=sizes)[0]
    changes_histogram = histogram(changes_list, bins=changes)[0]
    sizes.pop()
    changes.pop()

    print("Computing probabilities")
    growth_probabilities, decay_probabilities = [], []
    for size in tqdm(sizes):
        total_events = growth_sizes_histogram[size - start] + decay_sizes_histogram[size - start]

        if total_events != 0:
            growth_probabilities.append(growth_sizes_histogram[size - 2] / total_events)
            decay_probabilities.append(decay_sizes_histogram[size - 2] / total_events)
        else:
            growth_probabilities.append(0)
            decay_probabilities.append(0)

    folder_path = path.join(path.dirname(__file__), "outputs")
    makedirs(folder_path, exist_ok=True)

    print("Computing final cluster distribution")
    lattice_length = len(final_lattices[0])
    cluster_distribution = zeros((lattice_length * lattice_length + 1))

    for lattice in final_lattices:
        cluster_distribution += depth_first_clustering(lattice, periodic=True, trim=False)

    max_index = -1
    for i in range(len(cluster_distribution) - 1, -1, -1):
        if cluster_distribution[i] != 0:
            max_index = i
            break

    cluster_distribution = cluster_distribution[:max_index + 1]

    print("Saving cluster growth probabilities ...")
    fp = open(path.join(folder_path, plot_name + '_cluster_growth_probabilities.txt'), "w")
    output_string = ""
    for size in sizes:
        output_string += f"{size} {growth_probabilities[size - start]}\n"
    fp.write(output_string)
    fp.close()

    # duct-taping the multiple changes issue
    min_positive_value = float("inf")
    for value in changes_histogram:
        if value > 0 and value < min_positive_value:
            min_positive_value = value
    
    if min_positive_value == float("inf"):
        min_positive_value = 1

    multiple_true = True
    for value in changes_histogram:
        if value % min_positive_value != 0:
            multiple_true = False
            break

    if multiple_true:
        changes_histogram = changes_histogram / min_positive_value

    print("Saving mean, mean_sq and number of changes undergone by each cluster ...")
    fp = open(path.join(folder_path, plot_name + '_cluster_ds.txt'), "w")
    output_string = ""
    residue_info = []

    for i in tqdm(range(len(cluster_ds))):
        if len(cluster_ds[i]) == 0:
            output_string += f"{i} 0 0 0\n"
            continue

        mean = sum(cluster_ds[i]) / len(cluster_ds[i])
        mean_sq = sum([(value - mean) ** 2 for value in cluster_ds[i]]) / len(cluster_ds[i])
        output_string += f"{i} {mean} {mean_sq} {len(cluster_ds[i])}\n"

        if calc_residue and i > 0 and len(cluster_ds[i]) > 1000 and (i in [10, 20, 30, 40, 50, 70, 90, 100] or i % 500 == 0):
            residue = [int(value - mean) for value in cluster_ds[i]]
            min_bin = min(residue) - 1
            max_bin = max(residue) + 1

            freq, bins = histogram(residue, bins=[i for i in range(min_bin, max_bin + 1)])
            residue_info.append({
                "size": i,
                "min_bin": min_bin,
                "max_bin": max_bin,
                "freq": freq
            })
        
    fp.write(output_string)
    fp.close()

    if calc_residue:
        print("Saving residue information ...")
        fp = open(path.join(folder_path, plot_name + '_residue_info.txt'), "w")
        output_string = ""
        for info in residue_info:
            output_string += f"{info['size']} : {info['min_bin']}, {info['max_bin']} : {', '.join([str(val) for val in info['freq']])}\n"
        fp.write(output_string)
        fp.close()

    print("Saving number of each process undergone by each cluster ...")
    fp = open(path.join(folder_path, plot_name + '_cluster_processes.txt'), "w")
    output_string = ""
    for i in tqdm(range(len(cluster_ds))):
        num_growth = 0
        num_decay = 0
        num_merge = 0
        num_split = 0

        for value in cluster_ds[i]:
            if value == 1:
                num_growth += 1
            elif value == -1:
                num_decay += 1
            elif value > 1:
                num_merge += 1
            elif value < 1:
                num_split += 1

        output_string += f"{i} {num_growth} {num_decay} {num_merge} {num_split}\n"
    fp.write(output_string)
    fp.close()

    print("Saving cluster change values ...")
    fp = open(path.join(folder_path, plot_name + '_changes.txt'), 'w')
    output_string = ""
    for change in changes:
        output_string += f"{change} {int(changes_histogram[change - min(changes)])}\n"
    fp.write(output_string)
    fp.close()

    print("Saving cluster distribution ...")
    fp = open(path.join(folder_path, plot_name + '_cluster_distribution.txt'), 'w')
    output_string = ""
    for i, num in enumerate(cluster_distribution):
        output_string += f"{i} {int(num)}\n"
    fp.write(output_string)
    fp.close()

    print("Saving final densities ...")
    fp = open(path.join(folder_path, plot_name + '_densities.txt'), "w")
    output_string = ""
    for i, density in enumerate(final_densities):
        output_string += f"{i} {density}\n"
    fp.write(output_string)
    fp.close()

    print("Saving final lattices ...")
    fp = open(path.join(folder_path, plot_name + '_final_lattices.pkl'), "wb")
    dump(final_lattices, fp)


if __name__ == '__main__':
    compile_changes("tricritical", [0], plot_name="0p73")