From 76001f47f761c68c3aee51b53f5d82404d812cbb Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Thu, 27 Jun 2024 14:41:22 +0100
Subject: [PATCH] added bootstrapped xi indicator function for full dataset

---
 .../multiclosure_inconsistent.py              |  57 ++++++++-
 .../multiclosure_inconsistent_output.py       |  34 ++++++
 .../src/validphys/closuretest/multiclosure.py | 114 ++++++++++--------
 3 files changed, 153 insertions(+), 52 deletions(-)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
index 939e4659de..aa9a42f38c 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
@@ -19,6 +19,7 @@
 from validphys.closuretest.multiclosure import (
     bootstrapped_internal_multiclosure_dataset_loader,
     bootstrapped_internal_multiclosure_data_loader,
+    standard_indicator_function,
 )
 
 from reportengine import collect
@@ -459,7 +460,7 @@ def principal_components_normalized_delta_data(internal_multiclosure_data_loader
 
     # find basis that diagonalise covmat pca
     eigvals, eigenvects = np.linalg.eigh(pca_loader.covmat_pca)
-    
+
     if pca_loader.n_comp == 1:
         delta_bias = pca_loader.pc_basis * delta_bias
         std_deviations = []
@@ -480,6 +481,60 @@ def principal_components_normalized_delta_data(internal_multiclosure_data_loader
 )
 
 
+def bootstrapped_principal_components_normalized_delta_data(
+    bootstrapped_internal_multiclosure_data_loader_pca,
+):
+    """
+    Compute the normalized deltas for each bootstrap sample.
+
+    Parameters
+    ----------
+    bootstrapped_internal_multiclosure_data_loader_pca: list
+        list of tuples containing the results of multiclosure fits after pca regularization
+
+    Returns
+    -------
+    list
+        list of tuples containing the normalized deltas and the number of principal components.
+        Each tuple corresponds to a bootstrap sample.
+    """
+    normalised_deltas = []
+    for boot_imdl_pca in bootstrapped_internal_multiclosure_data_loader_pca:
+        normalised_deltas.append(principal_components_normalized_delta_data(boot_imdl_pca))
+    return normalised_deltas
+
+
+def bootstrapped_indicator_function_data(
+    bootstrapped_principal_components_normalized_delta_data, nsigma=1
+):
+    """
+    Compute the indicator function for each bootstrap sample.
+
+    Parameters
+    ----------
+    bootstrapped_principal_components_normalized_delta_data: list
+        list of tuples containing the normalized deltas and the number of principal components.
+        Each tuple corresponds to a bootstrap sample.
+
+    nsigma: int, default is 1
+
+    Returns
+    -------
+    2-D tuple:
+        list
+            list of length N_boot and entrances are arrays of dim Npca x Nfits containing the indicator function for each bootstrap sample.
+
+        float
+            average number of degrees of freedom
+    """
+    indicator_list = []
+    ndof_list = []
+    for boot, ndof in bootstrapped_principal_components_normalized_delta_data:
+        indicator_list.append(standard_indicator_function(boot, nsigma))
+        ndof_list.append(ndof)
+    return indicator_list, np.mean(np.asarray(ndof_list))
+
+
 def bootstrapped_principal_components_bias_variance_dataset(
     bootstrapped_internal_multiclosure_dataset_loader_pca, dataset
 ):
diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
index 08d82e5ca9..74bf7dbe7d 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
@@ -521,3 +521,37 @@ def delta_histogram(principal_components_normalized_delta_data, title, label_his
     ax.plot(x, y, label="Standard gaussian")
     ax.legend()
     return fig
+
+
+@table
+def table_xi_indicator_function_data(bootstrapped_indicator_function_data):
+    """
+    Computes the bootstrap average and std of the indicator function for the data.
+
+    Parameters
+    ----------
+    bootstrapped_indicator_function_data: tuple
+
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame containing the average and std of the indicator function for the data.
+    """
+    indicator_list, mean_dof = bootstrapped_indicator_function_data
+
+    # average over data and fits within the bootstrap samples
+    mean_boot_vals = np.array([np.mean(boot_val) for boot_val in indicator_list])
+
+    # take bootstrap expectation and variance
+    mean_xi = np.mean(mean_boot_vals)
+    std_xi = np.std(mean_boot_vals)
+
+    records = [dict(data="full data", mean_dof=mean_dof, mean_xi=mean_xi, std_xi=std_xi)]
+
+    df = pd.DataFrame.from_records(
+        records, index="data", columns=("data", "mean_dof", "mean_xi", "std_xi")
+    )
+
+    df.columns = ["mean_dof", "mean_xi", "std_xi"]
+    return df
diff --git a/validphys2/src/validphys/closuretest/multiclosure.py b/validphys2/src/validphys/closuretest/multiclosure.py
index a9158b8c42..ed1190e84a 100644
--- a/validphys2/src/validphys/closuretest/multiclosure.py
+++ b/validphys2/src/validphys/closuretest/multiclosure.py
@@ -7,6 +7,7 @@
 ``multiclosure_output.py``
 
 """
+
 import numpy as np
 import scipy.linalg as la
 import scipy.special as special
@@ -116,9 +117,7 @@ def internal_multiclosure_data_loader(
 
 @check_multifit_replicas
 def fits_dataset_bias_variance(
-    internal_multiclosure_dataset_loader,
-    _internal_max_reps=None,
-    _internal_min_reps=20,
+    internal_multiclosure_dataset_loader, _internal_max_reps=None, _internal_min_reps=20
 ):
     """For a single dataset, calculate the bias and variance for each fit
     and return tuple (bias, variance, n_data), where bias and variance are
@@ -149,22 +148,22 @@ def fits_dataset_bias_variance(
         variances.append(np.mean(calc_chi2(sqrtcov, diffs)))
     return biases, np.asarray(variances), len(law_th)
 
+
 @check_multifit_replicas
 def fits_normed_dataset_central_delta(
-    internal_multiclosure_dataset_loader,
-    _internal_max_reps=None,
-    _internal_min_reps=20):
+    internal_multiclosure_dataset_loader, _internal_max_reps=None, _internal_min_reps=20
+):
     """
     For each fit calculate the difference between central expectation value and true val. Normalize this
     value by the variance of the differences between replicas and central expectation value (different
-    for each fit but expected to vary only a little). Each observable central exp value is 
+    for each fit but expected to vary only a little). Each observable central exp value is
     expected to be gaussianly distributed around the true value set by the fakepdf
     """
     closures_th, law_th, exp_cov, sqrtcov = internal_multiclosure_dataset_loader
     # The dimentions here are (fit, data point, replica)
     reps = np.asarray([th.error_members[:, :_internal_max_reps] for th in closures_th])
     # One could mask here some reps in order to avoid redundancy of information
-    #TODO
+    # TODO
 
     n_data = len(law_th)
     n_fits = np.shape(reps)[0]
@@ -172,21 +171,19 @@ def fits_normed_dataset_central_delta(
     # There are n_fits pdf_covariances
     # flag to see whether to eliminate dataset
     for i in range(n_fits):
-        bias_diffs = np.mean(reps[i], axis = 1) - law_th.central_value
+        bias_diffs = np.mean(reps[i], axis=1) - law_th.central_value
         # bias diffs in the for loop should have dim = (n_obs)
-        sigmas = np.sqrt(np.var(reps[i], axis = 1))
+        sigmas = np.sqrt(np.var(reps[i], axis=1))
         # var diffs has shape (n_obs,reps)
-        bias_diffs = bias_diffs/sigmas
-        
+        bias_diffs = bias_diffs / sigmas
+
         deltas.append(bias_diffs.tolist())
         # biases.shape = (n_fits, n_obs_cut/uncut)
         # variances.shape = (n_fits, n_obs_cut/uncut, reps)
     return np.asarray(deltas)
 
 
-fits_datasets_bias_variance = collect(
-    "fits_dataset_bias_variance", ("data",)
-)
+fits_datasets_bias_variance = collect("fits_dataset_bias_variance", ("data",))
 
 
 def expected_dataset_bias_variance(fits_dataset_bias_variance):
@@ -200,9 +197,7 @@ def expected_dataset_bias_variance(fits_dataset_bias_variance):
 
 @check_multifit_replicas
 def fits_data_bias_variance(
-    internal_multiclosure_data_loader,
-    _internal_max_reps=None,
-    _internal_min_reps=20,
+    internal_multiclosure_data_loader, _internal_max_reps=None, _internal_min_reps=20
 ):
     """Like `fits_dataset_bias_variance` but for all data"""
     return fits_dataset_bias_variance(
@@ -350,11 +345,7 @@ def n_replica_samples(fits_pdf, _internal_max_reps=None, _internal_min_reps=20):
     _internal_max_reps.
     """
     return list(
-        range(
-            _internal_min_reps,
-            _internal_max_reps + SAMPLING_INTERVAL,
-            SAMPLING_INTERVAL,
-        )
+        range(_internal_min_reps, _internal_max_reps + SAMPLING_INTERVAL, SAMPLING_INTERVAL)
     )
 
 
@@ -370,13 +361,7 @@ def __init__(self, data):
 
 
 def _bootstrap_multiclosure_fits(
-    internal_multiclosure_dataset_loader,
-    rng,
-    n_fit_max,
-    n_fit,
-    n_rep_max,
-    n_rep,
-    use_repeats,
+    internal_multiclosure_dataset_loader, rng, n_fit_max, n_fit, n_rep_max, n_rep, use_repeats
 ):
     """Perform a single bootstrap resample of the multiclosure fits and return
     a proxy of the base internal object used by relevant estimator actions
@@ -907,9 +892,7 @@ def total_bootstrap_xi(experiments_bootstrap_xi):
 
 
 def dataset_fits_bias_replicas_variance_samples(
-    internal_multiclosure_dataset_loader,
-    _internal_max_reps=None,
-    _internal_min_reps=20,
+    internal_multiclosure_dataset_loader, _internal_max_reps=None, _internal_min_reps=20
 ):
     """For a single dataset, calculate the samples of chi2-quantities which
     are used to calculate the bias and variance for each fit. The output of this
@@ -950,14 +933,10 @@ def dataset_fits_bias_replicas_variance_samples(
 
 
 def dataset_inputs_fits_bias_replicas_variance_samples(
-    internal_multiclosure_data_loader,
-    _internal_max_reps=None,
-    _internal_min_reps=20,
+    internal_multiclosure_data_loader, _internal_max_reps=None, _internal_min_reps=20
 ):
     return dataset_fits_bias_replicas_variance_samples(
-        internal_multiclosure_data_loader,
-        _internal_max_reps=None,
-        _internal_min_reps=20,
+        internal_multiclosure_data_loader, _internal_max_reps=None, _internal_min_reps=20
     )
 
 
@@ -966,9 +945,13 @@ def dataset_inputs_fits_bias_replicas_variance_samples(
 )
 
 
-def xq2_dataset_map(commondata, cuts,internal_multiclosure_dataset_loader,
-                        _internal_max_reps=None,
-                        _internal_min_reps=20):
+def xq2_dataset_map(
+    commondata,
+    cuts,
+    internal_multiclosure_dataset_loader,
+    _internal_max_reps=None,
+    _internal_min_reps=20,
+):
     """
     Load in a dictionary all the specs of a dataset meaning:
     - ds name
@@ -980,17 +963,46 @@ def xq2_dataset_map(commondata, cuts,internal_multiclosure_dataset_loader,
     xq2_map_obj = xq2map_with_cuts(commondata, cuts)
     coords = xq2_map_obj[2]
     central_deltas = fits_normed_dataset_central_delta(internal_multiclosure_dataset_loader)
-    #std_devs = np.std(central_deltas, axis = 0)
-    std_devs = np.sqrt(np.mean(central_deltas**2, axis = 0))
-    means = np.mean(central_deltas, axis = 0)
-    xi = dataset_xi(dataset_replica_and_central_diff(internal_multiclosure_dataset_loader,False))
+    # std_devs = np.std(central_deltas, axis = 0)
+    std_devs = np.sqrt(np.mean(central_deltas**2, axis=0))
+    means = np.mean(central_deltas, axis=0)
+    xi = dataset_xi(dataset_replica_and_central_diff(internal_multiclosure_dataset_loader, False))
 
     # for case of DY observables we have 2 (x,Q) for each experimental point
     if coords[0].shape[0] != std_devs.shape[0]:
-        std_devs = np.concatenate((std_devs,std_devs))
-        means = np.concatenate((means,means))
-        xi = np.concatenate((xi,xi))
-    return {'x_coords':coords[0], 'Q_coords':coords[1],'std_devs':std_devs,'name':commondata.name,'process':commondata.process_type, 'means': means, 'xi': xi}
+        std_devs = np.concatenate((std_devs, std_devs))
+        means = np.concatenate((means, means))
+        xi = np.concatenate((xi, xi))
+    return {
+        'x_coords': coords[0],
+        'Q_coords': coords[1],
+        'std_devs': std_devs,
+        'name': commondata.name,
+        'process': commondata.process_type,
+        'means': means,
+        'xi': xi,
+    }
+
 
+xq2_data_map = collect("xq2_dataset_map", ("data",))
 
-xq2_data_map = collect("xq2_dataset_map",("data",))
\ No newline at end of file
+
+def standard_indicator_function(standard_variable, nsigma=1):
+    """
+    Calculate the indicator function for a standardised variable.
+
+    Parameters
+    ----------
+    standard_variable: np.array
+        array of variables that have been standardised: (x - mu)/sigma
+
+    nsigma: float
+        number of standard deviations to consider
+
+    Returns
+    -------
+    np.array
+        array of ones and zeros. If 1 then the variable is within nsigma standard deviations
+        from the mean, otherwise it is 0.
+    """
+    return np.array(abs(standard_variable) < nsigma, dtype=int)