From 9f2f9d49a9ee66e04b50a2d84718f29a84bc25f7 Mon Sep 17 00:00:00 2001
From: andreab1997 <andreab1997@yahoo.it>
Date: Tue, 20 Feb 2024 15:55:15 +0100
Subject: [PATCH 01/61] Init analysis

---
 validphys2/src/validphys/kinematics.py | 46 ++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/validphys2/src/validphys/kinematics.py b/validphys2/src/validphys/kinematics.py
index 28c0217ffa..a9af6132ef 100644
--- a/validphys2/src/validphys/kinematics.py
+++ b/validphys2/src/validphys/kinematics.py
@@ -96,6 +96,30 @@ def kinlimits(commondata, cuts, use_cuts, use_kinoverride: bool = True):
 
 all_kinlimits = collect(kinlimits, ('dataset_inputs',))
 
+def xq2_dataset_map(commondata, cuts,internal_multiclosure_dataset_loader,
+                        _internal_max_reps=None,
+                        _internal_min_reps=20):
+    """
+    Load in a dictionary all the specs of a dataset meaning:
+    - ds name
+    - ds coords
+    - standard deviation (in multiclosure)
+    - mean (in multiclosure again)
+    - (x,Q^2) coords
+    """
+    xq2_map_obj = xq2map_with_cuts(commondata, cuts)
+    coords = xq2_map_obj[2]
+    central_deltas = fits_normed_dataset_central_delta(internal_multiclosure_dataset_loader)
+    std_devs = np.std(central_deltas, axis = 0)
+    means = np.mean(central_deltas, axis = 0)
+    xi = dataset_xi(dataset_replica_and_central_diff(internal_multiclosure_dataset_loader,False))
+    #import ipdb; ipdb.set_trace()
+    # for case of DY observables we have 2 (x,Q) for each experimental point
+    if coords[0].shape[0] != std_devs.shape[0]:
+        std_devs = np.concatenate((std_devs,std_devs))
+        means = np.concatenate((means,means))
+        xi = np.concatenate((xi,xi))
+    return {'x_coords':coords[0], 'Q_coords':coords[1],'std_devs':std_devs,'name':commondata.name,'process':commondata.process_type, 'means': means, 'xi': xi}
 
 @table
 def all_kinlimits_table(all_kinlimits, use_kinoverride: bool = True):
@@ -185,6 +209,28 @@ def xq2map_with_cuts(commondata, cuts, group_name=None):
 dataset_inputs_by_groups_xq2map = collect(
     xq2map_with_cuts, ('group_dataset_inputs_by_metadata', 'data_input')
 )
+xq2_data_map = collect("xq2_dataset_map",("data",))
+
+@figuregen
+def xq2_data_prcs_maps(xq2_data_map):
+    import matplotlib.pyplot as plt
+    import matplotlib.colors
+    from matplotlib.colors import ListedColormap
+    keys = ["std_devs","xi"]
+    for elem in xq2_data_map:
+        for k in keys:
+            fig, ax = plotutils.subplots()
+            im = ax.scatter(elem['x_coords'],elem['Q_coords'],
+                                c=(np.asarray(elem[k])), 
+                                cmap='viridis',
+                                label = elem['name'])
+            fig.colorbar(im,label=k)
+            ax.set_xscale('log')  # Set x-axis to log scale
+            ax.set_yscale('log')  # Set y-axis to log scale
+            ax.set_xlabel('x')
+            ax.set_ylabel('Q2')
+            ax.set_title(elem["name"]+" "+k)
+            yield fig
 
 
 def kinematics_table_notable(commondata, cuts, show_extra_labels: bool = False):

From be93df4bee36b6f07e54f7946f821f0af9aab332 Mon Sep 17 00:00:00 2001
From: andreab1997 <andreab1997@yahoo.it>
Date: Tue, 20 Feb 2024 16:15:05 +0100
Subject: [PATCH 02/61] Add other functions

---
 .../multiclosure_inconsistent.py              | 362 ++++++++++++++++++
 .../multiclosure_inconsistent_output.py       |  54 +++
 .../src/validphys/closuretest/multiclosure.py |  35 ++
 validphys2/src/validphys/kinematics.py        |  12 +-
 4 files changed, 462 insertions(+), 1 deletion(-)
 create mode 100644 validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
 create mode 100644 validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
new file mode 100644
index 0000000000..4cb12f2d30
--- /dev/null
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
@@ -0,0 +1,362 @@
+"""
+closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
+
+Module containing all of the statistical estimators which are
+averaged across multiple inconsistent fits. The actions
+in this module are used to produce results which are plotted in
+``multiclosure_inconsistent_output.py``
+
+"""
+
+import numpy as np
+from sklearn.decomposition import PCA
+from sklearn import preprocessing
+
+from validphys import covmats
+from validphys.calcutils import calc_chi2
+from validphys.results import ThPredictionsResult
+
+from reportengine import collect
+
+
+""" To load several multiclosure fits. Useful for inconsistent closure test analysis """
+multi_dataset_loader = collect("internal_multiclosure_dataset_loader", ("dataspecs",))
+
+multi_dataset_fits_bias_replicas_variance_samples = collect(
+    "dataset_fits_bias_replicas_variance_samples", ("dataspecs",)
+)
+
+multi_fits_bootstrap_dataset_bias_variance = collect(
+    "fits_bootstrap_dataset_bias_variance", ("dataspecs",)
+)
+
+multi_bias_variance_resampling_dataset = collect("bias_variance_resampling_dataset", ("dataspecs",))
+
+multi_dataset_fits_bias_variance_samples_pca = collect(
+    "dataset_fits_bias_variance_samples_pca", ("dataspecs",)
+)
+
+
+def principal_components_dataset(dataset, fits_pdf, variancepdf, explained_variance_ratio=0.93):
+    """
+    Compute the principal components of theory predictions replica matrix
+    (Ndat x Nrep feature matrix).
+
+    Parameters
+    ----------
+    dataset: (DataSetSpec, DataGroupSpec)
+        dataset for which the theory predictions and t0 covariance matrix
+        will be loaded. Note that due to the structure of `validphys` this
+        function can be overloaded to accept a DataGroupSpec.
+
+    fits_pdf: list
+        list of PDF objects produced from performing multiple closure tests
+        fits. Each fit should have a different filterseed but the same
+        underlying law used to generate the pseudodata.
+
+    variancepdf: validphys.core.PDF
+            PDF object used to estimate the variance of the fits.
+
+    explained_variance_ratio: float, default is 0.93
+
+    Returns
+    -------
+    tuple
+        2D tuple:
+        - matrix of the principal components (PCs) of shape (N_pc, N_dat)
+        - reduced feature matrix, i.e., feature matrix projected onto PCs of shape (N_pc, N_rep)
+
+    """
+    # fits_dataset_predictions = [
+    #     ThPredictionsResult.from_convolution(pdf, dataset) for pdf in fits_pdf
+    # ]
+
+    # dimensions here are (Nfits, Ndat, Nrep)
+    # reps = np.asarray([th.error_members for th in fits_dataset_predictions])
+
+    # reshape so as to get PCs from all the samples
+    # reps = reps.reshape(reps.shape[1],-1)
+
+    # get replicas from variance fit, used to estimate variance
+    reps = ThPredictionsResult.from_convolution(variancepdf, dataset).error_members
+
+    # rescale feature matrix
+    reps_scaled = reps  # preprocessing.scale(reps)
+
+    # choose number of principal components (PCs) based on explained variance ratio
+    n_comp = 1
+    for _ in range(reps.shape[0]):
+        pca = PCA(n_comp).fit(reps_scaled.T)
+        if np.sum(pca.explained_variance_ratio_) >= explained_variance_ratio:
+            break
+        n_comp += 1
+
+    # project feature matrix onto PCs
+    pc_reps = pca.components_ @ reps
+
+    return pca.components_, pc_reps, n_comp
+
+
+def principal_components_bias_variance_dataset(
+    internal_multiclosure_dataset_loader, principal_components_dataset
+):
+    """
+    TODO
+    """
+
+    closures_th, law_th, _, _ = internal_multiclosure_dataset_loader
+
+    reps = np.asarray([th.error_members for th in closures_th])
+
+    pc_basis, pc_reps, n_comp = principal_components_dataset
+
+    if n_comp <=1:
+        return None, None, n_comp
+        
+    # estimate (PC) pdf covariance matrix (from replicas), shape is (Npc, Npc)
+    covmat_pdf = np.cov(pc_reps)
+    sqrt_covmat_pdf = covmats.sqrt_covmat(covmat_pdf)
+
+    # compute bias diff and project it onto space spanned by PCs
+    delta_bias = reps.mean(axis=2).T - law_th.central_value[:, np.newaxis]
+    # shape here is (Npc, Nfits)
+    delta_bias = pc_basis @ delta_bias
+
+    # compute biases, shape of biases is (Nfits)
+    biases = calc_chi2(sqrt_covmat_pdf, delta_bias)
+
+    variances = []
+    for i in range(reps.shape[0]):
+        diffs = pc_basis @ (reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True))
+        variances.append(np.mean(calc_chi2(sqrt_covmat_pdf, diffs)))
+
+    return biases, np.asarray(variances), n_comp
+
+
+principal_components_bias_variance_datasets = collect(
+    "principal_components_bias_variance_dataset", ("data",)
+)
+
+
+def principal_components_variance_distribution_dataset(
+    internal_multiclosure_dataset_loader, principal_components_dataset
+):
+    """
+    TODO
+    """
+
+    closures_th, _, _, _ = internal_multiclosure_dataset_loader
+
+    reps = np.asarray([th.error_members for th in closures_th])
+
+    pc_basis, pc_reps, n_comp = principal_components_dataset
+
+    if n_comp <= 1:
+        return None, n_comp
+    # estimate (PC) pdf covariance matrix (from replicas), shape is (Npc, Npc)
+    covmat_pdf = np.cov(pc_reps)
+    sqrt_covmat_pdf = covmats.sqrt_covmat(covmat_pdf)
+
+    variances = []
+    for i in range(reps.shape[0]):
+        diffs = pc_basis @ (reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True))
+        variances.append([calc_chi2(sqrt_covmat_pdf, diffs)])
+
+    return np.asarray(variances), n_comp
+
+
+principal_components_variance_distribution_datasets = collect(
+    "principal_components_variance_distribution_dataset", ("data",)
+)
+
+
+def compute_num_components(covariance_matrix, threshold=0.99):
+    """
+    Compute the number of principal components to keep based on a desired explained variance threshold.
+
+    Parameters
+    ----------
+    covariance_matrix : np.ndarray, 2-D array
+
+    threshold : (float): Desired explained variance threshold (between 0 and 1).
+
+    Returns
+    -------
+    int
+        num_components: Number of principal components to keep.
+
+    """
+    eig_val, eig_vec = np.linalg.eig(covariance_matrix)
+    idx = eig_val.argsort()[::-1]
+    eig_val = eig_val[idx]
+    eig_vec = eig_vec[:, idx]
+
+    cumulative_sum = np.cumsum(eig_val)
+    total_sum = np.sum(eig_val)
+    num_components = np.argmin(np.abs(cumulative_sum / total_sum - threshold))
+
+    return num_components
+
+
+def pca_covmat(X, num_components):
+    """
+    given data X of shape (n,p), reduce its dimension to
+    (n,num_components) and return the covariance matrix
+    of the reduced data matrix.
+
+    Parameters
+    ----------
+
+    Returns
+    -------
+    """
+    pca = PCA(num_components)
+    X_reduced = pca.fit_transform(X)
+    covariance = np.dot(X_reduced.T, X_reduced) / (X_reduced.shape[0] - 1)
+    return covariance
+
+
+def calc_chi2_pca(pdf_cov, diff, num_components):
+    """
+    Computes the chi2 between pdf_cov and diff by first projecting
+    them into num_components PCs.
+
+    Parameters
+    ----------
+    pdf_cov: np.ndarray
+
+    diff: np.ndarray
+
+    num_components: int
+
+    Returns
+    -------
+    float or np.ndarray depending on input
+
+    """
+    # Diagonalize the matrix
+    L, W = np.linalg.eig(pdf_cov)
+
+    # Sort the eigenvalues and eigenvectors from largest to smallest
+    idx = L.argsort()[::-1]
+    L = L[idx]
+    W = W[:, idx]
+
+    # Keep only the n_components largest eigenvectors
+    Wtilde = W[:, :num_components]
+
+    # Transform initial covariance matrix
+    pdf_cov_pca = np.einsum("ij,jk->ik", np.einsum("ij,ik->jk", Wtilde, pdf_cov), Wtilde).real
+
+    # transform data
+    diff_pca = diff.T @ Wtilde
+
+    try:
+        sqrt_pdf_cov_pca = covmats.sqrt_covmat(pdf_cov_pca)
+    except:
+        raise ValueError(
+            f"PCA Covariance Matrix cannot be Cholesky decomposed, perhaps less than {num_components} PC should be kept"
+        )
+
+    return np.real(calc_chi2(sqrt_pdf_cov_pca, diff_pca.T))
+
+
+def dataset_fits_bias_variance_samples_pca(internal_multiclosure_dataset_loader, threshold=0.99):
+    """
+    similar to `dataset_fits_bias_replicas_variance_samples`.
+
+    Returns
+    -------
+    tuple
+        3D tuple:
+        - biases: 1-D array of shape (Nfits,)
+        - variances: 1-D array of shape (Nfits, )
+        - n_eig: number of eigenvalues kept in PCA, i.e.,
+          ndata in the new, lower dimensional, space.
+    Note that we return Nfits values of the variance so that computing the
+    Bias - Variance ratio is straightforward.
+    """
+    closures_th, law_th, _, _ = internal_multiclosure_dataset_loader
+
+    # The dimensions here are (fit, data point, replica)
+    reps = np.asarray([th.error_members for th in closures_th])
+
+    # take mean across replicas - since we might have changed no. of reps
+    centrals = reps.mean(axis=2)
+
+    # compute the PDF covariance matrix of the central samples
+    if centrals.shape[0] <= 1:
+        raise ValueError(f"Need more than one fit to compute the 'Bias' PDF covariance Matrix")
+
+    pdf_cov_bias = np.cov(centrals.T)
+
+    # find number of (ordered) eigenvalues that explain 99% of the total variance (total sum of eigenvalues)
+    n_eig = compute_num_components(pdf_cov_bias, threshold)
+
+    # compute bias from PCs
+    diffs_bias = centrals.T - law_th.central_value[:, np.newaxis]
+    biases = calc_chi2_pca(pdf_cov_bias, diffs_bias, n_eig)
+
+    # compute variances from PCs
+    variances = []
+
+    # loop over fits to compute variances
+    for i in range(reps.shape[0]):
+        diffs_var = reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)
+        pdf_cov_var = np.cov(reps[i, :, :])
+
+        variances.append(np.mean(calc_chi2_pca(pdf_cov_var, diffs_var, n_eig)))
+
+    return biases, np.asarray(variances), n_eig
+
+
+def expected_dataset_fits_bias_variance_samples_pca(dataset_fits_bias_variance_samples_pca):
+    """
+    returns the bias and variance expected values as well as the number of
+    principal components.
+    """
+    biases, variances, n_eig = dataset_fits_bias_variance_samples_pca
+    return np.mean(biases), np.mean(variances), n_eig
+
+
+expected_datasets_fits_bias_variance_samples_pca = collect(
+    "expected_dataset_fits_bias_variance_samples_pca", ("data",)
+)
+
+
+def dataset_fits_ratio_bias_variance_samples_pca(dataset_fits_bias_variance_samples_pca):
+    """ """
+    biases, variances, _ = dataset_fits_bias_variance_samples_pca
+    sqrt_ratios = np.sqrt(biases / variances)
+    return sqrt_ratios
+
+
+def dataset_fits_gaussian_parameters(internal_multiclosure_dataset_loader, threshold=0.99):
+    """
+    returns parameters of multi gaussian distribution of replicas
+    and central replicas
+    """
+    closures_th, law_th, _, _ = internal_multiclosure_dataset_loader
+
+    # The dimensions here are (fit, data point, replica)
+    reps = np.asarray([th.error_members for th in closures_th])
+
+    # take mean across replicas - since we might have changed no. of reps
+    centrals = reps.mean(axis=2)
+
+    centrals_covmat = np.cov(centrals.T)
+    centrals_covmat = pca_covmat(
+        centrals, num_components=compute_num_components(centrals_covmat, threshold)
+    )
+    mean_centrals = np.mean(centrals, axis=0)
+
+    replicas_covmat = 0
+    for i in range(reps.shape[0]):
+        replicas_covmat = np.cov(reps[i, :, :])
+        replicas_covmat += pca_covmat(
+            reps[i, :, :].T, num_components=compute_num_components(replicas_covmat, threshold)
+        )
+    replicas_covmat /= reps.shape[0]
+    mean_replicas = np.mean(reps.reshape(reps.shape[1], -1), axis=1)
+
+    return mean_centrals, centrals_covmat, mean_replicas, replicas_covmat
\ No newline at end of file
diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
new file mode 100644
index 0000000000..2f057510ab
--- /dev/null
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
@@ -0,0 +1,54 @@
+import numpy as np
+import pandas as pd
+from scipy import stats
+
+from reportengine.table import table
+
+from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent import principal_components_dataset
+
+@table
+def table_bias_variance_datasets(principal_components_bias_variance_datasets, each_dataset):
+    """
+    TODO
+    """
+    records = []
+    for pc_bias_var_dataset, ds in zip(principal_components_bias_variance_datasets, each_dataset):
+        biases, variances, n_comp = pc_bias_var_dataset
+        
+        try:
+            bias = np.mean(biases)
+            variance = np.mean(variances)
+            rbv = bias / variance
+            sqrt_rbv = np.sqrt(bias / variance)
+            records.append(
+            dict(
+                dataset=str(ds),
+                dof=n_comp,
+                bias=bias,
+                variance=variance,
+                ratio=rbv,
+                ratio_sqrt=sqrt_rbv,
+            )
+        )
+        
+        except:
+            records.append(
+            dict(
+                dataset=str(ds),
+                dof=n_comp,
+                bias=bias,
+                variance=variance,
+                ratio=np.nan,
+                ratio_sqrt=np.nan,
+            ))
+    
+        
+            
+
+    df = pd.DataFrame.from_records(
+            records,
+            index="dataset",
+            columns=("dataset", "dof", "bias", "variance", "ratio", "ratio_sqrt"),
+        )
+    df.columns = ["dof", "bias", "variance", "ratio", "sqrt(ratio)"]
+    return df
\ No newline at end of file
diff --git a/validphys2/src/validphys/closuretest/multiclosure.py b/validphys2/src/validphys/closuretest/multiclosure.py
index 5b545569e2..64d26c4d69 100644
--- a/validphys2/src/validphys/closuretest/multiclosure.py
+++ b/validphys2/src/validphys/closuretest/multiclosure.py
@@ -147,6 +147,41 @@ def fits_dataset_bias_variance(
         variances.append(np.mean(calc_chi2(sqrtcov, diffs)))
     return biases, np.asarray(variances), len(law_th)
 
+@check_multifit_replicas
+def fits_normed_dataset_central_delta(
+    internal_multiclosure_dataset_loader,
+    _internal_max_reps=None,
+    _internal_min_reps=20):
+    """
+    For each fit calculate the difference between central expectation value and true val. Normalize this
+    value by the variance of the differences between replicas and central expectation value (different
+    for each fit but expected to vary only a little). Each observable central exp value is 
+    expected to be gaussianly distributed around the true value set by the fakepdf
+    """
+    closures_th, law_th, exp_cov, sqrtcov = internal_multiclosure_dataset_loader
+    # The dimentions here are (fit, data point, replica)
+    #import ipdb; ipdb.set_trace()
+    reps = np.asarray([th.error_members[:, :_internal_max_reps] for th in closures_th])
+    # One could mask here some reps in order to avoid redundancy of information
+    #TODO
+
+    n_data = len(law_th)
+    n_fits = np.shape(reps)[0]
+    deltas = []
+    # There are n_fits pdf_covariances
+    # flag to see whether to eliminate dataset
+    for i in range(n_fits):
+        bias_diffs = np.mean(reps[i], axis = 1) - law_th.central_value
+        # bias diffs in the for loop should have dim = (n_obs)
+        sigmas = np.sqrt(np.var(reps[i], axis = 1))
+        # var diffs has shape (n_obs,reps)
+        bias_diffs = bias_diffs/sigmas
+        
+        deltas.append(bias_diffs.tolist())
+        # biases.shape = (n_fits, n_obs_cut/uncut)
+        # variances.shape = (n_fits, n_obs_cut/uncut, reps)
+    #import ipdb; ipdb.set_trace()
+    return np.asarray(deltas)
 
 def expected_dataset_bias_variance(fits_dataset_bias_variance):
     """For a given dataset calculate the expected bias and variance across fits
diff --git a/validphys2/src/validphys/kinematics.py b/validphys2/src/validphys/kinematics.py
index a9af6132ef..1a5e08cbed 100644
--- a/validphys2/src/validphys/kinematics.py
+++ b/validphys2/src/validphys/kinematics.py
@@ -12,9 +12,20 @@
 from reportengine import collect
 from reportengine.checks import check_positive
 from reportengine.table import table
+
 from validphys.core import CutsPolicy
 from validphys.plotoptions import core as plotoptions_core
 
+from validphys import plotutils
+from validphys import plotoptions
+from validphys.core import CutsPolicy
+from validphys.closuretest import fits_normed_dataset_central_delta
+from validphys.closuretest import dataset_xi
+from validphys.closuretest import dataset_replica_and_central_diff
+
+from reportengine.figure import figuregen
+
+
 log = logging.getLogger(__name__)
 
 
@@ -113,7 +124,6 @@ def xq2_dataset_map(commondata, cuts,internal_multiclosure_dataset_loader,
     std_devs = np.std(central_deltas, axis = 0)
     means = np.mean(central_deltas, axis = 0)
     xi = dataset_xi(dataset_replica_and_central_diff(internal_multiclosure_dataset_loader,False))
-    #import ipdb; ipdb.set_trace()
     # for case of DY observables we have 2 (x,Q) for each experimental point
     if coords[0].shape[0] != std_devs.shape[0]:
         std_devs = np.concatenate((std_devs,std_devs))

From c041049e503b18e2d6475c40a1264b44fa2514b3 Mon Sep 17 00:00:00 2001
From: andreab1997 <andreab1997@yahoo.it>
Date: Tue, 20 Feb 2024 16:33:18 +0100
Subject: [PATCH 03/61] Add other functions again

---
 .../multiclosure_inconsistent_output.py       | 292 +++++++++++++++++-
 1 file changed, 290 insertions(+), 2 deletions(-)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
index 2f057510ab..33310361d1 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
@@ -1,11 +1,87 @@
-import numpy as np
+"""
+multiclosure_inconsistent_output
+
+Module containing the actions which produce some output in validphys
+reports i.e figures or tables for (inconsistent) multiclosure 
+estimators in the space of data
+
+"""
 import pandas as pd
+import numpy as np
 from scipy import stats
 
+from reportengine.figure import figure, figuregen
 from reportengine.table import table
 
+from validphys import plotutils
 from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent import principal_components_dataset
 
+
+@figuregen
+def plot_bias_distribution_datasets(principal_components_bias_variance_datasets, each_dataset):
+    """
+    TODO
+    """
+    for pc_bias_var_dataset, ds in zip(principal_components_bias_variance_datasets, each_dataset):
+        biases, variances, n_comp = pc_bias_var_dataset
+        
+        try:
+            sqrt_rbv = np.sqrt(np.mean(biases) / np.mean(variances))
+            vals = np.linspace(0, 3 * n_comp, 100)
+            chi2_pdf = stats.chi2.pdf(vals, df=n_comp)
+            chi2_cdf = lambda x: stats.chi2.cdf(x, df=n_comp)
+            pvalue_ks = stats.kstest(biases, chi2_cdf).pvalue
+            fig, ax = plotutils.subplots()
+            ax.hist(biases, density=True, bins='auto', label=f"Bias {str(ds)}, p={pvalue_ks:.3f}")
+            ax.plot(vals, chi2_pdf, label=f"chi2, dof={n_comp}")
+            ax.plot([], [], 'ro', label=f"sqrt(Rbv) = {sqrt_rbv:.2f}")
+
+            ax.legend()
+
+            yield fig
+        
+        except:
+            fig, ax = plotutils.subplots()
+            ax.plot([], [], label=f"Dataset: {str(ds)}")
+            ax.legend()
+            yield fig
+
+@figuregen
+def plot_variance_distribution_datasets(
+    principal_components_variance_distribution_datasets, each_dataset
+):
+    """
+    TODO
+    """
+
+    for pc_var_dataset, ds in zip(
+        principal_components_variance_distribution_datasets, each_dataset
+    ):
+        variances, n_comp = pc_var_dataset
+        try:
+            vals = np.linspace(0, 3 * n_comp, 100)
+            chi2_pdf = stats.chi2.pdf(vals, df=n_comp)
+            chi2_cdf = lambda x: stats.chi2.cdf(x, df=n_comp)
+
+            for i, var_fit in enumerate(variances):
+                pvalue_ks = stats.kstest(var_fit[0], chi2_cdf).pvalue
+
+                fig, ax = plotutils.subplots()
+                ax.hist(
+                    var_fit[0],
+                    density=True,
+                    bins='auto',
+                    label=f"Variance {str(ds)}, p={pvalue_ks:.3f}",
+                )
+                ax.plot(vals, chi2_pdf, label=f"chi2, dof={n_comp}")
+                ax.set_title(f"Fit {i}")
+                ax.legend()
+
+                yield fig
+        except:
+            fig, ax = plotutils.subplots()
+            yield fig
+
 @table
 def table_bias_variance_datasets(principal_components_bias_variance_datasets, each_dataset):
     """
@@ -51,4 +127,216 @@ def table_bias_variance_datasets(principal_components_bias_variance_datasets, ea
             columns=("dataset", "dof", "bias", "variance", "ratio", "ratio_sqrt"),
         )
     df.columns = ["dof", "bias", "variance", "ratio", "sqrt(ratio)"]
-    return df
\ No newline at end of file
+    return df
+
+
+@table
+def datasets_bias_variance_pca_table(
+    expected_datasets_fits_bias_variance_samples_pca, each_dataset
+):
+    """For each dataset calculate the expected bias and expected variance computed by
+    first projecting the covariance matrix into a lower dimensional space trough PCA.
+
+    """
+    records = []
+    for ds, (bias, var, ndata) in zip(
+        each_dataset, expected_datasets_fits_bias_variance_samples_pca
+    ):
+        records.append(
+            dict(
+                dataset=str(ds),
+                ndata=ndata,
+                bias=bias,
+                variance=var,
+                ratio=bias / var,
+                ratio_sqrt=np.sqrt(bias / var),
+            )
+        )
+    df = pd.DataFrame.from_records(
+        records,
+        index="dataset",
+        columns=("dataset", "ndata", "bias", "variance", "ratio", "ratio_sqrt"),
+    )
+    df.columns = ["ndata", "bias", "variance", "ratio", "sqrt(ratio)"]
+    return df
+
+
+@figure
+def plot_sqrt_ratio_distribution_pca(dataset_fits_ratio_bias_variance_samples_pca):
+    """"""
+    sqrt_ratios = dataset_fits_ratio_bias_variance_samples_pca
+    fig, ax = plotutils.subplots()
+    ax.hist(sqrt_ratios, bins='auto', density=True, alpha=0.5, label="sqrt_ratio")
+    ax.legend()
+    return fig
+
+
+@figure
+def plot_variance_distribution_multi(multi_dataset_fits_bias_replicas_variance_samples, dataspecs):
+    """
+    histogram of the distribution of the variances (k) defined as the
+    distance between central replica and replica (k) in units of the
+    experimental covariance matrix.
+    If more than one group of dataspecs (e.g. consistent and inconsistent)
+    fits are defined, than plot comparison of these.
+    """
+    fig, ax = plotutils.subplots()
+    for (_, variance_dist, _), spec in zip(
+        multi_dataset_fits_bias_replicas_variance_samples, dataspecs
+    ):
+        label = spec['speclabel']
+
+        ax.hist(variance_dist, bins='auto', density=True, alpha=0.5, label=label)
+
+    ax.legend()
+    return fig
+
+
+@figure
+def plot_variance_distribution_pca_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs):
+    """
+    like `plot_variance_distribution_multi`, but for variance computed with the covariance matrix
+    predicted by the PDFs (and reduced with PCA).
+    """
+    return plot_variance_distribution_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs)
+
+
+@figure
+def plot_bias_distribution_multi(multi_dataset_fits_bias_replicas_variance_samples, dataspecs):
+    """
+    histogram of the distribution of the biases (l) defined as the
+    distance between central replica and underlying law in units of the
+    experimental covariance matrix.
+    If more than one group of dataspecs (e.g. consistent and inconsistent)
+    fits are defined, than plot comparison of these.
+    """
+    fig, ax = plotutils.subplots()
+    for (bias_dist, _, _), spec in zip(
+        multi_dataset_fits_bias_replicas_variance_samples, dataspecs
+    ):
+        label = spec['speclabel']
+
+        ax.hist(bias_dist, bins='auto', density=True, alpha=0.5, label=label)
+
+    ax.legend()
+    return fig
+
+
+@figure
+def plot_bias_distribution_pca_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs):
+    """
+    like `plot_bias_distribution_multi`, but for variance computed with the covariance matrix
+    predicted by the PDFs (and reduced with PCA).
+    """
+    return plot_bias_distribution_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs)
+
+
+@figure
+def plot_sqrt_ratio_distribution_pca(multi_dataset_fits_bias_variance_samples_pca, dataspecs):
+    """
+    histogram of the distribution of the sqrt ratio of bias and variance computed with
+    the estimated "PDF" covariance matrix (reduced with PCA).
+    If more than one group of dataspecs (e.g. consistent and inconsistent)
+    fits are defined, than plot comparison of these.
+    """
+    fig, ax = plotutils.subplots()
+    ratios = []
+    labels = []
+    for (bias_dist, variance_dist, _), spec in zip(
+        multi_dataset_fits_bias_variance_samples_pca, dataspecs
+    ):
+        labels.append(spec['speclabel'])
+        ratios.append(bias_dist / variance_dist)
+
+    ax.hist(ratios, bins='auto', density=True, label=labels)
+    ax.legend()
+    return fig
+
+
+@figure
+def plot_multi_bias_distribution_bootstrap(multi_fits_bootstrap_dataset_bias_variance, dataspecs):
+    """
+    histogram of the distribution of the biases obtained by bootstrapping with
+    `fits_bootstrap_dataset_bias_variance` over the existing fits.
+
+    If more than one group of dataspecs (e.g. consistent and inconsistent)
+    fits are defined, than plot comparison of these.
+    """
+    fig, ax = plotutils.subplots()
+    for (bias, _), spec in zip(multi_fits_bootstrap_dataset_bias_variance, dataspecs):
+        label = spec['speclabel']
+
+        ax.hist(bias, bins='auto', density=True, alpha=0.5, label=label)
+
+    ax.legend()
+    return fig
+
+
+@figure
+def plot_multi_ratio_bias_variance_distribution_bootstrap(
+    multi_fits_bootstrap_dataset_bias_variance, dataspecs
+):
+    """
+    histogram of the ratio bias variance distribution obtained by bootstrapping bias
+    and variance with `fits_bootstrap_dataset_bias_variance`.
+
+    If more than one group of dataspecs (e.g. consistent and inconsistent)
+    fits are defined, than plot comparison of these.
+    """
+    fig, ax = plotutils.subplots()
+    for (bias, variance), spec in zip(multi_fits_bootstrap_dataset_bias_variance, dataspecs):
+        ratio = bias / variance
+        label = spec['speclabel']
+
+        ax.hist(ratio, bins='auto', density=True, alpha=0.5, label=label)
+
+    ax.legend()
+    return fig
+
+@figuregen
+def plot_l2_condition_number(each_dataset, fits_pdf, variancepdf):
+    """
+    TODO
+    """
+
+    evr_range = np.linspace(0.90, 0.995, 10) 
+    
+
+    for ds in each_dataset:
+        l2_cond = []
+        dof = []
+
+        for evr in evr_range:
+            _, pc_reps, n_comp = principal_components_dataset(ds, fits_pdf, variancepdf, explained_variance_ratio=evr)
+            
+            covmat_pdf = np.cov(pc_reps)
+            
+            # compute condition number
+            l2_cond.append(np.linalg.cond(covmat_pdf))
+            dof.append(n_comp)
+            
+
+
+        fig, ax1 = plotutils.subplots(figsize=(15,4))
+        ax1.plot(evr_range, l2_cond, "b-o", label="condition number")
+        ax1.set_title(f"Dataset: {str(ds)}")
+        ax1.set_xlabel("EVR")
+        ax1.set_ylabel("Covariance Matrix Condition Number", color="b")
+        ax1.tick_params('y', color="b")
+
+        ax2 = ax1.twinx()
+
+        # Plot the second dataset on the right y-axis
+        ax2.plot(evr_range, dof, 'r-o', label="dof")
+        ax2.set_ylabel('Degrees of freedom', color="r")
+        ax2.tick_params('y', color="r")
+        # ax1.legend()
+        # ax2.legend()
+        lines1, labels1 = ax1.get_legend_handles_labels()
+        lines2, labels2 = ax2.get_legend_handles_labels()
+        lines = lines1 + lines2
+        labels = labels1 + labels2
+        ax1.legend(lines, labels, loc='upper left')
+
+
+        yield fig
\ No newline at end of file

From b6abb5309462d79511de66dffbbd18720e212b9b Mon Sep 17 00:00:00 2001
From: andreab1997 <andreab1997@yahoo.it>
Date: Tue, 20 Feb 2024 15:55:15 +0100
Subject: [PATCH 04/61] Init analysis

---
 validphys2/src/validphys/kinematics.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/validphys2/src/validphys/kinematics.py b/validphys2/src/validphys/kinematics.py
index 1a5e08cbed..6be188cec8 100644
--- a/validphys2/src/validphys/kinematics.py
+++ b/validphys2/src/validphys/kinematics.py
@@ -124,6 +124,7 @@ def xq2_dataset_map(commondata, cuts,internal_multiclosure_dataset_loader,
     std_devs = np.std(central_deltas, axis = 0)
     means = np.mean(central_deltas, axis = 0)
     xi = dataset_xi(dataset_replica_and_central_diff(internal_multiclosure_dataset_loader,False))
+
     # for case of DY observables we have 2 (x,Q) for each experimental point
     if coords[0].shape[0] != std_devs.shape[0]:
         std_devs = np.concatenate((std_devs,std_devs))

From c580a9a2bf2012bdf1f085164165a2f2d65ede96 Mon Sep 17 00:00:00 2001
From: andreab1997 <andreab1997@yahoo.it>
Date: Tue, 20 Feb 2024 16:15:05 +0100
Subject: [PATCH 05/61] Add other functions

---
 .../multiclosure_inconsistent_output.py                     | 5 ++++-
 validphys2/src/validphys/kinematics.py                      | 6 ++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
index 33310361d1..b6b2a02d3f 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
@@ -82,6 +82,7 @@ def plot_variance_distribution_datasets(
             fig, ax = plotutils.subplots()
             yield fig
 
+
 @table
 def table_bias_variance_datasets(principal_components_bias_variance_datasets, each_dataset):
     """
@@ -127,6 +128,7 @@ def table_bias_variance_datasets(principal_components_bias_variance_datasets, ea
             columns=("dataset", "dof", "bias", "variance", "ratio", "ratio_sqrt"),
         )
     df.columns = ["dof", "bias", "variance", "ratio", "sqrt(ratio)"]
+
     return df
 
 
@@ -339,4 +341,5 @@ def plot_l2_condition_number(each_dataset, fits_pdf, variancepdf):
         ax1.legend(lines, labels, loc='upper left')
 
 
-        yield fig
\ No newline at end of file
+        yield fig
+
diff --git a/validphys2/src/validphys/kinematics.py b/validphys2/src/validphys/kinematics.py
index 6be188cec8..d4f1ef4f2f 100644
--- a/validphys2/src/validphys/kinematics.py
+++ b/validphys2/src/validphys/kinematics.py
@@ -13,9 +13,12 @@
 from reportengine.checks import check_positive
 from reportengine.table import table
 
+
 from validphys.core import CutsPolicy
 from validphys.plotoptions import core as plotoptions_core
 
+
+
 from validphys import plotutils
 from validphys import plotoptions
 from validphys.core import CutsPolicy
@@ -26,6 +29,9 @@
 from reportengine.figure import figuregen
 
 
+
+
+
 log = logging.getLogger(__name__)
 
 

From d3c99b207d67921265765ba48cfcb277c5bf570f Mon Sep 17 00:00:00 2001
From: andreab1997 <andreab1997@yahoo.it>
Date: Tue, 20 Feb 2024 16:33:18 +0100
Subject: [PATCH 06/61] Add other functions again

---
 .../multiclosure_inconsistent_output.py                         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
index b6b2a02d3f..e2f523afa7 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
@@ -83,6 +83,7 @@ def plot_variance_distribution_datasets(
             yield fig
 
 
+
 @table
 def table_bias_variance_datasets(principal_components_bias_variance_datasets, each_dataset):
     """
@@ -342,4 +343,3 @@ def plot_l2_condition_number(each_dataset, fits_pdf, variancepdf):
 
 
         yield fig
-

From 82a9ff8cbbcfe5485163738a3d4849499283d528 Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Mon, 10 Jun 2024 12:19:48 +0200
Subject: [PATCH 07/61] Remove ipdbs and make sure multiclosure_inconsistent
 work

---
 validphys2/src/validphys/closuretest/__init__.py     | 2 ++
 validphys2/src/validphys/closuretest/multiclosure.py | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/validphys2/src/validphys/closuretest/__init__.py b/validphys2/src/validphys/closuretest/__init__.py
index 02950f15d6..38bc1228c5 100644
--- a/validphys2/src/validphys/closuretest/__init__.py
+++ b/validphys2/src/validphys/closuretest/__init__.py
@@ -11,3 +11,5 @@
 from validphys.closuretest.multiclosure_pdf_output import *
 from validphys.closuretest.multiclosure_preprocessing import *
 from validphys.closuretest.multiclosure_pseudodata import *
+from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent import *
+from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent_output import *
diff --git a/validphys2/src/validphys/closuretest/multiclosure.py b/validphys2/src/validphys/closuretest/multiclosure.py
index 64d26c4d69..ff71e3caf0 100644
--- a/validphys2/src/validphys/closuretest/multiclosure.py
+++ b/validphys2/src/validphys/closuretest/multiclosure.py
@@ -160,7 +160,6 @@ def fits_normed_dataset_central_delta(
     """
     closures_th, law_th, exp_cov, sqrtcov = internal_multiclosure_dataset_loader
     # The dimentions here are (fit, data point, replica)
-    #import ipdb; ipdb.set_trace()
     reps = np.asarray([th.error_members[:, :_internal_max_reps] for th in closures_th])
     # One could mask here some reps in order to avoid redundancy of information
     #TODO
@@ -180,7 +179,6 @@ def fits_normed_dataset_central_delta(
         deltas.append(bias_diffs.tolist())
         # biases.shape = (n_fits, n_obs_cut/uncut)
         # variances.shape = (n_fits, n_obs_cut/uncut, reps)
-    #import ipdb; ipdb.set_trace()
     return np.asarray(deltas)
 
 def expected_dataset_bias_variance(fits_dataset_bias_variance):

From 8d635b4b037fdb9f0d6dfa3fdf2fbb1ff3e0bb2e Mon Sep 17 00:00:00 2001
From: andreab1997 <andreab1997@yahoo.it>
Date: Thu, 22 Feb 2024 11:39:43 +0100
Subject: [PATCH 08/61] Add multiclosure comparefits

---
 pyproject.toml                                |   1 +
 .../__init__.py                               |   3 +
 .../comparefits_template.md                   | 121 +++++++++
 .../data.md                                   |   5 +
 .../exponents.md                              |  15 ++
 .../lumi.md                                   |   9 +
 .../multiclosure_analysis.yaml                | 244 ++++++++++++++++++
 .../multiclosure_template.md                  |  14 +
 .../pca_template.md                           |   2 +
 .../pdf.md                                    |  24 ++
 .../single_point_template.md                  |   2 +
 .../src/validphys/scripts/vp_multiclosure.py  | 159 ++++++++++++
 12 files changed, 599 insertions(+)
 create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/__init__.py
 create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/comparefits_template.md
 create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/data.md
 create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/exponents.md
 create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/lumi.md
 create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml
 create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md
 create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md
 create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/pdf.md
 create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/single_point_template.md
 create mode 100644 validphys2/src/validphys/scripts/vp_multiclosure.py

diff --git a/pyproject.toml b/pyproject.toml
index cb3809b892..71fc4ce320 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,6 +50,7 @@ wiki-upload = "validphys.scripts.wiki_upload:main"
 vp-get = "validphys.scripts.vp_get:main"
 vp-comparefits = "validphys.scripts.vp_comparefits:main"
 vp-fitrename = "validphys.scripts.vp_fitrename:main"
+vp-multiclosure = "validphys.scripts.vp_multiclosure:main"
 vp-checktheory = "validphys.scripts.vp_checktheory:main"
 vp-pdfrename = "validphys.scripts.vp_pdfrename:main"
 vp-pdffromreplicas = "validphys.scripts.vp_pdffromreplicas:main"
diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/__init__.py b/validphys2/src/validphys/compareinconsistentclosuretemplates/__init__.py
new file mode 100644
index 0000000000..9a64b5152a
--- /dev/null
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/__init__.py
@@ -0,0 +1,3 @@
+import pathlib
+
+template_path = pathlib.Path(__file__).with_name('multiclosure_analysis.yaml')
diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/comparefits_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/comparefits_template.md
new file mode 100644
index 0000000000..bafb0f5fa5
--- /dev/null
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/comparefits_template.md
@@ -0,0 +1,121 @@
+%NNPDF report comparing {@ current fit_id @} and {@ reference fit_id @}
+  
+Summary
+-------
+  
+We are comparing:
+  
+  - {@ current fit @} (`{@ current fit_id @}`): {@ current description @}
+  - {@ reference fit @} (`{@ reference fit_id @}`): {@ reference description @}
+  
+  
+{@ summarise_fits @}
+  
+  
+t0 losses
+---------
+{@ dataspecs::t0_info t0_chi2_info_table @}
+  
+Theory covariance summary
+-------------------------
+{@summarise_theory_covmat_fits@}
+  
+Dataset properties
+------------------
+{@current fit_datasets_properties_table@}
+  
+Distances
+---------
+{@with Scales@}
+### {@Scaletitle@}
+{@with Normalize::Basespecs::PDFscalespecs::Distspecs@}
+#### {@Basistitle@}, {@Xscaletitle@}
+{@plot_pdfdistances@}
+{@plot_pdfvardistances@}
+{@endwith@}
+{@endwith@}
+  
+PDF arc-lengths
+---------------
+{@Basespecs plot_arc_lengths@}
+  
+Sum rules
+---------
+{@with pdfs@}
+### {@pdf@}
+  
+#### Known sum rules
+  
+{@sum_rules_table@}
+  
+#### Unknown sum rules
+  
+{@unknown_sum_rules_table@}
+  
+{@endwith@}
+  
+PDF plots
+---------
+{@with Scales@}
+[Plots at {@Scaletitle@}]({@pdf_report report@})
+{@endwith@}
+  
+Luminosities
+------------
+{@with Energies@}
+[Plots at {@Energytitle@}]({@lumi_report report@})
+{@endwith@}
+  
+Effective exponents
+-------------------
+[Detailed information]({@exponents_report report@})
+  
+Training lengths
+----------------
+{@fits plot_training_length@}
+  
+Training-validation
+-------------------
+{@fits plot_training_validation@}
+  
+{@with DataGroups@}
+$\chi^2$ by {@processed_metadata_group@}
+----------------------------------------
+{@plot_fits_groups_data_chi2@}
+{@endwith@}
+  
+  
+$\chi^2$ by dataset
+-------------------
+### Plot
+{@plot_fits_datasets_chi2@}
+### Table
+{@ProcessGroup fits_chi2_table(show_total=true)@}
+  
+  
+{@with DataGroups@}
+$\phi$ by {@processed_metadata_group@}
+--------------------------------------
+{@plot_fits_groups_data_phi@}
+{@endwith@}
+  
+Dataset plots
+-------------
+{@with matched_datasets_from_dataspecs@}
+[Plots for {@dataset_name@}]({@dataset_report report@})
+{@endwith@}
+  
+Positivity
+----------
+{@with matched_positivity_from_dataspecs@}
+{@plot_dataspecs_positivity@}
+{@endwith@}
+  
+Dataset differences and cuts
+----------------------------
+{@print_dataset_differences@}
+{@print_different_cuts@}
+  
+Code versions
+-------------
+{@fits_version_table@}
diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/data.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/data.md
new file mode 100644
index 0000000000..c99e62f3d4
--- /dev/null
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/data.md
@@ -0,0 +1,5 @@
+% Data-theory comparison for {@dataset_name@}
+# Absolute
+{@plot_fancy_dataspecs@}
+# Normalized
+{@Datanorm plot_fancy_dataspecs@}
diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/exponents.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/exponents.md
new file mode 100644
index 0000000000..9736e70775
--- /dev/null
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/exponents.md
@@ -0,0 +1,15 @@
+%NNPDF report comparing {@ current fit @} and {@ reference fit @}
+
+# Effective preprocessing information
+
+## Plots
+### alpha exponent
+{@current::basisfromfit plot_alpha_eff@}
+### beta exponent
+{@current::basisfromfit plot_beta_eff@}
+
+## Tables
+{@with fits@}
+### Next effective exponents for {@fit@}
+{@effective_exponents_table@}
+{@endwith@}
diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/lumi.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/lumi.md
new file mode 100644
index 0000000000..a077a4a21f
--- /dev/null
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/lumi.md
@@ -0,0 +1,9 @@
+%NNPDF report comparing {@ current fit @} and {@ reference fit @}
+
+# Luminosity plots
+
+{@with Normalize@}
+{@with lumi_channels@}
+{@plot_lumi1d@}
+{@endwith@}
+{@endwith@}
diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml
new file mode 100644
index 0000000000..dd4a2b92a5
--- /dev/null
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml
@@ -0,0 +1,244 @@
+meta:
+  title: Multiclosure report
+  author: AB
+  keywords: [inconsistencies, multiclosure]
+
+#COMPAREFITS SETTINGS
+
+positivity:
+  from_: fit
+
+description:
+  from_: fit
+
+dataset_inputs:
+  from_: fit
+  
+t0_info:
+  - use_t0: True
+    datacuts:
+      from_: fit
+    t0pdfset:
+      from_: datacuts 
+compare_settings:
+  current:
+    fit: {id: "240210_mnc_dis_ict_lam02"}
+    pdf: {id: "240210_mnc_dis_ict_lam02", label: "Current Fit"}
+    theory:
+      from_: fit
+    theoryid:
+      from_: theory
+    speclabel: "Current Fit"
+  
+  reference:
+    fit: {id: "240210_mnc_dis_ict_lam04"}
+    pdf: {id: "240210_mnc_dis_ict_lam04", label: "Reference Fit" }
+    theory:
+      from_: fit
+    theoryid:
+      from_: theory
+    speclabel: "Reference Fit"
+    
+  pdfs:
+    - from_: current
+    - from_: reference
+
+  fits:
+    - from_: current
+    - from_: reference
+
+  use_cuts: "fromfit"
+  use_weights_in_covmat: False
+
+  Q: 1.651
+
+  Scales:
+    - Q: 1.651
+      Scaletitle: "Q = 1.65 GeV"
+    - Q: 100
+      Scaletitle: "Q = 100 GeV"
+
+  PDFnormalize:
+    - Normtitle: Absolute
+
+    - normalize_to: 2
+      Normtitle: Ratio
+
+  Basespecs:
+    - basis: CCBAR_ASYMM_FLAVOUR
+      Basistitle: Flavour basis
+    - basis: CCBAR_ASYMM
+      Basistitle: Evolution basis
+
+  PDFscalespecs:
+    - xscale: log
+      Xscaletitle: Log
+    - xscale: linear
+      Xscaletitle: Linear
+
+  Energies:
+    - sqrts: 13000
+      Energytitle: "13 TeV"
+
+  lumi_channels:
+    - gg
+    - gq
+    - qq
+    - qqbar
+    - uubar
+    - ddbar
+    - udbar
+    - dubar
+
+  Distspecs:
+    - ymin: 0
+      ymax: 20
+
+  pos_use_kin: True
+
+  dataset_report:
+    meta: Null
+    template: data.md
+
+  pdf_report:
+    meta: Null
+    template: pdf.md
+
+  exponents_report:
+    meta: Null
+    template: exponents.md
+
+  lumi_report:
+    meta: Null
+    template: lumi.md
+  
+  dataspecs:
+    - theoryid:
+        from_: current
+      pdf:
+        from_: current
+      fit:
+        from_: current
+      speclabel:
+        from_: current
+
+    - theoryid:
+        from_: reference
+      pdf:
+        from_: reference
+      fit:
+        from_: reference
+      speclabel:
+        from_: reference
+        
+  Normalize:
+    normalize_to: 2
+
+  Datanorm:
+    normalize_to: data
+
+  DataGroups:
+    - metadata_group: nnpdf31_process
+    - metadata_group: experiment
+
+  ProcessGroup:
+      metadata_group: nnpdf31_process
+
+##################################################################
+#OTHER ANALYSIS SETTINGS
+
+lambdavalues:
+  - label: "LAMBDA02"
+    fit: 230124_dis_ict_lam02_fs_122996
+    dataset_inputs:
+      from_: fit
+    theory:
+      from_: fit
+    theoryid:
+      from_: theory
+    use_t0: True
+    t0pdfset: 210223-mw-000_fakepdf
+    use_cuts: "internal"
+    explained_variance_ratio: 0.99
+    variancepdf: 230708_mnc_dis_pt1_ct_1000
+    fits:
+      - 230124_dis_ict_lam02_fs_122996
+      - 230124_dis_ict_lam02_fs_152326
+      - 230124_dis_ict_lam02_fs_196144
+      - 230124_dis_ict_lam02_fs_200988
+      - 230124_dis_ict_lam02_fs_257767
+      - 230124_dis_ict_lam02_fs_31144
+      - 230124_dis_ict_lam02_fs_374984
+      - 230124_dis_ict_lam02_fs_383440
+      - 230124_dis_ict_lam02_fs_394255
+      - 230124_dis_ict_lam02_fs_431574
+      - 230124_dis_ict_lam02_fs_498979
+      - 230124_dis_ict_lam02_fs_504344
+      - 230124_dis_ict_lam02_fs_52722
+      - 230124_dis_ict_lam02_fs_54647
+      - 230124_dis_ict_lam02_fs_562489
+      - 230124_dis_ict_lam02_fs_57171
+      - 230124_dis_ict_lam02_fs_661665
+      - 230124_dis_ict_lam02_fs_683404
+      - 230124_dis_ict_lam02_fs_752096
+      - 230124_dis_ict_lam02_fs_760929
+      - 230124_dis_ict_lam02_fs_786382
+      - 230124_dis_ict_lam02_fs_834248
+      - 230124_dis_ict_lam02_fs_868248
+      - 230124_dis_ict_lam02_fs_897760
+      - 230124_dis_ict_lam02_fs_916690
+      - 230124_dis_ict_lam02_fs_966390
+  - label: "LAMBDA04"
+    fit: 250124_dis_ict_lam04_fs_102661
+    dataset_inputs:
+      from_: fit
+    theory:
+      from_: fit
+    theoryid:
+      from_: theory
+    use_t0: True
+    t0pdfset: 210223-mw-000_fakepdf
+    use_cuts: "internal"
+    explained_variance_ratio: 0.99
+    variancepdf: 230708_mnc_dis_pt1_ct_1000
+    fits:
+      - 250124_dis_ict_lam04_fs_102661
+      - 250124_dis_ict_lam04_fs_166247
+      - 250124_dis_ict_lam04_fs_194869
+      - 250124_dis_ict_lam04_fs_235399
+      - 250124_dis_ict_lam04_fs_259799
+      - 250124_dis_ict_lam04_fs_280552
+      - 250124_dis_ict_lam04_fs_362871
+      - 250124_dis_ict_lam04_fs_38346
+      - 250124_dis_ict_lam04_fs_392751
+      - 250124_dis_ict_lam04_fs_394227
+      - 250124_dis_ict_lam04_fs_473619
+      - 250124_dis_ict_lam04_fs_475803
+      - 250124_dis_ict_lam04_fs_5238
+      - 250124_dis_ict_lam04_fs_544453
+      - 250124_dis_ict_lam04_fs_547187
+      - 250124_dis_ict_lam04_fs_561201
+      - 250124_dis_ict_lam04_fs_630491
+      - 250124_dis_ict_lam04_fs_691489
+      - 250124_dis_ict_lam04_fs_717426
+      - 250124_dis_ict_lam04_fs_833288
+      - 250124_dis_ict_lam04_fs_85029
+      - 250124_dis_ict_lam04_fs_856511
+      - 250124_dis_ict_lam04_fs_895365
+      - 250124_dis_ict_lam04_fs_902027
+      - 250124_dis_ict_lam04_fs_911964
+      - 250124_dis_ict_lam04_fs_99031
+    
+
+template: multiclosure_template.md
+comparefits_report:
+  meta: Null
+  template: comparefits_template.md
+single_point_report:
+  meta: Null
+  template: single_point_template.md
+pca_report:
+  meta: Null
+  template: pca_template.md
+actions_:
+  - report(main=true)
diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md
new file mode 100644
index 0000000000..7ad062bd09
--- /dev/null
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md
@@ -0,0 +1,14 @@
+{@ with compare_settings @}
+[Comparefits report]({@comparefits_report report@})
+{@ endwith @}
+
+Single data point analysis
+--------------------------
+{@with lambdavalues@}
+[Lambda value of {@label@}]({@single_point_report report@})
+{@endwith@}
+Bias Variance Ratio Table
+-------------------------
+{@with lambdavalues@}
+[Lambda value of {@label@}]({@pca_report report@})
+{@endwith@}
diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md
new file mode 100644
index 0000000000..6269098614
--- /dev/null
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md
@@ -0,0 +1,2 @@
+# PCA analysis
+{@table_bias_variance_datasets@}
diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/pdf.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/pdf.md
new file mode 100644
index 0000000000..54cdc4bd87
--- /dev/null
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/pdf.md
@@ -0,0 +1,24 @@
+%NNPDF report comparing {@ current fit @} and {@ reference fit @}
+
+# PDF plots
+
+## PDF comparison
+{@with PDFnormalize@}
+### {@Normtitle@}
+{@with Basespecs@}
+#### {@Basistitle@}
+{@with PDFscalespecs@}
+##### {@Xscaletitle@}
+{@plot_pdfs@}
+{@endwith@}
+{@endwith@}
+{@endwith@}
+
+## PDF replicas
+{@with Basespecs@}
+#### {@Basistitle@}
+{@with PDFscalespecs@}
+##### {@Xscaletitle@}
+{@plot_pdfreplicas@}
+{@endwith@}
+{@endwith@}
diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/single_point_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/single_point_template.md
new file mode 100644
index 0000000000..8a8e5e8808
--- /dev/null
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/single_point_template.md
@@ -0,0 +1,2 @@
+# Single data point analysis
+{@xq2_data_prcs_maps@}
diff --git a/validphys2/src/validphys/scripts/vp_multiclosure.py b/validphys2/src/validphys/scripts/vp_multiclosure.py
new file mode 100644
index 0000000000..5d1aa7013a
--- /dev/null
+++ b/validphys2/src/validphys/scripts/vp_multiclosure.py
@@ -0,0 +1,159 @@
+import sys
+import os
+import logging
+
+#TODO: Look into making these lazy imports
+import prompt_toolkit
+from prompt_toolkit.completion import WordCompleter
+
+from reportengine.compat import yaml
+from reportengine.colors import t
+
+from validphys.app import App
+from validphys.loader import RemoteLoader
+from validphys import compareinconsistentclosuretemplates
+from validphys.promptutils import confirm, KeywordsWithCache
+
+log = logging.getLogger(__name__)
+
+CURRENT_FIT_LABEL_DEFAULT = "Current Fit"
+REFERENCE_FIT_LABEL_DEFAULT = "Reference Fit"
+
+
+
+class CompareFitApp(App):
+    def add_positional_arguments(self, parser):
+        parser.add_argument(
+            'current_fit',
+            default=None,
+            nargs='?',
+            help="The fit to produce the report for.",
+        )
+        parser.add_argument(
+            'reference_fit',
+            default=None,
+            nargs='?',
+            help="The fit to compare with.")
+        # Group together mandatory arguments that are not positional
+        mandatory = parser.add_argument_group("mandatory", "Mandatory command line arguments")
+        mandatory.add_argument(
+            '--title', help="The title that will be indexed with the report.")
+        mandatory.add_argument('--author', help="The author of the report.")
+        mandatory.add_argument(
+            '--keywords', nargs='+', help="keywords to index the report with.")
+        parser.add_argument(
+            '--current_fit_label',
+            nargs='?',
+            default=CURRENT_FIT_LABEL_DEFAULT,
+            help="The label for the fit that the report is being produced for.",
+        )
+        parser.add_argument(
+            '--reference_fit_label',
+            nargs='?',
+            default=REFERENCE_FIT_LABEL_DEFAULT,
+            help="The label for the fit that is being compared to.")
+        parser.add_argument(
+            '--lambdasettings_path',
+            help="The path to the yaml file containing the settings for the lambdavalues report.")
+        
+        parser.set_defaults()
+
+    def try_complete_args(self):
+        args = self.args
+        argnames = (
+            'current_fit', 'reference_fit', 'title', 'author', 'keywords')
+        optionalnames = (
+            'current_fit_label', 'reference_fit_label')
+        badargs = [argname for argname in argnames if not args[argname]]
+        bad = badargs
+        if bad:
+            sys.exit(f"The following arguments are required: {bad}")
+        texts = '\n'.join(
+            f'    {argname.replace("_", " ").capitalize()}: {args[argname]}'
+            for argname in [*argnames, *optionalnames])
+        log.info(f"Starting NNPDF fit comparison:\n{texts}")
+
+    def get_commandline_arguments(self, cmdline=None):
+        args = super().get_commandline_arguments(cmdline)
+        # This is needed because the environment wants to know how to resolve
+        # the relative paths to find the templates. Best to have the template
+        # look as much as possible as a runcard passed from the command line
+        args['config_yml'] = compareinconsistentclosuretemplates.template_path
+        return args
+
+    def complete_mapping(self):
+        args = self.args
+        autosettings = {}
+        shortsettings = {}
+        autosettings['meta'] = {
+            'title': args['title'],
+            'author': args['author'],
+            'keywords': args['keywords']
+        }
+        currentmap = {'id': args['current_fit'], 'label': args['current_fit_label']}
+        shortsettings['current'] = {
+            'fit': currentmap,
+            'pdf': currentmap,
+            'theory': {
+                'from_': 'fit'
+            },
+            'theoryid': {
+                'from_': 'theory'
+            },
+            'speclabel': args['current_fit_label']
+        }
+        refmap = {'id': args['reference_fit'], 'label': args['reference_fit_label']}
+        shortsettings['reference'] = {
+            'fit': refmap,
+            'pdf': refmap,
+            'theory': {
+                'from_': 'fit'
+            },
+            'theoryid': {
+                'from_': 'theory'
+            },
+            'speclabel': args['reference_fit_label']
+        }
+        autosettings["compare_settings"] = shortsettings
+        return autosettings
+
+    def complete_lambdavaluesmapping(self):
+        args = self.args
+        # opening conf file
+        with open(args['lambdasettings_path']) as f:
+            settings = yaml.safe_load(f)
+        autosettings = {}
+        list_lambdas = []
+        for lambdas in settings:
+            lambdasetting = {}
+            for set in ["variancepdf", "t0pdfset", "explained_variance_ratio", "label", "fits", "fit"]:
+                lambdasetting[set] = lambdas[set]
+            list_lambdas.append(lambdasetting)
+        autosettings["lambdavalues"] = list_lambdas
+        return autosettings
+            
+    def get_config(self):
+        self.try_complete_args()
+        #No error handling here because this is our internal file
+        with open(self.args['config_yml']) as f:
+            #TODO: Ideally this would load round trip but needs
+            #to be fixed in reportengine.
+            c = yaml.safe_load(f)
+        complete_mapping = self.complete_mapping()
+        complete_mapping.update(self.complete_lambdavaluesmapping())
+        c["meta"] = complete_mapping["meta"]
+        for lambdas, lambdas_mapping in zip(c["lambdavalues"], complete_mapping["lambdavalues"]):
+            for set in ["variancepdf", "t0pdfset", "explained_variance_ratio", "label", "fits", "fit"]:
+                lambdas[set] = lambdas_mapping[set]
+        for set in ["current", "reference"]:
+            c["compare_settings"][set] = complete_mapping["compare_settings"][set]
+        return self.config_class(c, environment=self.environment)
+
+
+def main():
+    a = CompareFitApp()
+    a.main()
+
+
+if __name__ == '__main__':
+    main()

From 0b6acae10b1901b5203ecca5cc57e3199328eb80 Mon Sep 17 00:00:00 2001
From: andreab1997 <andreab1997@yahoo.it>
Date: Mon, 26 Feb 2024 10:48:48 +0100
Subject: [PATCH 09/61] Add sklearn to pyproject

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 71fc4ce320..06b53483ba 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -97,6 +97,7 @@ fiatlux = {version = "*", optional = true}
 # without lhapdf
 pdfflow = {version = "^1.2.1", optional = true}
 lhapdf-management = {version = "^0.5", optional = true}
+scikit-learn = "^1.4.1"
 
 # Optional dependencies
 [tool.poetry.extras]

From e6936db384d87b94271aa634b499329ec31bb7cd Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Mon, 10 Jun 2024 12:21:03 +0200
Subject: [PATCH 10/61] Scikit added to conda

---
 conda-recipe/meta.yaml | 1 +
 pyproject.toml         | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index 9d1b6376e3..8c6b1dd12c 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -44,6 +44,7 @@ requirements:
         - sphinx_rtd_theme >0.5
         - sphinxcontrib-bibtex
         - ruamel.yaml <0.18
+        - scikit-learn = "^1.4.1"
 
 test:
     requires:
diff --git a/pyproject.toml b/pyproject.toml
index 06b53483ba..71fc4ce320 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -97,7 +97,6 @@ fiatlux = {version = "*", optional = true}
 # without lhapdf
 pdfflow = {version = "^1.2.1", optional = true}
 lhapdf-management = {version = "^0.5", optional = true}
-scikit-learn = "^1.4.1"
 
 # Optional dependencies
 [tool.poetry.extras]

From baa99a3502465954ddb51cea5a4d058060db8ddd Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Tue, 5 Mar 2024 11:51:27 +0000
Subject: [PATCH 11/61] added PCA bias variance example to validphys2/examples

---
 .../examples/pca_bias_variance_report.yaml    | 44 +++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 validphys2/examples/pca_bias_variance_report.yaml

diff --git a/validphys2/examples/pca_bias_variance_report.yaml b/validphys2/examples/pca_bias_variance_report.yaml
new file mode 100644
index 0000000000..5c355a8092
--- /dev/null
+++ b/validphys2/examples/pca_bias_variance_report.yaml
@@ -0,0 +1,44 @@
+meta:
+  title: PCA report for bias and variance of multiclosure fits.
+  author: Mark N. Costantini
+  keywords: [multiclosure, PCA, bias, variance]
+
+
+dataset_inputs:
+  - {dataset: HERA_NC_225GEV_EP-SIGMARED, variant: legacy}
+  - {dataset: HERA_NC_251GEV_EP-SIGMARED, variant: legacy}
+  - {dataset: HERA_NC_300GEV_EP-SIGMARED, variant: legacy}
+  - {dataset: HERA_NC_318GEV_EP-SIGMARED, variant: legacy}
+  - {dataset: HERA_CC_318GEV_EM-SIGMARED, variant: legacy}
+  - {dataset: HERA_CC_318GEV_EP-SIGMARED, variant: legacy}
+
+
+# variance pdf is used to compute the variance of fits
+# the assumption is that fits have the same variance (needs to be checked)
+variancepdf: 230124_dis_ict_lam02_fs_122996
+
+
+theoryid: 200
+use_cuts: internal
+use_t0: True
+t0pdfset: 210223-mw-000_fakepdf
+
+
+explained_variance_ratio: 0.99
+
+fits:
+  - 230124_dis_ict_lam02_fs_122996
+  - 230124_dis_ict_lam02_fs_196144
+  - 230124_dis_ict_lam02_fs_257767
+
+
+template_text: |
+
+  PCA report for bias and variance of multiclosure fits.
+  ------------------------------------------------------
+
+  {@table_bias_variance_datasets@}
+
+
+actions_:
+  - report(main=true)
\ No newline at end of file

From a7399641dc32edb275132d32b8062c3119d0c664 Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Tue, 5 Mar 2024 14:44:14 +0000
Subject: [PATCH 12/61] l2 condition number plot

---
 .../multiclosure_inconsistent.py              |  6 +-
 .../multiclosure_inconsistent_output.py       | 59 +++++++++++++------
 2 files changed, 44 insertions(+), 21 deletions(-)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
index 4cb12f2d30..899e50252b 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
@@ -110,9 +110,9 @@ def principal_components_bias_variance_dataset(
 
     pc_basis, pc_reps, n_comp = principal_components_dataset
 
-    if n_comp <=1:
+    if n_comp <= 1:
         return None, None, n_comp
-        
+
     # estimate (PC) pdf covariance matrix (from replicas), shape is (Npc, Npc)
     covmat_pdf = np.cov(pc_reps)
     sqrt_covmat_pdf = covmats.sqrt_covmat(covmat_pdf)
@@ -359,4 +359,4 @@ def dataset_fits_gaussian_parameters(internal_multiclosure_dataset_loader, thres
     replicas_covmat /= reps.shape[0]
     mean_replicas = np.mean(reps.reshape(reps.shape[1], -1), axis=1)
 
-    return mean_centrals, centrals_covmat, mean_replicas, replicas_covmat
\ No newline at end of file
+    return mean_centrals, centrals_covmat, mean_replicas, replicas_covmat
diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
index e2f523afa7..d15de1c3ae 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
@@ -297,14 +297,38 @@ def plot_multi_ratio_bias_variance_distribution_bootstrap(
     return fig
 
 @figuregen
-def plot_l2_condition_number(each_dataset, fits_pdf, variancepdf):
-    """
-    TODO
+def plot_l2_condition_number(each_dataset, fits_pdf, variancepdf, evr_min=0.90, evr_max=0.995, evr_n=20):
     """
+    Plot the L2 condition number of the covariance matrix as a function of the explained variance ratio.
+    The plot gives an idea of the stability of the covariance matrix as a function of the 
+    exaplained variance ratio and hence the number of principal components used to reduce the dimensionality.
 
-    evr_range = np.linspace(0.90, 0.995, 10) 
-    
+    The ideal explained variance ratio is chosen based on a threshold L2 condition number, in general this
+    threshold number (and the derived explained variance ratio) should be chosen so that
+
+    relative error in output (inverse covmat) <= relative error in input (covmat) * condition number
 
+
+    Parameters
+    ----------
+    each_dataset : list
+        List of datasets
+    
+    fits_pdf: list
+        list of validphys.core.PDF objects
+    
+    variancepdf: validphys.core.PDF
+        PDF object for the variance
+    
+    Yields
+    ------
+    fig
+        Figure object
+    """
+    
+    # Explained variance ratio range
+    evr_range = np.linspace(evr_min, evr_max, evr_n)
+    
     for ds in each_dataset:
         l2_cond = []
         dof = []
@@ -318,28 +342,27 @@ def plot_l2_condition_number(each_dataset, fits_pdf, variancepdf):
             l2_cond.append(np.linalg.cond(covmat_pdf))
             dof.append(n_comp)
             
-
-
-        fig, ax1 = plotutils.subplots(figsize=(15,4))
-        ax1.plot(evr_range, l2_cond, "b-o", label="condition number")
+        fig, ax1 = plotutils.subplots()
+        ax1.plot(evr_range, l2_cond, label="L2 Condition Number")
         ax1.set_title(f"Dataset: {str(ds)}")
-        ax1.set_xlabel("EVR")
-        ax1.set_ylabel("Covariance Matrix Condition Number", color="b")
-        ax1.tick_params('y', color="b")
+        ax1.set_xlabel("Explained Variance Ratio")
+        ax1.set_ylabel("Covariance Matrix Condition Number")
+        ax1.tick_params('y')
+
+        # Draw horizontal line for threshold L2 condition number
+        ax1.axhline(y=100, color='g', linestyle='--', label="Threshold L2 condition number")
 
         ax2 = ax1.twinx()
 
         # Plot the second dataset on the right y-axis
-        ax2.plot(evr_range, dof, 'r-o', label="dof")
-        ax2.set_ylabel('Degrees of freedom', color="r")
-        ax2.tick_params('y', color="r")
-        # ax1.legend()
-        # ax2.legend()
+        ax2.plot(evr_range, dof, color="r", label="Degrees of freedom")
+        ax2.set_ylabel('Degrees of freedom')
+        ax2.tick_params('y')
+        
         lines1, labels1 = ax1.get_legend_handles_labels()
         lines2, labels2 = ax2.get_legend_handles_labels()
         lines = lines1 + lines2
         labels = labels1 + labels2
         ax1.legend(lines, labels, loc='upper left')
 
-
         yield fig

From 72ad203123d351dc68113d2e4ab1247ab9bc2da4 Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Tue, 5 Mar 2024 14:44:28 +0000
Subject: [PATCH 13/61] added l2 condition number plot

---
 validphys2/examples/pca_bias_variance_report.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/validphys2/examples/pca_bias_variance_report.yaml b/validphys2/examples/pca_bias_variance_report.yaml
index 5c355a8092..e2f66f44fb 100644
--- a/validphys2/examples/pca_bias_variance_report.yaml
+++ b/validphys2/examples/pca_bias_variance_report.yaml
@@ -37,8 +37,12 @@ template_text: |
   PCA report for bias and variance of multiclosure fits.
   ------------------------------------------------------
 
+  ## Table of bias and variance
   {@table_bias_variance_datasets@}
 
+  ## L2 condition number
+  {@plot_l2_condition_number@}
+
 
 actions_:
   - report(main=true)
\ No newline at end of file

From 190911431cfad953b7f07e1d7c4226481b8e7c9d Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Tue, 5 Mar 2024 15:17:09 +0000
Subject: [PATCH 14/61] removed unused functions pca_covmat,
 dataset_fits_gaussian_parameters

---
 .../multiclosure_inconsistent.py              | 49 -------------------
 .../multiclosure_inconsistent_output.py       |  2 +-
 2 files changed, 1 insertion(+), 50 deletions(-)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
index 899e50252b..66a3f15112 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
@@ -198,24 +198,6 @@ def compute_num_components(covariance_matrix, threshold=0.99):
     return num_components
 
 
-def pca_covmat(X, num_components):
-    """
-    given data X of shape (n,p), reduce its dimension to
-    (n,num_components) and return the covariance matrix
-    of the reduced data matrix.
-
-    Parameters
-    ----------
-
-    Returns
-    -------
-    """
-    pca = PCA(num_components)
-    X_reduced = pca.fit_transform(X)
-    covariance = np.dot(X_reduced.T, X_reduced) / (X_reduced.shape[0] - 1)
-    return covariance
-
-
 def calc_chi2_pca(pdf_cov, diff, num_components):
     """
     Computes the chi2 between pdf_cov and diff by first projecting
@@ -329,34 +311,3 @@ def dataset_fits_ratio_bias_variance_samples_pca(dataset_fits_bias_variance_samp
     biases, variances, _ = dataset_fits_bias_variance_samples_pca
     sqrt_ratios = np.sqrt(biases / variances)
     return sqrt_ratios
-
-
-def dataset_fits_gaussian_parameters(internal_multiclosure_dataset_loader, threshold=0.99):
-    """
-    returns parameters of multi gaussian distribution of replicas
-    and central replicas
-    """
-    closures_th, law_th, _, _ = internal_multiclosure_dataset_loader
-
-    # The dimensions here are (fit, data point, replica)
-    reps = np.asarray([th.error_members for th in closures_th])
-
-    # take mean across replicas - since we might have changed no. of reps
-    centrals = reps.mean(axis=2)
-
-    centrals_covmat = np.cov(centrals.T)
-    centrals_covmat = pca_covmat(
-        centrals, num_components=compute_num_components(centrals_covmat, threshold)
-    )
-    mean_centrals = np.mean(centrals, axis=0)
-
-    replicas_covmat = 0
-    for i in range(reps.shape[0]):
-        replicas_covmat = np.cov(reps[i, :, :])
-        replicas_covmat += pca_covmat(
-            reps[i, :, :].T, num_components=compute_num_components(replicas_covmat, threshold)
-        )
-    replicas_covmat /= reps.shape[0]
-    mean_replicas = np.mean(reps.reshape(reps.shape[1], -1), axis=1)
-
-    return mean_centrals, centrals_covmat, mean_replicas, replicas_covmat
diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
index d15de1c3ae..ac211d9af6 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
@@ -307,7 +307,7 @@ def plot_l2_condition_number(each_dataset, fits_pdf, variancepdf, evr_min=0.90,
     threshold number (and the derived explained variance ratio) should be chosen so that
 
     relative error in output (inverse covmat) <= relative error in input (covmat) * condition number
-
+    Note that in a closure test the relative error in the covariance matrix is very small and only numerical.
 
     Parameters
     ----------

From 66f293fdbaa464dbf3f3b28abe0d9850bce5d68f Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Tue, 5 Mar 2024 15:18:46 +0000
Subject: [PATCH 15/61] updated multiclosure analysis with l2 plots

---
 .../compareinconsistentclosuretemplates/pca_template.md      | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md
index 6269098614..2f308bd701 100644
--- a/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md
@@ -1,2 +1,7 @@
 # PCA analysis
+
+## Table of bias and variance
 {@table_bias_variance_datasets@}
+
+## L2 condition number
+{@plot_l2_condition_number@}

From 90c553a47d5bc3e1fad2ecd10651dd533d2ed112 Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Tue, 5 Mar 2024 15:32:03 +0000
Subject: [PATCH 16/61] variance pdf has to be taken from the same fits

---
 .../multiclosure_analysis.yaml                                | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml
index dd4a2b92a5..33c790f42a 100644
--- a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml
@@ -160,7 +160,7 @@ lambdavalues:
     t0pdfset: 210223-mw-000_fakepdf
     use_cuts: "internal"
     explained_variance_ratio: 0.99
-    variancepdf: 230708_mnc_dis_pt1_ct_1000
+    variancepdf: 230124_dis_ict_lam02_fs_122996
     fits:
       - 230124_dis_ict_lam02_fs_122996
       - 230124_dis_ict_lam02_fs_152326
@@ -200,7 +200,7 @@ lambdavalues:
     t0pdfset: 210223-mw-000_fakepdf
     use_cuts: "internal"
     explained_variance_ratio: 0.99
-    variancepdf: 230708_mnc_dis_pt1_ct_1000
+    variancepdf: 250124_dis_ict_lam04_fs_102661
     fits:
       - 250124_dis_ict_lam04_fs_102661
       - 250124_dis_ict_lam04_fs_166247

From d24545130786197259d07d1803c026560d941198 Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Tue, 5 Mar 2024 16:03:15 +0000
Subject: [PATCH 17/61] use np.nan and set default EVR to 0.99

---
 .../multiclosure_inconsistent.py                | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
index 66a3f15112..9ee3e189e6 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
@@ -37,7 +37,7 @@
 )
 
 
-def principal_components_dataset(dataset, fits_pdf, variancepdf, explained_variance_ratio=0.93):
+def principal_components_dataset(dataset, fits_pdf, variancepdf, explained_variance_ratio=0.99):
     """
     Compute the principal components of theory predictions replica matrix
     (Ndat x Nrep feature matrix).
@@ -62,21 +62,12 @@ def principal_components_dataset(dataset, fits_pdf, variancepdf, explained_varia
     Returns
     -------
     tuple
-        2D tuple:
+        3D tuple:
         - matrix of the principal components (PCs) of shape (N_pc, N_dat)
         - reduced feature matrix, i.e., feature matrix projected onto PCs of shape (N_pc, N_rep)
+        - N_pc: number of principal components kept
 
     """
-    # fits_dataset_predictions = [
-    #     ThPredictionsResult.from_convolution(pdf, dataset) for pdf in fits_pdf
-    # ]
-
-    # dimensions here are (Nfits, Ndat, Nrep)
-    # reps = np.asarray([th.error_members for th in fits_dataset_predictions])
-
-    # reshape so as to get PCs from all the samples
-    # reps = reps.reshape(reps.shape[1],-1)
-
     # get replicas from variance fit, used to estimate variance
     reps = ThPredictionsResult.from_convolution(variancepdf, dataset).error_members
 
@@ -111,7 +102,7 @@ def principal_components_bias_variance_dataset(
     pc_basis, pc_reps, n_comp = principal_components_dataset
 
     if n_comp <= 1:
-        return None, None, n_comp
+        return np.nan, np.nan, n_comp
 
     # estimate (PC) pdf covariance matrix (from replicas), shape is (Npc, Npc)
     covmat_pdf = np.cov(pc_reps)

From d09018a87ab246fa2d400803b4b2e6b7a62e1d5b Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Tue, 5 Mar 2024 16:04:43 +0000
Subject: [PATCH 18/61] cleaned bias variance table

---
 .../multiclosure_inconsistent_output.py       | 101 ++++++++++--------
 1 file changed, 57 insertions(+), 44 deletions(-)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
index ac211d9af6..009dbb07e8 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
@@ -6,6 +6,7 @@
 estimators in the space of data
 
 """
+
 import pandas as pd
 import numpy as np
 from scipy import stats
@@ -14,7 +15,9 @@
 from reportengine.table import table
 
 from validphys import plotutils
-from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent import principal_components_dataset
+from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent import (
+    principal_components_dataset,
+)
 
 
 @figuregen
@@ -24,7 +27,7 @@ def plot_bias_distribution_datasets(principal_components_bias_variance_datasets,
     """
     for pc_bias_var_dataset, ds in zip(principal_components_bias_variance_datasets, each_dataset):
         biases, variances, n_comp = pc_bias_var_dataset
-        
+
         try:
             sqrt_rbv = np.sqrt(np.mean(biases) / np.mean(variances))
             vals = np.linspace(0, 3 * n_comp, 100)
@@ -39,13 +42,14 @@ def plot_bias_distribution_datasets(principal_components_bias_variance_datasets,
             ax.legend()
 
             yield fig
-        
+
         except:
             fig, ax = plotutils.subplots()
             ax.plot([], [], label=f"Dataset: {str(ds)}")
             ax.legend()
             yield fig
 
+
 @figuregen
 def plot_variance_distribution_datasets(
     principal_components_variance_distribution_datasets, each_dataset
@@ -83,22 +87,35 @@ def plot_variance_distribution_datasets(
             yield fig
 
 
-
 @table
 def table_bias_variance_datasets(principal_components_bias_variance_datasets, each_dataset):
     """
-    TODO
+    Compute the bias, variance, ratio and sqrt(ratio) for each dataset
+    and return a DataFrame with the results.
+
+    Parameters
+    ----------
+
+    principal_components_bias_variance_datasets: list
+        List of tuples containing the values of bias, variance and number of degrees of freedom
+
+    each_dataset: list
+        List of validphys.core.DataSetSpec
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame containing the bias, variance, ratio and sqrt(ratio) for each dataset
     """
     records = []
     for pc_bias_var_dataset, ds in zip(principal_components_bias_variance_datasets, each_dataset):
         biases, variances, n_comp = pc_bias_var_dataset
-        
-        try:
-            bias = np.mean(biases)
-            variance = np.mean(variances)
-            rbv = bias / variance
-            sqrt_rbv = np.sqrt(bias / variance)
-            records.append(
+
+        bias = np.mean(biases)
+        variance = np.mean(variances)
+        rbv = bias / variance
+        sqrt_rbv = np.sqrt(bias / variance)
+        records.append(
             dict(
                 dataset=str(ds),
                 dof=n_comp,
@@ -108,26 +125,12 @@ def table_bias_variance_datasets(principal_components_bias_variance_datasets, ea
                 ratio_sqrt=sqrt_rbv,
             )
         )
-        
-        except:
-            records.append(
-            dict(
-                dataset=str(ds),
-                dof=n_comp,
-                bias=bias,
-                variance=variance,
-                ratio=np.nan,
-                ratio_sqrt=np.nan,
-            ))
-    
-        
-            
 
     df = pd.DataFrame.from_records(
-            records,
-            index="dataset",
-            columns=("dataset", "dof", "bias", "variance", "ratio", "ratio_sqrt"),
-        )
+        records,
+        index="dataset",
+        columns=("dataset", "dof", "bias", "variance", "ratio", "ratio_sqrt"),
+    )
     df.columns = ["dof", "bias", "variance", "ratio", "sqrt(ratio)"]
 
     return df
@@ -296,11 +299,14 @@ def plot_multi_ratio_bias_variance_distribution_bootstrap(
     ax.legend()
     return fig
 
+
 @figuregen
-def plot_l2_condition_number(each_dataset, fits_pdf, variancepdf, evr_min=0.90, evr_max=0.995, evr_n=20):
+def plot_l2_condition_number(
+    each_dataset, fits_pdf, variancepdf, evr_min=0.90, evr_max=0.995, evr_n=20
+):
     """
     Plot the L2 condition number of the covariance matrix as a function of the explained variance ratio.
-    The plot gives an idea of the stability of the covariance matrix as a function of the 
+    The plot gives an idea of the stability of the covariance matrix as a function of the
     exaplained variance ratio and hence the number of principal components used to reduce the dimensionality.
 
     The ideal explained variance ratio is chosen based on a threshold L2 condition number, in general this
@@ -313,35 +319,42 @@ def plot_l2_condition_number(each_dataset, fits_pdf, variancepdf, evr_min=0.90,
     ----------
     each_dataset : list
         List of datasets
-    
+
     fits_pdf: list
         list of validphys.core.PDF objects
-    
+
     variancepdf: validphys.core.PDF
         PDF object for the variance
-    
+
     Yields
     ------
     fig
         Figure object
     """
-    
+
     # Explained variance ratio range
     evr_range = np.linspace(evr_min, evr_max, evr_n)
-    
+
     for ds in each_dataset:
         l2_cond = []
         dof = []
 
         for evr in evr_range:
-            _, pc_reps, n_comp = principal_components_dataset(ds, fits_pdf, variancepdf, explained_variance_ratio=evr)
-            
+            _, pc_reps, n_comp = principal_components_dataset(
+                ds, fits_pdf, variancepdf, explained_variance_ratio=evr
+            )
+
             covmat_pdf = np.cov(pc_reps)
-            
-            # compute condition number
-            l2_cond.append(np.linalg.cond(covmat_pdf))
+
+            # if the number of principal components is 1 then the covariance matrix is a scalar
+            # and the condition number is not computed
+            if n_comp <= 1:
+                l2_cond.append(np.nan)
+            else:
+                l2_cond.append(np.linalg.cond(covmat_pdf))
+
             dof.append(n_comp)
-            
+
         fig, ax1 = plotutils.subplots()
         ax1.plot(evr_range, l2_cond, label="L2 Condition Number")
         ax1.set_title(f"Dataset: {str(ds)}")
@@ -358,7 +371,7 @@ def plot_l2_condition_number(each_dataset, fits_pdf, variancepdf, evr_min=0.90,
         ax2.plot(evr_range, dof, color="r", label="Degrees of freedom")
         ax2.set_ylabel('Degrees of freedom')
         ax2.tick_params('y')
-        
+
         lines1, labels1 = ax1.get_legend_handles_labels()
         lines2, labels2 = ax2.get_legend_handles_labels()
         lines = lines1 + lines2

From d9b94b89ed422cbf427e5a6db358ff69d64ecbb8 Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Tue, 5 Mar 2024 16:13:06 +0000
Subject: [PATCH 19/61] added docs to func + removed sklearn preprocessing as
 unused

---
 .../multiclosure_inconsistent.py              | 26 +++++++++++++++----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
index 9ee3e189e6..89e4d554d1 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
@@ -10,7 +10,6 @@
 
 import numpy as np
 from sklearn.decomposition import PCA
-from sklearn import preprocessing
 
 from validphys import covmats
 from validphys.calcutils import calc_chi2
@@ -71,13 +70,13 @@ def principal_components_dataset(dataset, fits_pdf, variancepdf, explained_varia
     # get replicas from variance fit, used to estimate variance
     reps = ThPredictionsResult.from_convolution(variancepdf, dataset).error_members
 
-    # rescale feature matrix
-    reps_scaled = reps  # preprocessing.scale(reps)
+    # something that could be tested: rescale feature matrix
+    # reps_scaled = reps.preprocessing.scale(reps)
 
     # choose number of principal components (PCs) based on explained variance ratio
     n_comp = 1
     for _ in range(reps.shape[0]):
-        pca = PCA(n_comp).fit(reps_scaled.T)
+        pca = PCA(n_comp).fit(reps.T)
         if np.sum(pca.explained_variance_ratio_) >= explained_variance_ratio:
             break
         n_comp += 1
@@ -92,7 +91,24 @@ def principal_components_bias_variance_dataset(
     internal_multiclosure_dataset_loader, principal_components_dataset
 ):
     """
-    TODO
+    Compute the bias and variance for one datasets
+    using the principal component reduced covariance matrix.
+    
+    Parameters
+    ----------
+    internal_multiclosure_dataset_loader : tuple
+        Tuple containing the results of multiclosure fits
+    
+    principal_components_dataset : tuple
+        3D tuple containing the principal components of the theory predictions    
+
+    Returns
+    -------
+    tuple
+        3D tuple:
+        - biases: 1-D array of shape (Nfits,)
+        - variances: 1-D array of shape (Nfits, )
+        - n_comp: number of principal components kept
     """
 
     closures_th, law_th, _, _ = internal_multiclosure_dataset_loader

From acf3f91f3267fc7856a53fcd3892ca522c448791 Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Tue, 5 Mar 2024 16:27:38 +0000
Subject: [PATCH 20/61] added single point dataset to example report

---
 validphys2/examples/pca_bias_variance_report.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/validphys2/examples/pca_bias_variance_report.yaml b/validphys2/examples/pca_bias_variance_report.yaml
index e2f66f44fb..9c621b20d2 100644
--- a/validphys2/examples/pca_bias_variance_report.yaml
+++ b/validphys2/examples/pca_bias_variance_report.yaml
@@ -5,6 +5,7 @@ meta:
 
 
 dataset_inputs:
+  - {dataset: ATLAS_SINGLETOP_7TEV_TCHANNEL-XSEC, variant: legacy}
   - {dataset: HERA_NC_225GEV_EP-SIGMARED, variant: legacy}
   - {dataset: HERA_NC_251GEV_EP-SIGMARED, variant: legacy}
   - {dataset: HERA_NC_300GEV_EP-SIGMARED, variant: legacy}
@@ -45,4 +46,4 @@ template_text: |
 
 
 actions_:
-  - report(main=true)
\ No newline at end of file
+  - report(main=true)

From 2de09b88fcbfa9dbf82b90a30a5d796fadc8aa82 Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Mon, 11 Mar 2024 16:48:59 +0000
Subject: [PATCH 21/61] added bootstrap of bias distribution

---
 .../examples/pca_bias_variance_report.yaml    | 14 ++-
 .../multiclosure_inconsistent.py              | 96 ++++++++++++++++++-
 .../multiclosure_inconsistent_output.py       | 72 +++++++++++++-
 .../multiclosure_analysis.yaml                | 11 ++-
 .../multiclosure_template.md                  |  4 +
 .../pca_template.md                           |  6 ++
 6 files changed, 191 insertions(+), 12 deletions(-)

diff --git a/validphys2/examples/pca_bias_variance_report.yaml b/validphys2/examples/pca_bias_variance_report.yaml
index 9c621b20d2..209ad06c58 100644
--- a/validphys2/examples/pca_bias_variance_report.yaml
+++ b/validphys2/examples/pca_bias_variance_report.yaml
@@ -28,10 +28,13 @@ t0pdfset: 210223-mw-000_fakepdf
 explained_variance_ratio: 0.99
 
 fits:
-  - 230124_dis_ict_lam02_fs_122996
-  - 230124_dis_ict_lam02_fs_196144
-  - 230124_dis_ict_lam02_fs_257767
-
+  - 250124_dis_ict_lam04_fs_833288
+  - 250124_dis_ict_lam04_fs_85029
+  - 250124_dis_ict_lam04_fs_856511
+  - 250124_dis_ict_lam04_fs_895365
+  - 250124_dis_ict_lam04_fs_902027
+  - 250124_dis_ict_lam04_fs_911964
+  - 250124_dis_ict_lam04_fs_99031
 
 template_text: |
 
@@ -43,7 +46,8 @@ template_text: |
 
   ## L2 condition number
   {@plot_l2_condition_number@}
-
+  
+  {@bootstrapped_bias_distribution@}
 
 actions_:
   - report(main=true)
diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
index 89e4d554d1..ffc351eea1 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
@@ -93,14 +93,14 @@ def principal_components_bias_variance_dataset(
     """
     Compute the bias and variance for one datasets
     using the principal component reduced covariance matrix.
-    
+
     Parameters
     ----------
     internal_multiclosure_dataset_loader : tuple
         Tuple containing the results of multiclosure fits
-    
+
     principal_components_dataset : tuple
-        3D tuple containing the principal components of the theory predictions    
+        3D tuple containing the principal components of the theory predictions
 
     Returns
     -------
@@ -145,6 +145,96 @@ def principal_components_bias_variance_dataset(
 )
 
 
+def bootstrap_principal_components_bias_variance_dataset(
+    internal_multiclosure_dataset_loader,
+    principal_components_dataset,
+    n_bootstrap=100,
+    n_fits_bt=15,
+):
+    """
+    Compute the bias and variance for one datasets
+    using the principal component reduced covariance matrix.
+
+    Parameters
+    ----------
+    internal_multiclosure_dataset_loader : tuple
+        Tuple containing the results of multiclosure fits
+
+    principal_components_dataset : tuple
+        3D tuple containing the principal components of the theory predictions
+
+    n_bootstrap: int, default is 100
+        number of bootstrap
+
+    n_fits_bt: int, default is 15
+        number of fits to be randomly sampled in bootstrap
+
+
+    Returns
+    -------
+    tuple
+        3D tuple:
+        - biases: 1-D array of shape (Nfits,)
+        - variances: 1-D array of shape (Nfits, )
+        - n_comp: number of principal components kept
+    """
+
+    # estimate (PC) pdf covariance matrix (from replicas), shape is (Npc, Npc)
+    pc_basis, pc_reps, n_comp = principal_components_dataset
+
+    if n_comp <= 1:
+        return np.nan, np.nan, n_comp
+
+    covmat_pdf = np.cov(pc_reps)
+
+    sqrt_covmat_pdf = covmats.sqrt_covmat(covmat_pdf)
+
+    # compute variance
+    closures_th, law_th, _, _ = internal_multiclosure_dataset_loader
+
+    reps = np.asarray([th.error_members for th in closures_th])
+
+    variances = []
+    for i in range(reps.shape[0]):
+        diffs = pc_basis @ (reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True))
+        variances.append(np.mean(calc_chi2(sqrt_covmat_pdf, diffs)))
+    variance = np.mean(variances)
+
+    # perform bootstrap on bias
+    bt_biases = []
+    for i in range(n_bootstrap):
+        rng = np.random.RandomState(seed=i)
+
+        # step1a: pick n_fits_bt closure tests at random (with replacement)
+        fit_idxs = rng.choice(len(closures_th), size=n_fits_bt, replace=True)
+        bt_closure_th = np.array(closures_th)[fit_idxs]
+
+        # step1b: from each closure test pick 100 replicas at random (with replacement)
+        rep_idxs = rng.choice(100, size=(n_fits_bt, 100), replace=True)
+        replicas = np.asarray(
+            [th.error_members[:, rep_idx] for th, rep_idx in zip(bt_closure_th, rep_idxs)]
+        )
+
+        # step2: compute delta bias on bootstrap sample
+        centrals = np.mean(replicas, axis=-1)
+        underlying = law_th.central_value
+
+        delta_bias = centrals - underlying[np.newaxis, :]
+        delta_bias = pc_basis @ delta_bias.T
+
+        bt_biases.append(np.mean(calc_chi2(sqrt_covmat_pdf, delta_bias)))
+
+    return np.asarray(bt_biases), variance, n_comp
+
+
+"""
+Bootstrapped bias for each dataset and fits with different inconsistency
+"""
+lambdavalues_bootstrap_principal_components_bias_variance_datasets = collect(
+    "bootstrap_principal_components_bias_variance_dataset", ("data", "lambdavalues")
+)
+
+
 def principal_components_variance_distribution_dataset(
     internal_multiclosure_dataset_loader, principal_components_dataset
 ):
diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
index 009dbb07e8..130063c594 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
@@ -110,11 +110,17 @@ def table_bias_variance_datasets(principal_components_bias_variance_datasets, ea
     records = []
     for pc_bias_var_dataset, ds in zip(principal_components_bias_variance_datasets, each_dataset):
         biases, variances, n_comp = pc_bias_var_dataset
-
         bias = np.mean(biases)
         variance = np.mean(variances)
         rbv = bias / variance
+
+        # use gaussian uncertainty propagation
+        delta_rbv = np.sqrt(
+            ((1 / variance) * np.std(biases)) ** 2 + (bias / variance**2 * np.std(variances)) ** 2
+        )
         sqrt_rbv = np.sqrt(bias / variance)
+        delta_sqrt_rbv = 0.5 * delta_rbv / np.sqrt(rbv)
+
         records.append(
             dict(
                 dataset=str(ds),
@@ -122,19 +128,79 @@ def table_bias_variance_datasets(principal_components_bias_variance_datasets, ea
                 bias=bias,
                 variance=variance,
                 ratio=rbv,
+                error_ratio=delta_rbv,
                 ratio_sqrt=sqrt_rbv,
+                error_ratio_sqrt=delta_sqrt_rbv,
             )
         )
 
     df = pd.DataFrame.from_records(
         records,
         index="dataset",
-        columns=("dataset", "dof", "bias", "variance", "ratio", "ratio_sqrt"),
+        columns=(
+            "dataset",
+            "dof",
+            "bias",
+            "variance",
+            "ratio",
+            "error_ratio",
+            "ratio_sqrt",
+            "error_ratio_sqrt",
+        ),
     )
-    df.columns = ["dof", "bias", "variance", "ratio", "sqrt(ratio)"]
+    df.columns = [
+        "dof",
+        "bias",
+        "variance",
+        "ratio",
+        "error ratio",
+        "sqrt(ratio)",
+        "error sqrt(ratio)",
+    ]
 
     return df
 
+@figuregen
+def lambdavalues_bootstrapped_bias_distribution(
+    lambdavalues_bootstrap_principal_components_bias_variance_datasets, each_dataset, lambdavalues
+):
+    """ """
+
+    for i, ds in enumerate(each_dataset):
+
+        n_lambda_values = len(
+            lambdavalues_bootstrap_principal_components_bias_variance_datasets
+        ) // len(each_dataset)
+
+        fig, ax = plotutils.subplots()
+
+        for j in range(n_lambda_values):
+
+            (biases, _, n_comp) = (
+                lambdavalues_bootstrap_principal_components_bias_variance_datasets[
+                    i + len(each_dataset) * j
+                ]
+            )
+
+            if n_comp == 1:
+                ax.set_title(f"dataset {str(ds)}")
+
+            else:
+                # adjust by expected value so that a comparison between identical datasets w. different
+                # inconsistency degree, and hence possibly different n_comp, is possible.
+                # a large deviation from zero indicates large inconsistency
+                biases -= n_comp
+                ax.hist(
+                    biases,
+                    alpha=0.5,
+                    density=True,
+                    label=f"{lambdavalues[j]['label']}, dof: {n_comp}",
+                )
+                ax.set_title(f"dataset {str(ds)}")
+                ax.legend()
+
+        yield fig
+
 
 @table
 def datasets_bias_variance_pca_table(
diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml
index 33c790f42a..0964fc4cbc 100644
--- a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml
@@ -146,7 +146,12 @@ compare_settings:
 
 ##################################################################
 #OTHER ANALYSIS SETTINGS
-
+fit: 230124_dis_ict_lam02_fs_122996
+theory:
+  from_: fit
+theoryid:
+  from_: theory
+use_cuts: "internal"
 lambdavalues:
   - label: "LAMBDA02"
     fit: 230124_dis_ict_lam02_fs_122996
@@ -240,5 +245,9 @@ single_point_report:
 pca_report:
   meta: Null
   template: pca_template.md
+multi_pca_report: 
+  meta: Null
+  template: multi_pca_template.md
+
 actions_:
   - report(main=true)
diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md
index 7ad062bd09..3248c5422a 100644
--- a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md
@@ -12,3 +12,7 @@ Bias Variance Ratio Table
 {@with lambdavalues@}
 [Lambda value of {@label@}]({@pca_report report@})
 {@endwith@}
+
+Bias Distribution
+-----------------
+{@multi_pca_report@}
diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md
index 2f308bd701..45170834ed 100644
--- a/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md
@@ -5,3 +5,9 @@
 
 ## L2 condition number
 {@plot_l2_condition_number@}
+
+## Bootstrapped bias distributions
+{@bootstrapped_bias_distribution@}
+
+## Fit comparison bootstrapped bias distributions
+{@lambdavalues_bootstrapped_bias_distribution@}
\ No newline at end of file

From 325ad8f7d9136c90881aceff258e9d5ab0726cc8 Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Mon, 11 Mar 2024 18:14:36 +0000
Subject: [PATCH 22/61] added rbv plots as function of lambda

---
 .../multiclosure_inconsistent.py              | 90 -------------------
 .../multiclosure_inconsistent_output.py       | 67 +++++++-------
 .../multiclosure_analysis.yaml                |  5 +-
 .../multiclosure_template.md                  |  4 -
 .../pca_template.md                           |  9 +-
 5 files changed, 42 insertions(+), 133 deletions(-)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
index ffc351eea1..c997e3eb29 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
@@ -145,96 +145,6 @@ def principal_components_bias_variance_dataset(
 )
 
 
-def bootstrap_principal_components_bias_variance_dataset(
-    internal_multiclosure_dataset_loader,
-    principal_components_dataset,
-    n_bootstrap=100,
-    n_fits_bt=15,
-):
-    """
-    Compute the bias and variance for one datasets
-    using the principal component reduced covariance matrix.
-
-    Parameters
-    ----------
-    internal_multiclosure_dataset_loader : tuple
-        Tuple containing the results of multiclosure fits
-
-    principal_components_dataset : tuple
-        3D tuple containing the principal components of the theory predictions
-
-    n_bootstrap: int, default is 100
-        number of bootstrap
-
-    n_fits_bt: int, default is 15
-        number of fits to be randomly sampled in bootstrap
-
-
-    Returns
-    -------
-    tuple
-        3D tuple:
-        - biases: 1-D array of shape (Nfits,)
-        - variances: 1-D array of shape (Nfits, )
-        - n_comp: number of principal components kept
-    """
-
-    # estimate (PC) pdf covariance matrix (from replicas), shape is (Npc, Npc)
-    pc_basis, pc_reps, n_comp = principal_components_dataset
-
-    if n_comp <= 1:
-        return np.nan, np.nan, n_comp
-
-    covmat_pdf = np.cov(pc_reps)
-
-    sqrt_covmat_pdf = covmats.sqrt_covmat(covmat_pdf)
-
-    # compute variance
-    closures_th, law_th, _, _ = internal_multiclosure_dataset_loader
-
-    reps = np.asarray([th.error_members for th in closures_th])
-
-    variances = []
-    for i in range(reps.shape[0]):
-        diffs = pc_basis @ (reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True))
-        variances.append(np.mean(calc_chi2(sqrt_covmat_pdf, diffs)))
-    variance = np.mean(variances)
-
-    # perform bootstrap on bias
-    bt_biases = []
-    for i in range(n_bootstrap):
-        rng = np.random.RandomState(seed=i)
-
-        # step1a: pick n_fits_bt closure tests at random (with replacement)
-        fit_idxs = rng.choice(len(closures_th), size=n_fits_bt, replace=True)
-        bt_closure_th = np.array(closures_th)[fit_idxs]
-
-        # step1b: from each closure test pick 100 replicas at random (with replacement)
-        rep_idxs = rng.choice(100, size=(n_fits_bt, 100), replace=True)
-        replicas = np.asarray(
-            [th.error_members[:, rep_idx] for th, rep_idx in zip(bt_closure_th, rep_idxs)]
-        )
-
-        # step2: compute delta bias on bootstrap sample
-        centrals = np.mean(replicas, axis=-1)
-        underlying = law_th.central_value
-
-        delta_bias = centrals - underlying[np.newaxis, :]
-        delta_bias = pc_basis @ delta_bias.T
-
-        bt_biases.append(np.mean(calc_chi2(sqrt_covmat_pdf, delta_bias)))
-
-    return np.asarray(bt_biases), variance, n_comp
-
-
-"""
-Bootstrapped bias for each dataset and fits with different inconsistency
-"""
-lambdavalues_bootstrap_principal_components_bias_variance_datasets = collect(
-    "bootstrap_principal_components_bias_variance_dataset", ("data", "lambdavalues")
-)
-
-
 def principal_components_variance_distribution_dataset(
     internal_multiclosure_dataset_loader, principal_components_dataset
 ):
diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
index 130063c594..612c0cf0b4 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
@@ -13,6 +13,7 @@
 
 from reportengine.figure import figure, figuregen
 from reportengine.table import table
+from reportengine import collect
 
 from validphys import plotutils
 from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent import (
@@ -160,44 +161,50 @@ def table_bias_variance_datasets(principal_components_bias_variance_datasets, ea
 
     return df
 
+
+lambdavalues_table_bias_variance_datasets = collect(
+    "table_bias_variance_datasets", ("lambdavalues",)
+)
+
+
 @figuregen
-def lambdavalues_bootstrapped_bias_distribution(
-    lambdavalues_bootstrap_principal_components_bias_variance_datasets, each_dataset, lambdavalues
+def plot_lambdavalues_bias_variance_values(
+    lambdavalues_table_bias_variance_datasets, lambdavalues, each_dataset
 ):
-    """ """
+    """
+    Plot sqrt of ratio bias variance as a function of lambda for each dataset.
 
-    for i, ds in enumerate(each_dataset):
+    Parameters
+    ----------
+    lambdavalues_table_bias_variance_datasets: list
+        list of data frames computed as per table_bias_variance_datasets.
 
-        n_lambda_values = len(
-            lambdavalues_bootstrap_principal_components_bias_variance_datasets
-        ) // len(each_dataset)
+    lambdavalues: list
+        list specified in multiclosure_analysis.yaml
 
-        fig, ax = plotutils.subplots()
+    each_dataset: list
+        list of datasets
 
-        for j in range(n_lambda_values):
+    Yields
+    ------
+    figure
+    """
 
-            (biases, _, n_comp) = (
-                lambdavalues_bootstrap_principal_components_bias_variance_datasets[
-                    i + len(each_dataset) * j
-                ]
+    for ds in each_dataset:
+        fig, ax = plotutils.subplots()
+        for i, lambdavalue in enumerate(lambdavalues):
+            df = lambdavalues_table_bias_variance_datasets[i]
+            df = df[df.index == str(ds)]
+
+            ax.errorbar(
+                lambdavalue["lambda_value"],
+                df["sqrt(ratio)"].values,
+                yerr=df["error sqrt(ratio)"].values,
+                color="blue",
             )
-
-            if n_comp == 1:
-                ax.set_title(f"dataset {str(ds)}")
-
-            else:
-                # adjust by expected value so that a comparison between identical datasets w. different
-                # inconsistency degree, and hence possibly different n_comp, is possible.
-                # a large deviation from zero indicates large inconsistency
-                biases -= n_comp
-                ax.hist(
-                    biases,
-                    alpha=0.5,
-                    density=True,
-                    label=f"{lambdavalues[j]['label']}, dof: {n_comp}",
-                )
-                ax.set_title(f"dataset {str(ds)}")
-                ax.legend()
+            ax.set_ylabel(r"$R_{bv}$")
+            ax.set_xlabel(r"$\lambda$")
+        ax.set_title(f"Ratio bias variance: {str(ds)}")
 
         yield fig
 
diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml
index 0964fc4cbc..65dfaf3cfd 100644
--- a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml
@@ -154,6 +154,7 @@ theoryid:
 use_cuts: "internal"
 lambdavalues:
   - label: "LAMBDA02"
+    lambda_value: 0.2
     fit: 230124_dis_ict_lam02_fs_122996
     dataset_inputs:
       from_: fit
@@ -194,6 +195,7 @@ lambdavalues:
       - 230124_dis_ict_lam02_fs_916690
       - 230124_dis_ict_lam02_fs_966390
   - label: "LAMBDA04"
+    lambda_value: 0.4
     fit: 250124_dis_ict_lam04_fs_102661
     dataset_inputs:
       from_: fit
@@ -245,9 +247,6 @@ single_point_report:
 pca_report:
   meta: Null
   template: pca_template.md
-multi_pca_report: 
-  meta: Null
-  template: multi_pca_template.md
 
 actions_:
   - report(main=true)
diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md
index 3248c5422a..7ad062bd09 100644
--- a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md
@@ -12,7 +12,3 @@ Bias Variance Ratio Table
 {@with lambdavalues@}
 [Lambda value of {@label@}]({@pca_report report@})
 {@endwith@}
-
-Bias Distribution
------------------
-{@multi_pca_report@}
diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md
index 45170834ed..7fba0826e8 100644
--- a/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md
@@ -3,11 +3,8 @@
 ## Table of bias and variance
 {@table_bias_variance_datasets@}
 
+## Ratio bias variance vs Lambda
+{@plot_lambdavalues_bias_variance_values@}
+
 ## L2 condition number
 {@plot_l2_condition_number@}
-
-## Bootstrapped bias distributions
-{@bootstrapped_bias_distribution@}
-
-## Fit comparison bootstrapped bias distributions
-{@lambdavalues_bootstrapped_bias_distribution@}
\ No newline at end of file

From 95d608e6fe9b9d80ee194a898e4617ef7be4def2 Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Tue, 12 Mar 2024 10:09:45 +0000
Subject: [PATCH 23/61] added ratio bias variance to multi closure report

---
 .../multiclosure_analysis.yaml                | 24 ++++++-------------
 .../multiclosure_template.md                  |  3 +++
 .../pca_template.md                           |  3 ---
 3 files changed, 10 insertions(+), 20 deletions(-)

diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml
index 65dfaf3cfd..cf27fa9796 100644
--- a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml
@@ -146,25 +146,19 @@ compare_settings:
 
 ##################################################################
 #OTHER ANALYSIS SETTINGS
+
 fit: 230124_dis_ict_lam02_fs_122996
 theory:
   from_: fit
 theoryid:
   from_: theory
-use_cuts: "internal"
+use_cuts: internal
+
 lambdavalues:
   - label: "LAMBDA02"
     lambda_value: 0.2
-    fit: 230124_dis_ict_lam02_fs_122996
-    dataset_inputs:
-      from_: fit
-    theory:
-      from_: fit
-    theoryid:
-      from_: theory
     use_t0: True
     t0pdfset: 210223-mw-000_fakepdf
-    use_cuts: "internal"
     explained_variance_ratio: 0.99
     variancepdf: 230124_dis_ict_lam02_fs_122996
     fits:
@@ -196,16 +190,8 @@ lambdavalues:
       - 230124_dis_ict_lam02_fs_966390
   - label: "LAMBDA04"
     lambda_value: 0.4
-    fit: 250124_dis_ict_lam04_fs_102661
-    dataset_inputs:
-      from_: fit
-    theory:
-      from_: fit
-    theoryid:
-      from_: theory
     use_t0: True
     t0pdfset: 210223-mw-000_fakepdf
-    use_cuts: "internal"
     explained_variance_ratio: 0.99
     variancepdf: 250124_dis_ict_lam04_fs_102661
     fits:
@@ -247,6 +233,10 @@ single_point_report:
 pca_report:
   meta: Null
   template: pca_template.md
+ratio_bias_variance_report:
+  meta: Null
+  template: ratio_biasvar_template.md
 
+ 
 actions_:
   - report(main=true)
diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md
index 7ad062bd09..aff62f9182 100644
--- a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md
@@ -12,3 +12,6 @@ Bias Variance Ratio Table
 {@with lambdavalues@}
 [Lambda value of {@label@}]({@pca_report report@})
 {@endwith@}
+
+
+[Ratio bias variance vs Lambda]({@ratio_bias_variance_report report@})
diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md
index 7fba0826e8..2f308bd701 100644
--- a/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md
@@ -3,8 +3,5 @@
 ## Table of bias and variance
 {@table_bias_variance_datasets@}
 
-## Ratio bias variance vs Lambda
-{@plot_lambdavalues_bias_variance_values@}
-
 ## L2 condition number
 {@plot_l2_condition_number@}

From 070e3f6a9beb0e30438468588c2fe30a9bf5d476 Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Tue, 12 Mar 2024 10:27:35 +0000
Subject: [PATCH 24/61] forgot ratio_bias_variance template

---
 .../ratio_biasvar_template.md                                   | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/ratio_biasvar_template.md

diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/ratio_biasvar_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/ratio_biasvar_template.md
new file mode 100644
index 0000000000..d4eda2c575
--- /dev/null
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/ratio_biasvar_template.md
@@ -0,0 +1,2 @@
+## Ratio Bias Variance vs Lambda
+{@plot_lambdavalues_bias_variance_values@}

From 70792926097aa3e7af00ef31d906589506814444 Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Tue, 12 Mar 2024 21:19:48 +0000
Subject: [PATCH 25/61] fmt = 'o' for ratio bias variance plot

---
 .../inconsistent_closuretest/multiclosure_inconsistent_output.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
index 612c0cf0b4..337dc6a3e1 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
@@ -201,6 +201,7 @@ def plot_lambdavalues_bias_variance_values(
                 df["sqrt(ratio)"].values,
                 yerr=df["error sqrt(ratio)"].values,
                 color="blue",
+                fmt='o',
             )
             ax.set_ylabel(r"$R_{bv}$")
             ax.set_xlabel(r"$\lambda$")

From 49d0484960fd3f900152b3e4a786a78f29dc3711 Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Fri, 15 Mar 2024 12:07:43 +0000
Subject: [PATCH 26/61] added functions for computing Rbv using old definition
 + runcard with table

---
 .../src/validphys/closuretest/multiclosure.py |  6 ++
 .../closuretest/multiclosure_output.py        | 59 +++++++++++++++++++
 .../runcards/old_rbv_definition.yaml          | 56 ++++++++++++++++++
 3 files changed, 121 insertions(+)
 create mode 100644 validphys2/src/validphys/closuretest/runcards/old_rbv_definition.yaml

diff --git a/validphys2/src/validphys/closuretest/multiclosure.py b/validphys2/src/validphys/closuretest/multiclosure.py
index ff71e3caf0..a086d97d91 100644
--- a/validphys2/src/validphys/closuretest/multiclosure.py
+++ b/validphys2/src/validphys/closuretest/multiclosure.py
@@ -181,6 +181,12 @@ def fits_normed_dataset_central_delta(
         # variances.shape = (n_fits, n_obs_cut/uncut, reps)
     return np.asarray(deltas)
 
+
+fits_datasets_bias_variance = collect(
+    "fits_dataset_bias_variance", ("data",)
+)
+
+
 def expected_dataset_bias_variance(fits_dataset_bias_variance):
     """For a given dataset calculate the expected bias and variance across fits
     then return tuple (expected bias, expected variance, n_data)
diff --git a/validphys2/src/validphys/closuretest/multiclosure_output.py b/validphys2/src/validphys/closuretest/multiclosure_output.py
index d7be585cd7..a9becee214 100644
--- a/validphys2/src/validphys/closuretest/multiclosure_output.py
+++ b/validphys2/src/validphys/closuretest/multiclosure_output.py
@@ -60,6 +60,65 @@ def plot_total_fits_bias_variance(fits_total_bias_variance):
     """
     return plot_dataset_fits_bias_variance(fits_total_bias_variance, "all data")
 
+@table
+def table_datasets_bias_variance_fits(fits_datasets_bias_variance, each_dataset):
+    """
+    Table with ratio bias variance value and associated uncertainty
+    computed with simple gaussian error propagation for each dataset.
+    """
+    records = []
+    # compute expectation and uncertainty on bias variance ratio
+    for ds, (bias, var, ndata) in zip(each_dataset, fits_datasets_bias_variance):
+        mean_bias = np.mean(bias)
+        mean_var = np.mean(var)
+        rbv = mean_bias / mean_var
+
+        # compute uncertainy on rbv
+        delta_rbv = np.sqrt(
+            ((1 / mean_var) * np.std(bias)) ** 2 + (mean_bias / mean_var**2 * np.std(var)) ** 2
+        )
+        sqrt_rbv = np.sqrt(rbv)
+        delta_sqrt_rbv = 0.5 * delta_rbv / np.sqrt(rbv)
+
+        records.append(
+            dict(
+                dataset=str(ds),
+                ndata=ndata,
+                bias=mean_bias,
+                variance=mean_var,
+                ratio=rbv,
+                error_ratio=delta_rbv,
+                ratio_sqrt=sqrt_rbv,
+                error_ratio_sqrt=delta_sqrt_rbv,
+            )
+        )
+    
+    df = pd.DataFrame.from_records(
+        records,
+        index="dataset",
+        columns=(
+            "dataset",
+            "ndata",
+            "bias",
+            "variance",
+            "ratio",
+            "error_ratio",
+            "ratio_sqrt",
+            "error_ratio_sqrt",
+        ),
+    )
+    df.columns = [
+        "ndata",
+        "bias",
+        "variance",
+        "ratio",
+        "error ratio",
+        "sqrt(ratio)",
+        "error sqrt(ratio)",
+    ]
+
+    return df
+
 
 @table
 def datasets_bias_variance_ratio(datasets_expected_bias_variance, each_dataset):
diff --git a/validphys2/src/validphys/closuretest/runcards/old_rbv_definition.yaml b/validphys2/src/validphys/closuretest/runcards/old_rbv_definition.yaml
new file mode 100644
index 0000000000..15de93bfdf
--- /dev/null
+++ b/validphys2/src/validphys/closuretest/runcards/old_rbv_definition.yaml
@@ -0,0 +1,56 @@
+meta:
+  title: multiclosure report using old definition of Rbv 
+  author: Mark N. Costantini
+  keywords: [multiclosure, python, consistent]
+
+fits: 
+  - 25_5_2023_19_47_5_dis_pt1_mnc_commit_4d5d473c_filterseed_415295
+  - 25_5_2023_19_49_38_dis_pt1_mnc_commit_4d5d473c_filterseed_120750
+  - 25_5_2023_19_50_27_dis_pt1_mnc_commit_4d5d473c_filterseed_754299
+  - 25_5_2023_19_51_19_dis_pt1_mnc_commit_4d5d473c_filterseed_361864
+  - 25_5_2023_19_52_54_dis_pt1_mnc_commit_4d5d473c_filterseed_516909
+  - 25_5_2023_19_52_7_dis_pt1_mnc_commit_4d5d473c_filterseed_539528
+  - 25_5_2023_19_53_41_dis_pt1_mnc_commit_4d5d473c_filterseed_307907
+  - 25_5_2023_19_54_28_dis_pt1_mnc_commit_4d5d473c_filterseed_279829
+  - 25_5_2023_19_55_19_dis_pt1_mnc_commit_4d5d473c_filterseed_162790
+  - 25_5_2023_19_56_54_dis_pt1_mnc_commit_4d5d473c_filterseed_6801
+  - 25_5_2023_19_56_7_dis_pt1_mnc_commit_4d5d473c_filterseed_625729
+  - 25_5_2023_19_57_42_dis_pt1_mnc_commit_4d5d473c_filterseed_327255
+  - 25_5_2023_19_58_39_dis_pt1_mnc_commit_4d5d473c_filterseed_156246
+  - 25_5_2023_19_59_29_dis_pt1_mnc_commit_4d5d473c_filterseed_385085
+  - 25_5_2023_20_0_23_dis_pt1_mnc_commit_4d5d473c_filterseed_780441
+  - 25_5_2023_20_1_12_dis_pt1_mnc_commit_4d5d473c_filterseed_155517
+  - 25_5_2023_20_2_0_dis_pt1_mnc_commit_4d5d473c_filterseed_663052
+  - 25_5_2023_20_2_47_dis_pt1_mnc_commit_4d5d473c_filterseed_387320
+  - 25_5_2023_20_3_34_dis_pt1_mnc_commit_4d5d473c_filterseed_277058
+  - 25_5_2023_20_4_24_dis_pt1_mnc_commit_4d5d473c_filterseed_347679
+  - 25_5_2023_20_5_11_dis_pt1_mnc_commit_4d5d473c_filterseed_455195
+  - 25_5_2023_20_6_1_dis_pt1_mnc_commit_4d5d473c_filterseed_191318
+  - 25_5_2023_20_6_48_dis_pt1_mnc_commit_4d5d473c_filterseed_527779
+  - 25_5_2023_20_7_35_dis_pt1_mnc_commit_4d5d473c_filterseed_974809
+
+# the fits have the same datasets
+fit: 25_5_2023_20_7_35_dis_pt1_mnc_commit_4d5d473c_filterseed_974809 
+
+theoryid: 200
+
+use_cuts: internal
+
+dataset_inputs: 
+  from_: fit
+
+t0pdfset: NNPDF40_nnlo_as_01180
+use_t0: True
+
+template_text:  |
+  Report for multiclosure fit using old (2111.05787) definition of Rbv
+  ---------------------------------------------------------------------
+
+  Table
+  =====
+
+  {@table_datasets_bias_variance_fits@}
+  
+
+actions_:
+  - report(main=True)

From 9d5f63ebd98dfdcb4c3bd38d91786bf8081b4a92 Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Sun, 17 Mar 2024 16:50:58 +0000
Subject: [PATCH 27/61] Piazza pulita 1: removed old plotting functions that
 were never used

---
 .../multiclosure_inconsistent.py              | 160 -----------------
 .../multiclosure_inconsistent_output.py       | 166 +-----------------
 2 files changed, 1 insertion(+), 325 deletions(-)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
index c997e3eb29..60e2b53a68 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
@@ -18,23 +18,6 @@
 from reportengine import collect
 
 
-""" To load several multiclosure fits. Useful for inconsistent closure test analysis """
-multi_dataset_loader = collect("internal_multiclosure_dataset_loader", ("dataspecs",))
-
-multi_dataset_fits_bias_replicas_variance_samples = collect(
-    "dataset_fits_bias_replicas_variance_samples", ("dataspecs",)
-)
-
-multi_fits_bootstrap_dataset_bias_variance = collect(
-    "fits_bootstrap_dataset_bias_variance", ("dataspecs",)
-)
-
-multi_bias_variance_resampling_dataset = collect("bias_variance_resampling_dataset", ("dataspecs",))
-
-multi_dataset_fits_bias_variance_samples_pca = collect(
-    "dataset_fits_bias_variance_samples_pca", ("dataspecs",)
-)
-
 
 def principal_components_dataset(dataset, fits_pdf, variancepdf, explained_variance_ratio=0.99):
     """
@@ -175,146 +158,3 @@ def principal_components_variance_distribution_dataset(
 principal_components_variance_distribution_datasets = collect(
     "principal_components_variance_distribution_dataset", ("data",)
 )
-
-
-def compute_num_components(covariance_matrix, threshold=0.99):
-    """
-    Compute the number of principal components to keep based on a desired explained variance threshold.
-
-    Parameters
-    ----------
-    covariance_matrix : np.ndarray, 2-D array
-
-    threshold : (float): Desired explained variance threshold (between 0 and 1).
-
-    Returns
-    -------
-    int
-        num_components: Number of principal components to keep.
-
-    """
-    eig_val, eig_vec = np.linalg.eig(covariance_matrix)
-    idx = eig_val.argsort()[::-1]
-    eig_val = eig_val[idx]
-    eig_vec = eig_vec[:, idx]
-
-    cumulative_sum = np.cumsum(eig_val)
-    total_sum = np.sum(eig_val)
-    num_components = np.argmin(np.abs(cumulative_sum / total_sum - threshold))
-
-    return num_components
-
-
-def calc_chi2_pca(pdf_cov, diff, num_components):
-    """
-    Computes the chi2 between pdf_cov and diff by first projecting
-    them into num_components PCs.
-
-    Parameters
-    ----------
-    pdf_cov: np.ndarray
-
-    diff: np.ndarray
-
-    num_components: int
-
-    Returns
-    -------
-    float or np.ndarray depending on input
-
-    """
-    # Diagonalize the matrix
-    L, W = np.linalg.eig(pdf_cov)
-
-    # Sort the eigenvalues and eigenvectors from largest to smallest
-    idx = L.argsort()[::-1]
-    L = L[idx]
-    W = W[:, idx]
-
-    # Keep only the n_components largest eigenvectors
-    Wtilde = W[:, :num_components]
-
-    # Transform initial covariance matrix
-    pdf_cov_pca = np.einsum("ij,jk->ik", np.einsum("ij,ik->jk", Wtilde, pdf_cov), Wtilde).real
-
-    # transform data
-    diff_pca = diff.T @ Wtilde
-
-    try:
-        sqrt_pdf_cov_pca = covmats.sqrt_covmat(pdf_cov_pca)
-    except:
-        raise ValueError(
-            f"PCA Covariance Matrix cannot be Cholesky decomposed, perhaps less than {num_components} PC should be kept"
-        )
-
-    return np.real(calc_chi2(sqrt_pdf_cov_pca, diff_pca.T))
-
-
-def dataset_fits_bias_variance_samples_pca(internal_multiclosure_dataset_loader, threshold=0.99):
-    """
-    similar to `dataset_fits_bias_replicas_variance_samples`.
-
-    Returns
-    -------
-    tuple
-        3D tuple:
-        - biases: 1-D array of shape (Nfits,)
-        - variances: 1-D array of shape (Nfits, )
-        - n_eig: number of eigenvalues kept in PCA, i.e.,
-          ndata in the new, lower dimensional, space.
-    Note that we return Nfits values of the variance so that computing the
-    Bias - Variance ratio is straightforward.
-    """
-    closures_th, law_th, _, _ = internal_multiclosure_dataset_loader
-
-    # The dimensions here are (fit, data point, replica)
-    reps = np.asarray([th.error_members for th in closures_th])
-
-    # take mean across replicas - since we might have changed no. of reps
-    centrals = reps.mean(axis=2)
-
-    # compute the PDF covariance matrix of the central samples
-    if centrals.shape[0] <= 1:
-        raise ValueError(f"Need more than one fit to compute the 'Bias' PDF covariance Matrix")
-
-    pdf_cov_bias = np.cov(centrals.T)
-
-    # find number of (ordered) eigenvalues that explain 99% of the total variance (total sum of eigenvalues)
-    n_eig = compute_num_components(pdf_cov_bias, threshold)
-
-    # compute bias from PCs
-    diffs_bias = centrals.T - law_th.central_value[:, np.newaxis]
-    biases = calc_chi2_pca(pdf_cov_bias, diffs_bias, n_eig)
-
-    # compute variances from PCs
-    variances = []
-
-    # loop over fits to compute variances
-    for i in range(reps.shape[0]):
-        diffs_var = reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)
-        pdf_cov_var = np.cov(reps[i, :, :])
-
-        variances.append(np.mean(calc_chi2_pca(pdf_cov_var, diffs_var, n_eig)))
-
-    return biases, np.asarray(variances), n_eig
-
-
-def expected_dataset_fits_bias_variance_samples_pca(dataset_fits_bias_variance_samples_pca):
-    """
-    returns the bias and variance expected values as well as the number of
-    principal components.
-    """
-    biases, variances, n_eig = dataset_fits_bias_variance_samples_pca
-    return np.mean(biases), np.mean(variances), n_eig
-
-
-expected_datasets_fits_bias_variance_samples_pca = collect(
-    "expected_dataset_fits_bias_variance_samples_pca", ("data",)
-)
-
-
-def dataset_fits_ratio_bias_variance_samples_pca(dataset_fits_bias_variance_samples_pca):
-    """ """
-    biases, variances, _ = dataset_fits_bias_variance_samples_pca
-    sqrt_ratios = np.sqrt(biases / variances)
-    return sqrt_ratios
diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
index 337dc6a3e1..5fe25d4eef 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
@@ -11,7 +11,7 @@
 import numpy as np
 from scipy import stats
 
-from reportengine.figure import figure, figuregen
+from reportengine.figure import figuregen
 from reportengine.table import table
 from reportengine import collect
 
@@ -210,170 +210,6 @@ def plot_lambdavalues_bias_variance_values(
         yield fig
 
 
-@table
-def datasets_bias_variance_pca_table(
-    expected_datasets_fits_bias_variance_samples_pca, each_dataset
-):
-    """For each dataset calculate the expected bias and expected variance computed by
-    first projecting the covariance matrix into a lower dimensional space trough PCA.
-
-    """
-    records = []
-    for ds, (bias, var, ndata) in zip(
-        each_dataset, expected_datasets_fits_bias_variance_samples_pca
-    ):
-        records.append(
-            dict(
-                dataset=str(ds),
-                ndata=ndata,
-                bias=bias,
-                variance=var,
-                ratio=bias / var,
-                ratio_sqrt=np.sqrt(bias / var),
-            )
-        )
-    df = pd.DataFrame.from_records(
-        records,
-        index="dataset",
-        columns=("dataset", "ndata", "bias", "variance", "ratio", "ratio_sqrt"),
-    )
-    df.columns = ["ndata", "bias", "variance", "ratio", "sqrt(ratio)"]
-    return df
-
-
-@figure
-def plot_sqrt_ratio_distribution_pca(dataset_fits_ratio_bias_variance_samples_pca):
-    """"""
-    sqrt_ratios = dataset_fits_ratio_bias_variance_samples_pca
-    fig, ax = plotutils.subplots()
-    ax.hist(sqrt_ratios, bins='auto', density=True, alpha=0.5, label="sqrt_ratio")
-    ax.legend()
-    return fig
-
-
-@figure
-def plot_variance_distribution_multi(multi_dataset_fits_bias_replicas_variance_samples, dataspecs):
-    """
-    histogram of the distribution of the variances (k) defined as the
-    distance between central replica and replica (k) in units of the
-    experimental covariance matrix.
-    If more than one group of dataspecs (e.g. consistent and inconsistent)
-    fits are defined, than plot comparison of these.
-    """
-    fig, ax = plotutils.subplots()
-    for (_, variance_dist, _), spec in zip(
-        multi_dataset_fits_bias_replicas_variance_samples, dataspecs
-    ):
-        label = spec['speclabel']
-
-        ax.hist(variance_dist, bins='auto', density=True, alpha=0.5, label=label)
-
-    ax.legend()
-    return fig
-
-
-@figure
-def plot_variance_distribution_pca_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs):
-    """
-    like `plot_variance_distribution_multi`, but for variance computed with the covariance matrix
-    predicted by the PDFs (and reduced with PCA).
-    """
-    return plot_variance_distribution_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs)
-
-
-@figure
-def plot_bias_distribution_multi(multi_dataset_fits_bias_replicas_variance_samples, dataspecs):
-    """
-    histogram of the distribution of the biases (l) defined as the
-    distance between central replica and underlying law in units of the
-    experimental covariance matrix.
-    If more than one group of dataspecs (e.g. consistent and inconsistent)
-    fits are defined, than plot comparison of these.
-    """
-    fig, ax = plotutils.subplots()
-    for (bias_dist, _, _), spec in zip(
-        multi_dataset_fits_bias_replicas_variance_samples, dataspecs
-    ):
-        label = spec['speclabel']
-
-        ax.hist(bias_dist, bins='auto', density=True, alpha=0.5, label=label)
-
-    ax.legend()
-    return fig
-
-
-@figure
-def plot_bias_distribution_pca_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs):
-    """
-    like `plot_bias_distribution_multi`, but for variance computed with the covariance matrix
-    predicted by the PDFs (and reduced with PCA).
-    """
-    return plot_bias_distribution_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs)
-
-
-@figure
-def plot_sqrt_ratio_distribution_pca(multi_dataset_fits_bias_variance_samples_pca, dataspecs):
-    """
-    histogram of the distribution of the sqrt ratio of bias and variance computed with
-    the estimated "PDF" covariance matrix (reduced with PCA).
-    If more than one group of dataspecs (e.g. consistent and inconsistent)
-    fits are defined, than plot comparison of these.
-    """
-    fig, ax = plotutils.subplots()
-    ratios = []
-    labels = []
-    for (bias_dist, variance_dist, _), spec in zip(
-        multi_dataset_fits_bias_variance_samples_pca, dataspecs
-    ):
-        labels.append(spec['speclabel'])
-        ratios.append(bias_dist / variance_dist)
-
-    ax.hist(ratios, bins='auto', density=True, label=labels)
-    ax.legend()
-    return fig
-
-
-@figure
-def plot_multi_bias_distribution_bootstrap(multi_fits_bootstrap_dataset_bias_variance, dataspecs):
-    """
-    histogram of the distribution of the biases obtained by bootstrapping with
-    `fits_bootstrap_dataset_bias_variance` over the existing fits.
-
-    If more than one group of dataspecs (e.g. consistent and inconsistent)
-    fits are defined, than plot comparison of these.
-    """
-    fig, ax = plotutils.subplots()
-    for (bias, _), spec in zip(multi_fits_bootstrap_dataset_bias_variance, dataspecs):
-        label = spec['speclabel']
-
-        ax.hist(bias, bins='auto', density=True, alpha=0.5, label=label)
-
-    ax.legend()
-    return fig
-
-
-@figure
-def plot_multi_ratio_bias_variance_distribution_bootstrap(
-    multi_fits_bootstrap_dataset_bias_variance, dataspecs
-):
-    """
-    histogram of the ratio bias variance distribution obtained by bootstrapping bias
-    and variance with `fits_bootstrap_dataset_bias_variance`.
-
-    If more than one group of dataspecs (e.g. consistent and inconsistent)
-    fits are defined, than plot comparison of these.
-    """
-    fig, ax = plotutils.subplots()
-    for (bias, variance), spec in zip(multi_fits_bootstrap_dataset_bias_variance, dataspecs):
-        ratio = bias / variance
-        label = spec['speclabel']
-
-        ax.hist(ratio, bins='auto', density=True, alpha=0.5, label=label)
-
-    ax.legend()
-    return fig
-
-
 @figuregen
 def plot_l2_condition_number(
     each_dataset, fits_pdf, variancepdf, evr_min=0.90, evr_max=0.995, evr_n=20

From 3644cb8a143105ea359b16dcf9b78d4b5ec8dfb9 Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Sun, 17 Mar 2024 16:56:37 +0000
Subject: [PATCH 28/61] Piazza pulita 2: removed
 principal_components_variance_distribution_dataset as we are not using it

---
 .../multiclosure_inconsistent.py              | 32 ---------
 .../multiclosure_inconsistent_output.py       | 68 -------------------
 2 files changed, 100 deletions(-)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
index 60e2b53a68..ba04f5d09a 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
@@ -126,35 +126,3 @@ def principal_components_bias_variance_dataset(
 principal_components_bias_variance_datasets = collect(
     "principal_components_bias_variance_dataset", ("data",)
 )
-
-
-def principal_components_variance_distribution_dataset(
-    internal_multiclosure_dataset_loader, principal_components_dataset
-):
-    """
-    TODO
-    """
-
-    closures_th, _, _, _ = internal_multiclosure_dataset_loader
-
-    reps = np.asarray([th.error_members for th in closures_th])
-
-    pc_basis, pc_reps, n_comp = principal_components_dataset
-
-    if n_comp <= 1:
-        return None, n_comp
-    # estimate (PC) pdf covariance matrix (from replicas), shape is (Npc, Npc)
-    covmat_pdf = np.cov(pc_reps)
-    sqrt_covmat_pdf = covmats.sqrt_covmat(covmat_pdf)
-
-    variances = []
-    for i in range(reps.shape[0]):
-        diffs = pc_basis @ (reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True))
-        variances.append([calc_chi2(sqrt_covmat_pdf, diffs)])
-
-    return np.asarray(variances), n_comp
-
-
-principal_components_variance_distribution_datasets = collect(
-    "principal_components_variance_distribution_dataset", ("data",)
-)
diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
index 5fe25d4eef..24bddc48ba 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
@@ -9,7 +9,6 @@
 
 import pandas as pd
 import numpy as np
-from scipy import stats
 
 from reportengine.figure import figuregen
 from reportengine.table import table
@@ -21,73 +20,6 @@
 )
 
 
-@figuregen
-def plot_bias_distribution_datasets(principal_components_bias_variance_datasets, each_dataset):
-    """
-    TODO
-    """
-    for pc_bias_var_dataset, ds in zip(principal_components_bias_variance_datasets, each_dataset):
-        biases, variances, n_comp = pc_bias_var_dataset
-
-        try:
-            sqrt_rbv = np.sqrt(np.mean(biases) / np.mean(variances))
-            vals = np.linspace(0, 3 * n_comp, 100)
-            chi2_pdf = stats.chi2.pdf(vals, df=n_comp)
-            chi2_cdf = lambda x: stats.chi2.cdf(x, df=n_comp)
-            pvalue_ks = stats.kstest(biases, chi2_cdf).pvalue
-            fig, ax = plotutils.subplots()
-            ax.hist(biases, density=True, bins='auto', label=f"Bias {str(ds)}, p={pvalue_ks:.3f}")
-            ax.plot(vals, chi2_pdf, label=f"chi2, dof={n_comp}")
-            ax.plot([], [], 'ro', label=f"sqrt(Rbv) = {sqrt_rbv:.2f}")
-
-            ax.legend()
-
-            yield fig
-
-        except:
-            fig, ax = plotutils.subplots()
-            ax.plot([], [], label=f"Dataset: {str(ds)}")
-            ax.legend()
-            yield fig
-
-
-@figuregen
-def plot_variance_distribution_datasets(
-    principal_components_variance_distribution_datasets, each_dataset
-):
-    """
-    TODO
-    """
-
-    for pc_var_dataset, ds in zip(
-        principal_components_variance_distribution_datasets, each_dataset
-    ):
-        variances, n_comp = pc_var_dataset
-        try:
-            vals = np.linspace(0, 3 * n_comp, 100)
-            chi2_pdf = stats.chi2.pdf(vals, df=n_comp)
-            chi2_cdf = lambda x: stats.chi2.cdf(x, df=n_comp)
-
-            for i, var_fit in enumerate(variances):
-                pvalue_ks = stats.kstest(var_fit[0], chi2_cdf).pvalue
-
-                fig, ax = plotutils.subplots()
-                ax.hist(
-                    var_fit[0],
-                    density=True,
-                    bins='auto',
-                    label=f"Variance {str(ds)}, p={pvalue_ks:.3f}",
-                )
-                ax.plot(vals, chi2_pdf, label=f"chi2, dof={n_comp}")
-                ax.set_title(f"Fit {i}")
-                ax.legend()
-
-                yield fig
-        except:
-            fig, ax = plotutils.subplots()
-            yield fig
-
-
 @table
 def table_bias_variance_datasets(principal_components_bias_variance_datasets, each_dataset):
     """

From 462e63d3a2e88ded2118eedd51f90d7e07a33313 Mon Sep 17 00:00:00 2001
From: Giovanni De Crescenzo <giodecre@gmail.com>
Date: Sun, 17 Mar 2024 19:05:10 +0100
Subject: [PATCH 29/61] moved single data colorbar plots to
 multiclosure.py/_output.py

---
 .../src/validphys/closuretest/multiclosure.py | 31 ++++++++++
 .../closuretest/multiclosure_output.py        | 18 ++++++
 validphys2/src/validphys/kinematics.py        | 58 +------------------
 3 files changed, 50 insertions(+), 57 deletions(-)

diff --git a/validphys2/src/validphys/closuretest/multiclosure.py b/validphys2/src/validphys/closuretest/multiclosure.py
index a086d97d91..496bf22772 100644
--- a/validphys2/src/validphys/closuretest/multiclosure.py
+++ b/validphys2/src/validphys/closuretest/multiclosure.py
@@ -24,6 +24,8 @@
 )
 from validphys.results import ThPredictionsResult
 
+from validphys.kinematics import xq2map_with_cuts
+
 # bootstrap seed default
 DEFAULT_SEED = 9689372
 # stepsize in fits/replicas to use for finite size bootstraps
@@ -870,3 +872,32 @@ def dataset_inputs_fits_bias_replicas_variance_samples(
 experiments_fits_bias_replicas_variance_samples = collect(
     "dataset_inputs_fits_bias_replicas_variance_samples", ("group_dataset_inputs_by_experiment",)
 )
+
+
+def xq2_dataset_map(commondata, cuts,internal_multiclosure_dataset_loader,
+                        _internal_max_reps=None,
+                        _internal_min_reps=20):
+    """
+    Load in a dictionary all the specs of a dataset meaning:
+    - ds name
+    - ds coords
+    - standard deviation (in multiclosure)
+    - mean (in multiclosure again)
+    - (x,Q^2) coords
+    """
+    xq2_map_obj = xq2map_with_cuts(commondata, cuts)
+    coords = xq2_map_obj[2]
+    central_deltas = fits_normed_dataset_central_delta(internal_multiclosure_dataset_loader)
+    std_devs = np.std(central_deltas, axis = 0)
+    means = np.mean(central_deltas, axis = 0)
+    xi = dataset_xi(dataset_replica_and_central_diff(internal_multiclosure_dataset_loader,False))
+
+    # for case of DY observables we have 2 (x,Q) for each experimental point
+    if coords[0].shape[0] != std_devs.shape[0]:
+        std_devs = np.concatenate((std_devs,std_devs))
+        means = np.concatenate((means,means))
+        xi = np.concatenate((xi,xi))
+    return {'x_coords':coords[0], 'Q_coords':coords[1],'std_devs':std_devs,'name':commondata.name,'process':commondata.process_type, 'means': means, 'xi': xi}
+
+
+xq2_data_map = collect("xq2_dataset_map",("data",))
\ No newline at end of file
diff --git a/validphys2/src/validphys/closuretest/multiclosure_output.py b/validphys2/src/validphys/closuretest/multiclosure_output.py
index a9becee214..54a92579bc 100644
--- a/validphys2/src/validphys/closuretest/multiclosure_output.py
+++ b/validphys2/src/validphys/closuretest/multiclosure_output.py
@@ -853,3 +853,21 @@ def plot_bias_variance_distributions(
     ax.legend()
     ax.set_title("Total bias and variance distributions.")
     yield fig
+
+@figuregen
+def xq2_data_prcs_maps(xq2_data_map):
+    keys = ["std_devs","xi"]
+    for elem in xq2_data_map:
+        for k in keys:
+            fig, ax = plotutils.subplots()
+            im = ax.scatter(elem['x_coords'],elem['Q_coords'],
+                                c=(np.asarray(elem[k])), 
+                                cmap='viridis',
+                                label = elem['name'])
+            fig.colorbar(im,label=k)
+            ax.set_xscale('log')  # Set x-axis to log scale
+            ax.set_yscale('log')  # Set y-axis to log scale
+            ax.set_xlabel('x')
+            ax.set_ylabel('Q2')
+            ax.set_title(elem["name"]+" "+k)
+            yield fig
diff --git a/validphys2/src/validphys/kinematics.py b/validphys2/src/validphys/kinematics.py
index d4f1ef4f2f..554dd401b5 100644
--- a/validphys2/src/validphys/kinematics.py
+++ b/validphys2/src/validphys/kinematics.py
@@ -19,16 +19,6 @@
 
 
 
-from validphys import plotutils
-from validphys import plotoptions
-from validphys.core import CutsPolicy
-from validphys.closuretest import fits_normed_dataset_central_delta
-from validphys.closuretest import dataset_xi
-from validphys.closuretest import dataset_replica_and_central_diff
-
-from reportengine.figure import figuregen
-
-
 
 
 
@@ -113,31 +103,6 @@ def kinlimits(commondata, cuts, use_cuts, use_kinoverride: bool = True):
 
 all_kinlimits = collect(kinlimits, ('dataset_inputs',))
 
-def xq2_dataset_map(commondata, cuts,internal_multiclosure_dataset_loader,
-                        _internal_max_reps=None,
-                        _internal_min_reps=20):
-    """
-    Load in a dictionary all the specs of a dataset meaning:
-    - ds name
-    - ds coords
-    - standard deviation (in multiclosure)
-    - mean (in multiclosure again)
-    - (x,Q^2) coords
-    """
-    xq2_map_obj = xq2map_with_cuts(commondata, cuts)
-    coords = xq2_map_obj[2]
-    central_deltas = fits_normed_dataset_central_delta(internal_multiclosure_dataset_loader)
-    std_devs = np.std(central_deltas, axis = 0)
-    means = np.mean(central_deltas, axis = 0)
-    xi = dataset_xi(dataset_replica_and_central_diff(internal_multiclosure_dataset_loader,False))
-
-    # for case of DY observables we have 2 (x,Q) for each experimental point
-    if coords[0].shape[0] != std_devs.shape[0]:
-        std_devs = np.concatenate((std_devs,std_devs))
-        means = np.concatenate((means,means))
-        xi = np.concatenate((xi,xi))
-    return {'x_coords':coords[0], 'Q_coords':coords[1],'std_devs':std_devs,'name':commondata.name,'process':commondata.process_type, 'means': means, 'xi': xi}
-
 @table
 def all_kinlimits_table(all_kinlimits, use_kinoverride: bool = True):
     """Return a table with the kinematic limits for the datasets given as input
@@ -226,28 +191,7 @@ def xq2map_with_cuts(commondata, cuts, group_name=None):
 dataset_inputs_by_groups_xq2map = collect(
     xq2map_with_cuts, ('group_dataset_inputs_by_metadata', 'data_input')
 )
-xq2_data_map = collect("xq2_dataset_map",("data",))
-
-@figuregen
-def xq2_data_prcs_maps(xq2_data_map):
-    import matplotlib.pyplot as plt
-    import matplotlib.colors
-    from matplotlib.colors import ListedColormap
-    keys = ["std_devs","xi"]
-    for elem in xq2_data_map:
-        for k in keys:
-            fig, ax = plotutils.subplots()
-            im = ax.scatter(elem['x_coords'],elem['Q_coords'],
-                                c=(np.asarray(elem[k])), 
-                                cmap='viridis',
-                                label = elem['name'])
-            fig.colorbar(im,label=k)
-            ax.set_xscale('log')  # Set x-axis to log scale
-            ax.set_yscale('log')  # Set y-axis to log scale
-            ax.set_xlabel('x')
-            ax.set_ylabel('Q2')
-            ax.set_title(elem["name"]+" "+k)
-            yield fig
+
 
 
 def kinematics_table_notable(commondata, cuts, show_extra_labels: bool = False):

From 7d696aff5e0d91189a74e8439a491783bb52402b Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Thu, 11 Apr 2024 23:42:34 +0200
Subject: [PATCH 30/61] scikit-dep >=1.4.1 in pyproject file

---
 conda-recipe/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index 8c6b1dd12c..a50e540f40 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -44,7 +44,7 @@ requirements:
         - sphinx_rtd_theme >0.5
         - sphinxcontrib-bibtex
         - ruamel.yaml <0.18
-        - scikit-learn = "^1.4.1"
+        - scikit-learn >=1.4.1
 
 test:
     requires:

From 37b835c27173d13e52f0c6eb207d7ae5ae1ff57e Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Mon, 22 Apr 2024 17:50:14 +0100
Subject: [PATCH 31/61] use mean theory covmat for PCA -> gives more stable
 values of variance

---
 .../multiclosure_inconsistent.py              | 42 +++++++++++++++----
 1 file changed, 35 insertions(+), 7 deletions(-)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
index ba04f5d09a..5f29882c98 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
@@ -18,7 +18,6 @@
 from reportengine import collect
 
 
-
 def principal_components_dataset(dataset, fits_pdf, variancepdf, explained_variance_ratio=0.99):
     """
     Compute the principal components of theory predictions replica matrix
@@ -71,7 +70,7 @@ def principal_components_dataset(dataset, fits_pdf, variancepdf, explained_varia
 
 
 def principal_components_bias_variance_dataset(
-    internal_multiclosure_dataset_loader, principal_components_dataset
+    internal_multiclosure_dataset_loader, explained_variance_ratio=0.99
 ):
     """
     Compute the bias and variance for one datasets
@@ -98,26 +97,55 @@ def principal_components_bias_variance_dataset(
 
     reps = np.asarray([th.error_members for th in closures_th])
 
-    pc_basis, pc_reps, n_comp = principal_components_dataset
+    # compute the covariance matrix of the theory predictions for each fit
+    _covmats = [np.cov(rep, rowvar=True) for rep in reps]
+
+    # compute the mean covariance matrix
+    _covmat_mean = np.mean(_covmats, axis=0)
+
+    # diagonalize the mean covariance matrix and only keep the principal components
+    # that explain the required variance
+
+    if _covmat_mean.shape == ():
+        return np.nan, np.nan, 1
+
+    eighvals, eigvecs = np.linalg.eigh(_covmat_mean)
+    idx = np.argsort(eighvals)[::-1]
+    # sort eigenvalues from largest to smallest
+    eigvecs = eigvecs[:, idx]
+    eighvals = eighvals[idx]
+    eighvals_norm = eighvals / eighvals.sum()
+
+    # choose components to keep based on EVR
+    n_comp = 1
+    for _ in range(eighvals.shape[0]):
+        if np.sum(eighvals_norm[:n_comp]) >= explained_variance_ratio:
+            break
+        n_comp += 1
+
+    # get the principal components
+    pc_basis = eigvecs[:, :n_comp]
+
+    # compute the (PCA) regularized covariance matrix
+    covmat_pdf = pc_basis.T @ _covmat_mean @ pc_basis
 
     if n_comp <= 1:
         return np.nan, np.nan, n_comp
 
-    # estimate (PC) pdf covariance matrix (from replicas), shape is (Npc, Npc)
-    covmat_pdf = np.cov(pc_reps)
+    # compute sqrt of pdf covariance matrix
     sqrt_covmat_pdf = covmats.sqrt_covmat(covmat_pdf)
 
     # compute bias diff and project it onto space spanned by PCs
     delta_bias = reps.mean(axis=2).T - law_th.central_value[:, np.newaxis]
     # shape here is (Npc, Nfits)
-    delta_bias = pc_basis @ delta_bias
+    delta_bias = pc_basis.T @ delta_bias
 
     # compute biases, shape of biases is (Nfits)
     biases = calc_chi2(sqrt_covmat_pdf, delta_bias)
 
     variances = []
     for i in range(reps.shape[0]):
-        diffs = pc_basis @ (reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True))
+        diffs = pc_basis.T @ (reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True))
         variances.append(np.mean(calc_chi2(sqrt_covmat_pdf, diffs)))
 
     return biases, np.asarray(variances), n_comp

From 8b4d5c520f24b1e03ccbeb3d508db39c36cda464 Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Mon, 10 Jun 2024 12:21:28 +0200
Subject: [PATCH 32/61] rewrote functions using new variance definition (covmat
 computed by averaging over covmats of all fits) and removed sklearn
 dependency

---
 .../examples/pca_bias_variance_report.yaml    |   1 -
 .../multiclosure_inconsistent.py              | 182 +++++++++++-------
 .../multiclosure_inconsistent_output.py       |  32 +--
 3 files changed, 129 insertions(+), 86 deletions(-)

diff --git a/validphys2/examples/pca_bias_variance_report.yaml b/validphys2/examples/pca_bias_variance_report.yaml
index 209ad06c58..1a9a506770 100644
--- a/validphys2/examples/pca_bias_variance_report.yaml
+++ b/validphys2/examples/pca_bias_variance_report.yaml
@@ -47,7 +47,6 @@ template_text: |
   ## L2 condition number
   {@plot_l2_condition_number@}
   
-  {@bootstrapped_bias_distribution@}
 
 actions_:
   - report(main=true)
diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
index 5f29882c98..f8820328b0 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
@@ -9,7 +9,7 @@
 """
 
 import numpy as np
-from sklearn.decomposition import PCA
+import dataclasses
 
 from validphys import covmats
 from validphys.calcutils import calc_chi2
@@ -18,87 +18,68 @@
 from reportengine import collect
 
 
-def principal_components_dataset(dataset, fits_pdf, variancepdf, explained_variance_ratio=0.99):
+@dataclasses.dataclass(frozen=True)
+class PCAInternalMulticlosureLoader:
     """
-    Compute the principal components of theory predictions replica matrix
-    (Ndat x Nrep feature matrix).
-
     Parameters
     ----------
-    dataset: (DataSetSpec, DataGroupSpec)
-        dataset for which the theory predictions and t0 covariance matrix
-        will be loaded. Note that due to the structure of `validphys` this
-        function can be overloaded to accept a DataGroupSpec.
+    closures_th: list
+        list containing validphys.results.ThPredictionsResult objects
+        for each fit
 
-    fits_pdf: list
-        list of PDF objects produced from performing multiple closure tests
-        fits. Each fit should have a different filterseed but the same
-        underlying law used to generate the pseudodata.
+    law_th: ThPredictionsResult object
+        underlying law theory predictions
 
-    variancepdf: validphys.core.PDF
-            PDF object used to estimate the variance of the fits.
+    pc_basis: np.array
+        basis of principal components
 
-    explained_variance_ratio: float, default is 0.93
+    n_comp: int
+        number of principal components kept after regularisation
 
-    Returns
-    -------
-    tuple
-        3D tuple:
-        - matrix of the principal components (PCs) of shape (N_pc, N_dat)
-        - reduced feature matrix, i.e., feature matrix projected onto PCs of shape (N_pc, N_rep)
-        - N_pc: number of principal components kept
+    covmat_pca: np.array
+        regularised covariance matrix computed from replicas
+        of theory predictions
 
+    sqrt_covmat_pca: np.array
+        cholesky decomposed covariance matrix
     """
-    # get replicas from variance fit, used to estimate variance
-    reps = ThPredictionsResult.from_convolution(variancepdf, dataset).error_members
-
-    # something that could be tested: rescale feature matrix
-    # reps_scaled = reps.preprocessing.scale(reps)
 
-    # choose number of principal components (PCs) based on explained variance ratio
-    n_comp = 1
-    for _ in range(reps.shape[0]):
-        pca = PCA(n_comp).fit(reps.T)
-        if np.sum(pca.explained_variance_ratio_) >= explained_variance_ratio:
-            break
-        n_comp += 1
+    closures_th: list
+    law_th: ThPredictionsResult
+    pc_basis: np.array
+    n_comp: int
+    covmat_pca: np.array
+    sqrt_covmat_pca: np.array
 
-    # project feature matrix onto PCs
-    pc_reps = pca.components_ @ reps
 
-    return pca.components_, pc_reps, n_comp
-
-
-def principal_components_bias_variance_dataset(
+def internal_multiclosure_dataset_loader_pca(
     internal_multiclosure_dataset_loader, explained_variance_ratio=0.99
 ):
     """
-    Compute the bias and variance for one datasets
-    using the principal component reduced covariance matrix.
+    Similar to multiclosure.internal_multiclosure_dataset_loader but returns
+    PCA regularised covariance matrix, where the covariance matrix has been computed
+    from the replicas of the theory predictions.
 
     Parameters
     ----------
-    internal_multiclosure_dataset_loader : tuple
-        Tuple containing the results of multiclosure fits
+    internal_multiclosure_dataset_loader: tuple
+        closure fits theory predictions,
+        underlying law theory predictions,
+        covariance matrix,
+        sqrt covariance matrix
 
-    principal_components_dataset : tuple
-        3D tuple containing the principal components of the theory predictions
+    explained_variance_ratio: float, default is 0.99
 
     Returns
     -------
-    tuple
-        3D tuple:
-        - biases: 1-D array of shape (Nfits,)
-        - variances: 1-D array of shape (Nfits, )
-        - n_comp: number of principal components kept
+    PCAInternalMulticlosureLoader
     """
-
     closures_th, law_th, _, _ = internal_multiclosure_dataset_loader
 
     reps = np.asarray([th.error_members for th in closures_th])
 
     # compute the covariance matrix of the theory predictions for each fit
-    _covmats = [np.cov(rep, rowvar=True) for rep in reps]
+    _covmats = [np.cov(rep, rowvar=True, bias=True) for rep in reps]
 
     # compute the mean covariance matrix
     _covmat_mean = np.mean(_covmats, axis=0)
@@ -107,7 +88,14 @@ def principal_components_bias_variance_dataset(
     # that explain the required variance
 
     if _covmat_mean.shape == ():
-        return np.nan, np.nan, 1
+        return PCAInternalMulticlosureLoader(
+            closures_th=closures_th,
+            law_th=law_th,
+            pc_basis=1,
+            n_comp=1,
+            covmat_pca=_covmat_mean,
+            sqrt_covmat_pca=np.sqrt(_covmat_mean),
+        )
 
     eighvals, eigvecs = np.linalg.eigh(_covmat_mean)
     idx = np.argsort(eighvals)[::-1]
@@ -127,28 +115,80 @@ def principal_components_bias_variance_dataset(
     pc_basis = eigvecs[:, :n_comp]
 
     # compute the (PCA) regularized covariance matrix
-    covmat_pdf = pc_basis.T @ _covmat_mean @ pc_basis
-
-    if n_comp <= 1:
-        return np.nan, np.nan, n_comp
+    covmat_pca = pc_basis.T @ _covmat_mean @ pc_basis
+
+    if n_comp == 1:
+        return PCAInternalMulticlosureLoader(
+            closures_th=closures_th,
+            law_th=law_th,
+            pc_basis=pc_basis,
+            n_comp=1,
+            covmat_pca=covmat_pca,
+            sqrt_covmat_pca=np.sqrt(covmat_pca),
+        )
 
     # compute sqrt of pdf covariance matrix
-    sqrt_covmat_pdf = covmats.sqrt_covmat(covmat_pdf)
+    sqrt_covmat_pca = covmats.sqrt_covmat(covmat_pca)
 
-    # compute bias diff and project it onto space spanned by PCs
-    delta_bias = reps.mean(axis=2).T - law_th.central_value[:, np.newaxis]
-    # shape here is (Npc, Nfits)
-    delta_bias = pc_basis.T @ delta_bias
+    return PCAInternalMulticlosureLoader(
+        closures_th=closures_th,
+        law_th=law_th,
+        pc_basis=pc_basis,
+        n_comp=n_comp,
+        covmat_pca=covmat_pca,
+        sqrt_covmat_pca=sqrt_covmat_pca,
+    )
+
+
+def principal_components_bias_variance_dataset(internal_multiclosure_dataset_loader_pca):
+    """
+    Compute the bias and variance for one datasets
+    using the principal component reduced covariance matrix.
 
-    # compute biases, shape of biases is (Nfits)
-    biases = calc_chi2(sqrt_covmat_pdf, delta_bias)
+    Parameters
+    ----------
+    internal_multiclosure_dataset_loader : tuple
+        Tuple containing the results of multiclosure fits
 
-    variances = []
-    for i in range(reps.shape[0]):
-        diffs = pc_basis.T @ (reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True))
-        variances.append(np.mean(calc_chi2(sqrt_covmat_pdf, diffs)))
+    explained_variance_ratio : float, default is 0.99
+        3D tuple containing the principal components of the theory predictions
 
-    return biases, np.asarray(variances), n_comp
+    Returns
+    -------
+    tuple
+        3D tuple:
+        - biases: 1-D array of shape (Nfits,)
+        - variances: 1-D array of shape (Nfits, )
+        - n_comp: number of principal components kept
+    """
+
+    pca_loader = internal_multiclosure_dataset_loader_pca
+
+    reps = np.asarray([th.error_members for th in pca_loader.closures_th])
+
+    # compute bias diff and project it onto space spanned by PCs
+    delta_bias = reps.mean(axis=2).T - pca_loader.law_th.central_value[:, np.newaxis]
+
+    if pca_loader.n_comp == 1:
+        delta_bias = pca_loader.pc_basis * delta_bias
+        biases = (delta_bias / pca_loader.sqrt_covmat_pca) ** 2
+        variances = []
+        for i in range(reps.shape[0]):
+            diffs = pca_loader.pc_basis * (
+                reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)
+            )
+            variances.append(np.mean((diffs / pca_loader.sqrt_covmat_pca) ** 2))
+    else:
+        delta_bias = pca_loader.pc_basis.T @ delta_bias
+        biases = calc_chi2(pca_loader.sqrt_covmat_pca, delta_bias)
+        variances = []
+        for i in range(reps.shape[0]):
+            diffs = pca_loader.pc_basis.T @ (
+                reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)
+            )
+            variances.append(np.mean(calc_chi2(pca_loader.sqrt_covmat_pca, diffs)))
+
+    return biases, np.asarray(variances), pca_loader.n_comp
 
 
 principal_components_bias_variance_datasets = collect(
diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
index 24bddc48ba..0b50f7afdc 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
@@ -16,7 +16,7 @@
 
 from validphys import plotutils
 from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent import (
-    principal_components_dataset,
+    internal_multiclosure_dataset_loader_pca,
 )
 
 
@@ -142,9 +142,14 @@ def plot_lambdavalues_bias_variance_values(
         yield fig
 
 
+internal_multiclosure_data_collected_loader = collect(
+    "internal_multiclosure_dataset_loader", ("data",)
+)
+
+
 @figuregen
 def plot_l2_condition_number(
-    each_dataset, fits_pdf, variancepdf, evr_min=0.90, evr_max=0.995, evr_n=20
+    each_dataset, internal_multiclosure_data_collected_loader, evr_min=0.90, evr_max=0.995, evr_n=20
 ):
     """
     Plot the L2 condition number of the covariance matrix as a function of the explained variance ratio.
@@ -162,11 +167,9 @@ def plot_l2_condition_number(
     each_dataset : list
         List of datasets
 
-    fits_pdf: list
-        list of validphys.core.PDF objects
+    internal_multiclosure_data_loader: list
+        list of internal_multiclosure_dataset_loader objects
 
-    variancepdf: validphys.core.PDF
-        PDF object for the variance
 
     Yields
     ------
@@ -177,25 +180,26 @@ def plot_l2_condition_number(
     # Explained variance ratio range
     evr_range = np.linspace(evr_min, evr_max, evr_n)
 
-    for ds in each_dataset:
+    for internal_multiclosure_dataset_loader, ds in zip(
+        internal_multiclosure_data_collected_loader, each_dataset
+    ):
         l2_cond = []
         dof = []
 
         for evr in evr_range:
-            _, pc_reps, n_comp = principal_components_dataset(
-                ds, fits_pdf, variancepdf, explained_variance_ratio=evr
-            )
 
-            covmat_pdf = np.cov(pc_reps)
+            pca_loader = internal_multiclosure_dataset_loader_pca(
+                internal_multiclosure_dataset_loader, explained_variance_ratio=evr
+            )
 
             # if the number of principal components is 1 then the covariance matrix is a scalar
             # and the condition number is not computed
-            if n_comp <= 1:
+            if pca_loader.n_comp == 1:
                 l2_cond.append(np.nan)
             else:
-                l2_cond.append(np.linalg.cond(covmat_pdf))
+                l2_cond.append(np.linalg.cond(pca_loader.covmat_pca))
 
-            dof.append(n_comp)
+            dof.append(pca_loader.n_comp)
 
         fig, ax1 = plotutils.subplots()
         ax1.plot(evr_range, l2_cond, label="L2 Condition Number")

From 5606f3e362ee1216d364cc2aa2eacba8b07b637e Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Tue, 23 Apr 2024 16:54:34 +0100
Subject: [PATCH 33/61] removed sklearn dep from conda recipe meta file

---
 conda-recipe/meta.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index a50e540f40..9d1b6376e3 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -44,7 +44,6 @@ requirements:
         - sphinx_rtd_theme >0.5
         - sphinxcontrib-bibtex
         - ruamel.yaml <0.18
-        - scikit-learn >=1.4.1
 
 test:
     requires:

From ee8bf0983d3bc4eacef2a21728f3f7ad09e77a46 Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Mon, 10 Jun 2024 12:22:22 +0200
Subject: [PATCH 34/61] removed variancepdf as unused

---
 .../examples/pca_bias_variance_report.yaml    | 20 +++++++++----------
 validphys2/src/validphys/config.py            |  1 +
 .../src/validphys/scripts/vp_multiclosure.py  |  4 ++--
 3 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/validphys2/examples/pca_bias_variance_report.yaml b/validphys2/examples/pca_bias_variance_report.yaml
index 1a9a506770..428a684754 100644
--- a/validphys2/examples/pca_bias_variance_report.yaml
+++ b/validphys2/examples/pca_bias_variance_report.yaml
@@ -14,10 +14,6 @@ dataset_inputs:
   - {dataset: HERA_CC_318GEV_EP-SIGMARED, variant: legacy}
 
 
-# variance pdf is used to compute the variance of fits
-# the assumption is that fits have the same variance (needs to be checked)
-variancepdf: 230124_dis_ict_lam02_fs_122996
-
 
 theoryid: 200
 use_cuts: internal
@@ -28,13 +24,15 @@ t0pdfset: 210223-mw-000_fakepdf
 explained_variance_ratio: 0.99
 
 fits:
-  - 250124_dis_ict_lam04_fs_833288
-  - 250124_dis_ict_lam04_fs_85029
-  - 250124_dis_ict_lam04_fs_856511
-  - 250124_dis_ict_lam04_fs_895365
-  - 250124_dis_ict_lam04_fs_902027
-  - 250124_dis_ict_lam04_fs_911964
-  - 250124_dis_ict_lam04_fs_99031
+  - 25_5_2023_19_47_5_dis_pt1_mnc_commit_4d5d473c_filterseed_415295
+  - 25_5_2023_19_49_38_dis_pt1_mnc_commit_4d5d473c_filterseed_120750
+  - 25_5_2023_19_50_27_dis_pt1_mnc_commit_4d5d473c_filterseed_754299
+  - 25_5_2023_19_51_19_dis_pt1_mnc_commit_4d5d473c_filterseed_361864
+  - 25_5_2023_19_52_54_dis_pt1_mnc_commit_4d5d473c_filterseed_516909
+  - 25_5_2023_19_52_7_dis_pt1_mnc_commit_4d5d473c_filterseed_539528
+  - 25_5_2023_19_53_41_dis_pt1_mnc_commit_4d5d473c_filterseed_307907
+  - 25_5_2023_19_54_28_dis_pt1_mnc_commit_4d5d473c_filterseed_279829
+
 
 template_text: |
 
diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py
index bdd94e213c..4cd55e6561 100644
--- a/validphys2/src/validphys/config.py
+++ b/validphys2/src/validphys/config.py
@@ -1065,6 +1065,7 @@ def produce_reweight_all_datasets(self, experiments):
                 ret.append({"reweighting_experiments": [single_exp], "dataset_input": dsinput})
         return ret
 
+
     def produce_pdf_id(self, pdf) -> str:
         """Return a string containing the PDF's LHAPDF ID"""
         return pdf.name
diff --git a/validphys2/src/validphys/scripts/vp_multiclosure.py b/validphys2/src/validphys/scripts/vp_multiclosure.py
index 5d1aa7013a..e87a0003d4 100644
--- a/validphys2/src/validphys/scripts/vp_multiclosure.py
+++ b/validphys2/src/validphys/scripts/vp_multiclosure.py
@@ -126,7 +126,7 @@ def complete_lambdavaluesmapping(self):
         list_lambdas = []
         for lambdas in settings:
             lambdasetting = {}
-            for set in ["variancepdf", "t0pdfset", "explained_variance_ratio", "label", "fits", "fit"]:
+            for set in ["t0pdfset", "explained_variance_ratio", "label", "fits", "fit"]:
                 lambdasetting[set] = lambdas[set]
             list_lambdas.append(lambdasetting)
         autosettings["lambdavalues"] = list_lambdas
@@ -143,7 +143,7 @@ def get_config(self):
         complete_mapping.update(self.complete_lambdavaluesmapping())
         c["meta"] = complete_mapping["meta"]
         for lambdas, lambdas_mapping in zip(c["lambdavalues"], complete_mapping["lambdavalues"]):
-            for set in ["variancepdf", "t0pdfset", "explained_variance_ratio", "label", "fits", "fit"]:
+            for set in ["t0pdfset", "explained_variance_ratio", "label", "fits", "fit"]:
                 lambdas[set] = lambdas_mapping[set]
         for set in ["current", "reference"]:
             c["compare_settings"][set] = complete_mapping["compare_settings"][set]

From 7144ac9292e9821bcb41970e5222b572c141147b Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Mon, 10 Jun 2024 12:22:44 +0200
Subject: [PATCH 35/61] added check_multifit_replicas check

---
 .../multiclosure_inconsistent.py                  | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
index f8820328b0..ec2971033b 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
@@ -14,6 +14,7 @@
 from validphys import covmats
 from validphys.calcutils import calc_chi2
 from validphys.results import ThPredictionsResult
+from validphys.closuretest.closure_checks import check_multifit_replicas
 
 from reportengine import collect
 
@@ -52,8 +53,12 @@ class PCAInternalMulticlosureLoader:
     sqrt_covmat_pca: np.array
 
 
+@check_multifit_replicas
 def internal_multiclosure_dataset_loader_pca(
-    internal_multiclosure_dataset_loader, explained_variance_ratio=0.99
+    internal_multiclosure_dataset_loader,
+    explained_variance_ratio=0.99,
+    _internal_max_reps=None,
+    _internal_min_reps=20,
 ):
     """
     Similar to multiclosure.internal_multiclosure_dataset_loader but returns
@@ -70,6 +75,14 @@ def internal_multiclosure_dataset_loader_pca(
 
     explained_variance_ratio: float, default is 0.99
 
+    _internal_max_reps: int, default is None
+        Maximum number of replicas used in the fits
+        this is needed to check that the number of replicas is the same for all fits
+
+    _internal_min_reps: int, default is 20
+        Minimum number of replicas used in the fits
+        this is needed to check that the number of replicas is the same for all fits
+
     Returns
     -------
     PCAInternalMulticlosureLoader

From ce5ff2dc88a847388a6b3f641a62abb5c2b8b801 Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Mon, 10 Jun 2024 12:17:46 +0200
Subject: [PATCH 36/61] use plotting dataset labels for rbv vs lambda titles

---
 .../multiclosure_inconsistent_output.py                        | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
index 0b50f7afdc..0a35a3e299 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
@@ -137,7 +137,8 @@ def plot_lambdavalues_bias_variance_values(
             )
             ax.set_ylabel(r"$R_{bv}$")
             ax.set_xlabel(r"$\lambda$")
-        ax.set_title(f"Ratio bias variance: {str(ds)}")
+
+        ax.set_title(f"{ds.commondata.metadata.plotting.dataset_label}")
 
         yield fig
 

From 328a8926662e383cdd2cba0e5f6255d5a0bda5e4 Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Mon, 10 Jun 2024 16:13:50 +0200
Subject: [PATCH 37/61] added hlines for rbv = 1

---
 .../inconsistent_closuretest/multiclosure_inconsistent_output.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
index 0a35a3e299..9d923def5d 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
@@ -135,6 +135,7 @@ def plot_lambdavalues_bias_variance_values(
                 color="blue",
                 fmt='o',
             )
+            ax.hlines(1, xmin=0, xmax=1.0, color="red", linestyle="--")
             ax.set_ylabel(r"$R_{bv}$")
             ax.set_xlabel(r"$\lambda$")
 

From 1f4eacc00a910f94ef51ef7799b6e501a125d4f1 Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Tue, 11 Jun 2024 17:59:22 +0200
Subject: [PATCH 38/61] added bootstrapped_internal_multiclosure_dataset_loader
 for tuple of bootstrapped multiclosuretests

---
 .../src/validphys/closuretest/multiclosure.py | 61 +++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/validphys2/src/validphys/closuretest/multiclosure.py b/validphys2/src/validphys/closuretest/multiclosure.py
index 496bf22772..b4681cc988 100644
--- a/validphys2/src/validphys/closuretest/multiclosure.py
+++ b/validphys2/src/validphys/closuretest/multiclosure.py
@@ -413,6 +413,67 @@ def _bootstrap_multiclosure_fits(
     return (boot_ths, *input_tuple)
 
 
+def bootstrapped_internal_multiclosure_dataset_loader(
+    internal_multiclosure_dataset_loader, n_fit_max, n_fit, n_rep_max, n_rep, n_boot_multiclosure, rng_seed_mct_boot, use_repeats=True, 
+):
+    """
+    Returns a tuple of internal_multiclosure_dataset_loader objects
+    each of which is a bootstrap resample of the original dataset
+
+    Parameters
+    ----------
+    internal_multiclosure_dataset_loader: tuple
+        closure fits theory predictions,
+        underlying law theory predictions,
+        covariance matrix,
+        sqrt covariance matrix
+
+    n_fit_max: int
+        maximum number of fits, should be smaller or equal to number of 
+        multiclosure fits
+
+    n_fit: int
+        number of fits to draw for each resample
+    
+    n_rep_max: int
+        maximum number of replicas, should be smaller or equal to number of 
+        replicas in each fit
+    
+    n_rep: int
+        number of replicas to draw for each resample
+    
+    n_boot_multiclosure: int
+        number of bootstrap resamples to perform
+
+    rng_seed_mct_boot: int
+        seed for random number generator
+
+    use_repeats: bool, default is True
+        whether to allow repeated fits and replicas in each resample
+
+    Returns
+    -------
+    resampled_multiclosure: tuple of shape (n_boot_multiclosure,)
+        tuple of internal_multiclosure_dataset_loader objects each of which
+        is a bootstrap resample of the original dataset
+
+    """
+    rng = np.random.RandomState(seed=rng_seed_mct_boot)
+    return tuple([
+        _bootstrap_multiclosure_fits(
+            internal_multiclosure_dataset_loader,
+            rng=rng,
+            n_fit_max=n_fit_max,
+            n_fit=n_fit,
+            n_rep_max=n_rep_max,
+            n_rep=n_rep,
+            use_repeats=use_repeats,
+        )
+        for _ in range(n_boot_multiclosure)
+    ]
+    )
+
+
 def bias_variance_resampling_dataset(
     internal_multiclosure_dataset_loader,
     n_fit_samples,

From 8689a186ae0ee6ff0d05e9a48fd3dbdd5a974557 Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Tue, 11 Jun 2024 17:59:45 +0200
Subject: [PATCH 39/61] bootstrap of PCA regularised multiclosure tests

---
 .../multiclosure_inconsistent.py              | 42 +++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
index ec2971033b..126b71d8b3 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
@@ -15,6 +15,7 @@
 from validphys.calcutils import calc_chi2
 from validphys.results import ThPredictionsResult
 from validphys.closuretest.closure_checks import check_multifit_replicas
+from validphys.closuretest.multiclosure import bootstrapped_internal_multiclosure_dataset_loader
 
 from reportengine import collect
 
@@ -153,6 +154,47 @@ def internal_multiclosure_dataset_loader_pca(
     )
 
 
+def bootstrapped_internal_multiclosure_dataset_loader_pca(
+    internal_multiclosure_dataset_loader,
+    n_fit_max,
+    n_fit,
+    n_rep_max,
+    n_rep,
+    n_boot_multiclosure,
+    rng_seed_mct_boot,
+    use_repeats=True,
+    explained_variance_ratio=0.99,
+    _internal_max_reps=None,
+    _internal_min_reps=20,
+):
+    """
+    Similar to multiclosure.bootstrapped_internal_multiclosure_dataset_loader but returns
+    PCA regularised covariance matrix, where the covariance matrix has been computed
+    from the replicas of the theory predictions.
+    """
+
+    # get bootstrapped internal multiclosure dataset loader
+    bootstrap_imdl = bootstrapped_internal_multiclosure_dataset_loader(
+        internal_multiclosure_dataset_loader,
+        n_fit_max=n_fit_max,
+        n_fit=n_fit,
+        n_rep_max=n_rep_max,
+        n_rep=n_rep,
+        n_boot_multiclosure=n_boot_multiclosure,
+        rng_seed_mct_boot=rng_seed_mct_boot,
+        use_repeats=use_repeats,
+    )
+
+    # PCA regularise all the bootstrapped internal multiclosure dataset loaders
+    bootstrap_imdl_pca = [
+        internal_multiclosure_dataset_loader_pca(
+            imdl, explained_variance_ratio, _internal_max_reps, _internal_min_reps
+        )
+        for imdl in bootstrap_imdl
+    ]
+    return tuple(bootstrap_imdl_pca)
+
+
 def principal_components_bias_variance_dataset(internal_multiclosure_dataset_loader_pca):
     """
     Compute the bias and variance for one datasets

From 2690bc97211ece8fc6b08aa24f371c4b3a431b72 Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Tue, 11 Jun 2024 18:10:40 +0200
Subject: [PATCH 40/61] bootstrap for internal_data_loadeer objecgs

---
 .../multiclosure_inconsistent.py              | 31 +++++++++
 .../src/validphys/closuretest/multiclosure.py | 67 ++++++++++++++-----
 2 files changed, 80 insertions(+), 18 deletions(-)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
index 126b71d8b3..3a8d057bc8 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
@@ -195,6 +195,37 @@ def bootstrapped_internal_multiclosure_dataset_loader_pca(
     return tuple(bootstrap_imdl_pca)
 
 
+def bootstrapped_internal_multiclosure_data_loader_pca(
+    internal_multiclosure_data_loader,
+    n_fit_max,
+    n_fit,
+    n_rep_max,
+    n_rep,
+    n_boot_multiclosure,
+    rng_seed_mct_boot,
+    use_repeats=True,
+    explained_variance_ratio=0.99,
+    _internal_max_reps=None,
+    _internal_min_reps=20,
+):
+    """
+    Same as bootstrapped_internal_multiclosure_dataset_loader_pca but for all the data.
+    """
+    return bootstrapped_internal_multiclosure_dataset_loader_pca(
+        internal_multiclosure_data_loader,
+        n_fit_max,
+        n_fit,
+        n_rep_max,
+        n_rep,
+        n_boot_multiclosure,
+        rng_seed_mct_boot,
+        use_repeats,
+        explained_variance_ratio,
+        _internal_max_reps,
+        _internal_min_reps,
+    )
+
+
 def principal_components_bias_variance_dataset(internal_multiclosure_dataset_loader_pca):
     """
     Compute the bias and variance for one datasets
diff --git a/validphys2/src/validphys/closuretest/multiclosure.py b/validphys2/src/validphys/closuretest/multiclosure.py
index b4681cc988..28b87baaea 100644
--- a/validphys2/src/validphys/closuretest/multiclosure.py
+++ b/validphys2/src/validphys/closuretest/multiclosure.py
@@ -414,7 +414,14 @@ def _bootstrap_multiclosure_fits(
 
 
 def bootstrapped_internal_multiclosure_dataset_loader(
-    internal_multiclosure_dataset_loader, n_fit_max, n_fit, n_rep_max, n_rep, n_boot_multiclosure, rng_seed_mct_boot, use_repeats=True, 
+    internal_multiclosure_dataset_loader,
+    n_fit_max,
+    n_fit,
+    n_rep_max,
+    n_rep,
+    n_boot_multiclosure,
+    rng_seed_mct_boot,
+    use_repeats=True,
 ):
     """
     Returns a tuple of internal_multiclosure_dataset_loader objects
@@ -429,19 +436,19 @@ def bootstrapped_internal_multiclosure_dataset_loader(
         sqrt covariance matrix
 
     n_fit_max: int
-        maximum number of fits, should be smaller or equal to number of 
+        maximum number of fits, should be smaller or equal to number of
         multiclosure fits
 
     n_fit: int
         number of fits to draw for each resample
-    
+
     n_rep_max: int
-        maximum number of replicas, should be smaller or equal to number of 
+        maximum number of replicas, should be smaller or equal to number of
         replicas in each fit
-    
+
     n_rep: int
         number of replicas to draw for each resample
-    
+
     n_boot_multiclosure: int
         number of bootstrap resamples to perform
 
@@ -459,18 +466,42 @@ def bootstrapped_internal_multiclosure_dataset_loader(
 
     """
     rng = np.random.RandomState(seed=rng_seed_mct_boot)
-    return tuple([
-        _bootstrap_multiclosure_fits(
-            internal_multiclosure_dataset_loader,
-            rng=rng,
-            n_fit_max=n_fit_max,
-            n_fit=n_fit,
-            n_rep_max=n_rep_max,
-            n_rep=n_rep,
-            use_repeats=use_repeats,
-        )
-        for _ in range(n_boot_multiclosure)
-    ]
+    return tuple(
+        [
+            _bootstrap_multiclosure_fits(
+                internal_multiclosure_dataset_loader,
+                rng=rng,
+                n_fit_max=n_fit_max,
+                n_fit=n_fit,
+                n_rep_max=n_rep_max,
+                n_rep=n_rep,
+                use_repeats=use_repeats,
+            )
+            for _ in range(n_boot_multiclosure)
+        ]
+    )
+
+
+def bootstrapped_internal_multiclosure_data_loader(
+    internal_multiclosure_data_loader,
+    n_fit_max,
+    n_fit,
+    n_rep_max,
+    n_rep,
+    n_boot_multiclosure,
+    rng_seed_mct_boot,
+    use_repeats=True,
+):
+    """Like bootstrapped_internal_multiclosure_dataset_loader except for all data"""
+    return bootstrapped_internal_multiclosure_dataset_loader(
+        internal_multiclosure_data_loader,
+        n_fit_max,
+        n_fit,
+        n_rep_max,
+        n_rep,
+        n_boot_multiclosure,
+        rng_seed_mct_boot,
+        use_repeats,
     )
 
 

From cc77ed9ea2a9be3c2a34163796e58ae15152a4fe Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Wed, 12 Jun 2024 09:57:34 +0200
Subject: [PATCH 41/61] added
 bootstrapped_principal_components_bias_variance_dataset

---
 .../multiclosure_inconsistent.py              | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
index 3a8d057bc8..42c3d6423a 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
@@ -10,6 +10,7 @@
 
 import numpy as np
 import dataclasses
+import pandas as pd
 
 from validphys import covmats
 from validphys.calcutils import calc_chi2
@@ -280,3 +281,33 @@ def principal_components_bias_variance_dataset(internal_multiclosure_dataset_loa
 principal_components_bias_variance_datasets = collect(
     "principal_components_bias_variance_dataset", ("data",)
 )
+
+
+def bootstrapped_principal_components_bias_variance_dataset(
+    bootstrapped_internal_multiclosure_dataset_loader_pca, dataset
+):
+    """
+    Computes Bias and Variance for each bootstrap sample.
+    Returns a DataFrame with the results.
+    """
+    boot_bias_var_samples = []
+    for i, boot_imdl_pca in enumerate(bootstrapped_internal_multiclosure_dataset_loader_pca):
+        bias, var, n_comp = principal_components_bias_variance_dataset(boot_imdl_pca)
+        boot_bias_var_samples.append(
+            {
+                "bias": np.mean(bias),
+                "variance": np.mean(var),
+                "n_comp": n_comp,
+                "dataset": str(dataset),
+                "bootstrap_index": i,
+            }
+        )
+
+    df = pd.DataFrame.from_records(
+        boot_bias_var_samples,
+        index="bootstrap_index",
+        columns=("bootstrap_index", "dataset", "n_comp", "bias", "variance"),
+    )
+
+    df.columns = ["dataset", "n_comp", "bias", "variance"]
+    return df

From 493e769da61e1ba7d0d4d02adefb28d74040f4b8 Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Wed, 12 Jun 2024 14:35:28 +0200
Subject: [PATCH 42/61] added bootstrap table to report

---
 .../multiclosure_inconsistent.py              |  5 ++
 .../multiclosure_inconsistent_output.py       | 63 +++++++++++++++++++
 .../multiclosure_analysis.yaml                |  9 +++
 .../pca_template.md                           |  3 +
 4 files changed, 80 insertions(+)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
index 42c3d6423a..3728b3d7cb 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
@@ -311,3 +311,8 @@ def bootstrapped_principal_components_bias_variance_dataset(
 
     df.columns = ["dataset", "n_comp", "bias", "variance"]
     return df
+
+
+bootstrapped_principal_components_bias_variance_data = collect(
+    "bootstrapped_principal_components_bias_variance_dataset", ("data",)
+)
diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
index 9d923def5d..eef98cdca5 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
@@ -99,6 +99,69 @@ def table_bias_variance_datasets(principal_components_bias_variance_datasets, ea
 )
 
 
+@table
+def bootstrapped_table_bias_variance_datasets(bootstrapped_principal_components_bias_variance_data):
+    """
+    Compute the bias, variance, ratio and sqrt(ratio) for each dataset
+    and return a DataFrame with the results.
+    Uncertainty on ratio and sqrt ratio is computed by Gaussian error propagation
+    of the bootstrap uncertainty on bias and variance.
+    """
+    records = []
+    for boot_ds in bootstrapped_principal_components_bias_variance_data:
+        df = boot_ds
+        mean_bias = df["bias"].mean()
+        mean_variance = df["variance"].mean()
+        mean_ratio = mean_bias / mean_variance
+        bootstrap_unc_bias = df["bias"].std()
+
+        # gaussian error propagation for the ratio of the means uncertainty
+        # only consider bias as source of uncertainty for the ratio (variance is almost constant)
+        bootstrap_unc_ratio = np.sqrt((1 / mean_variance * bootstrap_unc_bias) ** 2)
+        sqrt_ratio = np.sqrt(mean_ratio)
+        # gaussian error propagation for the sqrt of the ratio
+        bootstrap_unc_sqrt_ratio = 0.5 * bootstrap_unc_ratio / np.sqrt(mean_ratio)
+
+        records.append(
+            dict(
+                dataset=df["dataset"].iloc[0],
+                mean_dof=df.n_comp.mean(),
+                bias=mean_bias,
+                variance=mean_variance,
+                ratio=mean_ratio,
+                error_ratio=bootstrap_unc_ratio,
+                ratio_sqrt=sqrt_ratio,
+                error_ratio_sqrt=bootstrap_unc_sqrt_ratio,
+            )
+        )
+
+    df = pd.DataFrame.from_records(
+        records,
+        index="dataset",
+        columns=(
+            "dataset",
+            "mean_dof",
+            "bias",
+            "variance",
+            "ratio",
+            "error_ratio",
+            "ratio_sqrt",
+            "error_ratio_sqrt",
+        ),
+    )
+    df.columns = [
+        "mean_dof",
+        "bias",
+        "variance",
+        "ratio",
+        "error_ratio",
+        "ratio_sqrt",
+        "error_ratio_sqrt",
+    ]
+
+    return df
+
+
 @figuregen
 def plot_lambdavalues_bias_variance_values(
     lambdavalues_table_bias_variance_datasets, lambdavalues, each_dataset
diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml
index cf27fa9796..e46f686c99 100644
--- a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml
@@ -154,6 +154,15 @@ theoryid:
   from_: theory
 use_cuts: internal
 
+# bootstrap settings
+n_fit_max: 2
+n_fit: 10
+
+n_rep_max: 100
+n_rep: 60
+n_boot_multiclosure: 60
+rng_seed_mct_boot: 1234
+
 lambdavalues:
   - label: "LAMBDA02"
     lambda_value: 0.2
diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md
index 2f308bd701..f873f9053f 100644
--- a/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md
@@ -3,5 +3,8 @@
 ## Table of bias and variance
 {@table_bias_variance_datasets@}
 
+## Bootstrap Table of bias and variance
+{@bootstrapped_table_bias_variance_datasets@}
+
 ## L2 condition number
 {@plot_l2_condition_number@}

From fb4fb4e73b529ad7f3c14f26de50ace111cbbb4a Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Wed, 12 Jun 2024 14:36:46 +0200
Subject: [PATCH 43/61] changed defaults of bootstrap

---
 .../multiclosure_analysis.yaml                              | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml
index e46f686c99..a068bcc003 100644
--- a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml
@@ -155,12 +155,12 @@ theoryid:
 use_cuts: internal
 
 # bootstrap settings
-n_fit_max: 2
-n_fit: 10
+n_fit_max: 25
+n_fit: 20
 
 n_rep_max: 100
 n_rep: 60
-n_boot_multiclosure: 60
+n_boot_multiclosure: 100
 rng_seed_mct_boot: 1234
 
 lambdavalues:

From b15427311661826f4337f42da748191c182b2826 Mon Sep 17 00:00:00 2001
From: Giovanni De Crescenzo <crescenzo@pi104.itp.uni-heidelberg.de>
Date: Thu, 13 Jun 2024 14:51:08 +0200
Subject: [PATCH 44/61] added title for single data point in latex mode

---
 .../validphys/closuretest/multiclosure_output.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/validphys2/src/validphys/closuretest/multiclosure_output.py b/validphys2/src/validphys/closuretest/multiclosure_output.py
index 54a92579bc..75a6d006c2 100644
--- a/validphys2/src/validphys/closuretest/multiclosure_output.py
+++ b/validphys2/src/validphys/closuretest/multiclosure_output.py
@@ -855,19 +855,23 @@ def plot_bias_variance_distributions(
     yield fig
 
 @figuregen
-def xq2_data_prcs_maps(xq2_data_map):
+def xq2_data_prcs_maps(xq2_data_map,each_dataset):
     keys = ["std_devs","xi"]
-    for elem in xq2_data_map:
+    for j,elem in enumerate(xq2_data_map):
+
         for k in keys:
+            if k == "std_devs":
+                title = r"$R_{bv}$"
+            if k == "xi":
+                title = r"$\xi$"
             fig, ax = plotutils.subplots()
             im = ax.scatter(elem['x_coords'],elem['Q_coords'],
                                 c=(np.asarray(elem[k])), 
-                                cmap='viridis',
-                                label = elem['name'])
-            fig.colorbar(im,label=k)
+                                cmap='viridis')
+            fig.colorbar(im,label=title)
             ax.set_xscale('log')  # Set x-axis to log scale
             ax.set_yscale('log')  # Set y-axis to log scale
             ax.set_xlabel('x')
             ax.set_ylabel('Q2')
-            ax.set_title(elem["name"]+" "+k)
+            ax.set_title(each_dataset[j].commondata.metadata.plotting.dataset_label)
             yield fig

From a1d6a08e09473a4a989a0bb947daed06aebc365c Mon Sep 17 00:00:00 2001
From: andreab1997 <andreab1997@yahoo.it>
Date: Tue, 18 Jun 2024 15:11:50 +0200
Subject: [PATCH 45/61] Add PCA on corr matrix on full dataset

---
 .../multiclosure_inconsistent.py              | 149 ++++++++++++++++++
 .../multiclosure_inconsistent_output.py       |  30 +++-
 2 files changed, 178 insertions(+), 1 deletion(-)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
index 3728b3d7cb..4037315c60 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
@@ -153,6 +153,108 @@ def internal_multiclosure_dataset_loader_pca(
         covmat_pca=covmat_pca,
         sqrt_covmat_pca=sqrt_covmat_pca,
     )
+    
+@check_multifit_replicas
+def internal_multiclosure_data_loader_pca(
+    internal_multiclosure_data_loader,
+    explained_variance_ratio=0.99,
+    _internal_max_reps=None,
+    _internal_min_reps=20,
+):
+    """
+    Like multiclosure.internal_multiclosure_dataset_loader_pca except for all data
+
+    Parameters
+    ----------
+    internal_multiclosure_data_loader: tuple
+        closure fits theory predictions,
+        underlying law theory predictions,
+        covariance matrix,
+        sqrt covariance matrix
+
+    explained_variance_ratio: float, default is 0.99
+
+    _internal_max_reps: int, default is None
+        Maximum number of replicas used in the fits
+        this is needed to check that the number of replicas is the same for all fits
+
+    _internal_min_reps: int, default is 20
+        Minimum number of replicas used in the fits
+        this is needed to check that the number of replicas is the same for all fits
+
+    Returns
+    -------
+    PCAInternalMulticlosureLoader
+    """
+    closures_th, law_th, _, _ = internal_multiclosure_data_loader
+    reps = np.asarray([th.error_members for th in closures_th])
+    
+    # compute the covariance matrix of the theory predictions for each fit
+    _covmats = [np.cov(rep, rowvar=True, bias=True) for rep in reps]
+    # compute the mean covariance matrix
+    _covmat_mean = np.mean(_covmats, axis=0)
+    # Keep the sqrt of the diagonals to reconstruct the covmat later
+    D = np.sqrt(np.diag(_covmat_mean))
+    
+    # compute the correlation matrix of the theory predictions for each fit
+    _corrmats = [np.corrcoef(rep, rowvar=True, bias=True) for rep in reps]
+    _corrmat_mean = np.mean(_corrmats, axis=0)
+    # diagonalize the mean correlation matrix and only keep the principal components
+    # that explain the required variance
+
+    if _covmat_mean.shape == ():
+        return PCAInternalMulticlosureLoader(
+            closures_th=closures_th,
+            law_th=law_th,
+            pc_basis=1,
+            n_comp=1,
+            covmat_pca=_covmat_mean,
+            sqrt_covmat_pca=np.sqrt(_covmat_mean),
+        )
+
+    eighvals, eigvecs = np.linalg.eigh(_corrmat_mean)
+    idx = np.argsort(eighvals)[::-1]
+    # sort eigenvalues from largest to smallest
+    eigvecs = eigvecs[:, idx]
+    eighvals = eighvals[idx]
+    eighvals_norm = eighvals / eighvals.sum()
+
+    # choose components to keep based on EVR
+    n_comp = 1
+    for _ in range(eighvals.shape[0]):
+        if np.sum(eighvals_norm[:n_comp]) >= explained_variance_ratio:
+            break
+        n_comp += 1
+    # get the principal components
+    pc_basis = eigvecs[:, :n_comp]
+
+    # compute the (PCA) regularized correlation matrix
+    corrmat_pca = pc_basis.T @ _corrmat_mean @ pc_basis
+    # project the diagonal matrix into the PCA space 
+    D_pca = pc_basis.T @ np.diag(D) @ pc_basis
+    # compute the (PCA) regularized covariance matrix
+    covmat_pca = D_pca @ corrmat_pca @ D_pca
+    if n_comp == 1:
+        return PCAInternalMulticlosureLoader(
+            closures_th=closures_th,
+            law_th=law_th,
+            pc_basis=pc_basis,
+            n_comp=1,
+            covmat_pca=covmat_pca,
+            sqrt_covmat_pca=np.sqrt(covmat_pca),
+        )
+
+    # compute sqrt of pdf covariance matrix
+    sqrt_covmat_pca = covmats.sqrt_covmat(covmat_pca)
+
+    return PCAInternalMulticlosureLoader(
+        closures_th=closures_th,
+        law_th=law_th,
+        pc_basis=pc_basis,
+        n_comp=n_comp,
+        covmat_pca=covmat_pca,
+        sqrt_covmat_pca=sqrt_covmat_pca,
+    )
 
 
 def bootstrapped_internal_multiclosure_dataset_loader_pca(
@@ -277,6 +379,53 @@ def principal_components_bias_variance_dataset(internal_multiclosure_dataset_loa
 
     return biases, np.asarray(variances), pca_loader.n_comp
 
+def principal_components_bias_variance_data(internal_multiclosure_data_loader_pca):
+    """
+    Like principal_components_bias_variance_datasets but for all data
+
+    Parameters
+    ----------
+    internal_multiclosure_data_loader_pca : tuple
+        Tuple containing the results of multiclosure fits after pca regularization
+
+
+    Returns
+    -------
+    tuple
+        3D tuple:
+        - biases: 1-D array of shape (Nfits,)
+        - variances: 1-D array of shape (Nfits, )
+        - n_comp: number of principal components kept
+    """
+
+    pca_loader = internal_multiclosure_data_loader_pca
+
+    reps = np.asarray([th.error_members for th in pca_loader.closures_th])
+
+    # compute bias diff and project it onto space spanned by PCs
+    delta_bias = reps.mean(axis=2).T - pca_loader.law_th.central_value[:, np.newaxis]
+
+    if pca_loader.n_comp == 1:
+        delta_bias = pca_loader.pc_basis * delta_bias
+        biases = (delta_bias / pca_loader.sqrt_covmat_pca) ** 2
+        variances = []
+        for i in range(reps.shape[0]):
+            diffs = pca_loader.pc_basis * (
+                reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)
+            )
+            variances.append(np.mean((diffs / pca_loader.sqrt_covmat_pca) ** 2))
+    else:
+        delta_bias = pca_loader.pc_basis.T @ delta_bias
+        biases = calc_chi2(pca_loader.sqrt_covmat_pca, delta_bias)
+        variances = []
+        for i in range(reps.shape[0]):
+            diffs = pca_loader.pc_basis.T @ (
+                reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)
+            )
+            variances.append(np.mean(calc_chi2(pca_loader.sqrt_covmat_pca, diffs)))
+
+    return biases, np.asarray(variances), pca_loader.n_comp
+
 
 principal_components_bias_variance_datasets = collect(
     "principal_components_bias_variance_dataset", ("data",)
diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
index eef98cdca5..892dd090c3 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
@@ -21,7 +21,7 @@
 
 
 @table
-def table_bias_variance_datasets(principal_components_bias_variance_datasets, each_dataset):
+def table_bias_variance_datasets(principal_components_bias_variance_datasets, principal_components_bias_variance_data, each_dataset):
     """
     Compute the bias, variance, ratio and sqrt(ratio) for each dataset
     and return a DataFrame with the results.
@@ -31,6 +31,9 @@ def table_bias_variance_datasets(principal_components_bias_variance_datasets, ea
 
     principal_components_bias_variance_datasets: list
         List of tuples containing the values of bias, variance and number of degrees of freedom
+    
+    principal_components_bias_variance_data: list
+        Same of principal_components_bias_variance_datasets but for all the data
 
     each_dataset: list
         List of validphys.core.DataSetSpec
@@ -41,6 +44,31 @@ def table_bias_variance_datasets(principal_components_bias_variance_datasets, ea
         DataFrame containing the bias, variance, ratio and sqrt(ratio) for each dataset
     """
     records = []
+    
+    # First let's do the total
+    biases_tot, variances_tot, n_comp_tot = principal_components_bias_variance_data
+    bias_tot = np.mean(biases_tot)
+    variance_tot = np.mean(variances_tot)
+    rbv_tot = bias_tot / variance_tot
+    # use gaussian uncertainty propagation
+    delta_rbv_tot = np.sqrt(
+            ((1 / variance_tot) * np.std(biases_tot)) ** 2 + (bias_tot / variance_tot**2 * np.std(variances_tot)) ** 2
+        )
+    sqrt_rbv_tot = np.sqrt(rbv_tot)
+    delta_sqrt_rbv_tot = 0.5 * delta_rbv_tot / np.sqrt(rbv_tot)
+    records.append(
+            dict(
+                dataset="Total",
+                dof=n_comp_tot,
+                bias=bias_tot,
+                variance=variance_tot,
+                ratio=rbv_tot,
+                error_ratio=delta_rbv_tot,
+                ratio_sqrt=sqrt_rbv_tot,
+                error_ratio_sqrt=delta_sqrt_rbv_tot,
+            )
+        )
+    # Now we do dataset per dataset
     for pc_bias_var_dataset, ds in zip(principal_components_bias_variance_datasets, each_dataset):
         biases, variances, n_comp = pc_bias_var_dataset
         bias = np.mean(biases)

From d1808ec1a52d2a20fa93b7b4d3fb523e778d7c58 Mon Sep 17 00:00:00 2001
From: Giovanni De Crescenzo <giodecre@gmail.com>
Date: Wed, 19 Jun 2024 12:09:00 +0200
Subject: [PATCH 46/61] fixed inconsistency with single data point and suggest
 different way to compute error bands after bootstrap

---
 .../multiclosure_inconsistent_output.py             | 13 ++++++++++---
 .../src/validphys/closuretest/multiclosure.py       |  3 ++-
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
index 892dd090c3..e25cdc5d8e 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
@@ -142,14 +142,15 @@ def bootstrapped_table_bias_variance_datasets(bootstrapped_principal_components_
         mean_variance = df["variance"].mean()
         mean_ratio = mean_bias / mean_variance
         bootstrap_unc_bias = df["bias"].std()
-
+ 
         # gaussian error propagation for the ratio of the means uncertainty
         # only consider bias as source of uncertainty for the ratio (variance is almost constant)
         bootstrap_unc_ratio = np.sqrt((1 / mean_variance * bootstrap_unc_bias) ** 2)
-        sqrt_ratio = np.sqrt(mean_ratio)
+        bootstrap_unc_ratio_alt = np.std(df["bias"]/df["variance"])
+        sqrt_ratio = np.mean(np.sqrt(df["bias"]/df["variance"]))
         # gaussian error propagation for the sqrt of the ratio
         bootstrap_unc_sqrt_ratio = 0.5 * bootstrap_unc_ratio / np.sqrt(mean_ratio)
-
+        bootstrap_unc_sqrt_ratio_alt = np.std(np.sqrt(df["bias"]/df["variance"]))
         records.append(
             dict(
                 dataset=df["dataset"].iloc[0],
@@ -158,8 +159,10 @@ def bootstrapped_table_bias_variance_datasets(bootstrapped_principal_components_
                 variance=mean_variance,
                 ratio=mean_ratio,
                 error_ratio=bootstrap_unc_ratio,
+                error_ratio_alt=bootstrap_unc_ratio_alt,
                 ratio_sqrt=sqrt_ratio,
                 error_ratio_sqrt=bootstrap_unc_sqrt_ratio,
+                error_ratio_sqrt_alt=bootstrap_unc_sqrt_ratio_alt
             )
         )
 
@@ -173,8 +176,10 @@ def bootstrapped_table_bias_variance_datasets(bootstrapped_principal_components_
             "variance",
             "ratio",
             "error_ratio",
+            "error_ratio_alt",
             "ratio_sqrt",
             "error_ratio_sqrt",
+            "error_ratio_sqrt_alt"
         ),
     )
     df.columns = [
@@ -183,8 +188,10 @@ def bootstrapped_table_bias_variance_datasets(bootstrapped_principal_components_
         "variance",
         "ratio",
         "error_ratio",
+        "error_ratio_alt",
         "ratio_sqrt",
         "error_ratio_sqrt",
+        "error_ratio_sqrt_alt"
     ]
 
     return df
diff --git a/validphys2/src/validphys/closuretest/multiclosure.py b/validphys2/src/validphys/closuretest/multiclosure.py
index 28b87baaea..a9158b8c42 100644
--- a/validphys2/src/validphys/closuretest/multiclosure.py
+++ b/validphys2/src/validphys/closuretest/multiclosure.py
@@ -980,7 +980,8 @@ def xq2_dataset_map(commondata, cuts,internal_multiclosure_dataset_loader,
     xq2_map_obj = xq2map_with_cuts(commondata, cuts)
     coords = xq2_map_obj[2]
     central_deltas = fits_normed_dataset_central_delta(internal_multiclosure_dataset_loader)
-    std_devs = np.std(central_deltas, axis = 0)
+    #std_devs = np.std(central_deltas, axis = 0)
+    std_devs = np.sqrt(np.mean(central_deltas**2, axis = 0))
     means = np.mean(central_deltas, axis = 0)
     xi = dataset_xi(dataset_replica_and_central_diff(internal_multiclosure_dataset_loader,False))
 

From 7c20ec808927fd292c17a2beeee258918d5673b8 Mon Sep 17 00:00:00 2001
From: Giovanni De Crescenzo <giodecre@gmail.com>
Date: Wed, 19 Jun 2024 15:19:21 +0200
Subject: [PATCH 47/61] added delta plots

---
 .../multiclosure_inconsistent.py              | 42 ++++++++++++++++++-
 .../multiclosure_inconsistent_output.py       | 35 +++++++++++++++-
 2 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
index 4037315c60..164fae9e6c 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
@@ -246,7 +246,6 @@ def internal_multiclosure_data_loader_pca(
 
     # compute sqrt of pdf covariance matrix
     sqrt_covmat_pca = covmats.sqrt_covmat(covmat_pca)
-
     return PCAInternalMulticlosureLoader(
         closures_th=closures_th,
         law_th=law_th,
@@ -426,6 +425,47 @@ def principal_components_bias_variance_data(internal_multiclosure_data_loader_pc
 
     return biases, np.asarray(variances), pca_loader.n_comp
 
+def principal_components_normalized_delta_data(internal_multiclosure_data_loader_pca):
+    """
+    Compute for all data only the normalized delta after PCA regularization
+
+    Parameters
+    ----------
+    internal_multiclosure_data_loader_pca : tuple
+        Tuple containing the results of multiclosure fits after pca regularization
+
+
+    Returns
+    -------
+    nd.array: deltas
+    """
+
+    pca_loader = internal_multiclosure_data_loader_pca
+
+    reps = np.asarray([th.error_members for th in pca_loader.closures_th])
+
+    # compute bias diff and project it onto space spanned by PCs
+    delta_bias = reps.mean(axis=2).T - pca_loader.law_th.central_value[:, np.newaxis]
+    if pca_loader.n_comp == 1:
+        delta_bias = pca_loader.pc_basis * delta_bias
+        standard_deviations = []
+        for i in range(reps.shape[0]):
+            diffs = pca_loader.pc_basis * (
+                reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)
+            )
+            standard_deviations.append(np.sqrt(np.mean((diffs / pca_loader.sqrt_covmat_pca) ** 2)))
+    else:
+        delta_bias = pca_loader.pc_basis.T @ delta_bias
+        standard_deviations = []
+        for i in range(reps.shape[0]):
+            diffs = pca_loader.pc_basis.T @ (
+                reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)
+            )
+            standard_deviations.append(np.std(diffs,axis=1))
+    
+
+    return (delta_bias/np.asarray(standard_deviations).T).flatten(), pca_loader.n_comp
+
 
 principal_components_bias_variance_datasets = collect(
     "principal_components_bias_variance_dataset", ("data",)
diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
index e25cdc5d8e..55aa40c186 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
@@ -10,9 +10,10 @@
 import pandas as pd
 import numpy as np
 
-from reportengine.figure import figuregen
+from reportengine.figure import figuregen, figure
 from reportengine.table import table
 from reportengine import collect
+from scipy.stats import norm
 
 from validphys import plotutils
 from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent import (
@@ -325,3 +326,35 @@ def plot_l2_condition_number(
         ax1.legend(lines, labels, loc='upper left')
 
         yield fig
+
+@figure
+def delta_histogram(principal_components_normalized_delta_data,title, label_hist=None):
+    """
+    Plot histogram of normalized delta regularized with PCA. 
+
+    Parameters
+    ----------
+    principal_components_normalized_delta_data: tuple
+
+    title: str
+        description of multiclosure
+
+    label_hist: str
+        summary description of multiclosure
+    
+
+
+    Yields
+    ------
+    fig
+        Figure object
+    """
+    fig, ax = plotutils.subplots()
+    size = np.shape(principal_components_normalized_delta_data[0])[0]
+    ax.hist(principal_components_normalized_delta_data[0],density=True,bins=int(np.sqrt(size)),label=label_hist)
+    ax.set_title(r"$\delta$ distribution, "+ title +f", dof={principal_components_normalized_delta_data[1]}")
+    x = np.linspace(-3,3,100)
+    y = norm.pdf(x,loc=0,scale=1)
+    ax.plot(x,y,label="Standard gaussian")
+    ax.legend()
+    return fig

From 6f2a8ef45bff37c6f6046e97c90503adfeec2369 Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Fri, 21 Jun 2024 14:23:12 +0100
Subject: [PATCH 48/61] use consistent bootstrap def and separate table
 datasets from table data

---
 .../examples/pca_bias_variance_report.yaml    |   2 +
 .../multiclosure_inconsistent_output.py       | 120 ++++++++++++------
 2 files changed, 80 insertions(+), 42 deletions(-)

diff --git a/validphys2/examples/pca_bias_variance_report.yaml b/validphys2/examples/pca_bias_variance_report.yaml
index 428a684754..024fe417e9 100644
--- a/validphys2/examples/pca_bias_variance_report.yaml
+++ b/validphys2/examples/pca_bias_variance_report.yaml
@@ -42,6 +42,8 @@ template_text: |
   ## Table of bias and variance
   {@table_bias_variance_datasets@}
 
+  {@table_bias_variance_data@}
+
   ## L2 condition number
   {@plot_l2_condition_number@}
   
diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
index 55aa40c186..3e9687fc5a 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
@@ -20,9 +20,8 @@
     internal_multiclosure_dataset_loader_pca,
 )
 
-
 @table
-def table_bias_variance_datasets(principal_components_bias_variance_datasets, principal_components_bias_variance_data, each_dataset):
+def table_bias_variance_datasets(principal_components_bias_variance_datasets, each_dataset):
     """
     Compute the bias, variance, ratio and sqrt(ratio) for each dataset
     and return a DataFrame with the results.
@@ -32,9 +31,6 @@ def table_bias_variance_datasets(principal_components_bias_variance_datasets, pr
 
     principal_components_bias_variance_datasets: list
         List of tuples containing the values of bias, variance and number of degrees of freedom
-    
-    principal_components_bias_variance_data: list
-        Same of principal_components_bias_variance_datasets but for all the data
 
     each_dataset: list
         List of validphys.core.DataSetSpec
@@ -45,31 +41,6 @@ def table_bias_variance_datasets(principal_components_bias_variance_datasets, pr
         DataFrame containing the bias, variance, ratio and sqrt(ratio) for each dataset
     """
     records = []
-    
-    # First let's do the total
-    biases_tot, variances_tot, n_comp_tot = principal_components_bias_variance_data
-    bias_tot = np.mean(biases_tot)
-    variance_tot = np.mean(variances_tot)
-    rbv_tot = bias_tot / variance_tot
-    # use gaussian uncertainty propagation
-    delta_rbv_tot = np.sqrt(
-            ((1 / variance_tot) * np.std(biases_tot)) ** 2 + (bias_tot / variance_tot**2 * np.std(variances_tot)) ** 2
-        )
-    sqrt_rbv_tot = np.sqrt(rbv_tot)
-    delta_sqrt_rbv_tot = 0.5 * delta_rbv_tot / np.sqrt(rbv_tot)
-    records.append(
-            dict(
-                dataset="Total",
-                dof=n_comp_tot,
-                bias=bias_tot,
-                variance=variance_tot,
-                ratio=rbv_tot,
-                error_ratio=delta_rbv_tot,
-                ratio_sqrt=sqrt_rbv_tot,
-                error_ratio_sqrt=delta_sqrt_rbv_tot,
-            )
-        )
-    # Now we do dataset per dataset
     for pc_bias_var_dataset, ds in zip(principal_components_bias_variance_datasets, each_dataset):
         biases, variances, n_comp = pc_bias_var_dataset
         bias = np.mean(biases)
@@ -123,6 +94,79 @@ def table_bias_variance_datasets(principal_components_bias_variance_datasets, pr
     return df
 
 
+
+@table
+def table_bias_variance_data(principal_components_bias_variance_data):
+    """
+    Same as table_bias_variance_datasets but for all the data, meaning that
+    the correlations between the datasets are taken into account.
+
+    Parameters
+    ----------
+    principal_components_bias_variance_data: list
+        Same of principal_components_bias_variance_datasets but for all the data
+
+    each_dataset: list
+        List of validphys.core.DataSetSpec
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame containing the bias, variance, ratio and sqrt(ratio) for each dataset
+    """
+    records = []
+    
+    # First let's do the total
+    biases_tot, variances_tot, n_comp_tot = principal_components_bias_variance_data
+    bias_tot = np.mean(biases_tot)
+    variance_tot = np.mean(variances_tot)
+    rbv_tot = bias_tot / variance_tot
+    # use gaussian uncertainty propagation
+    delta_rbv_tot = np.sqrt(
+            ((1 / variance_tot) * np.std(biases_tot)) ** 2 + (bias_tot / variance_tot**2 * np.std(variances_tot)) ** 2
+        )
+    sqrt_rbv_tot = np.sqrt(rbv_tot)
+    delta_sqrt_rbv_tot = 0.5 * delta_rbv_tot / np.sqrt(rbv_tot)
+    records.append(
+            dict(
+                dataset="Total",
+                dof=n_comp_tot,
+                bias=bias_tot,
+                variance=variance_tot,
+                ratio=rbv_tot,
+                error_ratio=delta_rbv_tot,
+                ratio_sqrt=sqrt_rbv_tot,
+                error_ratio_sqrt=delta_sqrt_rbv_tot,
+            )
+        )
+    
+    df = pd.DataFrame.from_records(
+        records,
+        index="dataset",
+        columns=(
+            "dataset",
+            "dof",
+            "bias",
+            "variance",
+            "ratio",
+            "error_ratio",
+            "ratio_sqrt",
+            "error_ratio_sqrt",
+        ),
+    )
+    df.columns = [
+        "dof",
+        "bias",
+        "variance",
+        "ratio",
+        "error ratio",
+        "sqrt(ratio)",
+        "error sqrt(ratio)",
+    ]
+
+    return df
+
+
 lambdavalues_table_bias_variance_datasets = collect(
     "table_bias_variance_datasets", ("lambdavalues",)
 )
@@ -141,17 +185,15 @@ def bootstrapped_table_bias_variance_datasets(bootstrapped_principal_components_
         df = boot_ds
         mean_bias = df["bias"].mean()
         mean_variance = df["variance"].mean()
-        mean_ratio = mean_bias / mean_variance
-        bootstrap_unc_bias = df["bias"].std()
+        mean_ratio = mean_bias / mean_variance        
  
         # gaussian error propagation for the ratio of the means uncertainty
         # only consider bias as source of uncertainty for the ratio (variance is almost constant)
-        bootstrap_unc_ratio = np.sqrt((1 / mean_variance * bootstrap_unc_bias) ** 2)
-        bootstrap_unc_ratio_alt = np.std(df["bias"]/df["variance"])
+        bootstrap_unc_ratio = np.std(df["bias"]/df["variance"])
         sqrt_ratio = np.mean(np.sqrt(df["bias"]/df["variance"]))
+
         # gaussian error propagation for the sqrt of the ratio
-        bootstrap_unc_sqrt_ratio = 0.5 * bootstrap_unc_ratio / np.sqrt(mean_ratio)
-        bootstrap_unc_sqrt_ratio_alt = np.std(np.sqrt(df["bias"]/df["variance"]))
+        bootstrap_unc_sqrt_ratio = np.std(np.sqrt(df["bias"]/df["variance"]))
         records.append(
             dict(
                 dataset=df["dataset"].iloc[0],
@@ -160,10 +202,8 @@ def bootstrapped_table_bias_variance_datasets(bootstrapped_principal_components_
                 variance=mean_variance,
                 ratio=mean_ratio,
                 error_ratio=bootstrap_unc_ratio,
-                error_ratio_alt=bootstrap_unc_ratio_alt,
                 ratio_sqrt=sqrt_ratio,
                 error_ratio_sqrt=bootstrap_unc_sqrt_ratio,
-                error_ratio_sqrt_alt=bootstrap_unc_sqrt_ratio_alt
             )
         )
 
@@ -177,10 +217,8 @@ def bootstrapped_table_bias_variance_datasets(bootstrapped_principal_components_
             "variance",
             "ratio",
             "error_ratio",
-            "error_ratio_alt",
             "ratio_sqrt",
             "error_ratio_sqrt",
-            "error_ratio_sqrt_alt"
         ),
     )
     df.columns = [
@@ -189,10 +227,8 @@ def bootstrapped_table_bias_variance_datasets(bootstrapped_principal_components_
         "variance",
         "ratio",
         "error_ratio",
-        "error_ratio_alt",
         "ratio_sqrt",
         "error_ratio_sqrt",
-        "error_ratio_sqrt_alt"
     ]
 
     return df

From 75773d40bbe529e7f3209e06c2bfadf7a339ebdb Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Fri, 21 Jun 2024 16:00:26 +0100
Subject: [PATCH 49/61] fixed PCA of correlation matrix for full dataset

---
 .../multiclosure_inconsistent.py              | 31 +++++++++----------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
index 164fae9e6c..56af67ed0a 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
@@ -153,7 +153,8 @@ def internal_multiclosure_dataset_loader_pca(
         covmat_pca=covmat_pca,
         sqrt_covmat_pca=sqrt_covmat_pca,
     )
-    
+
+
 @check_multifit_replicas
 def internal_multiclosure_data_loader_pca(
     internal_multiclosure_data_loader,
@@ -188,20 +189,19 @@ def internal_multiclosure_data_loader_pca(
     """
     closures_th, law_th, _, _ = internal_multiclosure_data_loader
     reps = np.asarray([th.error_members for th in closures_th])
-    
+
     # compute the covariance matrix of the theory predictions for each fit
     _covmats = [np.cov(rep, rowvar=True, bias=True) for rep in reps]
     # compute the mean covariance matrix
     _covmat_mean = np.mean(_covmats, axis=0)
     # Keep the sqrt of the diagonals to reconstruct the covmat later
     D = np.sqrt(np.diag(_covmat_mean))
-    
-    # compute the correlation matrix of the theory predictions for each fit
-    _corrmats = [np.corrcoef(rep, rowvar=True, bias=True) for rep in reps]
-    _corrmat_mean = np.mean(_corrmats, axis=0)
+
+    # compute the correlation matrix
+    _corrmat_mean = _covmat_mean / np.outer(D, D)
+
     # diagonalize the mean correlation matrix and only keep the principal components
     # that explain the required variance
-
     if _covmat_mean.shape == ():
         return PCAInternalMulticlosureLoader(
             closures_th=closures_th,
@@ -228,12 +228,10 @@ def internal_multiclosure_data_loader_pca(
     # get the principal components
     pc_basis = eigvecs[:, :n_comp]
 
-    # compute the (PCA) regularized correlation matrix
-    corrmat_pca = pc_basis.T @ _corrmat_mean @ pc_basis
-    # project the diagonal matrix into the PCA space 
-    D_pca = pc_basis.T @ np.diag(D) @ pc_basis
-    # compute the (PCA) regularized covariance matrix
-    covmat_pca = D_pca @ corrmat_pca @ D_pca
+    # compute the (PCA) regularized covariance matrix by projecting the mean covariance matrix
+    # onto the principal components
+    covmat_pca = pc_basis.T @ _covmat_mean @ pc_basis
+
     if n_comp == 1:
         return PCAInternalMulticlosureLoader(
             closures_th=closures_th,
@@ -378,6 +376,7 @@ def principal_components_bias_variance_dataset(internal_multiclosure_dataset_loa
 
     return biases, np.asarray(variances), pca_loader.n_comp
 
+
 def principal_components_bias_variance_data(internal_multiclosure_data_loader_pca):
     """
     Like principal_components_bias_variance_datasets but for all data
@@ -425,6 +424,7 @@ def principal_components_bias_variance_data(internal_multiclosure_data_loader_pc
 
     return biases, np.asarray(variances), pca_loader.n_comp
 
+
 def principal_components_normalized_delta_data(internal_multiclosure_data_loader_pca):
     """
     Compute for all data only the normalized delta after PCA regularization
@@ -461,10 +461,9 @@ def principal_components_normalized_delta_data(internal_multiclosure_data_loader
             diffs = pca_loader.pc_basis.T @ (
                 reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)
             )
-            standard_deviations.append(np.std(diffs,axis=1))
-    
+            standard_deviations.append(np.std(diffs, axis=1))
 
-    return (delta_bias/np.asarray(standard_deviations).T).flatten(), pca_loader.n_comp
+    return (delta_bias / np.asarray(standard_deviations).T).flatten(), pca_loader.n_comp
 
 
 principal_components_bias_variance_datasets = collect(

From cb61d2245d0a09f750f389ccb47ab6a74ac1cf18 Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Fri, 21 Jun 2024 16:27:29 +0100
Subject: [PATCH 50/61] compute rbv scan using bootstrap uncertainty
 quantification

---
 .../multiclosure_inconsistent_output.py       | 16 +++++--
 .../lambdavalues_report.yaml                  | 48 +++++++++++++++++++
 2 files changed, 59 insertions(+), 5 deletions(-)
 create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/lambdavalues_report.yaml

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
index 3e9687fc5a..288647e9bd 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
@@ -221,22 +221,28 @@ def bootstrapped_table_bias_variance_datasets(bootstrapped_principal_components_
             "error_ratio_sqrt",
         ),
     )
+
     df.columns = [
         "mean_dof",
         "bias",
         "variance",
         "ratio",
-        "error_ratio",
-        "ratio_sqrt",
-        "error_ratio_sqrt",
+        "error ratio",
+        "sqrt(ratio)",
+        "error sqrt(ratio)",
     ]
 
     return df
 
 
+lambdavalues_bootstrapped_table_bias_variance_datasets = collect(
+    "bootstrapped_table_bias_variance_datasets", ("lambdavalues",)
+)
+
+
 @figuregen
 def plot_lambdavalues_bias_variance_values(
-    lambdavalues_table_bias_variance_datasets, lambdavalues, each_dataset
+    lambdavalues_bootstrapped_table_bias_variance_datasets, lambdavalues, each_dataset
 ):
     """
     Plot sqrt of ratio bias variance as a function of lambda for each dataset.
@@ -260,7 +266,7 @@ def plot_lambdavalues_bias_variance_values(
     for ds in each_dataset:
         fig, ax = plotutils.subplots()
         for i, lambdavalue in enumerate(lambdavalues):
-            df = lambdavalues_table_bias_variance_datasets[i]
+            df = lambdavalues_bootstrapped_table_bias_variance_datasets[i]
             df = df[df.index == str(ds)]
 
             ax.errorbar(
diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/lambdavalues_report.yaml b/validphys2/src/validphys/compareinconsistentclosuretemplates/lambdavalues_report.yaml
new file mode 100644
index 0000000000..c3b91f855c
--- /dev/null
+++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/lambdavalues_report.yaml
@@ -0,0 +1,48 @@
+meta:
+  title: Lambda Values scan multiclosure report
+  author: Mark N. Costantini
+  keywords: [inconsistencies, multiclosure]
+
+
+##################################################################
+#OTHER ANALYSIS SETTINGS
+
+fit: 230124_dis_ict_lam02_fs_122996
+theory:
+  from_: fit
+theoryid:
+  from_: theory
+
+dataset_inputs:
+  from_: fit
+  
+use_cuts: internal
+
+# bootstrap settings
+n_fit_max: 2
+n_fit: 20
+
+n_rep_max: 100
+n_rep: 60
+n_boot_multiclosure: 100
+rng_seed_mct_boot: 1234
+
+use_t0: True
+t0pdfset: 210223-mw-000_fakepdf
+
+lambdavalues:
+  - label: "LAMBDA02"
+    lambda_value: 0.2
+    use_t0: True
+    t0pdfset: 210223-mw-000_fakepdf
+    explained_variance_ratio: 0.99
+    variancepdf: 230124_dis_ict_lam02_fs_122996
+    fits:
+      - 230124_dis_ict_lam02_fs_122996
+      - 230124_dis_ict_lam02_fs_152326
+    
+
+template: ratio_biasvar_template.md
+ 
+actions_:
+  - report(main=true)

From 07b16a6a52349cc3e7427fe9b799c3534820e367 Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Tue, 25 Jun 2024 15:53:38 +0100
Subject: [PATCH 51/61] added full data bootstrapped table

---
 .../examples/pca_bias_variance_report.yaml    |  15 +-
 .../multiclosure_inconsistent.py              |  66 ++++++--
 .../multiclosure_inconsistent_output.py       | 142 ++++++++++++++----
 3 files changed, 179 insertions(+), 44 deletions(-)

diff --git a/validphys2/examples/pca_bias_variance_report.yaml b/validphys2/examples/pca_bias_variance_report.yaml
index 024fe417e9..2b53ae59a1 100644
--- a/validphys2/examples/pca_bias_variance_report.yaml
+++ b/validphys2/examples/pca_bias_variance_report.yaml
@@ -23,6 +23,17 @@ t0pdfset: 210223-mw-000_fakepdf
 
 explained_variance_ratio: 0.99
 
+##### Bootstrap parameters
+n_fit_max: 8
+n_fit: 8
+
+n_rep_max: 100
+n_rep: 60
+
+n_boot_multiclosure: 100
+rng_seed_mct_boot: 1
+
+
 fits:
   - 25_5_2023_19_47_5_dis_pt1_mnc_commit_4d5d473c_filterseed_415295
   - 25_5_2023_19_49_38_dis_pt1_mnc_commit_4d5d473c_filterseed_120750
@@ -42,7 +53,9 @@ template_text: |
   ## Table of bias and variance
   {@table_bias_variance_datasets@}
 
-  {@table_bias_variance_data@}
+
+  ## Bootstrapped table for full dataset  
+  {@bootstrapped_table_bias_variance_data@}
 
   ## L2 condition number
   {@plot_l2_condition_number@}
diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
index 56af67ed0a..89a1679fe8 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
@@ -16,7 +16,10 @@
 from validphys.calcutils import calc_chi2
 from validphys.results import ThPredictionsResult
 from validphys.closuretest.closure_checks import check_multifit_replicas
-from validphys.closuretest.multiclosure import bootstrapped_internal_multiclosure_dataset_loader
+from validphys.closuretest.multiclosure import (
+    bootstrapped_internal_multiclosure_dataset_loader,
+    bootstrapped_internal_multiclosure_data_loader,
+)
 
 from reportengine import collect
 
@@ -311,20 +314,27 @@ def bootstrapped_internal_multiclosure_data_loader_pca(
     """
     Same as bootstrapped_internal_multiclosure_dataset_loader_pca but for all the data.
     """
-    return bootstrapped_internal_multiclosure_dataset_loader_pca(
+    # get bootstrapped internal multiclosure dataset loader
+    bootstrap_imdl = bootstrapped_internal_multiclosure_data_loader(
         internal_multiclosure_data_loader,
-        n_fit_max,
-        n_fit,
-        n_rep_max,
-        n_rep,
-        n_boot_multiclosure,
-        rng_seed_mct_boot,
-        use_repeats,
-        explained_variance_ratio,
-        _internal_max_reps,
-        _internal_min_reps,
+        n_fit_max=n_fit_max,
+        n_fit=n_fit,
+        n_rep_max=n_rep_max,
+        n_rep=n_rep,
+        n_boot_multiclosure=n_boot_multiclosure,
+        rng_seed_mct_boot=rng_seed_mct_boot,
+        use_repeats=use_repeats,
     )
 
+    # PCA regularise all the bootstrapped internal multiclosure dataset loaders
+    bootstrap_imdl_pca = [
+        internal_multiclosure_data_loader_pca(
+            imdl, explained_variance_ratio, _internal_max_reps, _internal_min_reps
+        )
+        for imdl in bootstrap_imdl
+    ]
+    return tuple(bootstrap_imdl_pca)
+
 
 def principal_components_bias_variance_dataset(internal_multiclosure_dataset_loader_pca):
     """
@@ -501,6 +511,36 @@ def bootstrapped_principal_components_bias_variance_dataset(
     return df
 
 
-bootstrapped_principal_components_bias_variance_data = collect(
+bootstrapped_principal_components_bias_variance_datasets = collect(
     "bootstrapped_principal_components_bias_variance_dataset", ("data",)
 )
+
+
+def bootstrapped_principal_components_bias_variance_data(
+    bootstrapped_internal_multiclosure_data_loader_pca,
+):
+    """
+    Computes Bias and Variance for each bootstrap sample.
+    Returns a DataFrame with the results.
+    """
+    boot_bias_var_samples = []
+    for i, boot_imdl_pca in enumerate(bootstrapped_internal_multiclosure_data_loader_pca):
+        bias, var, n_comp = principal_components_bias_variance_data(boot_imdl_pca)
+        boot_bias_var_samples.append(
+            {
+                "bias": np.mean(bias),
+                "variance": np.mean(var),
+                "n_comp": n_comp,
+                "data": "Full dataset",
+                "bootstrap_index": i,
+            }
+        )
+
+    df = pd.DataFrame.from_records(
+        boot_bias_var_samples,
+        index="bootstrap_index",
+        columns=("bootstrap_index", "dataset", "n_comp", "bias", "variance"),
+    )
+
+    df.columns = ["dataset", "n_comp", "bias", "variance"]
+    return df
diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
index 288647e9bd..b7aa0f7ed8 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
@@ -20,6 +20,7 @@
     internal_multiclosure_dataset_loader_pca,
 )
 
+
 @table
 def table_bias_variance_datasets(principal_components_bias_variance_datasets, each_dataset):
     """
@@ -94,7 +95,6 @@ def table_bias_variance_datasets(principal_components_bias_variance_datasets, ea
     return df
 
 
-
 @table
 def table_bias_variance_data(principal_components_bias_variance_data):
     """
@@ -115,7 +115,7 @@ def table_bias_variance_data(principal_components_bias_variance_data):
         DataFrame containing the bias, variance, ratio and sqrt(ratio) for each dataset
     """
     records = []
-    
+
     # First let's do the total
     biases_tot, variances_tot, n_comp_tot = principal_components_bias_variance_data
     bias_tot = np.mean(biases_tot)
@@ -123,23 +123,24 @@ def table_bias_variance_data(principal_components_bias_variance_data):
     rbv_tot = bias_tot / variance_tot
     # use gaussian uncertainty propagation
     delta_rbv_tot = np.sqrt(
-            ((1 / variance_tot) * np.std(biases_tot)) ** 2 + (bias_tot / variance_tot**2 * np.std(variances_tot)) ** 2
-        )
+        ((1 / variance_tot) * np.std(biases_tot)) ** 2
+        + (bias_tot / variance_tot**2 * np.std(variances_tot)) ** 2
+    )
     sqrt_rbv_tot = np.sqrt(rbv_tot)
     delta_sqrt_rbv_tot = 0.5 * delta_rbv_tot / np.sqrt(rbv_tot)
     records.append(
-            dict(
-                dataset="Total",
-                dof=n_comp_tot,
-                bias=bias_tot,
-                variance=variance_tot,
-                ratio=rbv_tot,
-                error_ratio=delta_rbv_tot,
-                ratio_sqrt=sqrt_rbv_tot,
-                error_ratio_sqrt=delta_sqrt_rbv_tot,
-            )
+        dict(
+            dataset="Total",
+            dof=n_comp_tot,
+            bias=bias_tot,
+            variance=variance_tot,
+            ratio=rbv_tot,
+            error_ratio=delta_rbv_tot,
+            ratio_sqrt=sqrt_rbv_tot,
+            error_ratio_sqrt=delta_sqrt_rbv_tot,
         )
-    
+    )
+
     df = pd.DataFrame.from_records(
         records,
         index="dataset",
@@ -173,7 +174,9 @@ def table_bias_variance_data(principal_components_bias_variance_data):
 
 
 @table
-def bootstrapped_table_bias_variance_datasets(bootstrapped_principal_components_bias_variance_data):
+def bootstrapped_table_bias_variance_datasets(
+    bootstrapped_principal_components_bias_variance_datasets,
+):
     """
     Compute the bias, variance, ratio and sqrt(ratio) for each dataset
     and return a DataFrame with the results.
@@ -181,19 +184,19 @@ def bootstrapped_table_bias_variance_datasets(bootstrapped_principal_components_
     of the bootstrap uncertainty on bias and variance.
     """
     records = []
-    for boot_ds in bootstrapped_principal_components_bias_variance_data:
+    for boot_ds in bootstrapped_principal_components_bias_variance_datasets:
         df = boot_ds
         mean_bias = df["bias"].mean()
         mean_variance = df["variance"].mean()
-        mean_ratio = mean_bias / mean_variance        
- 
+        mean_ratio = mean_bias / mean_variance
+
         # gaussian error propagation for the ratio of the means uncertainty
         # only consider bias as source of uncertainty for the ratio (variance is almost constant)
-        bootstrap_unc_ratio = np.std(df["bias"]/df["variance"])
-        sqrt_ratio = np.mean(np.sqrt(df["bias"]/df["variance"]))
+        bootstrap_unc_ratio = np.std(df["bias"] / df["variance"])
+        sqrt_ratio = np.mean(np.sqrt(df["bias"] / df["variance"]))
 
         # gaussian error propagation for the sqrt of the ratio
-        bootstrap_unc_sqrt_ratio = np.std(np.sqrt(df["bias"]/df["variance"]))
+        bootstrap_unc_sqrt_ratio = np.std(np.sqrt(df["bias"] / df["variance"]))
         records.append(
             dict(
                 dataset=df["dataset"].iloc[0],
@@ -240,6 +243,75 @@ def bootstrapped_table_bias_variance_datasets(bootstrapped_principal_components_
 )
 
 
+@table
+def bootstrapped_table_bias_variance_data(bootstrapped_principal_components_bias_variance_data):
+    """
+    Compute the bias, variance, ratio and sqrt(ratio) for a Datagroup
+    and return a DataFrame with the results.
+    Uncertainty on ratio and sqrt ratio is computed by Gaussian error propagation
+    of the bootstrap uncertainty on bias and variance.
+    """
+    records = []
+
+    df = bootstrapped_principal_components_bias_variance_data
+
+    mean_bias = df["bias"].mean()
+    mean_variance = df["variance"].mean()
+    mean_ratio = mean_bias / mean_variance
+
+    # gaussian error propagation for the ratio of the means uncertainty
+    # only consider bias as source of uncertainty for the ratio (variance is almost constant)
+    bootstrap_unc_ratio = np.std(df["bias"] / df["variance"])
+    sqrt_ratio = np.mean(np.sqrt(df["bias"] / df["variance"]))
+
+    # gaussian error propagation for the sqrt of the ratio
+    bootstrap_unc_sqrt_ratio = np.std(np.sqrt(df["bias"] / df["variance"]))
+    records.append(
+        dict(
+            dataset=df["dataset"].iloc[0],
+            mean_dof=df.n_comp.mean(),
+            bias=mean_bias,
+            variance=mean_variance,
+            ratio=mean_ratio,
+            error_ratio=bootstrap_unc_ratio,
+            ratio_sqrt=sqrt_ratio,
+            error_ratio_sqrt=bootstrap_unc_sqrt_ratio,
+        )
+    )
+
+    df = pd.DataFrame.from_records(
+        records,
+        index="dataset",
+        columns=(
+            "dataset",
+            "mean_dof",
+            "bias",
+            "variance",
+            "ratio",
+            "error_ratio",
+            "ratio_sqrt",
+            "error_ratio_sqrt",
+        ),
+    )
+
+    df.columns = [
+        "mean_dof",
+        "bias",
+        "variance",
+        "ratio",
+        "error ratio",
+        "sqrt(ratio)",
+        "error sqrt(ratio)",
+    ]
+
+    return df
+
+
+lambdavalues_bootstrapped_table_bias_variance_datasets = collect(
+    "bootstrapped_table_bias_variance_data", ("lambdavalues",)
+)
+
+
 @figuregen
 def plot_lambdavalues_bias_variance_values(
     lambdavalues_bootstrapped_table_bias_variance_datasets, lambdavalues, each_dataset
@@ -369,10 +441,11 @@ def plot_l2_condition_number(
 
         yield fig
 
+
 @figure
-def delta_histogram(principal_components_normalized_delta_data,title, label_hist=None):
+def delta_histogram(principal_components_normalized_delta_data, title, label_hist=None):
     """
-    Plot histogram of normalized delta regularized with PCA. 
+    Plot histogram of normalized delta regularized with PCA.
 
     Parameters
     ----------
@@ -383,7 +456,7 @@ def delta_histogram(principal_components_normalized_delta_data,title, label_hist
 
     label_hist: str
         summary description of multiclosure
-    
+
 
 
     Yields
@@ -393,10 +466,19 @@ def delta_histogram(principal_components_normalized_delta_data,title, label_hist
     """
     fig, ax = plotutils.subplots()
     size = np.shape(principal_components_normalized_delta_data[0])[0]
-    ax.hist(principal_components_normalized_delta_data[0],density=True,bins=int(np.sqrt(size)),label=label_hist)
-    ax.set_title(r"$\delta$ distribution, "+ title +f", dof={principal_components_normalized_delta_data[1]}")
-    x = np.linspace(-3,3,100)
-    y = norm.pdf(x,loc=0,scale=1)
-    ax.plot(x,y,label="Standard gaussian")
+    ax.hist(
+        principal_components_normalized_delta_data[0],
+        density=True,
+        bins=int(np.sqrt(size)),
+        label=label_hist,
+    )
+    ax.set_title(
+        r"$\delta$ distribution, "
+        + title
+        + f", dof={principal_components_normalized_delta_data[1]}"
+    )
+    x = np.linspace(-3, 3, 100)
+    y = norm.pdf(x, loc=0, scale=1)
+    ax.plot(x, y, label="Standard gaussian")
     ax.legend()
     return fig

From 099af95fb417c47d7fe9f2a104560820ddddb553 Mon Sep 17 00:00:00 2001
From: Giovanni De Crescenzo <crescenzo@pi104.itp.uni-heidelberg.de>
Date: Wed, 26 Jun 2024 15:21:37 +0200
Subject: [PATCH 52/61] slight change in def of delta hist

---
 .../multiclosure_inconsistent.py                     | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
index 89a1679fe8..87054e7432 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
@@ -458,22 +458,22 @@ def principal_components_normalized_delta_data(internal_multiclosure_data_loader
     delta_bias = reps.mean(axis=2).T - pca_loader.law_th.central_value[:, np.newaxis]
     if pca_loader.n_comp == 1:
         delta_bias = pca_loader.pc_basis * delta_bias
-        standard_deviations = []
+        variances = []
         for i in range(reps.shape[0]):
             diffs = pca_loader.pc_basis * (
                 reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)
             )
-            standard_deviations.append(np.sqrt(np.mean((diffs / pca_loader.sqrt_covmat_pca) ** 2)))
+            variances.append(np.sqrt(np.mean((diffs / pca_loader.sqrt_covmat_pca) ** 2)))
     else:
         delta_bias = pca_loader.pc_basis.T @ delta_bias
-        standard_deviations = []
+        variances = []
         for i in range(reps.shape[0]):
             diffs = pca_loader.pc_basis.T @ (
                 reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)
             )
-            standard_deviations.append(np.std(diffs, axis=1))
-
-    return (delta_bias / np.asarray(standard_deviations).T).flatten(), pca_loader.n_comp
+            variances.append(np.var(diffs, axis=1))
+    variances = (np.mean(variances))
+    return (delta_bias / np.sqrt(variances)).flatten(), pca_loader.n_comp
 
 
 principal_components_bias_variance_datasets = collect(

From 89398ef589815ed49a3f238456e9c896faa8c4a2 Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Wed, 26 Jun 2024 16:24:18 +0100
Subject: [PATCH 53/61] added definition of delta in line with eq. 2.22

---
 .../multiclosure_inconsistent.py              | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
index 87054e7432..ac92d43d0c 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
@@ -456,6 +456,10 @@ def principal_components_normalized_delta_data(internal_multiclosure_data_loader
 
     # compute bias diff and project it onto space spanned by PCs
     delta_bias = reps.mean(axis=2).T - pca_loader.law_th.central_value[:, np.newaxis]
+
+    # find basis that diagonalise covmat pca
+    eigvals, eigenvects = np.linalg.eigh(pca_loader.covmat_pca)
+    
     if pca_loader.n_comp == 1:
         delta_bias = pca_loader.pc_basis * delta_bias
         variances = []
@@ -463,17 +467,12 @@ def principal_components_normalized_delta_data(internal_multiclosure_data_loader
             diffs = pca_loader.pc_basis * (
                 reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)
             )
-            variances.append(np.sqrt(np.mean((diffs / pca_loader.sqrt_covmat_pca) ** 2)))
+            std_deviations.append(np.sqrt(np.mean((diffs / pca_loader.sqrt_covmat_pca) ** 2)))
     else:
-        delta_bias = pca_loader.pc_basis.T @ delta_bias
-        variances = []
-        for i in range(reps.shape[0]):
-            diffs = pca_loader.pc_basis.T @ (
-                reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)
-            )
-            variances.append(np.var(diffs, axis=1))
-    variances = (np.mean(variances))
-    return (delta_bias / np.sqrt(variances)).flatten(), pca_loader.n_comp
+        delta_bias = eigenvects.T @ (pca_loader.pc_basis.T @ delta_bias)
+        std_deviations = np.sqrt(eigvals)[:, None]
+
+    return (delta_bias / std_deviations).flatten(), pca_loader.n_comp
 
 
 principal_components_bias_variance_datasets = collect(

From 8d71a2e2cdcfe91d1f811011cdc5c31c476a9570 Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Wed, 26 Jun 2024 16:27:43 +0100
Subject: [PATCH 54/61] added definition of delta in line with eq. 2.22

---
 .../inconsistent_closuretest/multiclosure_inconsistent.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
index ac92d43d0c..939e4659de 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
@@ -462,7 +462,7 @@ def principal_components_normalized_delta_data(internal_multiclosure_data_loader
     
     if pca_loader.n_comp == 1:
         delta_bias = pca_loader.pc_basis * delta_bias
-        variances = []
+        std_deviations = []
         for i in range(reps.shape[0]):
             diffs = pca_loader.pc_basis * (
                 reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)

From 8df13b2081a33e9604f07733b94c353a1f32a444 Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Wed, 26 Jun 2024 19:11:05 +0100
Subject: [PATCH 55/61] added rbv scan for full dataset

---
 .../multiclosure_inconsistent_output.py       | 41 ++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
index b7aa0f7ed8..08d82e5ca9 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
@@ -307,7 +307,7 @@ def bootstrapped_table_bias_variance_data(bootstrapped_principal_components_bias
     return df
 
 
-lambdavalues_bootstrapped_table_bias_variance_datasets = collect(
+lambdavalues_bootstrapped_table_bias_variance_data = collect(
     "bootstrapped_table_bias_variance_data", ("lambdavalues",)
 )
 
@@ -357,6 +357,45 @@ def plot_lambdavalues_bias_variance_values(
         yield fig
 
 
+@figure
+def plot_lambdavalues_bias_variance_values_full_data(
+    lambdavalues_bootstrapped_table_bias_variance_data, lambdavalues
+):
+    """
+    Plot sqrt of ratio bias variance as a function of lambda for each dataset.
+
+    Parameters
+    ----------
+    lambdavalues_bootstrapped_table_bias_variance_data: list
+        list of data frames computed as per table_bias_variance_data.
+
+    lambdavalues: list
+        list specified in multiclosure_analysis.yaml
+
+    Yields
+    ------
+    figure
+    """
+    fig, ax = plotutils.subplots()
+    for i, lambdavalue in enumerate(lambdavalues):
+        df = lambdavalues_bootstrapped_table_bias_variance_data[i]
+
+        ax.errorbar(
+            lambdavalue["lambda_value"],
+            df["sqrt(ratio)"].values,
+            yerr=df["error sqrt(ratio)"].values,
+            color="blue",
+            fmt='o',
+        )
+        ax.hlines(1, xmin=0, xmax=1.0, color="red", linestyle="--")
+        ax.set_ylabel(r"$R_{bv}$")
+        ax.set_xlabel(r"$\lambda$")
+
+    ax.set_title("Full data")
+
+    return fig
+
+
 internal_multiclosure_data_collected_loader = collect(
     "internal_multiclosure_dataset_loader", ("data",)
 )

From defcd9675ad14ec89cfd33af09f817d8ed6d5fbc Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Thu, 27 Jun 2024 14:41:22 +0100
Subject: [PATCH 56/61] added bootstrapped xi indicator function for full
 dataset

---
 .../multiclosure_inconsistent.py              |  57 ++++++++-
 .../multiclosure_inconsistent_output.py       |  34 ++++++
 .../src/validphys/closuretest/multiclosure.py | 114 ++++++++++--------
 3 files changed, 153 insertions(+), 52 deletions(-)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
index 939e4659de..aa9a42f38c 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
@@ -19,6 +19,7 @@
 from validphys.closuretest.multiclosure import (
     bootstrapped_internal_multiclosure_dataset_loader,
     bootstrapped_internal_multiclosure_data_loader,
+    standard_indicator_function,
 )
 
 from reportengine import collect
@@ -459,7 +460,7 @@ def principal_components_normalized_delta_data(internal_multiclosure_data_loader
 
     # find basis that diagonalise covmat pca
     eigvals, eigenvects = np.linalg.eigh(pca_loader.covmat_pca)
-    
+
     if pca_loader.n_comp == 1:
         delta_bias = pca_loader.pc_basis * delta_bias
         std_deviations = []
@@ -480,6 +481,60 @@ def principal_components_normalized_delta_data(internal_multiclosure_data_loader
 )
 
 
+def bootstrapped_principal_components_normalized_delta_data(
+    bootstrapped_internal_multiclosure_data_loader_pca,
+):
+    """
+    Compute the normalized deltas for each bootstrap sample.
+
+    Parameters
+    ----------
+    bootstrapped_internal_multiclosure_data_loader_pca: list
+        list of tuples containing the results of multiclosure fits after pca regularization
+
+    Returns
+    -------
+    list
+        list of tuples containing the normalized deltas and the number of principal components.
+        Each tuple corresponds to a bootstrap sample.
+    """
+    normalised_deltas = []
+    for boot_imdl_pca in bootstrapped_internal_multiclosure_data_loader_pca:
+        normalised_deltas.append(principal_components_normalized_delta_data(boot_imdl_pca))
+    return normalised_deltas
+
+
+def bootstrapped_indicator_function_data(
+    bootstrapped_principal_components_normalized_delta_data, nsigma=1
+):
+    """
+    Compute the indicator function for each bootstrap sample.
+
+    Parameters
+    ----------
+    bootstrapped_principal_components_normalized_delta_data: list
+        list of tuples containing the normalized deltas and the number of principal components.
+        Each tuple corresponds to a bootstrap sample.
+
+    nsigma: int, default is 1
+
+    Returns
+    -------
+    2-D tuple:
+        list
+            list of length N_boot and entrances are arrays of dim Npca x Nfits containing the indicator function for each bootstrap sample.
+
+        float
+            average number of degrees of freedom
+    """
+    indicator_list = []
+    ndof_list = []
+    for boot, ndof in bootstrapped_principal_components_normalized_delta_data:
+        indicator_list.append(standard_indicator_function(boot, nsigma))
+        ndof_list.append(ndof)
+    return indicator_list, np.mean(np.asarray(ndof_list))
+
+
 def bootstrapped_principal_components_bias_variance_dataset(
     bootstrapped_internal_multiclosure_dataset_loader_pca, dataset
 ):
diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
index 08d82e5ca9..74bf7dbe7d 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
@@ -521,3 +521,37 @@ def delta_histogram(principal_components_normalized_delta_data, title, label_his
     ax.plot(x, y, label="Standard gaussian")
     ax.legend()
     return fig
+
+
+@table
+def table_xi_indicator_function_data(bootstrapped_indicator_function_data):
+    """
+    Computes the bootstrap average and std of the indicator function for the data.
+
+    Parameters
+    ----------
+    bootstrapped_indicator_function_data: tuple
+
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame containing the average and std of the indicator function for the data.
+    """
+    indicator_list, mean_dof = bootstrapped_indicator_function_data
+
+    # average over data and fits within the bootstrap samples
+    mean_boot_vals = np.array([np.mean(boot_val) for boot_val in indicator_list])
+
+    # take bootstrap expectation and variance
+    mean_xi = np.mean(mean_boot_vals)
+    std_xi = np.std(mean_boot_vals)
+
+    records = [dict(data="full data", mean_dof=mean_dof, mean_xi=mean_xi, std_xi=std_xi)]
+
+    df = pd.DataFrame.from_records(
+        records, index="data", columns=("data", "mean_dof", "mean_xi", "std_xi")
+    )
+
+    df.columns = ["mean_dof", "mean_xi", "std_xi"]
+    return df
diff --git a/validphys2/src/validphys/closuretest/multiclosure.py b/validphys2/src/validphys/closuretest/multiclosure.py
index a9158b8c42..ed1190e84a 100644
--- a/validphys2/src/validphys/closuretest/multiclosure.py
+++ b/validphys2/src/validphys/closuretest/multiclosure.py
@@ -7,6 +7,7 @@
 ``multiclosure_output.py``
 
 """
+
 import numpy as np
 import scipy.linalg as la
 import scipy.special as special
@@ -116,9 +117,7 @@ def internal_multiclosure_data_loader(
 
 @check_multifit_replicas
 def fits_dataset_bias_variance(
-    internal_multiclosure_dataset_loader,
-    _internal_max_reps=None,
-    _internal_min_reps=20,
+    internal_multiclosure_dataset_loader, _internal_max_reps=None, _internal_min_reps=20
 ):
     """For a single dataset, calculate the bias and variance for each fit
     and return tuple (bias, variance, n_data), where bias and variance are
@@ -149,22 +148,22 @@ def fits_dataset_bias_variance(
         variances.append(np.mean(calc_chi2(sqrtcov, diffs)))
     return biases, np.asarray(variances), len(law_th)
 
+
 @check_multifit_replicas
 def fits_normed_dataset_central_delta(
-    internal_multiclosure_dataset_loader,
-    _internal_max_reps=None,
-    _internal_min_reps=20):
+    internal_multiclosure_dataset_loader, _internal_max_reps=None, _internal_min_reps=20
+):
     """
     For each fit calculate the difference between central expectation value and true val. Normalize this
     value by the variance of the differences between replicas and central expectation value (different
-    for each fit but expected to vary only a little). Each observable central exp value is 
+    for each fit but expected to vary only a little). Each observable central exp value is
     expected to be gaussianly distributed around the true value set by the fakepdf
     """
     closures_th, law_th, exp_cov, sqrtcov = internal_multiclosure_dataset_loader
     # The dimentions here are (fit, data point, replica)
     reps = np.asarray([th.error_members[:, :_internal_max_reps] for th in closures_th])
     # One could mask here some reps in order to avoid redundancy of information
-    #TODO
+    # TODO
 
     n_data = len(law_th)
     n_fits = np.shape(reps)[0]
@@ -172,21 +171,19 @@ def fits_normed_dataset_central_delta(
     # There are n_fits pdf_covariances
     # flag to see whether to eliminate dataset
     for i in range(n_fits):
-        bias_diffs = np.mean(reps[i], axis = 1) - law_th.central_value
+        bias_diffs = np.mean(reps[i], axis=1) - law_th.central_value
         # bias diffs in the for loop should have dim = (n_obs)
-        sigmas = np.sqrt(np.var(reps[i], axis = 1))
+        sigmas = np.sqrt(np.var(reps[i], axis=1))
         # var diffs has shape (n_obs,reps)
-        bias_diffs = bias_diffs/sigmas
-        
+        bias_diffs = bias_diffs / sigmas
+
         deltas.append(bias_diffs.tolist())
         # biases.shape = (n_fits, n_obs_cut/uncut)
         # variances.shape = (n_fits, n_obs_cut/uncut, reps)
     return np.asarray(deltas)
 
 
-fits_datasets_bias_variance = collect(
-    "fits_dataset_bias_variance", ("data",)
-)
+fits_datasets_bias_variance = collect("fits_dataset_bias_variance", ("data",))
 
 
 def expected_dataset_bias_variance(fits_dataset_bias_variance):
@@ -200,9 +197,7 @@ def expected_dataset_bias_variance(fits_dataset_bias_variance):
 
 @check_multifit_replicas
 def fits_data_bias_variance(
-    internal_multiclosure_data_loader,
-    _internal_max_reps=None,
-    _internal_min_reps=20,
+    internal_multiclosure_data_loader, _internal_max_reps=None, _internal_min_reps=20
 ):
     """Like `fits_dataset_bias_variance` but for all data"""
     return fits_dataset_bias_variance(
@@ -350,11 +345,7 @@ def n_replica_samples(fits_pdf, _internal_max_reps=None, _internal_min_reps=20):
     _internal_max_reps.
     """
     return list(
-        range(
-            _internal_min_reps,
-            _internal_max_reps + SAMPLING_INTERVAL,
-            SAMPLING_INTERVAL,
-        )
+        range(_internal_min_reps, _internal_max_reps + SAMPLING_INTERVAL, SAMPLING_INTERVAL)
     )
 
 
@@ -370,13 +361,7 @@ def __init__(self, data):
 
 
 def _bootstrap_multiclosure_fits(
-    internal_multiclosure_dataset_loader,
-    rng,
-    n_fit_max,
-    n_fit,
-    n_rep_max,
-    n_rep,
-    use_repeats,
+    internal_multiclosure_dataset_loader, rng, n_fit_max, n_fit, n_rep_max, n_rep, use_repeats
 ):
     """Perform a single bootstrap resample of the multiclosure fits and return
     a proxy of the base internal object used by relevant estimator actions
@@ -907,9 +892,7 @@ def total_bootstrap_xi(experiments_bootstrap_xi):
 
 
 def dataset_fits_bias_replicas_variance_samples(
-    internal_multiclosure_dataset_loader,
-    _internal_max_reps=None,
-    _internal_min_reps=20,
+    internal_multiclosure_dataset_loader, _internal_max_reps=None, _internal_min_reps=20
 ):
     """For a single dataset, calculate the samples of chi2-quantities which
     are used to calculate the bias and variance for each fit. The output of this
@@ -950,14 +933,10 @@ def dataset_fits_bias_replicas_variance_samples(
 
 
 def dataset_inputs_fits_bias_replicas_variance_samples(
-    internal_multiclosure_data_loader,
-    _internal_max_reps=None,
-    _internal_min_reps=20,
+    internal_multiclosure_data_loader, _internal_max_reps=None, _internal_min_reps=20
 ):
     return dataset_fits_bias_replicas_variance_samples(
-        internal_multiclosure_data_loader,
-        _internal_max_reps=None,
-        _internal_min_reps=20,
+        internal_multiclosure_data_loader, _internal_max_reps=None, _internal_min_reps=20
     )
 
 
@@ -966,9 +945,13 @@ def dataset_inputs_fits_bias_replicas_variance_samples(
 )
 
 
-def xq2_dataset_map(commondata, cuts,internal_multiclosure_dataset_loader,
-                        _internal_max_reps=None,
-                        _internal_min_reps=20):
+def xq2_dataset_map(
+    commondata,
+    cuts,
+    internal_multiclosure_dataset_loader,
+    _internal_max_reps=None,
+    _internal_min_reps=20,
+):
     """
     Load in a dictionary all the specs of a dataset meaning:
     - ds name
@@ -980,17 +963,46 @@ def xq2_dataset_map(commondata, cuts,internal_multiclosure_dataset_loader,
     xq2_map_obj = xq2map_with_cuts(commondata, cuts)
     coords = xq2_map_obj[2]
     central_deltas = fits_normed_dataset_central_delta(internal_multiclosure_dataset_loader)
-    #std_devs = np.std(central_deltas, axis = 0)
-    std_devs = np.sqrt(np.mean(central_deltas**2, axis = 0))
-    means = np.mean(central_deltas, axis = 0)
-    xi = dataset_xi(dataset_replica_and_central_diff(internal_multiclosure_dataset_loader,False))
+    # std_devs = np.std(central_deltas, axis = 0)
+    std_devs = np.sqrt(np.mean(central_deltas**2, axis=0))
+    means = np.mean(central_deltas, axis=0)
+    xi = dataset_xi(dataset_replica_and_central_diff(internal_multiclosure_dataset_loader, False))
 
     # for case of DY observables we have 2 (x,Q) for each experimental point
     if coords[0].shape[0] != std_devs.shape[0]:
-        std_devs = np.concatenate((std_devs,std_devs))
-        means = np.concatenate((means,means))
-        xi = np.concatenate((xi,xi))
-    return {'x_coords':coords[0], 'Q_coords':coords[1],'std_devs':std_devs,'name':commondata.name,'process':commondata.process_type, 'means': means, 'xi': xi}
+        std_devs = np.concatenate((std_devs, std_devs))
+        means = np.concatenate((means, means))
+        xi = np.concatenate((xi, xi))
+    return {
+        'x_coords': coords[0],
+        'Q_coords': coords[1],
+        'std_devs': std_devs,
+        'name': commondata.name,
+        'process': commondata.process_type,
+        'means': means,
+        'xi': xi,
+    }
+
 
+xq2_data_map = collect("xq2_dataset_map", ("data",))
 
-xq2_data_map = collect("xq2_dataset_map",("data",))
\ No newline at end of file
+
+def standard_indicator_function(standard_variable, nsigma=1):
+    """
+    Calculate the indicator function for a standardised variable.
+
+    Parameters
+    ----------
+    standard_variable: np.array
+        array of variables that have been standardised: (x - mu)/sigma
+
+    nsigma: float
+        number of standard deviations to consider
+
+    Returns
+    -------
+    np.array
+        array of ones and zeros. If 1 then the variable is within nsigma standard deviations
+        from the mean, otherwise it is 0.
+    """
+    return np.array(abs(standard_variable) < nsigma, dtype=int)

From e03c29ee24590b800a506d0fcec182f975d8f40a Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Wed, 24 Jul 2024 11:08:57 +0100
Subject: [PATCH 57/61] removed unused import from vp_multiclosure.py script

---
 validphys2/src/validphys/scripts/vp_multiclosure.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/validphys2/src/validphys/scripts/vp_multiclosure.py b/validphys2/src/validphys/scripts/vp_multiclosure.py
index e87a0003d4..904e861068 100644
--- a/validphys2/src/validphys/scripts/vp_multiclosure.py
+++ b/validphys2/src/validphys/scripts/vp_multiclosure.py
@@ -1,18 +1,10 @@
 import sys
-import os
 import logging
 
-#TODO: Look into making these lazy imports
-import prompt_toolkit
-from prompt_toolkit.completion import WordCompleter
-
 from reportengine.compat import yaml
-from reportengine.colors import t
 
 from validphys.app import App
-from validphys.loader import RemoteLoader
 from validphys import compareinconsistentclosuretemplates
-from validphys.promptutils import confirm, KeywordsWithCache
 
 log = logging.getLogger(__name__)
 

From 5b03cf4df5d491064712e91ee29a11f284b801ae Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Wed, 24 Jul 2024 11:10:01 +0100
Subject: [PATCH 58/61] removed new lines

---
 validphys2/src/validphys/kinematics.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/validphys2/src/validphys/kinematics.py b/validphys2/src/validphys/kinematics.py
index 554dd401b5..5644771f27 100644
--- a/validphys2/src/validphys/kinematics.py
+++ b/validphys2/src/validphys/kinematics.py
@@ -12,16 +12,9 @@
 from reportengine import collect
 from reportengine.checks import check_positive
 from reportengine.table import table
-
-
 from validphys.core import CutsPolicy
 from validphys.plotoptions import core as plotoptions_core
 
-
-
-
-
-
 log = logging.getLogger(__name__)
 
 

From ddc3c0d1acfa0f73b0a96f843bc6586788ec436b Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Wed, 24 Jul 2024 11:11:46 +0100
Subject: [PATCH 59/61] removed unused variables

---
 validphys2/src/validphys/closuretest/multiclosure.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/validphys2/src/validphys/closuretest/multiclosure.py b/validphys2/src/validphys/closuretest/multiclosure.py
index ed1190e84a..93ab6cb06c 100644
--- a/validphys2/src/validphys/closuretest/multiclosure.py
+++ b/validphys2/src/validphys/closuretest/multiclosure.py
@@ -159,13 +159,12 @@ def fits_normed_dataset_central_delta(
     for each fit but expected to vary only a little). Each observable central exp value is
     expected to be gaussianly distributed around the true value set by the fakepdf
     """
-    closures_th, law_th, exp_cov, sqrtcov = internal_multiclosure_dataset_loader
+    closures_th, law_th, _, _ = internal_multiclosure_dataset_loader
     # The dimentions here are (fit, data point, replica)
     reps = np.asarray([th.error_members[:, :_internal_max_reps] for th in closures_th])
     # One could mask here some reps in order to avoid redundancy of information
     # TODO
 
-    n_data = len(law_th)
     n_fits = np.shape(reps)[0]
     deltas = []
     # There are n_fits pdf_covariances

From e4fe43845e3bd3943cf6a2b8953e926bfd5d0743 Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Wed, 24 Jul 2024 11:23:40 +0100
Subject: [PATCH 60/61] _covmats as array instead of list of arrays

---
 .../inconsistent_closuretest/multiclosure_inconsistent.py     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
index aa9a42f38c..60c4cf20c5 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
@@ -98,7 +98,7 @@ def internal_multiclosure_dataset_loader_pca(
     reps = np.asarray([th.error_members for th in closures_th])
 
     # compute the covariance matrix of the theory predictions for each fit
-    _covmats = [np.cov(rep, rowvar=True, bias=True) for rep in reps]
+    _covmats = np.array([np.cov(rep, rowvar=True, bias=True) for rep in reps])
 
     # compute the mean covariance matrix
     _covmat_mean = np.mean(_covmats, axis=0)
@@ -195,7 +195,7 @@ def internal_multiclosure_data_loader_pca(
     reps = np.asarray([th.error_members for th in closures_th])
 
     # compute the covariance matrix of the theory predictions for each fit
-    _covmats = [np.cov(rep, rowvar=True, bias=True) for rep in reps]
+    _covmats = np.array([np.cov(rep, rowvar=True, bias=True) for rep in reps])
     # compute the mean covariance matrix
     _covmat_mean = np.mean(_covmats, axis=0)
     # Keep the sqrt of the diagonals to reconstruct the covmat later

From 7cfdd9a10da65af98b51b64475a773fa881623b3 Mon Sep 17 00:00:00 2001
From: Mark Nestor Costantini <mnc33@cam.ac.uk>
Date: Wed, 24 Jul 2024 11:28:02 +0100
Subject: [PATCH 61/61] added eigendecomposition function

---
 .../multiclosure_inconsistent.py              | 40 +++++++++++++------
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
index 60c4cf20c5..ad4aab6ea4 100644
--- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
+++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
@@ -25,6 +25,32 @@
 from reportengine import collect
 
 
+def eigendecomposition(covmat):
+    """
+    Compute the eigendecomposition of a covariance matrix.
+
+    Parameters
+    ----------
+    covmat: np.array
+        covariance matrix
+
+    Returns
+    -------
+    tuple
+        3D tuple containing the eigenvalues, eigenvectors and the normalized
+        eigenvalues.
+        Note that the eigenvalues are sorted from largest to smallest.
+    """
+    eighvals, eigvecs = np.linalg.eigh(covmat)
+    idx = np.argsort(eighvals)[::-1]
+    # sort eigenvalues from largest to smallest
+    eigvecs = eigvecs[:, idx]
+    eighvals = eighvals[idx]
+    eighvals_norm = eighvals / eighvals.sum()
+
+    return eighvals, eigvecs, eighvals_norm
+
+
 @dataclasses.dataclass(frozen=True)
 class PCAInternalMulticlosureLoader:
     """
@@ -116,12 +142,7 @@ def internal_multiclosure_dataset_loader_pca(
             sqrt_covmat_pca=np.sqrt(_covmat_mean),
         )
 
-    eighvals, eigvecs = np.linalg.eigh(_covmat_mean)
-    idx = np.argsort(eighvals)[::-1]
-    # sort eigenvalues from largest to smallest
-    eigvecs = eigvecs[:, idx]
-    eighvals = eighvals[idx]
-    eighvals_norm = eighvals / eighvals.sum()
+    eighvals, eigvecs, eighvals_norm = eigendecomposition(_covmat_mean)
 
     # choose components to keep based on EVR
     n_comp = 1
@@ -216,12 +237,7 @@ def internal_multiclosure_data_loader_pca(
             sqrt_covmat_pca=np.sqrt(_covmat_mean),
         )
 
-    eighvals, eigvecs = np.linalg.eigh(_corrmat_mean)
-    idx = np.argsort(eighvals)[::-1]
-    # sort eigenvalues from largest to smallest
-    eigvecs = eigvecs[:, idx]
-    eighvals = eighvals[idx]
-    eighvals_norm = eighvals / eighvals.sum()
+    eighvals, eigvecs, eighvals_norm = eigendecomposition(_corrmat_mean)
 
     # choose components to keep based on EVR
     n_comp = 1