From 9f2f9d49a9ee66e04b50a2d84718f29a84bc25f7 Mon Sep 17 00:00:00 2001 From: andreab1997 Date: Tue, 20 Feb 2024 15:55:15 +0100 Subject: [PATCH 01/61] Init analysis --- validphys2/src/validphys/kinematics.py | 46 ++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/validphys2/src/validphys/kinematics.py b/validphys2/src/validphys/kinematics.py index 28c0217ffa..a9af6132ef 100644 --- a/validphys2/src/validphys/kinematics.py +++ b/validphys2/src/validphys/kinematics.py @@ -96,6 +96,30 @@ def kinlimits(commondata, cuts, use_cuts, use_kinoverride: bool = True): all_kinlimits = collect(kinlimits, ('dataset_inputs',)) +def xq2_dataset_map(commondata, cuts,internal_multiclosure_dataset_loader, + _internal_max_reps=None, + _internal_min_reps=20): + """ + Load in a dictionary all the specs of a dataset meaning: + - ds name + - ds coords + - standard deviation (in multiclosure) + - mean (in multiclosure again) + - (x,Q^2) coords + """ + xq2_map_obj = xq2map_with_cuts(commondata, cuts) + coords = xq2_map_obj[2] + central_deltas = fits_normed_dataset_central_delta(internal_multiclosure_dataset_loader) + std_devs = np.std(central_deltas, axis = 0) + means = np.mean(central_deltas, axis = 0) + xi = dataset_xi(dataset_replica_and_central_diff(internal_multiclosure_dataset_loader,False)) + #import ipdb; ipdb.set_trace() + # for case of DY observables we have 2 (x,Q) for each experimental point + if coords[0].shape[0] != std_devs.shape[0]: + std_devs = np.concatenate((std_devs,std_devs)) + means = np.concatenate((means,means)) + xi = np.concatenate((xi,xi)) + return {'x_coords':coords[0], 'Q_coords':coords[1],'std_devs':std_devs,'name':commondata.name,'process':commondata.process_type, 'means': means, 'xi': xi} @table def all_kinlimits_table(all_kinlimits, use_kinoverride: bool = True): @@ -185,6 +209,28 @@ def xq2map_with_cuts(commondata, cuts, group_name=None): dataset_inputs_by_groups_xq2map = collect( xq2map_with_cuts, ('group_dataset_inputs_by_metadata', 'data_input') ) +xq2_data_map = collect("xq2_dataset_map",("data",)) + +@figuregen +def xq2_data_prcs_maps(xq2_data_map): + import matplotlib.pyplot as plt + import matplotlib.colors + from matplotlib.colors import ListedColormap + keys = ["std_devs","xi"] + for elem in xq2_data_map: + for k in keys: + fig, ax = plotutils.subplots() + im = ax.scatter(elem['x_coords'],elem['Q_coords'], + c=(np.asarray(elem[k])), + cmap='viridis', + label = elem['name']) + fig.colorbar(im,label=k) + ax.set_xscale('log') # Set x-axis to log scale + ax.set_yscale('log') # Set y-axis to log scale + ax.set_xlabel('x') + ax.set_ylabel('Q2') + ax.set_title(elem["name"]+" "+k) + yield fig def kinematics_table_notable(commondata, cuts, show_extra_labels: bool = False): From be93df4bee36b6f07e54f7946f821f0af9aab332 Mon Sep 17 00:00:00 2001 From: andreab1997 Date: Tue, 20 Feb 2024 16:15:05 +0100 Subject: [PATCH 02/61] Add other functions --- .../multiclosure_inconsistent.py | 362 ++++++++++++++++++ .../multiclosure_inconsistent_output.py | 54 +++ .../src/validphys/closuretest/multiclosure.py | 35 ++ validphys2/src/validphys/kinematics.py | 12 +- 4 files changed, 462 insertions(+), 1 deletion(-) create mode 100644 validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py create mode 100644 validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py new file mode 100644 index 0000000000..4cb12f2d30 --- /dev/null +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py @@ -0,0 +1,362 @@ +""" +closuretest/inconsistent_closuretest/multiclosure_inconsistent.py + +Module containing all of the statistical estimators which are +averaged across multiple inconsistent fits. The actions +in this module are used to produce results which are plotted in +``multiclosure_inconsistent_output.py`` + +""" + +import numpy as np +from sklearn.decomposition import PCA +from sklearn import preprocessing + +from validphys import covmats +from validphys.calcutils import calc_chi2 +from validphys.results import ThPredictionsResult + +from reportengine import collect + + +""" To load several multiclosure fits. Useful for inconsistent closure test analysis """ +multi_dataset_loader = collect("internal_multiclosure_dataset_loader", ("dataspecs",)) + +multi_dataset_fits_bias_replicas_variance_samples = collect( + "dataset_fits_bias_replicas_variance_samples", ("dataspecs",) +) + +multi_fits_bootstrap_dataset_bias_variance = collect( + "fits_bootstrap_dataset_bias_variance", ("dataspecs",) +) + +multi_bias_variance_resampling_dataset = collect("bias_variance_resampling_dataset", ("dataspecs",)) + +multi_dataset_fits_bias_variance_samples_pca = collect( + "dataset_fits_bias_variance_samples_pca", ("dataspecs",) +) + + +def principal_components_dataset(dataset, fits_pdf, variancepdf, explained_variance_ratio=0.93): + """ + Compute the principal components of theory predictions replica matrix + (Ndat x Nrep feature matrix). + + Parameters + ---------- + dataset: (DataSetSpec, DataGroupSpec) + dataset for which the theory predictions and t0 covariance matrix + will be loaded. Note that due to the structure of `validphys` this + function can be overloaded to accept a DataGroupSpec. + + fits_pdf: list + list of PDF objects produced from performing multiple closure tests + fits. Each fit should have a different filterseed but the same + underlying law used to generate the pseudodata. + + variancepdf: validphys.core.PDF + PDF object used to estimate the variance of the fits. + + explained_variance_ratio: float, default is 0.93 + + Returns + ------- + tuple + 2D tuple: + - matrix of the principal components (PCs) of shape (N_pc, N_dat) + - reduced feature matrix, i.e., feature matrix projected onto PCs of shape (N_pc, N_rep) + + """ + # fits_dataset_predictions = [ + # ThPredictionsResult.from_convolution(pdf, dataset) for pdf in fits_pdf + # ] + + # dimensions here are (Nfits, Ndat, Nrep) + # reps = np.asarray([th.error_members for th in fits_dataset_predictions]) + + # reshape so as to get PCs from all the samples + # reps = reps.reshape(reps.shape[1],-1) + + # get replicas from variance fit, used to estimate variance + reps = ThPredictionsResult.from_convolution(variancepdf, dataset).error_members + + # rescale feature matrix + reps_scaled = reps # preprocessing.scale(reps) + + # choose number of principal components (PCs) based on explained variance ratio + n_comp = 1 + for _ in range(reps.shape[0]): + pca = PCA(n_comp).fit(reps_scaled.T) + if np.sum(pca.explained_variance_ratio_) >= explained_variance_ratio: + break + n_comp += 1 + + # project feature matrix onto PCs + pc_reps = pca.components_ @ reps + + return pca.components_, pc_reps, n_comp + + +def principal_components_bias_variance_dataset( + internal_multiclosure_dataset_loader, principal_components_dataset +): + """ + TODO + """ + + closures_th, law_th, _, _ = internal_multiclosure_dataset_loader + + reps = np.asarray([th.error_members for th in closures_th]) + + pc_basis, pc_reps, n_comp = principal_components_dataset + + if n_comp <=1: + return None, None, n_comp + + # estimate (PC) pdf covariance matrix (from replicas), shape is (Npc, Npc) + covmat_pdf = np.cov(pc_reps) + sqrt_covmat_pdf = covmats.sqrt_covmat(covmat_pdf) + + # compute bias diff and project it onto space spanned by PCs + delta_bias = reps.mean(axis=2).T - law_th.central_value[:, np.newaxis] + # shape here is (Npc, Nfits) + delta_bias = pc_basis @ delta_bias + + # compute biases, shape of biases is (Nfits) + biases = calc_chi2(sqrt_covmat_pdf, delta_bias) + + variances = [] + for i in range(reps.shape[0]): + diffs = pc_basis @ (reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)) + variances.append(np.mean(calc_chi2(sqrt_covmat_pdf, diffs))) + + return biases, np.asarray(variances), n_comp + + +principal_components_bias_variance_datasets = collect( + "principal_components_bias_variance_dataset", ("data",) +) + + +def principal_components_variance_distribution_dataset( + internal_multiclosure_dataset_loader, principal_components_dataset +): + """ + TODO + """ + + closures_th, _, _, _ = internal_multiclosure_dataset_loader + + reps = np.asarray([th.error_members for th in closures_th]) + + pc_basis, pc_reps, n_comp = principal_components_dataset + + if n_comp <= 1: + return None, n_comp + # estimate (PC) pdf covariance matrix (from replicas), shape is (Npc, Npc) + covmat_pdf = np.cov(pc_reps) + sqrt_covmat_pdf = covmats.sqrt_covmat(covmat_pdf) + + variances = [] + for i in range(reps.shape[0]): + diffs = pc_basis @ (reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)) + variances.append([calc_chi2(sqrt_covmat_pdf, diffs)]) + + return np.asarray(variances), n_comp + + +principal_components_variance_distribution_datasets = collect( + "principal_components_variance_distribution_dataset", ("data",) +) + + +def compute_num_components(covariance_matrix, threshold=0.99): + """ + Compute the number of principal components to keep based on a desired explained variance threshold. + + Parameters + ---------- + covariance_matrix : np.ndarray, 2-D array + + threshold : (float): Desired explained variance threshold (between 0 and 1). + + Returns + ------- + int + num_components: Number of principal components to keep. + + """ + eig_val, eig_vec = np.linalg.eig(covariance_matrix) + idx = eig_val.argsort()[::-1] + eig_val = eig_val[idx] + eig_vec = eig_vec[:, idx] + + cumulative_sum = np.cumsum(eig_val) + total_sum = np.sum(eig_val) + num_components = np.argmin(np.abs(cumulative_sum / total_sum - threshold)) + + return num_components + + +def pca_covmat(X, num_components): + """ + given data X of shape (n,p), reduce its dimension to + (n,num_components) and return the covariance matrix + of the reduced data matrix. + + Parameters + ---------- + + Returns + ------- + """ + pca = PCA(num_components) + X_reduced = pca.fit_transform(X) + covariance = np.dot(X_reduced.T, X_reduced) / (X_reduced.shape[0] - 1) + return covariance + + +def calc_chi2_pca(pdf_cov, diff, num_components): + """ + Computes the chi2 between pdf_cov and diff by first projecting + them into num_components PCs. + + Parameters + ---------- + pdf_cov: np.ndarray + + diff: np.ndarray + + num_components: int + + Returns + ------- + float or np.ndarray depending on input + + """ + # Diagonalize the matrix + L, W = np.linalg.eig(pdf_cov) + + # Sort the eigenvalues and eigenvectors from largest to smallest + idx = L.argsort()[::-1] + L = L[idx] + W = W[:, idx] + + # Keep only the n_components largest eigenvectors + Wtilde = W[:, :num_components] + + # Transform initial covariance matrix + pdf_cov_pca = np.einsum("ij,jk->ik", np.einsum("ij,ik->jk", Wtilde, pdf_cov), Wtilde).real + + # transform data + diff_pca = diff.T @ Wtilde + + try: + sqrt_pdf_cov_pca = covmats.sqrt_covmat(pdf_cov_pca) + except: + raise ValueError( + f"PCA Covariance Matrix cannot be Cholesky decomposed, perhaps less than {num_components} PC should be kept" + ) + + return np.real(calc_chi2(sqrt_pdf_cov_pca, diff_pca.T)) + + +def dataset_fits_bias_variance_samples_pca(internal_multiclosure_dataset_loader, threshold=0.99): + """ + similar to `dataset_fits_bias_replicas_variance_samples`. + + Returns + ------- + tuple + 3D tuple: + - biases: 1-D array of shape (Nfits,) + - variances: 1-D array of shape (Nfits, ) + - n_eig: number of eigenvalues kept in PCA, i.e., + ndata in the new, lower dimensional, space. + Note that we return Nfits values of the variance so that computing the + Bias - Variance ratio is straightforward. + """ + closures_th, law_th, _, _ = internal_multiclosure_dataset_loader + + # The dimensions here are (fit, data point, replica) + reps = np.asarray([th.error_members for th in closures_th]) + + # take mean across replicas - since we might have changed no. of reps + centrals = reps.mean(axis=2) + + # compute the PDF covariance matrix of the central samples + if centrals.shape[0] <= 1: + raise ValueError(f"Need more than one fit to compute the 'Bias' PDF covariance Matrix") + + pdf_cov_bias = np.cov(centrals.T) + + # find number of (ordered) eigenvalues that explain 99% of the total variance (total sum of eigenvalues) + n_eig = compute_num_components(pdf_cov_bias, threshold) + + # compute bias from PCs + diffs_bias = centrals.T - law_th.central_value[:, np.newaxis] + biases = calc_chi2_pca(pdf_cov_bias, diffs_bias, n_eig) + + # compute variances from PCs + variances = [] + + # loop over fits to compute variances + for i in range(reps.shape[0]): + diffs_var = reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True) + pdf_cov_var = np.cov(reps[i, :, :]) + + variances.append(np.mean(calc_chi2_pca(pdf_cov_var, diffs_var, n_eig))) + + return biases, np.asarray(variances), n_eig + + +def expected_dataset_fits_bias_variance_samples_pca(dataset_fits_bias_variance_samples_pca): + """ + returns the bias and variance expected values as well as the number of + principal components. + """ + biases, variances, n_eig = dataset_fits_bias_variance_samples_pca + return np.mean(biases), np.mean(variances), n_eig + + +expected_datasets_fits_bias_variance_samples_pca = collect( + "expected_dataset_fits_bias_variance_samples_pca", ("data",) +) + + +def dataset_fits_ratio_bias_variance_samples_pca(dataset_fits_bias_variance_samples_pca): + """ """ + biases, variances, _ = dataset_fits_bias_variance_samples_pca + sqrt_ratios = np.sqrt(biases / variances) + return sqrt_ratios + + +def dataset_fits_gaussian_parameters(internal_multiclosure_dataset_loader, threshold=0.99): + """ + returns parameters of multi gaussian distribution of replicas + and central replicas + """ + closures_th, law_th, _, _ = internal_multiclosure_dataset_loader + + # The dimensions here are (fit, data point, replica) + reps = np.asarray([th.error_members for th in closures_th]) + + # take mean across replicas - since we might have changed no. of reps + centrals = reps.mean(axis=2) + + centrals_covmat = np.cov(centrals.T) + centrals_covmat = pca_covmat( + centrals, num_components=compute_num_components(centrals_covmat, threshold) + ) + mean_centrals = np.mean(centrals, axis=0) + + replicas_covmat = 0 + for i in range(reps.shape[0]): + replicas_covmat = np.cov(reps[i, :, :]) + replicas_covmat += pca_covmat( + reps[i, :, :].T, num_components=compute_num_components(replicas_covmat, threshold) + ) + replicas_covmat /= reps.shape[0] + mean_replicas = np.mean(reps.reshape(reps.shape[1], -1), axis=1) + + return mean_centrals, centrals_covmat, mean_replicas, replicas_covmat \ No newline at end of file diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py new file mode 100644 index 0000000000..2f057510ab --- /dev/null +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py @@ -0,0 +1,54 @@ +import numpy as np +import pandas as pd +from scipy import stats + +from reportengine.table import table + +from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent import principal_components_dataset + +@table +def table_bias_variance_datasets(principal_components_bias_variance_datasets, each_dataset): + """ + TODO + """ + records = [] + for pc_bias_var_dataset, ds in zip(principal_components_bias_variance_datasets, each_dataset): + biases, variances, n_comp = pc_bias_var_dataset + + try: + bias = np.mean(biases) + variance = np.mean(variances) + rbv = bias / variance + sqrt_rbv = np.sqrt(bias / variance) + records.append( + dict( + dataset=str(ds), + dof=n_comp, + bias=bias, + variance=variance, + ratio=rbv, + ratio_sqrt=sqrt_rbv, + ) + ) + + except: + records.append( + dict( + dataset=str(ds), + dof=n_comp, + bias=bias, + variance=variance, + ratio=np.nan, + ratio_sqrt=np.nan, + )) + + + + + df = pd.DataFrame.from_records( + records, + index="dataset", + columns=("dataset", "dof", "bias", "variance", "ratio", "ratio_sqrt"), + ) + df.columns = ["dof", "bias", "variance", "ratio", "sqrt(ratio)"] + return df \ No newline at end of file diff --git a/validphys2/src/validphys/closuretest/multiclosure.py b/validphys2/src/validphys/closuretest/multiclosure.py index 5b545569e2..64d26c4d69 100644 --- a/validphys2/src/validphys/closuretest/multiclosure.py +++ b/validphys2/src/validphys/closuretest/multiclosure.py @@ -147,6 +147,41 @@ def fits_dataset_bias_variance( variances.append(np.mean(calc_chi2(sqrtcov, diffs))) return biases, np.asarray(variances), len(law_th) +@check_multifit_replicas +def fits_normed_dataset_central_delta( + internal_multiclosure_dataset_loader, + _internal_max_reps=None, + _internal_min_reps=20): + """ + For each fit calculate the difference between central expectation value and true val. Normalize this + value by the variance of the differences between replicas and central expectation value (different + for each fit but expected to vary only a little). Each observable central exp value is + expected to be gaussianly distributed around the true value set by the fakepdf + """ + closures_th, law_th, exp_cov, sqrtcov = internal_multiclosure_dataset_loader + # The dimentions here are (fit, data point, replica) + #import ipdb; ipdb.set_trace() + reps = np.asarray([th.error_members[:, :_internal_max_reps] for th in closures_th]) + # One could mask here some reps in order to avoid redundancy of information + #TODO + + n_data = len(law_th) + n_fits = np.shape(reps)[0] + deltas = [] + # There are n_fits pdf_covariances + # flag to see whether to eliminate dataset + for i in range(n_fits): + bias_diffs = np.mean(reps[i], axis = 1) - law_th.central_value + # bias diffs in the for loop should have dim = (n_obs) + sigmas = np.sqrt(np.var(reps[i], axis = 1)) + # var diffs has shape (n_obs,reps) + bias_diffs = bias_diffs/sigmas + + deltas.append(bias_diffs.tolist()) + # biases.shape = (n_fits, n_obs_cut/uncut) + # variances.shape = (n_fits, n_obs_cut/uncut, reps) + #import ipdb; ipdb.set_trace() + return np.asarray(deltas) def expected_dataset_bias_variance(fits_dataset_bias_variance): """For a given dataset calculate the expected bias and variance across fits diff --git a/validphys2/src/validphys/kinematics.py b/validphys2/src/validphys/kinematics.py index a9af6132ef..1a5e08cbed 100644 --- a/validphys2/src/validphys/kinematics.py +++ b/validphys2/src/validphys/kinematics.py @@ -12,9 +12,20 @@ from reportengine import collect from reportengine.checks import check_positive from reportengine.table import table + from validphys.core import CutsPolicy from validphys.plotoptions import core as plotoptions_core +from validphys import plotutils +from validphys import plotoptions +from validphys.core import CutsPolicy +from validphys.closuretest import fits_normed_dataset_central_delta +from validphys.closuretest import dataset_xi +from validphys.closuretest import dataset_replica_and_central_diff + +from reportengine.figure import figuregen + + log = logging.getLogger(__name__) @@ -113,7 +124,6 @@ def xq2_dataset_map(commondata, cuts,internal_multiclosure_dataset_loader, std_devs = np.std(central_deltas, axis = 0) means = np.mean(central_deltas, axis = 0) xi = dataset_xi(dataset_replica_and_central_diff(internal_multiclosure_dataset_loader,False)) - #import ipdb; ipdb.set_trace() # for case of DY observables we have 2 (x,Q) for each experimental point if coords[0].shape[0] != std_devs.shape[0]: std_devs = np.concatenate((std_devs,std_devs)) From c041049e503b18e2d6475c40a1264b44fa2514b3 Mon Sep 17 00:00:00 2001 From: andreab1997 Date: Tue, 20 Feb 2024 16:33:18 +0100 Subject: [PATCH 03/61] Add other functions again --- .../multiclosure_inconsistent_output.py | 292 +++++++++++++++++- 1 file changed, 290 insertions(+), 2 deletions(-) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py index 2f057510ab..33310361d1 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py @@ -1,11 +1,87 @@ -import numpy as np +""" +multiclosure_inconsistent_output + +Module containing the actions which produce some output in validphys +reports i.e figures or tables for (inconsistent) multiclosure +estimators in the space of data + +""" import pandas as pd +import numpy as np from scipy import stats +from reportengine.figure import figure, figuregen from reportengine.table import table +from validphys import plotutils from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent import principal_components_dataset + +@figuregen +def plot_bias_distribution_datasets(principal_components_bias_variance_datasets, each_dataset): + """ + TODO + """ + for pc_bias_var_dataset, ds in zip(principal_components_bias_variance_datasets, each_dataset): + biases, variances, n_comp = pc_bias_var_dataset + + try: + sqrt_rbv = np.sqrt(np.mean(biases) / np.mean(variances)) + vals = np.linspace(0, 3 * n_comp, 100) + chi2_pdf = stats.chi2.pdf(vals, df=n_comp) + chi2_cdf = lambda x: stats.chi2.cdf(x, df=n_comp) + pvalue_ks = stats.kstest(biases, chi2_cdf).pvalue + fig, ax = plotutils.subplots() + ax.hist(biases, density=True, bins='auto', label=f"Bias {str(ds)}, p={pvalue_ks:.3f}") + ax.plot(vals, chi2_pdf, label=f"chi2, dof={n_comp}") + ax.plot([], [], 'ro', label=f"sqrt(Rbv) = {sqrt_rbv:.2f}") + + ax.legend() + + yield fig + + except: + fig, ax = plotutils.subplots() + ax.plot([], [], label=f"Dataset: {str(ds)}") + ax.legend() + yield fig + +@figuregen +def plot_variance_distribution_datasets( + principal_components_variance_distribution_datasets, each_dataset +): + """ + TODO + """ + + for pc_var_dataset, ds in zip( + principal_components_variance_distribution_datasets, each_dataset + ): + variances, n_comp = pc_var_dataset + try: + vals = np.linspace(0, 3 * n_comp, 100) + chi2_pdf = stats.chi2.pdf(vals, df=n_comp) + chi2_cdf = lambda x: stats.chi2.cdf(x, df=n_comp) + + for i, var_fit in enumerate(variances): + pvalue_ks = stats.kstest(var_fit[0], chi2_cdf).pvalue + + fig, ax = plotutils.subplots() + ax.hist( + var_fit[0], + density=True, + bins='auto', + label=f"Variance {str(ds)}, p={pvalue_ks:.3f}", + ) + ax.plot(vals, chi2_pdf, label=f"chi2, dof={n_comp}") + ax.set_title(f"Fit {i}") + ax.legend() + + yield fig + except: + fig, ax = plotutils.subplots() + yield fig + @table def table_bias_variance_datasets(principal_components_bias_variance_datasets, each_dataset): """ @@ -51,4 +127,216 @@ def table_bias_variance_datasets(principal_components_bias_variance_datasets, ea columns=("dataset", "dof", "bias", "variance", "ratio", "ratio_sqrt"), ) df.columns = ["dof", "bias", "variance", "ratio", "sqrt(ratio)"] - return df \ No newline at end of file + return df + + +@table +def datasets_bias_variance_pca_table( + expected_datasets_fits_bias_variance_samples_pca, each_dataset +): + """For each dataset calculate the expected bias and expected variance computed by + first projecting the covariance matrix into a lower dimensional space trough PCA. + + """ + records = [] + for ds, (bias, var, ndata) in zip( + each_dataset, expected_datasets_fits_bias_variance_samples_pca + ): + records.append( + dict( + dataset=str(ds), + ndata=ndata, + bias=bias, + variance=var, + ratio=bias / var, + ratio_sqrt=np.sqrt(bias / var), + ) + ) + df = pd.DataFrame.from_records( + records, + index="dataset", + columns=("dataset", "ndata", "bias", "variance", "ratio", "ratio_sqrt"), + ) + df.columns = ["ndata", "bias", "variance", "ratio", "sqrt(ratio)"] + return df + + +@figure +def plot_sqrt_ratio_distribution_pca(dataset_fits_ratio_bias_variance_samples_pca): + """""" + sqrt_ratios = dataset_fits_ratio_bias_variance_samples_pca + fig, ax = plotutils.subplots() + ax.hist(sqrt_ratios, bins='auto', density=True, alpha=0.5, label="sqrt_ratio") + ax.legend() + return fig + + +@figure +def plot_variance_distribution_multi(multi_dataset_fits_bias_replicas_variance_samples, dataspecs): + """ + histogram of the distribution of the variances (k) defined as the + distance between central replica and replica (k) in units of the + experimental covariance matrix. + If more than one group of dataspecs (e.g. consistent and inconsistent) + fits are defined, than plot comparison of these. + """ + fig, ax = plotutils.subplots() + for (_, variance_dist, _), spec in zip( + multi_dataset_fits_bias_replicas_variance_samples, dataspecs + ): + label = spec['speclabel'] + + ax.hist(variance_dist, bins='auto', density=True, alpha=0.5, label=label) + + ax.legend() + return fig + + +@figure +def plot_variance_distribution_pca_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs): + """ + like `plot_variance_distribution_multi`, but for variance computed with the covariance matrix + predicted by the PDFs (and reduced with PCA). + """ + return plot_variance_distribution_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs) + + +@figure +def plot_bias_distribution_multi(multi_dataset_fits_bias_replicas_variance_samples, dataspecs): + """ + histogram of the distribution of the biases (l) defined as the + distance between central replica and underlying law in units of the + experimental covariance matrix. + If more than one group of dataspecs (e.g. consistent and inconsistent) + fits are defined, than plot comparison of these. + """ + fig, ax = plotutils.subplots() + for (bias_dist, _, _), spec in zip( + multi_dataset_fits_bias_replicas_variance_samples, dataspecs + ): + label = spec['speclabel'] + + ax.hist(bias_dist, bins='auto', density=True, alpha=0.5, label=label) + + ax.legend() + return fig + + +@figure +def plot_bias_distribution_pca_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs): + """ + like `plot_bias_distribution_multi`, but for variance computed with the covariance matrix + predicted by the PDFs (and reduced with PCA). + """ + return plot_bias_distribution_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs) + + +@figure +def plot_sqrt_ratio_distribution_pca(multi_dataset_fits_bias_variance_samples_pca, dataspecs): + """ + histogram of the distribution of the sqrt ratio of bias and variance computed with + the estimated "PDF" covariance matrix (reduced with PCA). + If more than one group of dataspecs (e.g. consistent and inconsistent) + fits are defined, than plot comparison of these. + """ + fig, ax = plotutils.subplots() + ratios = [] + labels = [] + for (bias_dist, variance_dist, _), spec in zip( + multi_dataset_fits_bias_variance_samples_pca, dataspecs + ): + labels.append(spec['speclabel']) + ratios.append(bias_dist / variance_dist) + + ax.hist(ratios, bins='auto', density=True, label=labels) + ax.legend() + return fig + + +@figure +def plot_multi_bias_distribution_bootstrap(multi_fits_bootstrap_dataset_bias_variance, dataspecs): + """ + histogram of the distribution of the biases obtained by bootstrapping with + `fits_bootstrap_dataset_bias_variance` over the existing fits. + + If more than one group of dataspecs (e.g. consistent and inconsistent) + fits are defined, than plot comparison of these. + """ + fig, ax = plotutils.subplots() + for (bias, _), spec in zip(multi_fits_bootstrap_dataset_bias_variance, dataspecs): + label = spec['speclabel'] + + ax.hist(bias, bins='auto', density=True, alpha=0.5, label=label) + + ax.legend() + return fig + + +@figure +def plot_multi_ratio_bias_variance_distribution_bootstrap( + multi_fits_bootstrap_dataset_bias_variance, dataspecs +): + """ + histogram of the ratio bias variance distribution obtained by bootstrapping bias + and variance with `fits_bootstrap_dataset_bias_variance`. + + If more than one group of dataspecs (e.g. consistent and inconsistent) + fits are defined, than plot comparison of these. + """ + fig, ax = plotutils.subplots() + for (bias, variance), spec in zip(multi_fits_bootstrap_dataset_bias_variance, dataspecs): + ratio = bias / variance + label = spec['speclabel'] + + ax.hist(ratio, bins='auto', density=True, alpha=0.5, label=label) + + ax.legend() + return fig + +@figuregen +def plot_l2_condition_number(each_dataset, fits_pdf, variancepdf): + """ + TODO + """ + + evr_range = np.linspace(0.90, 0.995, 10) + + + for ds in each_dataset: + l2_cond = [] + dof = [] + + for evr in evr_range: + _, pc_reps, n_comp = principal_components_dataset(ds, fits_pdf, variancepdf, explained_variance_ratio=evr) + + covmat_pdf = np.cov(pc_reps) + + # compute condition number + l2_cond.append(np.linalg.cond(covmat_pdf)) + dof.append(n_comp) + + + + fig, ax1 = plotutils.subplots(figsize=(15,4)) + ax1.plot(evr_range, l2_cond, "b-o", label="condition number") + ax1.set_title(f"Dataset: {str(ds)}") + ax1.set_xlabel("EVR") + ax1.set_ylabel("Covariance Matrix Condition Number", color="b") + ax1.tick_params('y', color="b") + + ax2 = ax1.twinx() + + # Plot the second dataset on the right y-axis + ax2.plot(evr_range, dof, 'r-o', label="dof") + ax2.set_ylabel('Degrees of freedom', color="r") + ax2.tick_params('y', color="r") + # ax1.legend() + # ax2.legend() + lines1, labels1 = ax1.get_legend_handles_labels() + lines2, labels2 = ax2.get_legend_handles_labels() + lines = lines1 + lines2 + labels = labels1 + labels2 + ax1.legend(lines, labels, loc='upper left') + + + yield fig \ No newline at end of file From b6abb5309462d79511de66dffbbd18720e212b9b Mon Sep 17 00:00:00 2001 From: andreab1997 Date: Tue, 20 Feb 2024 15:55:15 +0100 Subject: [PATCH 04/61] Init analysis --- validphys2/src/validphys/kinematics.py | 1 + 1 file changed, 1 insertion(+) diff --git a/validphys2/src/validphys/kinematics.py b/validphys2/src/validphys/kinematics.py index 1a5e08cbed..6be188cec8 100644 --- a/validphys2/src/validphys/kinematics.py +++ b/validphys2/src/validphys/kinematics.py @@ -124,6 +124,7 @@ def xq2_dataset_map(commondata, cuts,internal_multiclosure_dataset_loader, std_devs = np.std(central_deltas, axis = 0) means = np.mean(central_deltas, axis = 0) xi = dataset_xi(dataset_replica_and_central_diff(internal_multiclosure_dataset_loader,False)) + # for case of DY observables we have 2 (x,Q) for each experimental point if coords[0].shape[0] != std_devs.shape[0]: std_devs = np.concatenate((std_devs,std_devs)) From c580a9a2bf2012bdf1f085164165a2f2d65ede96 Mon Sep 17 00:00:00 2001 From: andreab1997 Date: Tue, 20 Feb 2024 16:15:05 +0100 Subject: [PATCH 05/61] Add other functions --- .../multiclosure_inconsistent_output.py | 5 ++++- validphys2/src/validphys/kinematics.py | 6 ++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py index 33310361d1..b6b2a02d3f 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py @@ -82,6 +82,7 @@ def plot_variance_distribution_datasets( fig, ax = plotutils.subplots() yield fig + @table def table_bias_variance_datasets(principal_components_bias_variance_datasets, each_dataset): """ @@ -127,6 +128,7 @@ def table_bias_variance_datasets(principal_components_bias_variance_datasets, ea columns=("dataset", "dof", "bias", "variance", "ratio", "ratio_sqrt"), ) df.columns = ["dof", "bias", "variance", "ratio", "sqrt(ratio)"] + return df @@ -339,4 +341,5 @@ def plot_l2_condition_number(each_dataset, fits_pdf, variancepdf): ax1.legend(lines, labels, loc='upper left') - yield fig \ No newline at end of file + yield fig + diff --git a/validphys2/src/validphys/kinematics.py b/validphys2/src/validphys/kinematics.py index 6be188cec8..d4f1ef4f2f 100644 --- a/validphys2/src/validphys/kinematics.py +++ b/validphys2/src/validphys/kinematics.py @@ -13,9 +13,12 @@ from reportengine.checks import check_positive from reportengine.table import table + from validphys.core import CutsPolicy from validphys.plotoptions import core as plotoptions_core + + from validphys import plotutils from validphys import plotoptions from validphys.core import CutsPolicy @@ -26,6 +29,9 @@ from reportengine.figure import figuregen + + + log = logging.getLogger(__name__) From d3c99b207d67921265765ba48cfcb277c5bf570f Mon Sep 17 00:00:00 2001 From: andreab1997 Date: Tue, 20 Feb 2024 16:33:18 +0100 Subject: [PATCH 06/61] Add other functions again --- .../multiclosure_inconsistent_output.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py index b6b2a02d3f..e2f523afa7 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py @@ -83,6 +83,7 @@ def plot_variance_distribution_datasets( yield fig + @table def table_bias_variance_datasets(principal_components_bias_variance_datasets, each_dataset): """ @@ -342,4 +343,3 @@ def plot_l2_condition_number(each_dataset, fits_pdf, variancepdf): yield fig - From 82a9ff8cbbcfe5485163738a3d4849499283d528 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Mon, 10 Jun 2024 12:19:48 +0200 Subject: [PATCH 07/61] Remove ipdbs and make sure multiclosure_inconsistent work --- validphys2/src/validphys/closuretest/__init__.py | 2 ++ validphys2/src/validphys/closuretest/multiclosure.py | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/validphys2/src/validphys/closuretest/__init__.py b/validphys2/src/validphys/closuretest/__init__.py index 02950f15d6..38bc1228c5 100644 --- a/validphys2/src/validphys/closuretest/__init__.py +++ b/validphys2/src/validphys/closuretest/__init__.py @@ -11,3 +11,5 @@ from validphys.closuretest.multiclosure_pdf_output import * from validphys.closuretest.multiclosure_preprocessing import * from validphys.closuretest.multiclosure_pseudodata import * +from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent import * +from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent_output import * diff --git a/validphys2/src/validphys/closuretest/multiclosure.py b/validphys2/src/validphys/closuretest/multiclosure.py index 64d26c4d69..ff71e3caf0 100644 --- a/validphys2/src/validphys/closuretest/multiclosure.py +++ b/validphys2/src/validphys/closuretest/multiclosure.py @@ -160,7 +160,6 @@ def fits_normed_dataset_central_delta( """ closures_th, law_th, exp_cov, sqrtcov = internal_multiclosure_dataset_loader # The dimentions here are (fit, data point, replica) - #import ipdb; ipdb.set_trace() reps = np.asarray([th.error_members[:, :_internal_max_reps] for th in closures_th]) # One could mask here some reps in order to avoid redundancy of information #TODO @@ -180,7 +179,6 @@ def fits_normed_dataset_central_delta( deltas.append(bias_diffs.tolist()) # biases.shape = (n_fits, n_obs_cut/uncut) # variances.shape = (n_fits, n_obs_cut/uncut, reps) - #import ipdb; ipdb.set_trace() return np.asarray(deltas) def expected_dataset_bias_variance(fits_dataset_bias_variance): From 8d635b4b037fdb9f0d6dfa3fdf2fbb1ff3e0bb2e Mon Sep 17 00:00:00 2001 From: andreab1997 Date: Thu, 22 Feb 2024 11:39:43 +0100 Subject: [PATCH 08/61] Add multiclosure comparefits --- pyproject.toml | 1 + .../__init__.py | 3 + .../comparefits_template.md | 121 +++++++++ .../data.md | 5 + .../exponents.md | 15 ++ .../lumi.md | 9 + .../multiclosure_analysis.yaml | 244 ++++++++++++++++++ .../multiclosure_template.md | 14 + .../pca_template.md | 2 + .../pdf.md | 24 ++ .../single_point_template.md | 2 + .../src/validphys/scripts/vp_multiclosure.py | 159 ++++++++++++ 12 files changed, 599 insertions(+) create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/__init__.py create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/comparefits_template.md create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/data.md create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/exponents.md create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/lumi.md create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/pdf.md create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/single_point_template.md create mode 100644 validphys2/src/validphys/scripts/vp_multiclosure.py diff --git a/pyproject.toml b/pyproject.toml index cb3809b892..71fc4ce320 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,6 +50,7 @@ wiki-upload = "validphys.scripts.wiki_upload:main" vp-get = "validphys.scripts.vp_get:main" vp-comparefits = "validphys.scripts.vp_comparefits:main" vp-fitrename = "validphys.scripts.vp_fitrename:main" +vp-multiclosure = "validphys.scripts.vp_multiclosure:main" vp-checktheory = "validphys.scripts.vp_checktheory:main" vp-pdfrename = "validphys.scripts.vp_pdfrename:main" vp-pdffromreplicas = "validphys.scripts.vp_pdffromreplicas:main" diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/__init__.py b/validphys2/src/validphys/compareinconsistentclosuretemplates/__init__.py new file mode 100644 index 0000000000..9a64b5152a --- /dev/null +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/__init__.py @@ -0,0 +1,3 @@ +import pathlib + +template_path = pathlib.Path(__file__).with_name('multiclosure_analysis.yaml') diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/comparefits_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/comparefits_template.md new file mode 100644 index 0000000000..bafb0f5fa5 --- /dev/null +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/comparefits_template.md @@ -0,0 +1,121 @@ +%NNPDF report comparing {@ current fit_id @} and {@ reference fit_id @} + +Summary +------- + +We are comparing: + + - {@ current fit @} (`{@ current fit_id @}`): {@ current description @} + - {@ reference fit @} (`{@ reference fit_id @}`): {@ reference description @} + + +{@ summarise_fits @} + + +t0 losses +--------- +{@ dataspecs::t0_info t0_chi2_info_table @} + +Theory covariance summary +------------------------- +{@summarise_theory_covmat_fits@} + +Dataset properties +------------------ +{@current fit_datasets_properties_table@} + +Distances +--------- +{@with Scales@} +### {@Scaletitle@} +{@with Normalize::Basespecs::PDFscalespecs::Distspecs@} +#### {@Basistitle@}, {@Xscaletitle@} +{@plot_pdfdistances@} +{@plot_pdfvardistances@} +{@endwith@} +{@endwith@} + +PDF arc-lengths +--------------- +{@Basespecs plot_arc_lengths@} + +Sum rules +--------- +{@with pdfs@} +### {@pdf@} + +#### Known sum rules + +{@sum_rules_table@} + +#### Unknown sum rules + +{@unknown_sum_rules_table@} + +{@endwith@} + +PDF plots +--------- +{@with Scales@} +[Plots at {@Scaletitle@}]({@pdf_report report@}) +{@endwith@} + +Luminosities +------------ +{@with Energies@} +[Plots at {@Energytitle@}]({@lumi_report report@}) +{@endwith@} + +Effective exponents +------------------- +[Detailed information]({@exponents_report report@}) + +Training lengths +---------------- +{@fits plot_training_length@} + +Training-validation +------------------- +{@fits plot_training_validation@} + +{@with DataGroups@} +$\chi^2$ by {@processed_metadata_group@} +---------------------------------------- +{@plot_fits_groups_data_chi2@} +{@endwith@} + + +$\chi^2$ by dataset +------------------- +### Plot +{@plot_fits_datasets_chi2@} +### Table +{@ProcessGroup fits_chi2_table(show_total=true)@} + + +{@with DataGroups@} +$\phi$ by {@processed_metadata_group@} +-------------------------------------- +{@plot_fits_groups_data_phi@} +{@endwith@} + +Dataset plots +------------- +{@with matched_datasets_from_dataspecs@} +[Plots for {@dataset_name@}]({@dataset_report report@}) +{@endwith@} + +Positivity +---------- +{@with matched_positivity_from_dataspecs@} +{@plot_dataspecs_positivity@} +{@endwith@} + +Dataset differences and cuts +---------------------------- +{@print_dataset_differences@} +{@print_different_cuts@} + +Code versions +------------- +{@fits_version_table@} diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/data.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/data.md new file mode 100644 index 0000000000..c99e62f3d4 --- /dev/null +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/data.md @@ -0,0 +1,5 @@ +% Data-theory comparison for {@dataset_name@} +# Absolute +{@plot_fancy_dataspecs@} +# Normalized +{@Datanorm plot_fancy_dataspecs@} diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/exponents.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/exponents.md new file mode 100644 index 0000000000..9736e70775 --- /dev/null +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/exponents.md @@ -0,0 +1,15 @@ +%NNPDF report comparing {@ current fit @} and {@ reference fit @} + +# Effective preprocessing information + +## Plots +### alpha exponent +{@current::basisfromfit plot_alpha_eff@} +### beta exponent +{@current::basisfromfit plot_beta_eff@} + +## Tables +{@with fits@} +### Next effective exponents for {@fit@} +{@effective_exponents_table@} +{@endwith@} diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/lumi.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/lumi.md new file mode 100644 index 0000000000..a077a4a21f --- /dev/null +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/lumi.md @@ -0,0 +1,9 @@ +%NNPDF report comparing {@ current fit @} and {@ reference fit @} + +# Luminosity plots + +{@with Normalize@} +{@with lumi_channels@} +{@plot_lumi1d@} +{@endwith@} +{@endwith@} diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml new file mode 100644 index 0000000000..dd4a2b92a5 --- /dev/null +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml @@ -0,0 +1,244 @@ +meta: + title: Multiclosure report + author: AB + keywords: [inconsistencies, multiclosure] + +#COMPAREFITS SETTINGS + +positivity: + from_: fit + +description: + from_: fit + +dataset_inputs: + from_: fit + +t0_info: + - use_t0: True + datacuts: + from_: fit + t0pdfset: + from_: datacuts +compare_settings: + current: + fit: {id: "240210_mnc_dis_ict_lam02"} + pdf: {id: "240210_mnc_dis_ict_lam02", label: "Current Fit"} + theory: + from_: fit + theoryid: + from_: theory + speclabel: "Current Fit" + + reference: + fit: {id: "240210_mnc_dis_ict_lam04"} + pdf: {id: "240210_mnc_dis_ict_lam04", label: "Reference Fit" } + theory: + from_: fit + theoryid: + from_: theory + speclabel: "Reference Fit" + + pdfs: + - from_: current + - from_: reference + + fits: + - from_: current + - from_: reference + + use_cuts: "fromfit" + use_weights_in_covmat: False + + Q: 1.651 + + Scales: + - Q: 1.651 + Scaletitle: "Q = 1.65 GeV" + - Q: 100 + Scaletitle: "Q = 100 GeV" + + PDFnormalize: + - Normtitle: Absolute + + - normalize_to: 2 + Normtitle: Ratio + + Basespecs: + - basis: CCBAR_ASYMM_FLAVOUR + Basistitle: Flavour basis + - basis: CCBAR_ASYMM + Basistitle: Evolution basis + + PDFscalespecs: + - xscale: log + Xscaletitle: Log + - xscale: linear + Xscaletitle: Linear + + Energies: + - sqrts: 13000 + Energytitle: "13 TeV" + + lumi_channels: + - gg + - gq + - qq + - qqbar + - uubar + - ddbar + - udbar + - dubar + + Distspecs: + - ymin: 0 + ymax: 20 + + pos_use_kin: True + + dataset_report: + meta: Null + template: data.md + + pdf_report: + meta: Null + template: pdf.md + + exponents_report: + meta: Null + template: exponents.md + + lumi_report: + meta: Null + template: lumi.md + + dataspecs: + - theoryid: + from_: current + pdf: + from_: current + fit: + from_: current + speclabel: + from_: current + + - theoryid: + from_: reference + pdf: + from_: reference + fit: + from_: reference + speclabel: + from_: reference + + Normalize: + normalize_to: 2 + + Datanorm: + normalize_to: data + + DataGroups: + - metadata_group: nnpdf31_process + - metadata_group: experiment + + ProcessGroup: + metadata_group: nnpdf31_process + +################################################################## +#OTHER ANALYSIS SETTINGS + +lambdavalues: + - label: "LAMBDA02" + fit: 230124_dis_ict_lam02_fs_122996 + dataset_inputs: + from_: fit + theory: + from_: fit + theoryid: + from_: theory + use_t0: True + t0pdfset: 210223-mw-000_fakepdf + use_cuts: "internal" + explained_variance_ratio: 0.99 + variancepdf: 230708_mnc_dis_pt1_ct_1000 + fits: + - 230124_dis_ict_lam02_fs_122996 + - 230124_dis_ict_lam02_fs_152326 + - 230124_dis_ict_lam02_fs_196144 + - 230124_dis_ict_lam02_fs_200988 + - 230124_dis_ict_lam02_fs_257767 + - 230124_dis_ict_lam02_fs_31144 + - 230124_dis_ict_lam02_fs_374984 + - 230124_dis_ict_lam02_fs_383440 + - 230124_dis_ict_lam02_fs_394255 + - 230124_dis_ict_lam02_fs_431574 + - 230124_dis_ict_lam02_fs_498979 + - 230124_dis_ict_lam02_fs_504344 + - 230124_dis_ict_lam02_fs_52722 + - 230124_dis_ict_lam02_fs_54647 + - 230124_dis_ict_lam02_fs_562489 + - 230124_dis_ict_lam02_fs_57171 + - 230124_dis_ict_lam02_fs_661665 + - 230124_dis_ict_lam02_fs_683404 + - 230124_dis_ict_lam02_fs_752096 + - 230124_dis_ict_lam02_fs_760929 + - 230124_dis_ict_lam02_fs_786382 + - 230124_dis_ict_lam02_fs_834248 + - 230124_dis_ict_lam02_fs_868248 + - 230124_dis_ict_lam02_fs_897760 + - 230124_dis_ict_lam02_fs_916690 + - 230124_dis_ict_lam02_fs_966390 + - label: "LAMBDA04" + fit: 250124_dis_ict_lam04_fs_102661 + dataset_inputs: + from_: fit + theory: + from_: fit + theoryid: + from_: theory + use_t0: True + t0pdfset: 210223-mw-000_fakepdf + use_cuts: "internal" + explained_variance_ratio: 0.99 + variancepdf: 230708_mnc_dis_pt1_ct_1000 + fits: + - 250124_dis_ict_lam04_fs_102661 + - 250124_dis_ict_lam04_fs_166247 + - 250124_dis_ict_lam04_fs_194869 + - 250124_dis_ict_lam04_fs_235399 + - 250124_dis_ict_lam04_fs_259799 + - 250124_dis_ict_lam04_fs_280552 + - 250124_dis_ict_lam04_fs_362871 + - 250124_dis_ict_lam04_fs_38346 + - 250124_dis_ict_lam04_fs_392751 + - 250124_dis_ict_lam04_fs_394227 + - 250124_dis_ict_lam04_fs_473619 + - 250124_dis_ict_lam04_fs_475803 + - 250124_dis_ict_lam04_fs_5238 + - 250124_dis_ict_lam04_fs_544453 + - 250124_dis_ict_lam04_fs_547187 + - 250124_dis_ict_lam04_fs_561201 + - 250124_dis_ict_lam04_fs_630491 + - 250124_dis_ict_lam04_fs_691489 + - 250124_dis_ict_lam04_fs_717426 + - 250124_dis_ict_lam04_fs_833288 + - 250124_dis_ict_lam04_fs_85029 + - 250124_dis_ict_lam04_fs_856511 + - 250124_dis_ict_lam04_fs_895365 + - 250124_dis_ict_lam04_fs_902027 + - 250124_dis_ict_lam04_fs_911964 + - 250124_dis_ict_lam04_fs_99031 + + +template: multiclosure_template.md +comparefits_report: + meta: Null + template: comparefits_template.md +single_point_report: + meta: Null + template: single_point_template.md +pca_report: + meta: Null + template: pca_template.md +actions_: + - report(main=true) diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md new file mode 100644 index 0000000000..7ad062bd09 --- /dev/null +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md @@ -0,0 +1,14 @@ +{@ with compare_settings @} +[Comparefits report]({@comparefits_report report@}) +{@ endwith @} + +Single data point analysis +-------------------------- +{@with lambdavalues@} +[Lambda value of {@label@}]({@single_point_report report@}) +{@endwith@} +Bias Variance Ratio Table +------------------------- +{@with lambdavalues@} +[Lambda value of {@label@}]({@pca_report report@}) +{@endwith@} diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md new file mode 100644 index 0000000000..6269098614 --- /dev/null +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md @@ -0,0 +1,2 @@ +# PCA analysis +{@table_bias_variance_datasets@} diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/pdf.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/pdf.md new file mode 100644 index 0000000000..54cdc4bd87 --- /dev/null +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/pdf.md @@ -0,0 +1,24 @@ +%NNPDF report comparing {@ current fit @} and {@ reference fit @} + +# PDF plots + +## PDF comparison +{@with PDFnormalize@} +### {@Normtitle@} +{@with Basespecs@} +#### {@Basistitle@} +{@with PDFscalespecs@} +##### {@Xscaletitle@} +{@plot_pdfs@} +{@endwith@} +{@endwith@} +{@endwith@} + +## PDF replicas +{@with Basespecs@} +#### {@Basistitle@} +{@with PDFscalespecs@} +##### {@Xscaletitle@} +{@plot_pdfreplicas@} +{@endwith@} +{@endwith@} diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/single_point_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/single_point_template.md new file mode 100644 index 0000000000..8a8e5e8808 --- /dev/null +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/single_point_template.md @@ -0,0 +1,2 @@ +# Single data point analysis +{@xq2_data_prcs_maps@} diff --git a/validphys2/src/validphys/scripts/vp_multiclosure.py b/validphys2/src/validphys/scripts/vp_multiclosure.py new file mode 100644 index 0000000000..5d1aa7013a --- /dev/null +++ b/validphys2/src/validphys/scripts/vp_multiclosure.py @@ -0,0 +1,159 @@ +import sys +import os +import logging + +#TODO: Look into making these lazy imports +import prompt_toolkit +from prompt_toolkit.completion import WordCompleter + +from reportengine.compat import yaml +from reportengine.colors import t + +from validphys.app import App +from validphys.loader import RemoteLoader +from validphys import compareinconsistentclosuretemplates +from validphys.promptutils import confirm, KeywordsWithCache + +log = logging.getLogger(__name__) + +CURRENT_FIT_LABEL_DEFAULT = "Current Fit" +REFERENCE_FIT_LABEL_DEFAULT = "Reference Fit" + + + +class CompareFitApp(App): + def add_positional_arguments(self, parser): + parser.add_argument( + 'current_fit', + default=None, + nargs='?', + help="The fit to produce the report for.", + ) + parser.add_argument( + 'reference_fit', + default=None, + nargs='?', + help="The fit to compare with.") + # Group together mandatory arguments that are not positional + mandatory = parser.add_argument_group("mandatory", "Mandatory command line arguments") + mandatory.add_argument( + '--title', help="The title that will be indexed with the report.") + mandatory.add_argument('--author', help="The author of the report.") + mandatory.add_argument( + '--keywords', nargs='+', help="keywords to index the report with.") + parser.add_argument( + '--current_fit_label', + nargs='?', + default=CURRENT_FIT_LABEL_DEFAULT, + help="The label for the fit that the report is being produced for.", + ) + parser.add_argument( + '--reference_fit_label', + nargs='?', + default=REFERENCE_FIT_LABEL_DEFAULT, + help="The label for the fit that is being compared to.") + parser.add_argument( + '--lambdasettings_path', + help="The path to the yaml file containing the settings for the lambdavalues report.") + + parser.set_defaults() + + def try_complete_args(self): + args = self.args + argnames = ( + 'current_fit', 'reference_fit', 'title', 'author', 'keywords') + optionalnames = ( + 'current_fit_label', 'reference_fit_label') + badargs = [argname for argname in argnames if not args[argname]] + bad = badargs + if bad: + sys.exit(f"The following arguments are required: {bad}") + texts = '\n'.join( + f' {argname.replace("_", " ").capitalize()}: {args[argname]}' + for argname in [*argnames, *optionalnames]) + log.info(f"Starting NNPDF fit comparison:\n{texts}") + + def get_commandline_arguments(self, cmdline=None): + args = super().get_commandline_arguments(cmdline) + # This is needed because the environment wants to know how to resolve + # the relative paths to find the templates. Best to have the template + # look as much as possible as a runcard passed from the command line + args['config_yml'] = compareinconsistentclosuretemplates.template_path + return args + + def complete_mapping(self): + args = self.args + autosettings = {} + shortsettings = {} + autosettings['meta'] = { + 'title': args['title'], + 'author': args['author'], + 'keywords': args['keywords'] + } + currentmap = {'id': args['current_fit'], 'label': args['current_fit_label']} + shortsettings['current'] = { + 'fit': currentmap, + 'pdf': currentmap, + 'theory': { + 'from_': 'fit' + }, + 'theoryid': { + 'from_': 'theory' + }, + 'speclabel': args['current_fit_label'] + } + refmap = {'id': args['reference_fit'], 'label': args['reference_fit_label']} + shortsettings['reference'] = { + 'fit': refmap, + 'pdf': refmap, + 'theory': { + 'from_': 'fit' + }, + 'theoryid': { + 'from_': 'theory' + }, + 'speclabel': args['reference_fit_label'] + } + autosettings["compare_settings"] = shortsettings + return autosettings + + def complete_lambdavaluesmapping(self): + args = self.args + # opening conf file + with open(args['lambdasettings_path']) as f: + settings = yaml.safe_load(f) + autosettings = {} + list_lambdas = [] + for lambdas in settings: + lambdasetting = {} + for set in ["variancepdf", "t0pdfset", "explained_variance_ratio", "label", "fits", "fit"]: + lambdasetting[set] = lambdas[set] + list_lambdas.append(lambdasetting) + autosettings["lambdavalues"] = list_lambdas + return autosettings + + def get_config(self): + self.try_complete_args() + #No error handling here because this is our internal file + with open(self.args['config_yml']) as f: + #TODO: Ideally this would load round trip but needs + #to be fixed in reportengine. + c = yaml.safe_load(f) + complete_mapping = self.complete_mapping() + complete_mapping.update(self.complete_lambdavaluesmapping()) + c["meta"] = complete_mapping["meta"] + for lambdas, lambdas_mapping in zip(c["lambdavalues"], complete_mapping["lambdavalues"]): + for set in ["variancepdf", "t0pdfset", "explained_variance_ratio", "label", "fits", "fit"]: + lambdas[set] = lambdas_mapping[set] + for set in ["current", "reference"]: + c["compare_settings"][set] = complete_mapping["compare_settings"][set] + return self.config_class(c, environment=self.environment) + + +def main(): + a = CompareFitApp() + a.main() + + +if __name__ == '__main__': + main() From 0b6acae10b1901b5203ecca5cc57e3199328eb80 Mon Sep 17 00:00:00 2001 From: andreab1997 Date: Mon, 26 Feb 2024 10:48:48 +0100 Subject: [PATCH 09/61] Add sklearn to pyproject --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 71fc4ce320..06b53483ba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -97,6 +97,7 @@ fiatlux = {version = "*", optional = true} # without lhapdf pdfflow = {version = "^1.2.1", optional = true} lhapdf-management = {version = "^0.5", optional = true} +scikit-learn = "^1.4.1" # Optional dependencies [tool.poetry.extras] From e6936db384d87b94271aa634b499329ec31bb7cd Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Mon, 10 Jun 2024 12:21:03 +0200 Subject: [PATCH 10/61] Scikit added to conda --- conda-recipe/meta.yaml | 1 + pyproject.toml | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index 9d1b6376e3..8c6b1dd12c 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -44,6 +44,7 @@ requirements: - sphinx_rtd_theme >0.5 - sphinxcontrib-bibtex - ruamel.yaml <0.18 + - scikit-learn = "^1.4.1" test: requires: diff --git a/pyproject.toml b/pyproject.toml index 06b53483ba..71fc4ce320 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -97,7 +97,6 @@ fiatlux = {version = "*", optional = true} # without lhapdf pdfflow = {version = "^1.2.1", optional = true} lhapdf-management = {version = "^0.5", optional = true} -scikit-learn = "^1.4.1" # Optional dependencies [tool.poetry.extras] From baa99a3502465954ddb51cea5a4d058060db8ddd Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Tue, 5 Mar 2024 11:51:27 +0000 Subject: [PATCH 11/61] added PCA bias variance example to validphys2/examples --- .../examples/pca_bias_variance_report.yaml | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 validphys2/examples/pca_bias_variance_report.yaml diff --git a/validphys2/examples/pca_bias_variance_report.yaml b/validphys2/examples/pca_bias_variance_report.yaml new file mode 100644 index 0000000000..5c355a8092 --- /dev/null +++ b/validphys2/examples/pca_bias_variance_report.yaml @@ -0,0 +1,44 @@ +meta: + title: PCA report for bias and variance of multiclosure fits. + author: Mark N. Costantini + keywords: [multiclosure, PCA, bias, variance] + + +dataset_inputs: + - {dataset: HERA_NC_225GEV_EP-SIGMARED, variant: legacy} + - {dataset: HERA_NC_251GEV_EP-SIGMARED, variant: legacy} + - {dataset: HERA_NC_300GEV_EP-SIGMARED, variant: legacy} + - {dataset: HERA_NC_318GEV_EP-SIGMARED, variant: legacy} + - {dataset: HERA_CC_318GEV_EM-SIGMARED, variant: legacy} + - {dataset: HERA_CC_318GEV_EP-SIGMARED, variant: legacy} + + +# variance pdf is used to compute the variance of fits +# the assumption is that fits have the same variance (needs to be checked) +variancepdf: 230124_dis_ict_lam02_fs_122996 + + +theoryid: 200 +use_cuts: internal +use_t0: True +t0pdfset: 210223-mw-000_fakepdf + + +explained_variance_ratio: 0.99 + +fits: + - 230124_dis_ict_lam02_fs_122996 + - 230124_dis_ict_lam02_fs_196144 + - 230124_dis_ict_lam02_fs_257767 + + +template_text: | + + PCA report for bias and variance of multiclosure fits. + ------------------------------------------------------ + + {@table_bias_variance_datasets@} + + +actions_: + - report(main=true) \ No newline at end of file From a7399641dc32edb275132d32b8062c3119d0c664 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Tue, 5 Mar 2024 14:44:14 +0000 Subject: [PATCH 12/61] l2 condition number plot --- .../multiclosure_inconsistent.py | 6 +- .../multiclosure_inconsistent_output.py | 59 +++++++++++++------ 2 files changed, 44 insertions(+), 21 deletions(-) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py index 4cb12f2d30..899e50252b 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py @@ -110,9 +110,9 @@ def principal_components_bias_variance_dataset( pc_basis, pc_reps, n_comp = principal_components_dataset - if n_comp <=1: + if n_comp <= 1: return None, None, n_comp - + # estimate (PC) pdf covariance matrix (from replicas), shape is (Npc, Npc) covmat_pdf = np.cov(pc_reps) sqrt_covmat_pdf = covmats.sqrt_covmat(covmat_pdf) @@ -359,4 +359,4 @@ def dataset_fits_gaussian_parameters(internal_multiclosure_dataset_loader, thres replicas_covmat /= reps.shape[0] mean_replicas = np.mean(reps.reshape(reps.shape[1], -1), axis=1) - return mean_centrals, centrals_covmat, mean_replicas, replicas_covmat \ No newline at end of file + return mean_centrals, centrals_covmat, mean_replicas, replicas_covmat diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py index e2f523afa7..d15de1c3ae 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py @@ -297,14 +297,38 @@ def plot_multi_ratio_bias_variance_distribution_bootstrap( return fig @figuregen -def plot_l2_condition_number(each_dataset, fits_pdf, variancepdf): - """ - TODO +def plot_l2_condition_number(each_dataset, fits_pdf, variancepdf, evr_min=0.90, evr_max=0.995, evr_n=20): """ + Plot the L2 condition number of the covariance matrix as a function of the explained variance ratio. + The plot gives an idea of the stability of the covariance matrix as a function of the + exaplained variance ratio and hence the number of principal components used to reduce the dimensionality. - evr_range = np.linspace(0.90, 0.995, 10) - + The ideal explained variance ratio is chosen based on a threshold L2 condition number, in general this + threshold number (and the derived explained variance ratio) should be chosen so that + + relative error in output (inverse covmat) <= relative error in input (covmat) * condition number + + Parameters + ---------- + each_dataset : list + List of datasets + + fits_pdf: list + list of validphys.core.PDF objects + + variancepdf: validphys.core.PDF + PDF object for the variance + + Yields + ------ + fig + Figure object + """ + + # Explained variance ratio range + evr_range = np.linspace(evr_min, evr_max, evr_n) + for ds in each_dataset: l2_cond = [] dof = [] @@ -318,28 +342,27 @@ def plot_l2_condition_number(each_dataset, fits_pdf, variancepdf): l2_cond.append(np.linalg.cond(covmat_pdf)) dof.append(n_comp) - - - fig, ax1 = plotutils.subplots(figsize=(15,4)) - ax1.plot(evr_range, l2_cond, "b-o", label="condition number") + fig, ax1 = plotutils.subplots() + ax1.plot(evr_range, l2_cond, label="L2 Condition Number") ax1.set_title(f"Dataset: {str(ds)}") - ax1.set_xlabel("EVR") - ax1.set_ylabel("Covariance Matrix Condition Number", color="b") - ax1.tick_params('y', color="b") + ax1.set_xlabel("Explained Variance Ratio") + ax1.set_ylabel("Covariance Matrix Condition Number") + ax1.tick_params('y') + + # Draw horizontal line for threshold L2 condition number + ax1.axhline(y=100, color='g', linestyle='--', label="Threshold L2 condition number") ax2 = ax1.twinx() # Plot the second dataset on the right y-axis - ax2.plot(evr_range, dof, 'r-o', label="dof") - ax2.set_ylabel('Degrees of freedom', color="r") - ax2.tick_params('y', color="r") - # ax1.legend() - # ax2.legend() + ax2.plot(evr_range, dof, color="r", label="Degrees of freedom") + ax2.set_ylabel('Degrees of freedom') + ax2.tick_params('y') + lines1, labels1 = ax1.get_legend_handles_labels() lines2, labels2 = ax2.get_legend_handles_labels() lines = lines1 + lines2 labels = labels1 + labels2 ax1.legend(lines, labels, loc='upper left') - yield fig From 72ad203123d351dc68113d2e4ab1247ab9bc2da4 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Tue, 5 Mar 2024 14:44:28 +0000 Subject: [PATCH 13/61] added l2 condition number plot --- validphys2/examples/pca_bias_variance_report.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/validphys2/examples/pca_bias_variance_report.yaml b/validphys2/examples/pca_bias_variance_report.yaml index 5c355a8092..e2f66f44fb 100644 --- a/validphys2/examples/pca_bias_variance_report.yaml +++ b/validphys2/examples/pca_bias_variance_report.yaml @@ -37,8 +37,12 @@ template_text: | PCA report for bias and variance of multiclosure fits. ------------------------------------------------------ + ## Table of bias and variance {@table_bias_variance_datasets@} + ## L2 condition number + {@plot_l2_condition_number@} + actions_: - report(main=true) \ No newline at end of file From 190911431cfad953b7f07e1d7c4226481b8e7c9d Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Tue, 5 Mar 2024 15:17:09 +0000 Subject: [PATCH 14/61] removed unused functions pca_covmat, dataset_fits_gaussian_parameters --- .../multiclosure_inconsistent.py | 49 ------------------- .../multiclosure_inconsistent_output.py | 2 +- 2 files changed, 1 insertion(+), 50 deletions(-) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py index 899e50252b..66a3f15112 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py @@ -198,24 +198,6 @@ def compute_num_components(covariance_matrix, threshold=0.99): return num_components -def pca_covmat(X, num_components): - """ - given data X of shape (n,p), reduce its dimension to - (n,num_components) and return the covariance matrix - of the reduced data matrix. - - Parameters - ---------- - - Returns - ------- - """ - pca = PCA(num_components) - X_reduced = pca.fit_transform(X) - covariance = np.dot(X_reduced.T, X_reduced) / (X_reduced.shape[0] - 1) - return covariance - - def calc_chi2_pca(pdf_cov, diff, num_components): """ Computes the chi2 between pdf_cov and diff by first projecting @@ -329,34 +311,3 @@ def dataset_fits_ratio_bias_variance_samples_pca(dataset_fits_bias_variance_samp biases, variances, _ = dataset_fits_bias_variance_samples_pca sqrt_ratios = np.sqrt(biases / variances) return sqrt_ratios - - -def dataset_fits_gaussian_parameters(internal_multiclosure_dataset_loader, threshold=0.99): - """ - returns parameters of multi gaussian distribution of replicas - and central replicas - """ - closures_th, law_th, _, _ = internal_multiclosure_dataset_loader - - # The dimensions here are (fit, data point, replica) - reps = np.asarray([th.error_members for th in closures_th]) - - # take mean across replicas - since we might have changed no. of reps - centrals = reps.mean(axis=2) - - centrals_covmat = np.cov(centrals.T) - centrals_covmat = pca_covmat( - centrals, num_components=compute_num_components(centrals_covmat, threshold) - ) - mean_centrals = np.mean(centrals, axis=0) - - replicas_covmat = 0 - for i in range(reps.shape[0]): - replicas_covmat = np.cov(reps[i, :, :]) - replicas_covmat += pca_covmat( - reps[i, :, :].T, num_components=compute_num_components(replicas_covmat, threshold) - ) - replicas_covmat /= reps.shape[0] - mean_replicas = np.mean(reps.reshape(reps.shape[1], -1), axis=1) - - return mean_centrals, centrals_covmat, mean_replicas, replicas_covmat diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py index d15de1c3ae..ac211d9af6 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py @@ -307,7 +307,7 @@ def plot_l2_condition_number(each_dataset, fits_pdf, variancepdf, evr_min=0.90, threshold number (and the derived explained variance ratio) should be chosen so that relative error in output (inverse covmat) <= relative error in input (covmat) * condition number - + Note that in a closure test the relative error in the covariance matrix is very small and only numerical. Parameters ---------- From 66f293fdbaa464dbf3f3b28abe0d9850bce5d68f Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Tue, 5 Mar 2024 15:18:46 +0000 Subject: [PATCH 15/61] updated multiclosure analysis with l2 plots --- .../compareinconsistentclosuretemplates/pca_template.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md index 6269098614..2f308bd701 100644 --- a/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md @@ -1,2 +1,7 @@ # PCA analysis + +## Table of bias and variance {@table_bias_variance_datasets@} + +## L2 condition number +{@plot_l2_condition_number@} From 90c553a47d5bc3e1fad2ecd10651dd533d2ed112 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Tue, 5 Mar 2024 15:32:03 +0000 Subject: [PATCH 16/61] variance pdf has to be taken from the same fits --- .../multiclosure_analysis.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml index dd4a2b92a5..33c790f42a 100644 --- a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml @@ -160,7 +160,7 @@ lambdavalues: t0pdfset: 210223-mw-000_fakepdf use_cuts: "internal" explained_variance_ratio: 0.99 - variancepdf: 230708_mnc_dis_pt1_ct_1000 + variancepdf: 230124_dis_ict_lam02_fs_122996 fits: - 230124_dis_ict_lam02_fs_122996 - 230124_dis_ict_lam02_fs_152326 @@ -200,7 +200,7 @@ lambdavalues: t0pdfset: 210223-mw-000_fakepdf use_cuts: "internal" explained_variance_ratio: 0.99 - variancepdf: 230708_mnc_dis_pt1_ct_1000 + variancepdf: 250124_dis_ict_lam04_fs_102661 fits: - 250124_dis_ict_lam04_fs_102661 - 250124_dis_ict_lam04_fs_166247 From d24545130786197259d07d1803c026560d941198 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Tue, 5 Mar 2024 16:03:15 +0000 Subject: [PATCH 17/61] use np.nan and set default EVR to 0.99 --- .../multiclosure_inconsistent.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py index 66a3f15112..9ee3e189e6 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py @@ -37,7 +37,7 @@ ) -def principal_components_dataset(dataset, fits_pdf, variancepdf, explained_variance_ratio=0.93): +def principal_components_dataset(dataset, fits_pdf, variancepdf, explained_variance_ratio=0.99): """ Compute the principal components of theory predictions replica matrix (Ndat x Nrep feature matrix). @@ -62,21 +62,12 @@ def principal_components_dataset(dataset, fits_pdf, variancepdf, explained_varia Returns ------- tuple - 2D tuple: + 3D tuple: - matrix of the principal components (PCs) of shape (N_pc, N_dat) - reduced feature matrix, i.e., feature matrix projected onto PCs of shape (N_pc, N_rep) + - N_pc: number of principal components kept """ - # fits_dataset_predictions = [ - # ThPredictionsResult.from_convolution(pdf, dataset) for pdf in fits_pdf - # ] - - # dimensions here are (Nfits, Ndat, Nrep) - # reps = np.asarray([th.error_members for th in fits_dataset_predictions]) - - # reshape so as to get PCs from all the samples - # reps = reps.reshape(reps.shape[1],-1) - # get replicas from variance fit, used to estimate variance reps = ThPredictionsResult.from_convolution(variancepdf, dataset).error_members @@ -111,7 +102,7 @@ def principal_components_bias_variance_dataset( pc_basis, pc_reps, n_comp = principal_components_dataset if n_comp <= 1: - return None, None, n_comp + return np.nan, np.nan, n_comp # estimate (PC) pdf covariance matrix (from replicas), shape is (Npc, Npc) covmat_pdf = np.cov(pc_reps) From d09018a87ab246fa2d400803b4b2e6b7a62e1d5b Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Tue, 5 Mar 2024 16:04:43 +0000 Subject: [PATCH 18/61] cleaned bias variance table --- .../multiclosure_inconsistent_output.py | 101 ++++++++++-------- 1 file changed, 57 insertions(+), 44 deletions(-) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py index ac211d9af6..009dbb07e8 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py @@ -6,6 +6,7 @@ estimators in the space of data """ + import pandas as pd import numpy as np from scipy import stats @@ -14,7 +15,9 @@ from reportengine.table import table from validphys import plotutils -from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent import principal_components_dataset +from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent import ( + principal_components_dataset, +) @figuregen @@ -24,7 +27,7 @@ def plot_bias_distribution_datasets(principal_components_bias_variance_datasets, """ for pc_bias_var_dataset, ds in zip(principal_components_bias_variance_datasets, each_dataset): biases, variances, n_comp = pc_bias_var_dataset - + try: sqrt_rbv = np.sqrt(np.mean(biases) / np.mean(variances)) vals = np.linspace(0, 3 * n_comp, 100) @@ -39,13 +42,14 @@ def plot_bias_distribution_datasets(principal_components_bias_variance_datasets, ax.legend() yield fig - + except: fig, ax = plotutils.subplots() ax.plot([], [], label=f"Dataset: {str(ds)}") ax.legend() yield fig + @figuregen def plot_variance_distribution_datasets( principal_components_variance_distribution_datasets, each_dataset @@ -83,22 +87,35 @@ def plot_variance_distribution_datasets( yield fig - @table def table_bias_variance_datasets(principal_components_bias_variance_datasets, each_dataset): """ - TODO + Compute the bias, variance, ratio and sqrt(ratio) for each dataset + and return a DataFrame with the results. + + Parameters + ---------- + + principal_components_bias_variance_datasets: list + List of tuples containing the values of bias, variance and number of degrees of freedom + + each_dataset: list + List of validphys.core.DataSetSpec + + Returns + ------- + pd.DataFrame + DataFrame containing the bias, variance, ratio and sqrt(ratio) for each dataset """ records = [] for pc_bias_var_dataset, ds in zip(principal_components_bias_variance_datasets, each_dataset): biases, variances, n_comp = pc_bias_var_dataset - - try: - bias = np.mean(biases) - variance = np.mean(variances) - rbv = bias / variance - sqrt_rbv = np.sqrt(bias / variance) - records.append( + + bias = np.mean(biases) + variance = np.mean(variances) + rbv = bias / variance + sqrt_rbv = np.sqrt(bias / variance) + records.append( dict( dataset=str(ds), dof=n_comp, @@ -108,26 +125,12 @@ def table_bias_variance_datasets(principal_components_bias_variance_datasets, ea ratio_sqrt=sqrt_rbv, ) ) - - except: - records.append( - dict( - dataset=str(ds), - dof=n_comp, - bias=bias, - variance=variance, - ratio=np.nan, - ratio_sqrt=np.nan, - )) - - - df = pd.DataFrame.from_records( - records, - index="dataset", - columns=("dataset", "dof", "bias", "variance", "ratio", "ratio_sqrt"), - ) + records, + index="dataset", + columns=("dataset", "dof", "bias", "variance", "ratio", "ratio_sqrt"), + ) df.columns = ["dof", "bias", "variance", "ratio", "sqrt(ratio)"] return df @@ -296,11 +299,14 @@ def plot_multi_ratio_bias_variance_distribution_bootstrap( ax.legend() return fig + @figuregen -def plot_l2_condition_number(each_dataset, fits_pdf, variancepdf, evr_min=0.90, evr_max=0.995, evr_n=20): +def plot_l2_condition_number( + each_dataset, fits_pdf, variancepdf, evr_min=0.90, evr_max=0.995, evr_n=20 +): """ Plot the L2 condition number of the covariance matrix as a function of the explained variance ratio. - The plot gives an idea of the stability of the covariance matrix as a function of the + The plot gives an idea of the stability of the covariance matrix as a function of the exaplained variance ratio and hence the number of principal components used to reduce the dimensionality. The ideal explained variance ratio is chosen based on a threshold L2 condition number, in general this @@ -313,35 +319,42 @@ def plot_l2_condition_number(each_dataset, fits_pdf, variancepdf, evr_min=0.90, ---------- each_dataset : list List of datasets - + fits_pdf: list list of validphys.core.PDF objects - + variancepdf: validphys.core.PDF PDF object for the variance - + Yields ------ fig Figure object """ - + # Explained variance ratio range evr_range = np.linspace(evr_min, evr_max, evr_n) - + for ds in each_dataset: l2_cond = [] dof = [] for evr in evr_range: - _, pc_reps, n_comp = principal_components_dataset(ds, fits_pdf, variancepdf, explained_variance_ratio=evr) - + _, pc_reps, n_comp = principal_components_dataset( + ds, fits_pdf, variancepdf, explained_variance_ratio=evr + ) + covmat_pdf = np.cov(pc_reps) - - # compute condition number - l2_cond.append(np.linalg.cond(covmat_pdf)) + + # if the number of principal components is 1 then the covariance matrix is a scalar + # and the condition number is not computed + if n_comp <= 1: + l2_cond.append(np.nan) + else: + l2_cond.append(np.linalg.cond(covmat_pdf)) + dof.append(n_comp) - + fig, ax1 = plotutils.subplots() ax1.plot(evr_range, l2_cond, label="L2 Condition Number") ax1.set_title(f"Dataset: {str(ds)}") @@ -358,7 +371,7 @@ def plot_l2_condition_number(each_dataset, fits_pdf, variancepdf, evr_min=0.90, ax2.plot(evr_range, dof, color="r", label="Degrees of freedom") ax2.set_ylabel('Degrees of freedom') ax2.tick_params('y') - + lines1, labels1 = ax1.get_legend_handles_labels() lines2, labels2 = ax2.get_legend_handles_labels() lines = lines1 + lines2 From d9b94b89ed422cbf427e5a6db358ff69d64ecbb8 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Tue, 5 Mar 2024 16:13:06 +0000 Subject: [PATCH 19/61] added docs to func + removed sklearn preprocessing as unused --- .../multiclosure_inconsistent.py | 26 +++++++++++++++---- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py index 9ee3e189e6..89e4d554d1 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py @@ -10,7 +10,6 @@ import numpy as np from sklearn.decomposition import PCA -from sklearn import preprocessing from validphys import covmats from validphys.calcutils import calc_chi2 @@ -71,13 +70,13 @@ def principal_components_dataset(dataset, fits_pdf, variancepdf, explained_varia # get replicas from variance fit, used to estimate variance reps = ThPredictionsResult.from_convolution(variancepdf, dataset).error_members - # rescale feature matrix - reps_scaled = reps # preprocessing.scale(reps) + # something that could be tested: rescale feature matrix + # reps_scaled = reps.preprocessing.scale(reps) # choose number of principal components (PCs) based on explained variance ratio n_comp = 1 for _ in range(reps.shape[0]): - pca = PCA(n_comp).fit(reps_scaled.T) + pca = PCA(n_comp).fit(reps.T) if np.sum(pca.explained_variance_ratio_) >= explained_variance_ratio: break n_comp += 1 @@ -92,7 +91,24 @@ def principal_components_bias_variance_dataset( internal_multiclosure_dataset_loader, principal_components_dataset ): """ - TODO + Compute the bias and variance for one datasets + using the principal component reduced covariance matrix. + + Parameters + ---------- + internal_multiclosure_dataset_loader : tuple + Tuple containing the results of multiclosure fits + + principal_components_dataset : tuple + 3D tuple containing the principal components of the theory predictions + + Returns + ------- + tuple + 3D tuple: + - biases: 1-D array of shape (Nfits,) + - variances: 1-D array of shape (Nfits, ) + - n_comp: number of principal components kept """ closures_th, law_th, _, _ = internal_multiclosure_dataset_loader From acf3f91f3267fc7856a53fcd3892ca522c448791 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Tue, 5 Mar 2024 16:27:38 +0000 Subject: [PATCH 20/61] added single point dataset to example report --- validphys2/examples/pca_bias_variance_report.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/validphys2/examples/pca_bias_variance_report.yaml b/validphys2/examples/pca_bias_variance_report.yaml index e2f66f44fb..9c621b20d2 100644 --- a/validphys2/examples/pca_bias_variance_report.yaml +++ b/validphys2/examples/pca_bias_variance_report.yaml @@ -5,6 +5,7 @@ meta: dataset_inputs: + - {dataset: ATLAS_SINGLETOP_7TEV_TCHANNEL-XSEC, variant: legacy} - {dataset: HERA_NC_225GEV_EP-SIGMARED, variant: legacy} - {dataset: HERA_NC_251GEV_EP-SIGMARED, variant: legacy} - {dataset: HERA_NC_300GEV_EP-SIGMARED, variant: legacy} @@ -45,4 +46,4 @@ template_text: | actions_: - - report(main=true) \ No newline at end of file + - report(main=true) From 2de09b88fcbfa9dbf82b90a30a5d796fadc8aa82 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Mon, 11 Mar 2024 16:48:59 +0000 Subject: [PATCH 21/61] added bootstrap of bias distribution --- .../examples/pca_bias_variance_report.yaml | 14 ++- .../multiclosure_inconsistent.py | 96 ++++++++++++++++++- .../multiclosure_inconsistent_output.py | 72 +++++++++++++- .../multiclosure_analysis.yaml | 11 ++- .../multiclosure_template.md | 4 + .../pca_template.md | 6 ++ 6 files changed, 191 insertions(+), 12 deletions(-) diff --git a/validphys2/examples/pca_bias_variance_report.yaml b/validphys2/examples/pca_bias_variance_report.yaml index 9c621b20d2..209ad06c58 100644 --- a/validphys2/examples/pca_bias_variance_report.yaml +++ b/validphys2/examples/pca_bias_variance_report.yaml @@ -28,10 +28,13 @@ t0pdfset: 210223-mw-000_fakepdf explained_variance_ratio: 0.99 fits: - - 230124_dis_ict_lam02_fs_122996 - - 230124_dis_ict_lam02_fs_196144 - - 230124_dis_ict_lam02_fs_257767 - + - 250124_dis_ict_lam04_fs_833288 + - 250124_dis_ict_lam04_fs_85029 + - 250124_dis_ict_lam04_fs_856511 + - 250124_dis_ict_lam04_fs_895365 + - 250124_dis_ict_lam04_fs_902027 + - 250124_dis_ict_lam04_fs_911964 + - 250124_dis_ict_lam04_fs_99031 template_text: | @@ -43,7 +46,8 @@ template_text: | ## L2 condition number {@plot_l2_condition_number@} - + + {@bootstrapped_bias_distribution@} actions_: - report(main=true) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py index 89e4d554d1..ffc351eea1 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py @@ -93,14 +93,14 @@ def principal_components_bias_variance_dataset( """ Compute the bias and variance for one datasets using the principal component reduced covariance matrix. - + Parameters ---------- internal_multiclosure_dataset_loader : tuple Tuple containing the results of multiclosure fits - + principal_components_dataset : tuple - 3D tuple containing the principal components of the theory predictions + 3D tuple containing the principal components of the theory predictions Returns ------- @@ -145,6 +145,96 @@ def principal_components_bias_variance_dataset( ) +def bootstrap_principal_components_bias_variance_dataset( + internal_multiclosure_dataset_loader, + principal_components_dataset, + n_bootstrap=100, + n_fits_bt=15, +): + """ + Compute the bias and variance for one datasets + using the principal component reduced covariance matrix. + + Parameters + ---------- + internal_multiclosure_dataset_loader : tuple + Tuple containing the results of multiclosure fits + + principal_components_dataset : tuple + 3D tuple containing the principal components of the theory predictions + + n_bootstrap: int, default is 100 + number of bootstrap + + n_fits_bt: int, default is 15 + number of fits to be randomly sampled in bootstrap + + + Returns + ------- + tuple + 3D tuple: + - biases: 1-D array of shape (Nfits,) + - variances: 1-D array of shape (Nfits, ) + - n_comp: number of principal components kept + """ + + # estimate (PC) pdf covariance matrix (from replicas), shape is (Npc, Npc) + pc_basis, pc_reps, n_comp = principal_components_dataset + + if n_comp <= 1: + return np.nan, np.nan, n_comp + + covmat_pdf = np.cov(pc_reps) + + sqrt_covmat_pdf = covmats.sqrt_covmat(covmat_pdf) + + # compute variance + closures_th, law_th, _, _ = internal_multiclosure_dataset_loader + + reps = np.asarray([th.error_members for th in closures_th]) + + variances = [] + for i in range(reps.shape[0]): + diffs = pc_basis @ (reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)) + variances.append(np.mean(calc_chi2(sqrt_covmat_pdf, diffs))) + variance = np.mean(variances) + + # perform bootstrap on bias + bt_biases = [] + for i in range(n_bootstrap): + rng = np.random.RandomState(seed=i) + + # step1a: pick n_fits_bt closure tests at random (with replacement) + fit_idxs = rng.choice(len(closures_th), size=n_fits_bt, replace=True) + bt_closure_th = np.array(closures_th)[fit_idxs] + + # step1b: from each closure test pick 100 replicas at random (with replacement) + rep_idxs = rng.choice(100, size=(n_fits_bt, 100), replace=True) + replicas = np.asarray( + [th.error_members[:, rep_idx] for th, rep_idx in zip(bt_closure_th, rep_idxs)] + ) + + # step2: compute delta bias on bootstrap sample + centrals = np.mean(replicas, axis=-1) + underlying = law_th.central_value + + delta_bias = centrals - underlying[np.newaxis, :] + delta_bias = pc_basis @ delta_bias.T + + bt_biases.append(np.mean(calc_chi2(sqrt_covmat_pdf, delta_bias))) + + return np.asarray(bt_biases), variance, n_comp + + +""" +Bootstrapped bias for each dataset and fits with different inconsistency +""" +lambdavalues_bootstrap_principal_components_bias_variance_datasets = collect( + "bootstrap_principal_components_bias_variance_dataset", ("data", "lambdavalues") +) + + def principal_components_variance_distribution_dataset( internal_multiclosure_dataset_loader, principal_components_dataset ): diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py index 009dbb07e8..130063c594 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py @@ -110,11 +110,17 @@ def table_bias_variance_datasets(principal_components_bias_variance_datasets, ea records = [] for pc_bias_var_dataset, ds in zip(principal_components_bias_variance_datasets, each_dataset): biases, variances, n_comp = pc_bias_var_dataset - bias = np.mean(biases) variance = np.mean(variances) rbv = bias / variance + + # use gaussian uncertainty propagation + delta_rbv = np.sqrt( + ((1 / variance) * np.std(biases)) ** 2 + (bias / variance**2 * np.std(variances)) ** 2 + ) sqrt_rbv = np.sqrt(bias / variance) + delta_sqrt_rbv = 0.5 * delta_rbv / np.sqrt(rbv) + records.append( dict( dataset=str(ds), @@ -122,19 +128,79 @@ def table_bias_variance_datasets(principal_components_bias_variance_datasets, ea bias=bias, variance=variance, ratio=rbv, + error_ratio=delta_rbv, ratio_sqrt=sqrt_rbv, + error_ratio_sqrt=delta_sqrt_rbv, ) ) df = pd.DataFrame.from_records( records, index="dataset", - columns=("dataset", "dof", "bias", "variance", "ratio", "ratio_sqrt"), + columns=( + "dataset", + "dof", + "bias", + "variance", + "ratio", + "error_ratio", + "ratio_sqrt", + "error_ratio_sqrt", + ), ) - df.columns = ["dof", "bias", "variance", "ratio", "sqrt(ratio)"] + df.columns = [ + "dof", + "bias", + "variance", + "ratio", + "error ratio", + "sqrt(ratio)", + "error sqrt(ratio)", + ] return df +@figuregen +def lambdavalues_bootstrapped_bias_distribution( + lambdavalues_bootstrap_principal_components_bias_variance_datasets, each_dataset, lambdavalues +): + """ """ + + for i, ds in enumerate(each_dataset): + + n_lambda_values = len( + lambdavalues_bootstrap_principal_components_bias_variance_datasets + ) // len(each_dataset) + + fig, ax = plotutils.subplots() + + for j in range(n_lambda_values): + + (biases, _, n_comp) = ( + lambdavalues_bootstrap_principal_components_bias_variance_datasets[ + i + len(each_dataset) * j + ] + ) + + if n_comp == 1: + ax.set_title(f"dataset {str(ds)}") + + else: + # adjust by expected value so that a comparison between identical datasets w. different + # inconsistency degree, and hence possibly different n_comp, is possible. + # a large deviation from zero indicates large inconsistency + biases -= n_comp + ax.hist( + biases, + alpha=0.5, + density=True, + label=f"{lambdavalues[j]['label']}, dof: {n_comp}", + ) + ax.set_title(f"dataset {str(ds)}") + ax.legend() + + yield fig + @table def datasets_bias_variance_pca_table( diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml index 33c790f42a..0964fc4cbc 100644 --- a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml @@ -146,7 +146,12 @@ compare_settings: ################################################################## #OTHER ANALYSIS SETTINGS - +fit: 230124_dis_ict_lam02_fs_122996 +theory: + from_: fit +theoryid: + from_: theory +use_cuts: "internal" lambdavalues: - label: "LAMBDA02" fit: 230124_dis_ict_lam02_fs_122996 @@ -240,5 +245,9 @@ single_point_report: pca_report: meta: Null template: pca_template.md +multi_pca_report: + meta: Null + template: multi_pca_template.md + actions_: - report(main=true) diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md index 7ad062bd09..3248c5422a 100644 --- a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md @@ -12,3 +12,7 @@ Bias Variance Ratio Table {@with lambdavalues@} [Lambda value of {@label@}]({@pca_report report@}) {@endwith@} + +Bias Distribution +----------------- +{@multi_pca_report@} diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md index 2f308bd701..45170834ed 100644 --- a/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md @@ -5,3 +5,9 @@ ## L2 condition number {@plot_l2_condition_number@} + +## Bootstrapped bias distributions +{@bootstrapped_bias_distribution@} + +## Fit comparison bootstrapped bias distributions +{@lambdavalues_bootstrapped_bias_distribution@} \ No newline at end of file From 325ad8f7d9136c90881aceff258e9d5ab0726cc8 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Mon, 11 Mar 2024 18:14:36 +0000 Subject: [PATCH 22/61] added rbv plots as function of lambda --- .../multiclosure_inconsistent.py | 90 ------------------- .../multiclosure_inconsistent_output.py | 67 +++++++------- .../multiclosure_analysis.yaml | 5 +- .../multiclosure_template.md | 4 - .../pca_template.md | 9 +- 5 files changed, 42 insertions(+), 133 deletions(-) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py index ffc351eea1..c997e3eb29 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py @@ -145,96 +145,6 @@ def principal_components_bias_variance_dataset( ) -def bootstrap_principal_components_bias_variance_dataset( - internal_multiclosure_dataset_loader, - principal_components_dataset, - n_bootstrap=100, - n_fits_bt=15, -): - """ - Compute the bias and variance for one datasets - using the principal component reduced covariance matrix. - - Parameters - ---------- - internal_multiclosure_dataset_loader : tuple - Tuple containing the results of multiclosure fits - - principal_components_dataset : tuple - 3D tuple containing the principal components of the theory predictions - - n_bootstrap: int, default is 100 - number of bootstrap - - n_fits_bt: int, default is 15 - number of fits to be randomly sampled in bootstrap - - - Returns - ------- - tuple - 3D tuple: - - biases: 1-D array of shape (Nfits,) - - variances: 1-D array of shape (Nfits, ) - - n_comp: number of principal components kept - """ - - # estimate (PC) pdf covariance matrix (from replicas), shape is (Npc, Npc) - pc_basis, pc_reps, n_comp = principal_components_dataset - - if n_comp <= 1: - return np.nan, np.nan, n_comp - - covmat_pdf = np.cov(pc_reps) - - sqrt_covmat_pdf = covmats.sqrt_covmat(covmat_pdf) - - # compute variance - closures_th, law_th, _, _ = internal_multiclosure_dataset_loader - - reps = np.asarray([th.error_members for th in closures_th]) - - variances = [] - for i in range(reps.shape[0]): - diffs = pc_basis @ (reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)) - variances.append(np.mean(calc_chi2(sqrt_covmat_pdf, diffs))) - variance = np.mean(variances) - - # perform bootstrap on bias - bt_biases = [] - for i in range(n_bootstrap): - rng = np.random.RandomState(seed=i) - - # step1a: pick n_fits_bt closure tests at random (with replacement) - fit_idxs = rng.choice(len(closures_th), size=n_fits_bt, replace=True) - bt_closure_th = np.array(closures_th)[fit_idxs] - - # step1b: from each closure test pick 100 replicas at random (with replacement) - rep_idxs = rng.choice(100, size=(n_fits_bt, 100), replace=True) - replicas = np.asarray( - [th.error_members[:, rep_idx] for th, rep_idx in zip(bt_closure_th, rep_idxs)] - ) - - # step2: compute delta bias on bootstrap sample - centrals = np.mean(replicas, axis=-1) - underlying = law_th.central_value - - delta_bias = centrals - underlying[np.newaxis, :] - delta_bias = pc_basis @ delta_bias.T - - bt_biases.append(np.mean(calc_chi2(sqrt_covmat_pdf, delta_bias))) - - return np.asarray(bt_biases), variance, n_comp - - -""" -Bootstrapped bias for each dataset and fits with different inconsistency -""" -lambdavalues_bootstrap_principal_components_bias_variance_datasets = collect( - "bootstrap_principal_components_bias_variance_dataset", ("data", "lambdavalues") -) - - def principal_components_variance_distribution_dataset( internal_multiclosure_dataset_loader, principal_components_dataset ): diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py index 130063c594..612c0cf0b4 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py @@ -13,6 +13,7 @@ from reportengine.figure import figure, figuregen from reportengine.table import table +from reportengine import collect from validphys import plotutils from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent import ( @@ -160,44 +161,50 @@ def table_bias_variance_datasets(principal_components_bias_variance_datasets, ea return df + +lambdavalues_table_bias_variance_datasets = collect( + "table_bias_variance_datasets", ("lambdavalues",) +) + + @figuregen -def lambdavalues_bootstrapped_bias_distribution( - lambdavalues_bootstrap_principal_components_bias_variance_datasets, each_dataset, lambdavalues +def plot_lambdavalues_bias_variance_values( + lambdavalues_table_bias_variance_datasets, lambdavalues, each_dataset ): - """ """ + """ + Plot sqrt of ratio bias variance as a function of lambda for each dataset. - for i, ds in enumerate(each_dataset): + Parameters + ---------- + lambdavalues_table_bias_variance_datasets: list + list of data frames computed as per table_bias_variance_datasets. - n_lambda_values = len( - lambdavalues_bootstrap_principal_components_bias_variance_datasets - ) // len(each_dataset) + lambdavalues: list + list specified in multiclosure_analysis.yaml - fig, ax = plotutils.subplots() + each_dataset: list + list of datasets - for j in range(n_lambda_values): + Yields + ------ + figure + """ - (biases, _, n_comp) = ( - lambdavalues_bootstrap_principal_components_bias_variance_datasets[ - i + len(each_dataset) * j - ] + for ds in each_dataset: + fig, ax = plotutils.subplots() + for i, lambdavalue in enumerate(lambdavalues): + df = lambdavalues_table_bias_variance_datasets[i] + df = df[df.index == str(ds)] + + ax.errorbar( + lambdavalue["lambda_value"], + df["sqrt(ratio)"].values, + yerr=df["error sqrt(ratio)"].values, + color="blue", ) - - if n_comp == 1: - ax.set_title(f"dataset {str(ds)}") - - else: - # adjust by expected value so that a comparison between identical datasets w. different - # inconsistency degree, and hence possibly different n_comp, is possible. - # a large deviation from zero indicates large inconsistency - biases -= n_comp - ax.hist( - biases, - alpha=0.5, - density=True, - label=f"{lambdavalues[j]['label']}, dof: {n_comp}", - ) - ax.set_title(f"dataset {str(ds)}") - ax.legend() + ax.set_ylabel(r"$R_{bv}$") + ax.set_xlabel(r"$\lambda$") + ax.set_title(f"Ratio bias variance: {str(ds)}") yield fig diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml index 0964fc4cbc..65dfaf3cfd 100644 --- a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml @@ -154,6 +154,7 @@ theoryid: use_cuts: "internal" lambdavalues: - label: "LAMBDA02" + lambda_value: 0.2 fit: 230124_dis_ict_lam02_fs_122996 dataset_inputs: from_: fit @@ -194,6 +195,7 @@ lambdavalues: - 230124_dis_ict_lam02_fs_916690 - 230124_dis_ict_lam02_fs_966390 - label: "LAMBDA04" + lambda_value: 0.4 fit: 250124_dis_ict_lam04_fs_102661 dataset_inputs: from_: fit @@ -245,9 +247,6 @@ single_point_report: pca_report: meta: Null template: pca_template.md -multi_pca_report: - meta: Null - template: multi_pca_template.md actions_: - report(main=true) diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md index 3248c5422a..7ad062bd09 100644 --- a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md @@ -12,7 +12,3 @@ Bias Variance Ratio Table {@with lambdavalues@} [Lambda value of {@label@}]({@pca_report report@}) {@endwith@} - -Bias Distribution ------------------ -{@multi_pca_report@} diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md index 45170834ed..7fba0826e8 100644 --- a/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md @@ -3,11 +3,8 @@ ## Table of bias and variance {@table_bias_variance_datasets@} +## Ratio bias variance vs Lambda +{@plot_lambdavalues_bias_variance_values@} + ## L2 condition number {@plot_l2_condition_number@} - -## Bootstrapped bias distributions -{@bootstrapped_bias_distribution@} - -## Fit comparison bootstrapped bias distributions -{@lambdavalues_bootstrapped_bias_distribution@} \ No newline at end of file From 95d608e6fe9b9d80ee194a898e4617ef7be4def2 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Tue, 12 Mar 2024 10:09:45 +0000 Subject: [PATCH 23/61] added ratio bias variance to multi closure report --- .../multiclosure_analysis.yaml | 24 ++++++------------- .../multiclosure_template.md | 3 +++ .../pca_template.md | 3 --- 3 files changed, 10 insertions(+), 20 deletions(-) diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml index 65dfaf3cfd..cf27fa9796 100644 --- a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml @@ -146,25 +146,19 @@ compare_settings: ################################################################## #OTHER ANALYSIS SETTINGS + fit: 230124_dis_ict_lam02_fs_122996 theory: from_: fit theoryid: from_: theory -use_cuts: "internal" +use_cuts: internal + lambdavalues: - label: "LAMBDA02" lambda_value: 0.2 - fit: 230124_dis_ict_lam02_fs_122996 - dataset_inputs: - from_: fit - theory: - from_: fit - theoryid: - from_: theory use_t0: True t0pdfset: 210223-mw-000_fakepdf - use_cuts: "internal" explained_variance_ratio: 0.99 variancepdf: 230124_dis_ict_lam02_fs_122996 fits: @@ -196,16 +190,8 @@ lambdavalues: - 230124_dis_ict_lam02_fs_966390 - label: "LAMBDA04" lambda_value: 0.4 - fit: 250124_dis_ict_lam04_fs_102661 - dataset_inputs: - from_: fit - theory: - from_: fit - theoryid: - from_: theory use_t0: True t0pdfset: 210223-mw-000_fakepdf - use_cuts: "internal" explained_variance_ratio: 0.99 variancepdf: 250124_dis_ict_lam04_fs_102661 fits: @@ -247,6 +233,10 @@ single_point_report: pca_report: meta: Null template: pca_template.md +ratio_bias_variance_report: + meta: Null + template: ratio_biasvar_template.md + actions_: - report(main=true) diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md index 7ad062bd09..aff62f9182 100644 --- a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_template.md @@ -12,3 +12,6 @@ Bias Variance Ratio Table {@with lambdavalues@} [Lambda value of {@label@}]({@pca_report report@}) {@endwith@} + + +[Ratio bias variance vs Lambda]({@ratio_bias_variance_report report@}) diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md index 7fba0826e8..2f308bd701 100644 --- a/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md @@ -3,8 +3,5 @@ ## Table of bias and variance {@table_bias_variance_datasets@} -## Ratio bias variance vs Lambda -{@plot_lambdavalues_bias_variance_values@} - ## L2 condition number {@plot_l2_condition_number@} From 070e3f6a9beb0e30438468588c2fe30a9bf5d476 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Tue, 12 Mar 2024 10:27:35 +0000 Subject: [PATCH 24/61] forgot ratio_bias_variance template --- .../ratio_biasvar_template.md | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/ratio_biasvar_template.md diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/ratio_biasvar_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/ratio_biasvar_template.md new file mode 100644 index 0000000000..d4eda2c575 --- /dev/null +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/ratio_biasvar_template.md @@ -0,0 +1,2 @@ +## Ratio Bias Variance vs Lambda +{@plot_lambdavalues_bias_variance_values@} From 70792926097aa3e7af00ef31d906589506814444 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Tue, 12 Mar 2024 21:19:48 +0000 Subject: [PATCH 25/61] fmt = 'o' for ratio bias variance plot --- .../inconsistent_closuretest/multiclosure_inconsistent_output.py | 1 + 1 file changed, 1 insertion(+) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py index 612c0cf0b4..337dc6a3e1 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py @@ -201,6 +201,7 @@ def plot_lambdavalues_bias_variance_values( df["sqrt(ratio)"].values, yerr=df["error sqrt(ratio)"].values, color="blue", + fmt='o', ) ax.set_ylabel(r"$R_{bv}$") ax.set_xlabel(r"$\lambda$") From 49d0484960fd3f900152b3e4a786a78f29dc3711 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Fri, 15 Mar 2024 12:07:43 +0000 Subject: [PATCH 26/61] added functions for computing Rbv using old definition + runcard with table --- .../src/validphys/closuretest/multiclosure.py | 6 ++ .../closuretest/multiclosure_output.py | 59 +++++++++++++++++++ .../runcards/old_rbv_definition.yaml | 56 ++++++++++++++++++ 3 files changed, 121 insertions(+) create mode 100644 validphys2/src/validphys/closuretest/runcards/old_rbv_definition.yaml diff --git a/validphys2/src/validphys/closuretest/multiclosure.py b/validphys2/src/validphys/closuretest/multiclosure.py index ff71e3caf0..a086d97d91 100644 --- a/validphys2/src/validphys/closuretest/multiclosure.py +++ b/validphys2/src/validphys/closuretest/multiclosure.py @@ -181,6 +181,12 @@ def fits_normed_dataset_central_delta( # variances.shape = (n_fits, n_obs_cut/uncut, reps) return np.asarray(deltas) + +fits_datasets_bias_variance = collect( + "fits_dataset_bias_variance", ("data",) +) + + def expected_dataset_bias_variance(fits_dataset_bias_variance): """For a given dataset calculate the expected bias and variance across fits then return tuple (expected bias, expected variance, n_data) diff --git a/validphys2/src/validphys/closuretest/multiclosure_output.py b/validphys2/src/validphys/closuretest/multiclosure_output.py index d7be585cd7..a9becee214 100644 --- a/validphys2/src/validphys/closuretest/multiclosure_output.py +++ b/validphys2/src/validphys/closuretest/multiclosure_output.py @@ -60,6 +60,65 @@ def plot_total_fits_bias_variance(fits_total_bias_variance): """ return plot_dataset_fits_bias_variance(fits_total_bias_variance, "all data") +@table +def table_datasets_bias_variance_fits(fits_datasets_bias_variance, each_dataset): + """ + Table with ratio bias variance value and associated uncertainty + computed with simple gaussian error propagation for each dataset. + """ + records = [] + # compute expectation and uncertainty on bias variance ratio + for ds, (bias, var, ndata) in zip(each_dataset, fits_datasets_bias_variance): + mean_bias = np.mean(bias) + mean_var = np.mean(var) + rbv = mean_bias / mean_var + + # compute uncertainy on rbv + delta_rbv = np.sqrt( + ((1 / mean_var) * np.std(bias)) ** 2 + (mean_bias / mean_var**2 * np.std(var)) ** 2 + ) + sqrt_rbv = np.sqrt(rbv) + delta_sqrt_rbv = 0.5 * delta_rbv / np.sqrt(rbv) + + records.append( + dict( + dataset=str(ds), + ndata=ndata, + bias=mean_bias, + variance=mean_var, + ratio=rbv, + error_ratio=delta_rbv, + ratio_sqrt=sqrt_rbv, + error_ratio_sqrt=delta_sqrt_rbv, + ) + ) + + df = pd.DataFrame.from_records( + records, + index="dataset", + columns=( + "dataset", + "ndata", + "bias", + "variance", + "ratio", + "error_ratio", + "ratio_sqrt", + "error_ratio_sqrt", + ), + ) + df.columns = [ + "ndata", + "bias", + "variance", + "ratio", + "error ratio", + "sqrt(ratio)", + "error sqrt(ratio)", + ] + + return df + @table def datasets_bias_variance_ratio(datasets_expected_bias_variance, each_dataset): diff --git a/validphys2/src/validphys/closuretest/runcards/old_rbv_definition.yaml b/validphys2/src/validphys/closuretest/runcards/old_rbv_definition.yaml new file mode 100644 index 0000000000..15de93bfdf --- /dev/null +++ b/validphys2/src/validphys/closuretest/runcards/old_rbv_definition.yaml @@ -0,0 +1,56 @@ +meta: + title: multiclosure report using old definition of Rbv + author: Mark N. Costantini + keywords: [multiclosure, python, consistent] + +fits: + - 25_5_2023_19_47_5_dis_pt1_mnc_commit_4d5d473c_filterseed_415295 + - 25_5_2023_19_49_38_dis_pt1_mnc_commit_4d5d473c_filterseed_120750 + - 25_5_2023_19_50_27_dis_pt1_mnc_commit_4d5d473c_filterseed_754299 + - 25_5_2023_19_51_19_dis_pt1_mnc_commit_4d5d473c_filterseed_361864 + - 25_5_2023_19_52_54_dis_pt1_mnc_commit_4d5d473c_filterseed_516909 + - 25_5_2023_19_52_7_dis_pt1_mnc_commit_4d5d473c_filterseed_539528 + - 25_5_2023_19_53_41_dis_pt1_mnc_commit_4d5d473c_filterseed_307907 + - 25_5_2023_19_54_28_dis_pt1_mnc_commit_4d5d473c_filterseed_279829 + - 25_5_2023_19_55_19_dis_pt1_mnc_commit_4d5d473c_filterseed_162790 + - 25_5_2023_19_56_54_dis_pt1_mnc_commit_4d5d473c_filterseed_6801 + - 25_5_2023_19_56_7_dis_pt1_mnc_commit_4d5d473c_filterseed_625729 + - 25_5_2023_19_57_42_dis_pt1_mnc_commit_4d5d473c_filterseed_327255 + - 25_5_2023_19_58_39_dis_pt1_mnc_commit_4d5d473c_filterseed_156246 + - 25_5_2023_19_59_29_dis_pt1_mnc_commit_4d5d473c_filterseed_385085 + - 25_5_2023_20_0_23_dis_pt1_mnc_commit_4d5d473c_filterseed_780441 + - 25_5_2023_20_1_12_dis_pt1_mnc_commit_4d5d473c_filterseed_155517 + - 25_5_2023_20_2_0_dis_pt1_mnc_commit_4d5d473c_filterseed_663052 + - 25_5_2023_20_2_47_dis_pt1_mnc_commit_4d5d473c_filterseed_387320 + - 25_5_2023_20_3_34_dis_pt1_mnc_commit_4d5d473c_filterseed_277058 + - 25_5_2023_20_4_24_dis_pt1_mnc_commit_4d5d473c_filterseed_347679 + - 25_5_2023_20_5_11_dis_pt1_mnc_commit_4d5d473c_filterseed_455195 + - 25_5_2023_20_6_1_dis_pt1_mnc_commit_4d5d473c_filterseed_191318 + - 25_5_2023_20_6_48_dis_pt1_mnc_commit_4d5d473c_filterseed_527779 + - 25_5_2023_20_7_35_dis_pt1_mnc_commit_4d5d473c_filterseed_974809 + +# the fits have the same datasets +fit: 25_5_2023_20_7_35_dis_pt1_mnc_commit_4d5d473c_filterseed_974809 + +theoryid: 200 + +use_cuts: internal + +dataset_inputs: + from_: fit + +t0pdfset: NNPDF40_nnlo_as_01180 +use_t0: True + +template_text: | + Report for multiclosure fit using old (2111.05787) definition of Rbv + --------------------------------------------------------------------- + + Table + ===== + + {@table_datasets_bias_variance_fits@} + + +actions_: + - report(main=True) From 9d5f63ebd98dfdcb4c3bd38d91786bf8081b4a92 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Sun, 17 Mar 2024 16:50:58 +0000 Subject: [PATCH 27/61] Piazza pulita 1: removed old plotting functions that were never used --- .../multiclosure_inconsistent.py | 160 ----------------- .../multiclosure_inconsistent_output.py | 166 +----------------- 2 files changed, 1 insertion(+), 325 deletions(-) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py index c997e3eb29..60e2b53a68 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py @@ -18,23 +18,6 @@ from reportengine import collect -""" To load several multiclosure fits. Useful for inconsistent closure test analysis """ -multi_dataset_loader = collect("internal_multiclosure_dataset_loader", ("dataspecs",)) - -multi_dataset_fits_bias_replicas_variance_samples = collect( - "dataset_fits_bias_replicas_variance_samples", ("dataspecs",) -) - -multi_fits_bootstrap_dataset_bias_variance = collect( - "fits_bootstrap_dataset_bias_variance", ("dataspecs",) -) - -multi_bias_variance_resampling_dataset = collect("bias_variance_resampling_dataset", ("dataspecs",)) - -multi_dataset_fits_bias_variance_samples_pca = collect( - "dataset_fits_bias_variance_samples_pca", ("dataspecs",) -) - def principal_components_dataset(dataset, fits_pdf, variancepdf, explained_variance_ratio=0.99): """ @@ -175,146 +158,3 @@ def principal_components_variance_distribution_dataset( principal_components_variance_distribution_datasets = collect( "principal_components_variance_distribution_dataset", ("data",) ) - - -def compute_num_components(covariance_matrix, threshold=0.99): - """ - Compute the number of principal components to keep based on a desired explained variance threshold. - - Parameters - ---------- - covariance_matrix : np.ndarray, 2-D array - - threshold : (float): Desired explained variance threshold (between 0 and 1). - - Returns - ------- - int - num_components: Number of principal components to keep. - - """ - eig_val, eig_vec = np.linalg.eig(covariance_matrix) - idx = eig_val.argsort()[::-1] - eig_val = eig_val[idx] - eig_vec = eig_vec[:, idx] - - cumulative_sum = np.cumsum(eig_val) - total_sum = np.sum(eig_val) - num_components = np.argmin(np.abs(cumulative_sum / total_sum - threshold)) - - return num_components - - -def calc_chi2_pca(pdf_cov, diff, num_components): - """ - Computes the chi2 between pdf_cov and diff by first projecting - them into num_components PCs. - - Parameters - ---------- - pdf_cov: np.ndarray - - diff: np.ndarray - - num_components: int - - Returns - ------- - float or np.ndarray depending on input - - """ - # Diagonalize the matrix - L, W = np.linalg.eig(pdf_cov) - - # Sort the eigenvalues and eigenvectors from largest to smallest - idx = L.argsort()[::-1] - L = L[idx] - W = W[:, idx] - - # Keep only the n_components largest eigenvectors - Wtilde = W[:, :num_components] - - # Transform initial covariance matrix - pdf_cov_pca = np.einsum("ij,jk->ik", np.einsum("ij,ik->jk", Wtilde, pdf_cov), Wtilde).real - - # transform data - diff_pca = diff.T @ Wtilde - - try: - sqrt_pdf_cov_pca = covmats.sqrt_covmat(pdf_cov_pca) - except: - raise ValueError( - f"PCA Covariance Matrix cannot be Cholesky decomposed, perhaps less than {num_components} PC should be kept" - ) - - return np.real(calc_chi2(sqrt_pdf_cov_pca, diff_pca.T)) - - -def dataset_fits_bias_variance_samples_pca(internal_multiclosure_dataset_loader, threshold=0.99): - """ - similar to `dataset_fits_bias_replicas_variance_samples`. - - Returns - ------- - tuple - 3D tuple: - - biases: 1-D array of shape (Nfits,) - - variances: 1-D array of shape (Nfits, ) - - n_eig: number of eigenvalues kept in PCA, i.e., - ndata in the new, lower dimensional, space. - Note that we return Nfits values of the variance so that computing the - Bias - Variance ratio is straightforward. - """ - closures_th, law_th, _, _ = internal_multiclosure_dataset_loader - - # The dimensions here are (fit, data point, replica) - reps = np.asarray([th.error_members for th in closures_th]) - - # take mean across replicas - since we might have changed no. of reps - centrals = reps.mean(axis=2) - - # compute the PDF covariance matrix of the central samples - if centrals.shape[0] <= 1: - raise ValueError(f"Need more than one fit to compute the 'Bias' PDF covariance Matrix") - - pdf_cov_bias = np.cov(centrals.T) - - # find number of (ordered) eigenvalues that explain 99% of the total variance (total sum of eigenvalues) - n_eig = compute_num_components(pdf_cov_bias, threshold) - - # compute bias from PCs - diffs_bias = centrals.T - law_th.central_value[:, np.newaxis] - biases = calc_chi2_pca(pdf_cov_bias, diffs_bias, n_eig) - - # compute variances from PCs - variances = [] - - # loop over fits to compute variances - for i in range(reps.shape[0]): - diffs_var = reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True) - pdf_cov_var = np.cov(reps[i, :, :]) - - variances.append(np.mean(calc_chi2_pca(pdf_cov_var, diffs_var, n_eig))) - - return biases, np.asarray(variances), n_eig - - -def expected_dataset_fits_bias_variance_samples_pca(dataset_fits_bias_variance_samples_pca): - """ - returns the bias and variance expected values as well as the number of - principal components. - """ - biases, variances, n_eig = dataset_fits_bias_variance_samples_pca - return np.mean(biases), np.mean(variances), n_eig - - -expected_datasets_fits_bias_variance_samples_pca = collect( - "expected_dataset_fits_bias_variance_samples_pca", ("data",) -) - - -def dataset_fits_ratio_bias_variance_samples_pca(dataset_fits_bias_variance_samples_pca): - """ """ - biases, variances, _ = dataset_fits_bias_variance_samples_pca - sqrt_ratios = np.sqrt(biases / variances) - return sqrt_ratios diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py index 337dc6a3e1..5fe25d4eef 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py @@ -11,7 +11,7 @@ import numpy as np from scipy import stats -from reportengine.figure import figure, figuregen +from reportengine.figure import figuregen from reportengine.table import table from reportengine import collect @@ -210,170 +210,6 @@ def plot_lambdavalues_bias_variance_values( yield fig -@table -def datasets_bias_variance_pca_table( - expected_datasets_fits_bias_variance_samples_pca, each_dataset -): - """For each dataset calculate the expected bias and expected variance computed by - first projecting the covariance matrix into a lower dimensional space trough PCA. - - """ - records = [] - for ds, (bias, var, ndata) in zip( - each_dataset, expected_datasets_fits_bias_variance_samples_pca - ): - records.append( - dict( - dataset=str(ds), - ndata=ndata, - bias=bias, - variance=var, - ratio=bias / var, - ratio_sqrt=np.sqrt(bias / var), - ) - ) - df = pd.DataFrame.from_records( - records, - index="dataset", - columns=("dataset", "ndata", "bias", "variance", "ratio", "ratio_sqrt"), - ) - df.columns = ["ndata", "bias", "variance", "ratio", "sqrt(ratio)"] - return df - - -@figure -def plot_sqrt_ratio_distribution_pca(dataset_fits_ratio_bias_variance_samples_pca): - """""" - sqrt_ratios = dataset_fits_ratio_bias_variance_samples_pca - fig, ax = plotutils.subplots() - ax.hist(sqrt_ratios, bins='auto', density=True, alpha=0.5, label="sqrt_ratio") - ax.legend() - return fig - - -@figure -def plot_variance_distribution_multi(multi_dataset_fits_bias_replicas_variance_samples, dataspecs): - """ - histogram of the distribution of the variances (k) defined as the - distance between central replica and replica (k) in units of the - experimental covariance matrix. - If more than one group of dataspecs (e.g. consistent and inconsistent) - fits are defined, than plot comparison of these. - """ - fig, ax = plotutils.subplots() - for (_, variance_dist, _), spec in zip( - multi_dataset_fits_bias_replicas_variance_samples, dataspecs - ): - label = spec['speclabel'] - - ax.hist(variance_dist, bins='auto', density=True, alpha=0.5, label=label) - - ax.legend() - return fig - - -@figure -def plot_variance_distribution_pca_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs): - """ - like `plot_variance_distribution_multi`, but for variance computed with the covariance matrix - predicted by the PDFs (and reduced with PCA). - """ - return plot_variance_distribution_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs) - - -@figure -def plot_bias_distribution_multi(multi_dataset_fits_bias_replicas_variance_samples, dataspecs): - """ - histogram of the distribution of the biases (l) defined as the - distance between central replica and underlying law in units of the - experimental covariance matrix. - If more than one group of dataspecs (e.g. consistent and inconsistent) - fits are defined, than plot comparison of these. - """ - fig, ax = plotutils.subplots() - for (bias_dist, _, _), spec in zip( - multi_dataset_fits_bias_replicas_variance_samples, dataspecs - ): - label = spec['speclabel'] - - ax.hist(bias_dist, bins='auto', density=True, alpha=0.5, label=label) - - ax.legend() - return fig - - -@figure -def plot_bias_distribution_pca_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs): - """ - like `plot_bias_distribution_multi`, but for variance computed with the covariance matrix - predicted by the PDFs (and reduced with PCA). - """ - return plot_bias_distribution_multi(multi_dataset_fits_bias_variance_samples_pca, dataspecs) - - -@figure -def plot_sqrt_ratio_distribution_pca(multi_dataset_fits_bias_variance_samples_pca, dataspecs): - """ - histogram of the distribution of the sqrt ratio of bias and variance computed with - the estimated "PDF" covariance matrix (reduced with PCA). - If more than one group of dataspecs (e.g. consistent and inconsistent) - fits are defined, than plot comparison of these. - """ - fig, ax = plotutils.subplots() - ratios = [] - labels = [] - for (bias_dist, variance_dist, _), spec in zip( - multi_dataset_fits_bias_variance_samples_pca, dataspecs - ): - labels.append(spec['speclabel']) - ratios.append(bias_dist / variance_dist) - - ax.hist(ratios, bins='auto', density=True, label=labels) - ax.legend() - return fig - - -@figure -def plot_multi_bias_distribution_bootstrap(multi_fits_bootstrap_dataset_bias_variance, dataspecs): - """ - histogram of the distribution of the biases obtained by bootstrapping with - `fits_bootstrap_dataset_bias_variance` over the existing fits. - - If more than one group of dataspecs (e.g. consistent and inconsistent) - fits are defined, than plot comparison of these. - """ - fig, ax = plotutils.subplots() - for (bias, _), spec in zip(multi_fits_bootstrap_dataset_bias_variance, dataspecs): - label = spec['speclabel'] - - ax.hist(bias, bins='auto', density=True, alpha=0.5, label=label) - - ax.legend() - return fig - - -@figure -def plot_multi_ratio_bias_variance_distribution_bootstrap( - multi_fits_bootstrap_dataset_bias_variance, dataspecs -): - """ - histogram of the ratio bias variance distribution obtained by bootstrapping bias - and variance with `fits_bootstrap_dataset_bias_variance`. - - If more than one group of dataspecs (e.g. consistent and inconsistent) - fits are defined, than plot comparison of these. - """ - fig, ax = plotutils.subplots() - for (bias, variance), spec in zip(multi_fits_bootstrap_dataset_bias_variance, dataspecs): - ratio = bias / variance - label = spec['speclabel'] - - ax.hist(ratio, bins='auto', density=True, alpha=0.5, label=label) - - ax.legend() - return fig - - @figuregen def plot_l2_condition_number( each_dataset, fits_pdf, variancepdf, evr_min=0.90, evr_max=0.995, evr_n=20 From 3644cb8a143105ea359b16dcf9b78d4b5ec8dfb9 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Sun, 17 Mar 2024 16:56:37 +0000 Subject: [PATCH 28/61] Piazza pulita 2: removed principal_components_variance_distribution_dataset as we are not using it --- .../multiclosure_inconsistent.py | 32 --------- .../multiclosure_inconsistent_output.py | 68 ------------------- 2 files changed, 100 deletions(-) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py index 60e2b53a68..ba04f5d09a 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py @@ -126,35 +126,3 @@ def principal_components_bias_variance_dataset( principal_components_bias_variance_datasets = collect( "principal_components_bias_variance_dataset", ("data",) ) - - -def principal_components_variance_distribution_dataset( - internal_multiclosure_dataset_loader, principal_components_dataset -): - """ - TODO - """ - - closures_th, _, _, _ = internal_multiclosure_dataset_loader - - reps = np.asarray([th.error_members for th in closures_th]) - - pc_basis, pc_reps, n_comp = principal_components_dataset - - if n_comp <= 1: - return None, n_comp - # estimate (PC) pdf covariance matrix (from replicas), shape is (Npc, Npc) - covmat_pdf = np.cov(pc_reps) - sqrt_covmat_pdf = covmats.sqrt_covmat(covmat_pdf) - - variances = [] - for i in range(reps.shape[0]): - diffs = pc_basis @ (reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)) - variances.append([calc_chi2(sqrt_covmat_pdf, diffs)]) - - return np.asarray(variances), n_comp - - -principal_components_variance_distribution_datasets = collect( - "principal_components_variance_distribution_dataset", ("data",) -) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py index 5fe25d4eef..24bddc48ba 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py @@ -9,7 +9,6 @@ import pandas as pd import numpy as np -from scipy import stats from reportengine.figure import figuregen from reportengine.table import table @@ -21,73 +20,6 @@ ) -@figuregen -def plot_bias_distribution_datasets(principal_components_bias_variance_datasets, each_dataset): - """ - TODO - """ - for pc_bias_var_dataset, ds in zip(principal_components_bias_variance_datasets, each_dataset): - biases, variances, n_comp = pc_bias_var_dataset - - try: - sqrt_rbv = np.sqrt(np.mean(biases) / np.mean(variances)) - vals = np.linspace(0, 3 * n_comp, 100) - chi2_pdf = stats.chi2.pdf(vals, df=n_comp) - chi2_cdf = lambda x: stats.chi2.cdf(x, df=n_comp) - pvalue_ks = stats.kstest(biases, chi2_cdf).pvalue - fig, ax = plotutils.subplots() - ax.hist(biases, density=True, bins='auto', label=f"Bias {str(ds)}, p={pvalue_ks:.3f}") - ax.plot(vals, chi2_pdf, label=f"chi2, dof={n_comp}") - ax.plot([], [], 'ro', label=f"sqrt(Rbv) = {sqrt_rbv:.2f}") - - ax.legend() - - yield fig - - except: - fig, ax = plotutils.subplots() - ax.plot([], [], label=f"Dataset: {str(ds)}") - ax.legend() - yield fig - - -@figuregen -def plot_variance_distribution_datasets( - principal_components_variance_distribution_datasets, each_dataset -): - """ - TODO - """ - - for pc_var_dataset, ds in zip( - principal_components_variance_distribution_datasets, each_dataset - ): - variances, n_comp = pc_var_dataset - try: - vals = np.linspace(0, 3 * n_comp, 100) - chi2_pdf = stats.chi2.pdf(vals, df=n_comp) - chi2_cdf = lambda x: stats.chi2.cdf(x, df=n_comp) - - for i, var_fit in enumerate(variances): - pvalue_ks = stats.kstest(var_fit[0], chi2_cdf).pvalue - - fig, ax = plotutils.subplots() - ax.hist( - var_fit[0], - density=True, - bins='auto', - label=f"Variance {str(ds)}, p={pvalue_ks:.3f}", - ) - ax.plot(vals, chi2_pdf, label=f"chi2, dof={n_comp}") - ax.set_title(f"Fit {i}") - ax.legend() - - yield fig - except: - fig, ax = plotutils.subplots() - yield fig - - @table def table_bias_variance_datasets(principal_components_bias_variance_datasets, each_dataset): """ From 462e63d3a2e88ded2118eedd51f90d7e07a33313 Mon Sep 17 00:00:00 2001 From: Giovanni De Crescenzo Date: Sun, 17 Mar 2024 19:05:10 +0100 Subject: [PATCH 29/61] moved single data colorbar plots to multiclosure.py/_output.py --- .../src/validphys/closuretest/multiclosure.py | 31 ++++++++++ .../closuretest/multiclosure_output.py | 18 ++++++ validphys2/src/validphys/kinematics.py | 58 +------------------ 3 files changed, 50 insertions(+), 57 deletions(-) diff --git a/validphys2/src/validphys/closuretest/multiclosure.py b/validphys2/src/validphys/closuretest/multiclosure.py index a086d97d91..496bf22772 100644 --- a/validphys2/src/validphys/closuretest/multiclosure.py +++ b/validphys2/src/validphys/closuretest/multiclosure.py @@ -24,6 +24,8 @@ ) from validphys.results import ThPredictionsResult +from validphys.kinematics import xq2map_with_cuts + # bootstrap seed default DEFAULT_SEED = 9689372 # stepsize in fits/replicas to use for finite size bootstraps @@ -870,3 +872,32 @@ def dataset_inputs_fits_bias_replicas_variance_samples( experiments_fits_bias_replicas_variance_samples = collect( "dataset_inputs_fits_bias_replicas_variance_samples", ("group_dataset_inputs_by_experiment",) ) + + +def xq2_dataset_map(commondata, cuts,internal_multiclosure_dataset_loader, + _internal_max_reps=None, + _internal_min_reps=20): + """ + Load in a dictionary all the specs of a dataset meaning: + - ds name + - ds coords + - standard deviation (in multiclosure) + - mean (in multiclosure again) + - (x,Q^2) coords + """ + xq2_map_obj = xq2map_with_cuts(commondata, cuts) + coords = xq2_map_obj[2] + central_deltas = fits_normed_dataset_central_delta(internal_multiclosure_dataset_loader) + std_devs = np.std(central_deltas, axis = 0) + means = np.mean(central_deltas, axis = 0) + xi = dataset_xi(dataset_replica_and_central_diff(internal_multiclosure_dataset_loader,False)) + + # for case of DY observables we have 2 (x,Q) for each experimental point + if coords[0].shape[0] != std_devs.shape[0]: + std_devs = np.concatenate((std_devs,std_devs)) + means = np.concatenate((means,means)) + xi = np.concatenate((xi,xi)) + return {'x_coords':coords[0], 'Q_coords':coords[1],'std_devs':std_devs,'name':commondata.name,'process':commondata.process_type, 'means': means, 'xi': xi} + + +xq2_data_map = collect("xq2_dataset_map",("data",)) \ No newline at end of file diff --git a/validphys2/src/validphys/closuretest/multiclosure_output.py b/validphys2/src/validphys/closuretest/multiclosure_output.py index a9becee214..54a92579bc 100644 --- a/validphys2/src/validphys/closuretest/multiclosure_output.py +++ b/validphys2/src/validphys/closuretest/multiclosure_output.py @@ -853,3 +853,21 @@ def plot_bias_variance_distributions( ax.legend() ax.set_title("Total bias and variance distributions.") yield fig + +@figuregen +def xq2_data_prcs_maps(xq2_data_map): + keys = ["std_devs","xi"] + for elem in xq2_data_map: + for k in keys: + fig, ax = plotutils.subplots() + im = ax.scatter(elem['x_coords'],elem['Q_coords'], + c=(np.asarray(elem[k])), + cmap='viridis', + label = elem['name']) + fig.colorbar(im,label=k) + ax.set_xscale('log') # Set x-axis to log scale + ax.set_yscale('log') # Set y-axis to log scale + ax.set_xlabel('x') + ax.set_ylabel('Q2') + ax.set_title(elem["name"]+" "+k) + yield fig diff --git a/validphys2/src/validphys/kinematics.py b/validphys2/src/validphys/kinematics.py index d4f1ef4f2f..554dd401b5 100644 --- a/validphys2/src/validphys/kinematics.py +++ b/validphys2/src/validphys/kinematics.py @@ -19,16 +19,6 @@ -from validphys import plotutils -from validphys import plotoptions -from validphys.core import CutsPolicy -from validphys.closuretest import fits_normed_dataset_central_delta -from validphys.closuretest import dataset_xi -from validphys.closuretest import dataset_replica_and_central_diff - -from reportengine.figure import figuregen - - @@ -113,31 +103,6 @@ def kinlimits(commondata, cuts, use_cuts, use_kinoverride: bool = True): all_kinlimits = collect(kinlimits, ('dataset_inputs',)) -def xq2_dataset_map(commondata, cuts,internal_multiclosure_dataset_loader, - _internal_max_reps=None, - _internal_min_reps=20): - """ - Load in a dictionary all the specs of a dataset meaning: - - ds name - - ds coords - - standard deviation (in multiclosure) - - mean (in multiclosure again) - - (x,Q^2) coords - """ - xq2_map_obj = xq2map_with_cuts(commondata, cuts) - coords = xq2_map_obj[2] - central_deltas = fits_normed_dataset_central_delta(internal_multiclosure_dataset_loader) - std_devs = np.std(central_deltas, axis = 0) - means = np.mean(central_deltas, axis = 0) - xi = dataset_xi(dataset_replica_and_central_diff(internal_multiclosure_dataset_loader,False)) - - # for case of DY observables we have 2 (x,Q) for each experimental point - if coords[0].shape[0] != std_devs.shape[0]: - std_devs = np.concatenate((std_devs,std_devs)) - means = np.concatenate((means,means)) - xi = np.concatenate((xi,xi)) - return {'x_coords':coords[0], 'Q_coords':coords[1],'std_devs':std_devs,'name':commondata.name,'process':commondata.process_type, 'means': means, 'xi': xi} - @table def all_kinlimits_table(all_kinlimits, use_kinoverride: bool = True): """Return a table with the kinematic limits for the datasets given as input @@ -226,28 +191,7 @@ def xq2map_with_cuts(commondata, cuts, group_name=None): dataset_inputs_by_groups_xq2map = collect( xq2map_with_cuts, ('group_dataset_inputs_by_metadata', 'data_input') ) -xq2_data_map = collect("xq2_dataset_map",("data",)) - -@figuregen -def xq2_data_prcs_maps(xq2_data_map): - import matplotlib.pyplot as plt - import matplotlib.colors - from matplotlib.colors import ListedColormap - keys = ["std_devs","xi"] - for elem in xq2_data_map: - for k in keys: - fig, ax = plotutils.subplots() - im = ax.scatter(elem['x_coords'],elem['Q_coords'], - c=(np.asarray(elem[k])), - cmap='viridis', - label = elem['name']) - fig.colorbar(im,label=k) - ax.set_xscale('log') # Set x-axis to log scale - ax.set_yscale('log') # Set y-axis to log scale - ax.set_xlabel('x') - ax.set_ylabel('Q2') - ax.set_title(elem["name"]+" "+k) - yield fig + def kinematics_table_notable(commondata, cuts, show_extra_labels: bool = False): From 7d696aff5e0d91189a74e8439a491783bb52402b Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Thu, 11 Apr 2024 23:42:34 +0200 Subject: [PATCH 30/61] scikit-dep >=1.4.1 in pyproject file --- conda-recipe/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index 8c6b1dd12c..a50e540f40 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -44,7 +44,7 @@ requirements: - sphinx_rtd_theme >0.5 - sphinxcontrib-bibtex - ruamel.yaml <0.18 - - scikit-learn = "^1.4.1" + - scikit-learn >=1.4.1 test: requires: From 37b835c27173d13e52f0c6eb207d7ae5ae1ff57e Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Mon, 22 Apr 2024 17:50:14 +0100 Subject: [PATCH 31/61] use mean theory covmat for PCA -> gives more stable values of variance --- .../multiclosure_inconsistent.py | 42 +++++++++++++++---- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py index ba04f5d09a..5f29882c98 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py @@ -18,7 +18,6 @@ from reportengine import collect - def principal_components_dataset(dataset, fits_pdf, variancepdf, explained_variance_ratio=0.99): """ Compute the principal components of theory predictions replica matrix @@ -71,7 +70,7 @@ def principal_components_dataset(dataset, fits_pdf, variancepdf, explained_varia def principal_components_bias_variance_dataset( - internal_multiclosure_dataset_loader, principal_components_dataset + internal_multiclosure_dataset_loader, explained_variance_ratio=0.99 ): """ Compute the bias and variance for one datasets @@ -98,26 +97,55 @@ def principal_components_bias_variance_dataset( reps = np.asarray([th.error_members for th in closures_th]) - pc_basis, pc_reps, n_comp = principal_components_dataset + # compute the covariance matrix of the theory predictions for each fit + _covmats = [np.cov(rep, rowvar=True) for rep in reps] + + # compute the mean covariance matrix + _covmat_mean = np.mean(_covmats, axis=0) + + # diagonalize the mean covariance matrix and only keep the principal components + # that explain the required variance + + if _covmat_mean.shape == (): + return np.nan, np.nan, 1 + + eighvals, eigvecs = np.linalg.eigh(_covmat_mean) + idx = np.argsort(eighvals)[::-1] + # sort eigenvalues from largest to smallest + eigvecs = eigvecs[:, idx] + eighvals = eighvals[idx] + eighvals_norm = eighvals / eighvals.sum() + + # choose components to keep based on EVR + n_comp = 1 + for _ in range(eighvals.shape[0]): + if np.sum(eighvals_norm[:n_comp]) >= explained_variance_ratio: + break + n_comp += 1 + + # get the principal components + pc_basis = eigvecs[:, :n_comp] + + # compute the (PCA) regularized covariance matrix + covmat_pdf = pc_basis.T @ _covmat_mean @ pc_basis if n_comp <= 1: return np.nan, np.nan, n_comp - # estimate (PC) pdf covariance matrix (from replicas), shape is (Npc, Npc) - covmat_pdf = np.cov(pc_reps) + # compute sqrt of pdf covariance matrix sqrt_covmat_pdf = covmats.sqrt_covmat(covmat_pdf) # compute bias diff and project it onto space spanned by PCs delta_bias = reps.mean(axis=2).T - law_th.central_value[:, np.newaxis] # shape here is (Npc, Nfits) - delta_bias = pc_basis @ delta_bias + delta_bias = pc_basis.T @ delta_bias # compute biases, shape of biases is (Nfits) biases = calc_chi2(sqrt_covmat_pdf, delta_bias) variances = [] for i in range(reps.shape[0]): - diffs = pc_basis @ (reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)) + diffs = pc_basis.T @ (reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)) variances.append(np.mean(calc_chi2(sqrt_covmat_pdf, diffs))) return biases, np.asarray(variances), n_comp From 8b4d5c520f24b1e03ccbeb3d508db39c36cda464 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Mon, 10 Jun 2024 12:21:28 +0200 Subject: [PATCH 32/61] rewrote functions using new variance definition (covmat computed by averaging over covmats of all fits) and removed sklearn dependency --- .../examples/pca_bias_variance_report.yaml | 1 - .../multiclosure_inconsistent.py | 182 +++++++++++------- .../multiclosure_inconsistent_output.py | 32 +-- 3 files changed, 129 insertions(+), 86 deletions(-) diff --git a/validphys2/examples/pca_bias_variance_report.yaml b/validphys2/examples/pca_bias_variance_report.yaml index 209ad06c58..1a9a506770 100644 --- a/validphys2/examples/pca_bias_variance_report.yaml +++ b/validphys2/examples/pca_bias_variance_report.yaml @@ -47,7 +47,6 @@ template_text: | ## L2 condition number {@plot_l2_condition_number@} - {@bootstrapped_bias_distribution@} actions_: - report(main=true) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py index 5f29882c98..f8820328b0 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py @@ -9,7 +9,7 @@ """ import numpy as np -from sklearn.decomposition import PCA +import dataclasses from validphys import covmats from validphys.calcutils import calc_chi2 @@ -18,87 +18,68 @@ from reportengine import collect -def principal_components_dataset(dataset, fits_pdf, variancepdf, explained_variance_ratio=0.99): +@dataclasses.dataclass(frozen=True) +class PCAInternalMulticlosureLoader: """ - Compute the principal components of theory predictions replica matrix - (Ndat x Nrep feature matrix). - Parameters ---------- - dataset: (DataSetSpec, DataGroupSpec) - dataset for which the theory predictions and t0 covariance matrix - will be loaded. Note that due to the structure of `validphys` this - function can be overloaded to accept a DataGroupSpec. + closures_th: list + list containing validphys.results.ThPredictionsResult objects + for each fit - fits_pdf: list - list of PDF objects produced from performing multiple closure tests - fits. Each fit should have a different filterseed but the same - underlying law used to generate the pseudodata. + law_th: ThPredictionsResult object + underlying law theory predictions - variancepdf: validphys.core.PDF - PDF object used to estimate the variance of the fits. + pc_basis: np.array + basis of principal components - explained_variance_ratio: float, default is 0.93 + n_comp: int + number of principal components kept after regularisation - Returns - ------- - tuple - 3D tuple: - - matrix of the principal components (PCs) of shape (N_pc, N_dat) - - reduced feature matrix, i.e., feature matrix projected onto PCs of shape (N_pc, N_rep) - - N_pc: number of principal components kept + covmat_pca: np.array + regularised covariance matrix computed from replicas + of theory predictions + sqrt_covmat_pca: np.array + cholesky decomposed covariance matrix """ - # get replicas from variance fit, used to estimate variance - reps = ThPredictionsResult.from_convolution(variancepdf, dataset).error_members - - # something that could be tested: rescale feature matrix - # reps_scaled = reps.preprocessing.scale(reps) - # choose number of principal components (PCs) based on explained variance ratio - n_comp = 1 - for _ in range(reps.shape[0]): - pca = PCA(n_comp).fit(reps.T) - if np.sum(pca.explained_variance_ratio_) >= explained_variance_ratio: - break - n_comp += 1 + closures_th: list + law_th: ThPredictionsResult + pc_basis: np.array + n_comp: int + covmat_pca: np.array + sqrt_covmat_pca: np.array - # project feature matrix onto PCs - pc_reps = pca.components_ @ reps - return pca.components_, pc_reps, n_comp - - -def principal_components_bias_variance_dataset( +def internal_multiclosure_dataset_loader_pca( internal_multiclosure_dataset_loader, explained_variance_ratio=0.99 ): """ - Compute the bias and variance for one datasets - using the principal component reduced covariance matrix. + Similar to multiclosure.internal_multiclosure_dataset_loader but returns + PCA regularised covariance matrix, where the covariance matrix has been computed + from the replicas of the theory predictions. Parameters ---------- - internal_multiclosure_dataset_loader : tuple - Tuple containing the results of multiclosure fits + internal_multiclosure_dataset_loader: tuple + closure fits theory predictions, + underlying law theory predictions, + covariance matrix, + sqrt covariance matrix - principal_components_dataset : tuple - 3D tuple containing the principal components of the theory predictions + explained_variance_ratio: float, default is 0.99 Returns ------- - tuple - 3D tuple: - - biases: 1-D array of shape (Nfits,) - - variances: 1-D array of shape (Nfits, ) - - n_comp: number of principal components kept + PCAInternalMulticlosureLoader """ - closures_th, law_th, _, _ = internal_multiclosure_dataset_loader reps = np.asarray([th.error_members for th in closures_th]) # compute the covariance matrix of the theory predictions for each fit - _covmats = [np.cov(rep, rowvar=True) for rep in reps] + _covmats = [np.cov(rep, rowvar=True, bias=True) for rep in reps] # compute the mean covariance matrix _covmat_mean = np.mean(_covmats, axis=0) @@ -107,7 +88,14 @@ def principal_components_bias_variance_dataset( # that explain the required variance if _covmat_mean.shape == (): - return np.nan, np.nan, 1 + return PCAInternalMulticlosureLoader( + closures_th=closures_th, + law_th=law_th, + pc_basis=1, + n_comp=1, + covmat_pca=_covmat_mean, + sqrt_covmat_pca=np.sqrt(_covmat_mean), + ) eighvals, eigvecs = np.linalg.eigh(_covmat_mean) idx = np.argsort(eighvals)[::-1] @@ -127,28 +115,80 @@ def principal_components_bias_variance_dataset( pc_basis = eigvecs[:, :n_comp] # compute the (PCA) regularized covariance matrix - covmat_pdf = pc_basis.T @ _covmat_mean @ pc_basis - - if n_comp <= 1: - return np.nan, np.nan, n_comp + covmat_pca = pc_basis.T @ _covmat_mean @ pc_basis + + if n_comp == 1: + return PCAInternalMulticlosureLoader( + closures_th=closures_th, + law_th=law_th, + pc_basis=pc_basis, + n_comp=1, + covmat_pca=covmat_pca, + sqrt_covmat_pca=np.sqrt(covmat_pca), + ) # compute sqrt of pdf covariance matrix - sqrt_covmat_pdf = covmats.sqrt_covmat(covmat_pdf) + sqrt_covmat_pca = covmats.sqrt_covmat(covmat_pca) - # compute bias diff and project it onto space spanned by PCs - delta_bias = reps.mean(axis=2).T - law_th.central_value[:, np.newaxis] - # shape here is (Npc, Nfits) - delta_bias = pc_basis.T @ delta_bias + return PCAInternalMulticlosureLoader( + closures_th=closures_th, + law_th=law_th, + pc_basis=pc_basis, + n_comp=n_comp, + covmat_pca=covmat_pca, + sqrt_covmat_pca=sqrt_covmat_pca, + ) + + +def principal_components_bias_variance_dataset(internal_multiclosure_dataset_loader_pca): + """ + Compute the bias and variance for one datasets + using the principal component reduced covariance matrix. - # compute biases, shape of biases is (Nfits) - biases = calc_chi2(sqrt_covmat_pdf, delta_bias) + Parameters + ---------- + internal_multiclosure_dataset_loader : tuple + Tuple containing the results of multiclosure fits - variances = [] - for i in range(reps.shape[0]): - diffs = pc_basis.T @ (reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)) - variances.append(np.mean(calc_chi2(sqrt_covmat_pdf, diffs))) + explained_variance_ratio : float, default is 0.99 + 3D tuple containing the principal components of the theory predictions - return biases, np.asarray(variances), n_comp + Returns + ------- + tuple + 3D tuple: + - biases: 1-D array of shape (Nfits,) + - variances: 1-D array of shape (Nfits, ) + - n_comp: number of principal components kept + """ + + pca_loader = internal_multiclosure_dataset_loader_pca + + reps = np.asarray([th.error_members for th in pca_loader.closures_th]) + + # compute bias diff and project it onto space spanned by PCs + delta_bias = reps.mean(axis=2).T - pca_loader.law_th.central_value[:, np.newaxis] + + if pca_loader.n_comp == 1: + delta_bias = pca_loader.pc_basis * delta_bias + biases = (delta_bias / pca_loader.sqrt_covmat_pca) ** 2 + variances = [] + for i in range(reps.shape[0]): + diffs = pca_loader.pc_basis * ( + reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True) + ) + variances.append(np.mean((diffs / pca_loader.sqrt_covmat_pca) ** 2)) + else: + delta_bias = pca_loader.pc_basis.T @ delta_bias + biases = calc_chi2(pca_loader.sqrt_covmat_pca, delta_bias) + variances = [] + for i in range(reps.shape[0]): + diffs = pca_loader.pc_basis.T @ ( + reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True) + ) + variances.append(np.mean(calc_chi2(pca_loader.sqrt_covmat_pca, diffs))) + + return biases, np.asarray(variances), pca_loader.n_comp principal_components_bias_variance_datasets = collect( diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py index 24bddc48ba..0b50f7afdc 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py @@ -16,7 +16,7 @@ from validphys import plotutils from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent import ( - principal_components_dataset, + internal_multiclosure_dataset_loader_pca, ) @@ -142,9 +142,14 @@ def plot_lambdavalues_bias_variance_values( yield fig +internal_multiclosure_data_collected_loader = collect( + "internal_multiclosure_dataset_loader", ("data",) +) + + @figuregen def plot_l2_condition_number( - each_dataset, fits_pdf, variancepdf, evr_min=0.90, evr_max=0.995, evr_n=20 + each_dataset, internal_multiclosure_data_collected_loader, evr_min=0.90, evr_max=0.995, evr_n=20 ): """ Plot the L2 condition number of the covariance matrix as a function of the explained variance ratio. @@ -162,11 +167,9 @@ def plot_l2_condition_number( each_dataset : list List of datasets - fits_pdf: list - list of validphys.core.PDF objects + internal_multiclosure_data_loader: list + list of internal_multiclosure_dataset_loader objects - variancepdf: validphys.core.PDF - PDF object for the variance Yields ------ @@ -177,25 +180,26 @@ def plot_l2_condition_number( # Explained variance ratio range evr_range = np.linspace(evr_min, evr_max, evr_n) - for ds in each_dataset: + for internal_multiclosure_dataset_loader, ds in zip( + internal_multiclosure_data_collected_loader, each_dataset + ): l2_cond = [] dof = [] for evr in evr_range: - _, pc_reps, n_comp = principal_components_dataset( - ds, fits_pdf, variancepdf, explained_variance_ratio=evr - ) - covmat_pdf = np.cov(pc_reps) + pca_loader = internal_multiclosure_dataset_loader_pca( + internal_multiclosure_dataset_loader, explained_variance_ratio=evr + ) # if the number of principal components is 1 then the covariance matrix is a scalar # and the condition number is not computed - if n_comp <= 1: + if pca_loader.n_comp == 1: l2_cond.append(np.nan) else: - l2_cond.append(np.linalg.cond(covmat_pdf)) + l2_cond.append(np.linalg.cond(pca_loader.covmat_pca)) - dof.append(n_comp) + dof.append(pca_loader.n_comp) fig, ax1 = plotutils.subplots() ax1.plot(evr_range, l2_cond, label="L2 Condition Number") From 5606f3e362ee1216d364cc2aa2eacba8b07b637e Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Tue, 23 Apr 2024 16:54:34 +0100 Subject: [PATCH 33/61] removed sklearn dep from conda recipe meta file --- conda-recipe/meta.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index a50e540f40..9d1b6376e3 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -44,7 +44,6 @@ requirements: - sphinx_rtd_theme >0.5 - sphinxcontrib-bibtex - ruamel.yaml <0.18 - - scikit-learn >=1.4.1 test: requires: From ee8bf0983d3bc4eacef2a21728f3f7ad09e77a46 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Mon, 10 Jun 2024 12:22:22 +0200 Subject: [PATCH 34/61] removed variancepdf as unused --- .../examples/pca_bias_variance_report.yaml | 20 +++++++++---------- validphys2/src/validphys/config.py | 1 + .../src/validphys/scripts/vp_multiclosure.py | 4 ++-- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/validphys2/examples/pca_bias_variance_report.yaml b/validphys2/examples/pca_bias_variance_report.yaml index 1a9a506770..428a684754 100644 --- a/validphys2/examples/pca_bias_variance_report.yaml +++ b/validphys2/examples/pca_bias_variance_report.yaml @@ -14,10 +14,6 @@ dataset_inputs: - {dataset: HERA_CC_318GEV_EP-SIGMARED, variant: legacy} -# variance pdf is used to compute the variance of fits -# the assumption is that fits have the same variance (needs to be checked) -variancepdf: 230124_dis_ict_lam02_fs_122996 - theoryid: 200 use_cuts: internal @@ -28,13 +24,15 @@ t0pdfset: 210223-mw-000_fakepdf explained_variance_ratio: 0.99 fits: - - 250124_dis_ict_lam04_fs_833288 - - 250124_dis_ict_lam04_fs_85029 - - 250124_dis_ict_lam04_fs_856511 - - 250124_dis_ict_lam04_fs_895365 - - 250124_dis_ict_lam04_fs_902027 - - 250124_dis_ict_lam04_fs_911964 - - 250124_dis_ict_lam04_fs_99031 + - 25_5_2023_19_47_5_dis_pt1_mnc_commit_4d5d473c_filterseed_415295 + - 25_5_2023_19_49_38_dis_pt1_mnc_commit_4d5d473c_filterseed_120750 + - 25_5_2023_19_50_27_dis_pt1_mnc_commit_4d5d473c_filterseed_754299 + - 25_5_2023_19_51_19_dis_pt1_mnc_commit_4d5d473c_filterseed_361864 + - 25_5_2023_19_52_54_dis_pt1_mnc_commit_4d5d473c_filterseed_516909 + - 25_5_2023_19_52_7_dis_pt1_mnc_commit_4d5d473c_filterseed_539528 + - 25_5_2023_19_53_41_dis_pt1_mnc_commit_4d5d473c_filterseed_307907 + - 25_5_2023_19_54_28_dis_pt1_mnc_commit_4d5d473c_filterseed_279829 + template_text: | diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index bdd94e213c..4cd55e6561 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -1065,6 +1065,7 @@ def produce_reweight_all_datasets(self, experiments): ret.append({"reweighting_experiments": [single_exp], "dataset_input": dsinput}) return ret + def produce_pdf_id(self, pdf) -> str: """Return a string containing the PDF's LHAPDF ID""" return pdf.name diff --git a/validphys2/src/validphys/scripts/vp_multiclosure.py b/validphys2/src/validphys/scripts/vp_multiclosure.py index 5d1aa7013a..e87a0003d4 100644 --- a/validphys2/src/validphys/scripts/vp_multiclosure.py +++ b/validphys2/src/validphys/scripts/vp_multiclosure.py @@ -126,7 +126,7 @@ def complete_lambdavaluesmapping(self): list_lambdas = [] for lambdas in settings: lambdasetting = {} - for set in ["variancepdf", "t0pdfset", "explained_variance_ratio", "label", "fits", "fit"]: + for set in ["t0pdfset", "explained_variance_ratio", "label", "fits", "fit"]: lambdasetting[set] = lambdas[set] list_lambdas.append(lambdasetting) autosettings["lambdavalues"] = list_lambdas @@ -143,7 +143,7 @@ def get_config(self): complete_mapping.update(self.complete_lambdavaluesmapping()) c["meta"] = complete_mapping["meta"] for lambdas, lambdas_mapping in zip(c["lambdavalues"], complete_mapping["lambdavalues"]): - for set in ["variancepdf", "t0pdfset", "explained_variance_ratio", "label", "fits", "fit"]: + for set in ["t0pdfset", "explained_variance_ratio", "label", "fits", "fit"]: lambdas[set] = lambdas_mapping[set] for set in ["current", "reference"]: c["compare_settings"][set] = complete_mapping["compare_settings"][set] From 7144ac9292e9821bcb41970e5222b572c141147b Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Mon, 10 Jun 2024 12:22:44 +0200 Subject: [PATCH 35/61] added check_multifit_replicas check --- .../multiclosure_inconsistent.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py index f8820328b0..ec2971033b 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py @@ -14,6 +14,7 @@ from validphys import covmats from validphys.calcutils import calc_chi2 from validphys.results import ThPredictionsResult +from validphys.closuretest.closure_checks import check_multifit_replicas from reportengine import collect @@ -52,8 +53,12 @@ class PCAInternalMulticlosureLoader: sqrt_covmat_pca: np.array +@check_multifit_replicas def internal_multiclosure_dataset_loader_pca( - internal_multiclosure_dataset_loader, explained_variance_ratio=0.99 + internal_multiclosure_dataset_loader, + explained_variance_ratio=0.99, + _internal_max_reps=None, + _internal_min_reps=20, ): """ Similar to multiclosure.internal_multiclosure_dataset_loader but returns @@ -70,6 +75,14 @@ def internal_multiclosure_dataset_loader_pca( explained_variance_ratio: float, default is 0.99 + _internal_max_reps: int, default is None + Maximum number of replicas used in the fits + this is needed to check that the number of replicas is the same for all fits + + _internal_min_reps: int, default is 20 + Minimum number of replicas used in the fits + this is needed to check that the number of replicas is the same for all fits + Returns ------- PCAInternalMulticlosureLoader From ce5ff2dc88a847388a6b3f641a62abb5c2b8b801 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Mon, 10 Jun 2024 12:17:46 +0200 Subject: [PATCH 36/61] use plotting dataset labels for rbv vs lambda titles --- .../multiclosure_inconsistent_output.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py index 0b50f7afdc..0a35a3e299 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py @@ -137,7 +137,8 @@ def plot_lambdavalues_bias_variance_values( ) ax.set_ylabel(r"$R_{bv}$") ax.set_xlabel(r"$\lambda$") - ax.set_title(f"Ratio bias variance: {str(ds)}") + + ax.set_title(f"{ds.commondata.metadata.plotting.dataset_label}") yield fig From 328a8926662e383cdd2cba0e5f6255d5a0bda5e4 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Mon, 10 Jun 2024 16:13:50 +0200 Subject: [PATCH 37/61] added hlines for rbv = 1 --- .../inconsistent_closuretest/multiclosure_inconsistent_output.py | 1 + 1 file changed, 1 insertion(+) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py index 0a35a3e299..9d923def5d 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py @@ -135,6 +135,7 @@ def plot_lambdavalues_bias_variance_values( color="blue", fmt='o', ) + ax.hlines(1, xmin=0, xmax=1.0, color="red", linestyle="--") ax.set_ylabel(r"$R_{bv}$") ax.set_xlabel(r"$\lambda$") From 1f4eacc00a910f94ef51ef7799b6e501a125d4f1 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Tue, 11 Jun 2024 17:59:22 +0200 Subject: [PATCH 38/61] added bootstrapped_internal_multiclosure_dataset_loader for tuple of bootstrapped multiclosuretests --- .../src/validphys/closuretest/multiclosure.py | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/validphys2/src/validphys/closuretest/multiclosure.py b/validphys2/src/validphys/closuretest/multiclosure.py index 496bf22772..b4681cc988 100644 --- a/validphys2/src/validphys/closuretest/multiclosure.py +++ b/validphys2/src/validphys/closuretest/multiclosure.py @@ -413,6 +413,67 @@ def _bootstrap_multiclosure_fits( return (boot_ths, *input_tuple) +def bootstrapped_internal_multiclosure_dataset_loader( + internal_multiclosure_dataset_loader, n_fit_max, n_fit, n_rep_max, n_rep, n_boot_multiclosure, rng_seed_mct_boot, use_repeats=True, +): + """ + Returns a tuple of internal_multiclosure_dataset_loader objects + each of which is a bootstrap resample of the original dataset + + Parameters + ---------- + internal_multiclosure_dataset_loader: tuple + closure fits theory predictions, + underlying law theory predictions, + covariance matrix, + sqrt covariance matrix + + n_fit_max: int + maximum number of fits, should be smaller or equal to number of + multiclosure fits + + n_fit: int + number of fits to draw for each resample + + n_rep_max: int + maximum number of replicas, should be smaller or equal to number of + replicas in each fit + + n_rep: int + number of replicas to draw for each resample + + n_boot_multiclosure: int + number of bootstrap resamples to perform + + rng_seed_mct_boot: int + seed for random number generator + + use_repeats: bool, default is True + whether to allow repeated fits and replicas in each resample + + Returns + ------- + resampled_multiclosure: tuple of shape (n_boot_multiclosure,) + tuple of internal_multiclosure_dataset_loader objects each of which + is a bootstrap resample of the original dataset + + """ + rng = np.random.RandomState(seed=rng_seed_mct_boot) + return tuple([ + _bootstrap_multiclosure_fits( + internal_multiclosure_dataset_loader, + rng=rng, + n_fit_max=n_fit_max, + n_fit=n_fit, + n_rep_max=n_rep_max, + n_rep=n_rep, + use_repeats=use_repeats, + ) + for _ in range(n_boot_multiclosure) + ] + ) + + def bias_variance_resampling_dataset( internal_multiclosure_dataset_loader, n_fit_samples, From 8689a186ae0ee6ff0d05e9a48fd3dbdd5a974557 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Tue, 11 Jun 2024 17:59:45 +0200 Subject: [PATCH 39/61] bootstrap of PCA regularised multiclosure tests --- .../multiclosure_inconsistent.py | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py index ec2971033b..126b71d8b3 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py @@ -15,6 +15,7 @@ from validphys.calcutils import calc_chi2 from validphys.results import ThPredictionsResult from validphys.closuretest.closure_checks import check_multifit_replicas +from validphys.closuretest.multiclosure import bootstrapped_internal_multiclosure_dataset_loader from reportengine import collect @@ -153,6 +154,47 @@ def internal_multiclosure_dataset_loader_pca( ) +def bootstrapped_internal_multiclosure_dataset_loader_pca( + internal_multiclosure_dataset_loader, + n_fit_max, + n_fit, + n_rep_max, + n_rep, + n_boot_multiclosure, + rng_seed_mct_boot, + use_repeats=True, + explained_variance_ratio=0.99, + _internal_max_reps=None, + _internal_min_reps=20, +): + """ + Similar to multiclosure.bootstrapped_internal_multiclosure_dataset_loader but returns + PCA regularised covariance matrix, where the covariance matrix has been computed + from the replicas of the theory predictions. + """ + + # get bootstrapped internal multiclosure dataset loader + bootstrap_imdl = bootstrapped_internal_multiclosure_dataset_loader( + internal_multiclosure_dataset_loader, + n_fit_max=n_fit_max, + n_fit=n_fit, + n_rep_max=n_rep_max, + n_rep=n_rep, + n_boot_multiclosure=n_boot_multiclosure, + rng_seed_mct_boot=rng_seed_mct_boot, + use_repeats=use_repeats, + ) + + # PCA regularise all the bootstrapped internal multiclosure dataset loaders + bootstrap_imdl_pca = [ + internal_multiclosure_dataset_loader_pca( + imdl, explained_variance_ratio, _internal_max_reps, _internal_min_reps + ) + for imdl in bootstrap_imdl + ] + return tuple(bootstrap_imdl_pca) + + def principal_components_bias_variance_dataset(internal_multiclosure_dataset_loader_pca): """ Compute the bias and variance for one datasets From 2690bc97211ece8fc6b08aa24f371c4b3a431b72 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Tue, 11 Jun 2024 18:10:40 +0200 Subject: [PATCH 40/61] bootstrap for internal_data_loadeer objecgs --- .../multiclosure_inconsistent.py | 31 +++++++++ .../src/validphys/closuretest/multiclosure.py | 67 ++++++++++++++----- 2 files changed, 80 insertions(+), 18 deletions(-) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py index 126b71d8b3..3a8d057bc8 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py @@ -195,6 +195,37 @@ def bootstrapped_internal_multiclosure_dataset_loader_pca( return tuple(bootstrap_imdl_pca) +def bootstrapped_internal_multiclosure_data_loader_pca( + internal_multiclosure_data_loader, + n_fit_max, + n_fit, + n_rep_max, + n_rep, + n_boot_multiclosure, + rng_seed_mct_boot, + use_repeats=True, + explained_variance_ratio=0.99, + _internal_max_reps=None, + _internal_min_reps=20, +): + """ + Same as bootstrapped_internal_multiclosure_dataset_loader_pca but for all the data. + """ + return bootstrapped_internal_multiclosure_dataset_loader_pca( + internal_multiclosure_data_loader, + n_fit_max, + n_fit, + n_rep_max, + n_rep, + n_boot_multiclosure, + rng_seed_mct_boot, + use_repeats, + explained_variance_ratio, + _internal_max_reps, + _internal_min_reps, + ) + + def principal_components_bias_variance_dataset(internal_multiclosure_dataset_loader_pca): """ Compute the bias and variance for one datasets diff --git a/validphys2/src/validphys/closuretest/multiclosure.py b/validphys2/src/validphys/closuretest/multiclosure.py index b4681cc988..28b87baaea 100644 --- a/validphys2/src/validphys/closuretest/multiclosure.py +++ b/validphys2/src/validphys/closuretest/multiclosure.py @@ -414,7 +414,14 @@ def _bootstrap_multiclosure_fits( def bootstrapped_internal_multiclosure_dataset_loader( - internal_multiclosure_dataset_loader, n_fit_max, n_fit, n_rep_max, n_rep, n_boot_multiclosure, rng_seed_mct_boot, use_repeats=True, + internal_multiclosure_dataset_loader, + n_fit_max, + n_fit, + n_rep_max, + n_rep, + n_boot_multiclosure, + rng_seed_mct_boot, + use_repeats=True, ): """ Returns a tuple of internal_multiclosure_dataset_loader objects @@ -429,19 +436,19 @@ def bootstrapped_internal_multiclosure_dataset_loader( sqrt covariance matrix n_fit_max: int - maximum number of fits, should be smaller or equal to number of + maximum number of fits, should be smaller or equal to number of multiclosure fits n_fit: int number of fits to draw for each resample - + n_rep_max: int - maximum number of replicas, should be smaller or equal to number of + maximum number of replicas, should be smaller or equal to number of replicas in each fit - + n_rep: int number of replicas to draw for each resample - + n_boot_multiclosure: int number of bootstrap resamples to perform @@ -459,18 +466,42 @@ def bootstrapped_internal_multiclosure_dataset_loader( """ rng = np.random.RandomState(seed=rng_seed_mct_boot) - return tuple([ - _bootstrap_multiclosure_fits( - internal_multiclosure_dataset_loader, - rng=rng, - n_fit_max=n_fit_max, - n_fit=n_fit, - n_rep_max=n_rep_max, - n_rep=n_rep, - use_repeats=use_repeats, - ) - for _ in range(n_boot_multiclosure) - ] + return tuple( + [ + _bootstrap_multiclosure_fits( + internal_multiclosure_dataset_loader, + rng=rng, + n_fit_max=n_fit_max, + n_fit=n_fit, + n_rep_max=n_rep_max, + n_rep=n_rep, + use_repeats=use_repeats, + ) + for _ in range(n_boot_multiclosure) + ] + ) + + +def bootstrapped_internal_multiclosure_data_loader( + internal_multiclosure_data_loader, + n_fit_max, + n_fit, + n_rep_max, + n_rep, + n_boot_multiclosure, + rng_seed_mct_boot, + use_repeats=True, +): + """Like bootstrapped_internal_multiclosure_dataset_loader except for all data""" + return bootstrapped_internal_multiclosure_dataset_loader( + internal_multiclosure_data_loader, + n_fit_max, + n_fit, + n_rep_max, + n_rep, + n_boot_multiclosure, + rng_seed_mct_boot, + use_repeats, ) From cc77ed9ea2a9be3c2a34163796e58ae15152a4fe Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Wed, 12 Jun 2024 09:57:34 +0200 Subject: [PATCH 41/61] added bootstrapped_principal_components_bias_variance_dataset --- .../multiclosure_inconsistent.py | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py index 3a8d057bc8..42c3d6423a 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py @@ -10,6 +10,7 @@ import numpy as np import dataclasses +import pandas as pd from validphys import covmats from validphys.calcutils import calc_chi2 @@ -280,3 +281,33 @@ def principal_components_bias_variance_dataset(internal_multiclosure_dataset_loa principal_components_bias_variance_datasets = collect( "principal_components_bias_variance_dataset", ("data",) ) + + +def bootstrapped_principal_components_bias_variance_dataset( + bootstrapped_internal_multiclosure_dataset_loader_pca, dataset +): + """ + Computes Bias and Variance for each bootstrap sample. + Returns a DataFrame with the results. + """ + boot_bias_var_samples = [] + for i, boot_imdl_pca in enumerate(bootstrapped_internal_multiclosure_dataset_loader_pca): + bias, var, n_comp = principal_components_bias_variance_dataset(boot_imdl_pca) + boot_bias_var_samples.append( + { + "bias": np.mean(bias), + "variance": np.mean(var), + "n_comp": n_comp, + "dataset": str(dataset), + "bootstrap_index": i, + } + ) + + df = pd.DataFrame.from_records( + boot_bias_var_samples, + index="bootstrap_index", + columns=("bootstrap_index", "dataset", "n_comp", "bias", "variance"), + ) + + df.columns = ["dataset", "n_comp", "bias", "variance"] + return df From 493e769da61e1ba7d0d4d02adefb28d74040f4b8 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Wed, 12 Jun 2024 14:35:28 +0200 Subject: [PATCH 42/61] added bootstrap table to report --- .../multiclosure_inconsistent.py | 5 ++ .../multiclosure_inconsistent_output.py | 63 +++++++++++++++++++ .../multiclosure_analysis.yaml | 9 +++ .../pca_template.md | 3 + 4 files changed, 80 insertions(+) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py index 42c3d6423a..3728b3d7cb 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py @@ -311,3 +311,8 @@ def bootstrapped_principal_components_bias_variance_dataset( df.columns = ["dataset", "n_comp", "bias", "variance"] return df + + +bootstrapped_principal_components_bias_variance_data = collect( + "bootstrapped_principal_components_bias_variance_dataset", ("data",) +) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py index 9d923def5d..eef98cdca5 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py @@ -99,6 +99,69 @@ def table_bias_variance_datasets(principal_components_bias_variance_datasets, ea ) +@table +def bootstrapped_table_bias_variance_datasets(bootstrapped_principal_components_bias_variance_data): + """ + Compute the bias, variance, ratio and sqrt(ratio) for each dataset + and return a DataFrame with the results. + Uncertainty on ratio and sqrt ratio is computed by Gaussian error propagation + of the bootstrap uncertainty on bias and variance. + """ + records = [] + for boot_ds in bootstrapped_principal_components_bias_variance_data: + df = boot_ds + mean_bias = df["bias"].mean() + mean_variance = df["variance"].mean() + mean_ratio = mean_bias / mean_variance + bootstrap_unc_bias = df["bias"].std() + + # gaussian error propagation for the ratio of the means uncertainty + # only consider bias as source of uncertainty for the ratio (variance is almost constant) + bootstrap_unc_ratio = np.sqrt((1 / mean_variance * bootstrap_unc_bias) ** 2) + sqrt_ratio = np.sqrt(mean_ratio) + # gaussian error propagation for the sqrt of the ratio + bootstrap_unc_sqrt_ratio = 0.5 * bootstrap_unc_ratio / np.sqrt(mean_ratio) + + records.append( + dict( + dataset=df["dataset"].iloc[0], + mean_dof=df.n_comp.mean(), + bias=mean_bias, + variance=mean_variance, + ratio=mean_ratio, + error_ratio=bootstrap_unc_ratio, + ratio_sqrt=sqrt_ratio, + error_ratio_sqrt=bootstrap_unc_sqrt_ratio, + ) + ) + + df = pd.DataFrame.from_records( + records, + index="dataset", + columns=( + "dataset", + "mean_dof", + "bias", + "variance", + "ratio", + "error_ratio", + "ratio_sqrt", + "error_ratio_sqrt", + ), + ) + df.columns = [ + "mean_dof", + "bias", + "variance", + "ratio", + "error_ratio", + "ratio_sqrt", + "error_ratio_sqrt", + ] + + return df + + @figuregen def plot_lambdavalues_bias_variance_values( lambdavalues_table_bias_variance_datasets, lambdavalues, each_dataset diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml index cf27fa9796..e46f686c99 100644 --- a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml @@ -154,6 +154,15 @@ theoryid: from_: theory use_cuts: internal +# bootstrap settings +n_fit_max: 2 +n_fit: 10 + +n_rep_max: 100 +n_rep: 60 +n_boot_multiclosure: 60 +rng_seed_mct_boot: 1234 + lambdavalues: - label: "LAMBDA02" lambda_value: 0.2 diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md b/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md index 2f308bd701..f873f9053f 100644 --- a/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/pca_template.md @@ -3,5 +3,8 @@ ## Table of bias and variance {@table_bias_variance_datasets@} +## Bootstrap Table of bias and variance +{@bootstrapped_table_bias_variance_datasets@} + ## L2 condition number {@plot_l2_condition_number@} From fb4fb4e73b529ad7f3c14f26de50ace111cbbb4a Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Wed, 12 Jun 2024 14:36:46 +0200 Subject: [PATCH 43/61] changed defaults of bootstrap --- .../multiclosure_analysis.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml index e46f686c99..a068bcc003 100644 --- a/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/multiclosure_analysis.yaml @@ -155,12 +155,12 @@ theoryid: use_cuts: internal # bootstrap settings -n_fit_max: 2 -n_fit: 10 +n_fit_max: 25 +n_fit: 20 n_rep_max: 100 n_rep: 60 -n_boot_multiclosure: 60 +n_boot_multiclosure: 100 rng_seed_mct_boot: 1234 lambdavalues: From b15427311661826f4337f42da748191c182b2826 Mon Sep 17 00:00:00 2001 From: Giovanni De Crescenzo Date: Thu, 13 Jun 2024 14:51:08 +0200 Subject: [PATCH 44/61] added title for single data point in latex mode --- .../validphys/closuretest/multiclosure_output.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/validphys2/src/validphys/closuretest/multiclosure_output.py b/validphys2/src/validphys/closuretest/multiclosure_output.py index 54a92579bc..75a6d006c2 100644 --- a/validphys2/src/validphys/closuretest/multiclosure_output.py +++ b/validphys2/src/validphys/closuretest/multiclosure_output.py @@ -855,19 +855,23 @@ def plot_bias_variance_distributions( yield fig @figuregen -def xq2_data_prcs_maps(xq2_data_map): +def xq2_data_prcs_maps(xq2_data_map,each_dataset): keys = ["std_devs","xi"] - for elem in xq2_data_map: + for j,elem in enumerate(xq2_data_map): + for k in keys: + if k == "std_devs": + title = r"$R_{bv}$" + if k == "xi": + title = r"$\xi$" fig, ax = plotutils.subplots() im = ax.scatter(elem['x_coords'],elem['Q_coords'], c=(np.asarray(elem[k])), - cmap='viridis', - label = elem['name']) - fig.colorbar(im,label=k) + cmap='viridis') + fig.colorbar(im,label=title) ax.set_xscale('log') # Set x-axis to log scale ax.set_yscale('log') # Set y-axis to log scale ax.set_xlabel('x') ax.set_ylabel('Q2') - ax.set_title(elem["name"]+" "+k) + ax.set_title(each_dataset[j].commondata.metadata.plotting.dataset_label) yield fig From a1d6a08e09473a4a989a0bb947daed06aebc365c Mon Sep 17 00:00:00 2001 From: andreab1997 Date: Tue, 18 Jun 2024 15:11:50 +0200 Subject: [PATCH 45/61] Add PCA on corr matrix on full dataset --- .../multiclosure_inconsistent.py | 149 ++++++++++++++++++ .../multiclosure_inconsistent_output.py | 30 +++- 2 files changed, 178 insertions(+), 1 deletion(-) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py index 3728b3d7cb..4037315c60 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py @@ -153,6 +153,108 @@ def internal_multiclosure_dataset_loader_pca( covmat_pca=covmat_pca, sqrt_covmat_pca=sqrt_covmat_pca, ) + +@check_multifit_replicas +def internal_multiclosure_data_loader_pca( + internal_multiclosure_data_loader, + explained_variance_ratio=0.99, + _internal_max_reps=None, + _internal_min_reps=20, +): + """ + Like multiclosure.internal_multiclosure_dataset_loader_pca except for all data + + Parameters + ---------- + internal_multiclosure_data_loader: tuple + closure fits theory predictions, + underlying law theory predictions, + covariance matrix, + sqrt covariance matrix + + explained_variance_ratio: float, default is 0.99 + + _internal_max_reps: int, default is None + Maximum number of replicas used in the fits + this is needed to check that the number of replicas is the same for all fits + + _internal_min_reps: int, default is 20 + Minimum number of replicas used in the fits + this is needed to check that the number of replicas is the same for all fits + + Returns + ------- + PCAInternalMulticlosureLoader + """ + closures_th, law_th, _, _ = internal_multiclosure_data_loader + reps = np.asarray([th.error_members for th in closures_th]) + + # compute the covariance matrix of the theory predictions for each fit + _covmats = [np.cov(rep, rowvar=True, bias=True) for rep in reps] + # compute the mean covariance matrix + _covmat_mean = np.mean(_covmats, axis=0) + # Keep the sqrt of the diagonals to reconstruct the covmat later + D = np.sqrt(np.diag(_covmat_mean)) + + # compute the correlation matrix of the theory predictions for each fit + _corrmats = [np.corrcoef(rep, rowvar=True, bias=True) for rep in reps] + _corrmat_mean = np.mean(_corrmats, axis=0) + # diagonalize the mean correlation matrix and only keep the principal components + # that explain the required variance + + if _covmat_mean.shape == (): + return PCAInternalMulticlosureLoader( + closures_th=closures_th, + law_th=law_th, + pc_basis=1, + n_comp=1, + covmat_pca=_covmat_mean, + sqrt_covmat_pca=np.sqrt(_covmat_mean), + ) + + eighvals, eigvecs = np.linalg.eigh(_corrmat_mean) + idx = np.argsort(eighvals)[::-1] + # sort eigenvalues from largest to smallest + eigvecs = eigvecs[:, idx] + eighvals = eighvals[idx] + eighvals_norm = eighvals / eighvals.sum() + + # choose components to keep based on EVR + n_comp = 1 + for _ in range(eighvals.shape[0]): + if np.sum(eighvals_norm[:n_comp]) >= explained_variance_ratio: + break + n_comp += 1 + # get the principal components + pc_basis = eigvecs[:, :n_comp] + + # compute the (PCA) regularized correlation matrix + corrmat_pca = pc_basis.T @ _corrmat_mean @ pc_basis + # project the diagonal matrix into the PCA space + D_pca = pc_basis.T @ np.diag(D) @ pc_basis + # compute the (PCA) regularized covariance matrix + covmat_pca = D_pca @ corrmat_pca @ D_pca + if n_comp == 1: + return PCAInternalMulticlosureLoader( + closures_th=closures_th, + law_th=law_th, + pc_basis=pc_basis, + n_comp=1, + covmat_pca=covmat_pca, + sqrt_covmat_pca=np.sqrt(covmat_pca), + ) + + # compute sqrt of pdf covariance matrix + sqrt_covmat_pca = covmats.sqrt_covmat(covmat_pca) + + return PCAInternalMulticlosureLoader( + closures_th=closures_th, + law_th=law_th, + pc_basis=pc_basis, + n_comp=n_comp, + covmat_pca=covmat_pca, + sqrt_covmat_pca=sqrt_covmat_pca, + ) def bootstrapped_internal_multiclosure_dataset_loader_pca( @@ -277,6 +379,53 @@ def principal_components_bias_variance_dataset(internal_multiclosure_dataset_loa return biases, np.asarray(variances), pca_loader.n_comp +def principal_components_bias_variance_data(internal_multiclosure_data_loader_pca): + """ + Like principal_components_bias_variance_datasets but for all data + + Parameters + ---------- + internal_multiclosure_data_loader_pca : tuple + Tuple containing the results of multiclosure fits after pca regularization + + + Returns + ------- + tuple + 3D tuple: + - biases: 1-D array of shape (Nfits,) + - variances: 1-D array of shape (Nfits, ) + - n_comp: number of principal components kept + """ + + pca_loader = internal_multiclosure_data_loader_pca + + reps = np.asarray([th.error_members for th in pca_loader.closures_th]) + + # compute bias diff and project it onto space spanned by PCs + delta_bias = reps.mean(axis=2).T - pca_loader.law_th.central_value[:, np.newaxis] + + if pca_loader.n_comp == 1: + delta_bias = pca_loader.pc_basis * delta_bias + biases = (delta_bias / pca_loader.sqrt_covmat_pca) ** 2 + variances = [] + for i in range(reps.shape[0]): + diffs = pca_loader.pc_basis * ( + reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True) + ) + variances.append(np.mean((diffs / pca_loader.sqrt_covmat_pca) ** 2)) + else: + delta_bias = pca_loader.pc_basis.T @ delta_bias + biases = calc_chi2(pca_loader.sqrt_covmat_pca, delta_bias) + variances = [] + for i in range(reps.shape[0]): + diffs = pca_loader.pc_basis.T @ ( + reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True) + ) + variances.append(np.mean(calc_chi2(pca_loader.sqrt_covmat_pca, diffs))) + + return biases, np.asarray(variances), pca_loader.n_comp + principal_components_bias_variance_datasets = collect( "principal_components_bias_variance_dataset", ("data",) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py index eef98cdca5..892dd090c3 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py @@ -21,7 +21,7 @@ @table -def table_bias_variance_datasets(principal_components_bias_variance_datasets, each_dataset): +def table_bias_variance_datasets(principal_components_bias_variance_datasets, principal_components_bias_variance_data, each_dataset): """ Compute the bias, variance, ratio and sqrt(ratio) for each dataset and return a DataFrame with the results. @@ -31,6 +31,9 @@ def table_bias_variance_datasets(principal_components_bias_variance_datasets, ea principal_components_bias_variance_datasets: list List of tuples containing the values of bias, variance and number of degrees of freedom + + principal_components_bias_variance_data: list + Same of principal_components_bias_variance_datasets but for all the data each_dataset: list List of validphys.core.DataSetSpec @@ -41,6 +44,31 @@ def table_bias_variance_datasets(principal_components_bias_variance_datasets, ea DataFrame containing the bias, variance, ratio and sqrt(ratio) for each dataset """ records = [] + + # First let's do the total + biases_tot, variances_tot, n_comp_tot = principal_components_bias_variance_data + bias_tot = np.mean(biases_tot) + variance_tot = np.mean(variances_tot) + rbv_tot = bias_tot / variance_tot + # use gaussian uncertainty propagation + delta_rbv_tot = np.sqrt( + ((1 / variance_tot) * np.std(biases_tot)) ** 2 + (bias_tot / variance_tot**2 * np.std(variances_tot)) ** 2 + ) + sqrt_rbv_tot = np.sqrt(rbv_tot) + delta_sqrt_rbv_tot = 0.5 * delta_rbv_tot / np.sqrt(rbv_tot) + records.append( + dict( + dataset="Total", + dof=n_comp_tot, + bias=bias_tot, + variance=variance_tot, + ratio=rbv_tot, + error_ratio=delta_rbv_tot, + ratio_sqrt=sqrt_rbv_tot, + error_ratio_sqrt=delta_sqrt_rbv_tot, + ) + ) + # Now we do dataset per dataset for pc_bias_var_dataset, ds in zip(principal_components_bias_variance_datasets, each_dataset): biases, variances, n_comp = pc_bias_var_dataset bias = np.mean(biases) From d1808ec1a52d2a20fa93b7b4d3fb523e778d7c58 Mon Sep 17 00:00:00 2001 From: Giovanni De Crescenzo Date: Wed, 19 Jun 2024 12:09:00 +0200 Subject: [PATCH 46/61] fixed inconsistency with single data point and suggest different way to compute error bands after bootstrap --- .../multiclosure_inconsistent_output.py | 13 ++++++++++--- .../src/validphys/closuretest/multiclosure.py | 3 ++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py index 892dd090c3..e25cdc5d8e 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py @@ -142,14 +142,15 @@ def bootstrapped_table_bias_variance_datasets(bootstrapped_principal_components_ mean_variance = df["variance"].mean() mean_ratio = mean_bias / mean_variance bootstrap_unc_bias = df["bias"].std() - + # gaussian error propagation for the ratio of the means uncertainty # only consider bias as source of uncertainty for the ratio (variance is almost constant) bootstrap_unc_ratio = np.sqrt((1 / mean_variance * bootstrap_unc_bias) ** 2) - sqrt_ratio = np.sqrt(mean_ratio) + bootstrap_unc_ratio_alt = np.std(df["bias"]/df["variance"]) + sqrt_ratio = np.mean(np.sqrt(df["bias"]/df["variance"])) # gaussian error propagation for the sqrt of the ratio bootstrap_unc_sqrt_ratio = 0.5 * bootstrap_unc_ratio / np.sqrt(mean_ratio) - + bootstrap_unc_sqrt_ratio_alt = np.std(np.sqrt(df["bias"]/df["variance"])) records.append( dict( dataset=df["dataset"].iloc[0], @@ -158,8 +159,10 @@ def bootstrapped_table_bias_variance_datasets(bootstrapped_principal_components_ variance=mean_variance, ratio=mean_ratio, error_ratio=bootstrap_unc_ratio, + error_ratio_alt=bootstrap_unc_ratio_alt, ratio_sqrt=sqrt_ratio, error_ratio_sqrt=bootstrap_unc_sqrt_ratio, + error_ratio_sqrt_alt=bootstrap_unc_sqrt_ratio_alt ) ) @@ -173,8 +176,10 @@ def bootstrapped_table_bias_variance_datasets(bootstrapped_principal_components_ "variance", "ratio", "error_ratio", + "error_ratio_alt", "ratio_sqrt", "error_ratio_sqrt", + "error_ratio_sqrt_alt" ), ) df.columns = [ @@ -183,8 +188,10 @@ def bootstrapped_table_bias_variance_datasets(bootstrapped_principal_components_ "variance", "ratio", "error_ratio", + "error_ratio_alt", "ratio_sqrt", "error_ratio_sqrt", + "error_ratio_sqrt_alt" ] return df diff --git a/validphys2/src/validphys/closuretest/multiclosure.py b/validphys2/src/validphys/closuretest/multiclosure.py index 28b87baaea..a9158b8c42 100644 --- a/validphys2/src/validphys/closuretest/multiclosure.py +++ b/validphys2/src/validphys/closuretest/multiclosure.py @@ -980,7 +980,8 @@ def xq2_dataset_map(commondata, cuts,internal_multiclosure_dataset_loader, xq2_map_obj = xq2map_with_cuts(commondata, cuts) coords = xq2_map_obj[2] central_deltas = fits_normed_dataset_central_delta(internal_multiclosure_dataset_loader) - std_devs = np.std(central_deltas, axis = 0) + #std_devs = np.std(central_deltas, axis = 0) + std_devs = np.sqrt(np.mean(central_deltas**2, axis = 0)) means = np.mean(central_deltas, axis = 0) xi = dataset_xi(dataset_replica_and_central_diff(internal_multiclosure_dataset_loader,False)) From 7c20ec808927fd292c17a2beeee258918d5673b8 Mon Sep 17 00:00:00 2001 From: Giovanni De Crescenzo Date: Wed, 19 Jun 2024 15:19:21 +0200 Subject: [PATCH 47/61] added delta plots --- .../multiclosure_inconsistent.py | 42 ++++++++++++++++++- .../multiclosure_inconsistent_output.py | 35 +++++++++++++++- 2 files changed, 75 insertions(+), 2 deletions(-) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py index 4037315c60..164fae9e6c 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py @@ -246,7 +246,6 @@ def internal_multiclosure_data_loader_pca( # compute sqrt of pdf covariance matrix sqrt_covmat_pca = covmats.sqrt_covmat(covmat_pca) - return PCAInternalMulticlosureLoader( closures_th=closures_th, law_th=law_th, @@ -426,6 +425,47 @@ def principal_components_bias_variance_data(internal_multiclosure_data_loader_pc return biases, np.asarray(variances), pca_loader.n_comp +def principal_components_normalized_delta_data(internal_multiclosure_data_loader_pca): + """ + Compute for all data only the normalized delta after PCA regularization + + Parameters + ---------- + internal_multiclosure_data_loader_pca : tuple + Tuple containing the results of multiclosure fits after pca regularization + + + Returns + ------- + nd.array: deltas + """ + + pca_loader = internal_multiclosure_data_loader_pca + + reps = np.asarray([th.error_members for th in pca_loader.closures_th]) + + # compute bias diff and project it onto space spanned by PCs + delta_bias = reps.mean(axis=2).T - pca_loader.law_th.central_value[:, np.newaxis] + if pca_loader.n_comp == 1: + delta_bias = pca_loader.pc_basis * delta_bias + standard_deviations = [] + for i in range(reps.shape[0]): + diffs = pca_loader.pc_basis * ( + reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True) + ) + standard_deviations.append(np.sqrt(np.mean((diffs / pca_loader.sqrt_covmat_pca) ** 2))) + else: + delta_bias = pca_loader.pc_basis.T @ delta_bias + standard_deviations = [] + for i in range(reps.shape[0]): + diffs = pca_loader.pc_basis.T @ ( + reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True) + ) + standard_deviations.append(np.std(diffs,axis=1)) + + + return (delta_bias/np.asarray(standard_deviations).T).flatten(), pca_loader.n_comp + principal_components_bias_variance_datasets = collect( "principal_components_bias_variance_dataset", ("data",) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py index e25cdc5d8e..55aa40c186 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py @@ -10,9 +10,10 @@ import pandas as pd import numpy as np -from reportengine.figure import figuregen +from reportengine.figure import figuregen, figure from reportengine.table import table from reportengine import collect +from scipy.stats import norm from validphys import plotutils from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent import ( @@ -325,3 +326,35 @@ def plot_l2_condition_number( ax1.legend(lines, labels, loc='upper left') yield fig + +@figure +def delta_histogram(principal_components_normalized_delta_data,title, label_hist=None): + """ + Plot histogram of normalized delta regularized with PCA. + + Parameters + ---------- + principal_components_normalized_delta_data: tuple + + title: str + description of multiclosure + + label_hist: str + summary description of multiclosure + + + + Yields + ------ + fig + Figure object + """ + fig, ax = plotutils.subplots() + size = np.shape(principal_components_normalized_delta_data[0])[0] + ax.hist(principal_components_normalized_delta_data[0],density=True,bins=int(np.sqrt(size)),label=label_hist) + ax.set_title(r"$\delta$ distribution, "+ title +f", dof={principal_components_normalized_delta_data[1]}") + x = np.linspace(-3,3,100) + y = norm.pdf(x,loc=0,scale=1) + ax.plot(x,y,label="Standard gaussian") + ax.legend() + return fig From 6f2a8ef45bff37c6f6046e97c90503adfeec2369 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Fri, 21 Jun 2024 14:23:12 +0100 Subject: [PATCH 48/61] use consistent bootstrap def and separate table datasets from table data --- .../examples/pca_bias_variance_report.yaml | 2 + .../multiclosure_inconsistent_output.py | 120 ++++++++++++------ 2 files changed, 80 insertions(+), 42 deletions(-) diff --git a/validphys2/examples/pca_bias_variance_report.yaml b/validphys2/examples/pca_bias_variance_report.yaml index 428a684754..024fe417e9 100644 --- a/validphys2/examples/pca_bias_variance_report.yaml +++ b/validphys2/examples/pca_bias_variance_report.yaml @@ -42,6 +42,8 @@ template_text: | ## Table of bias and variance {@table_bias_variance_datasets@} + {@table_bias_variance_data@} + ## L2 condition number {@plot_l2_condition_number@} diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py index 55aa40c186..3e9687fc5a 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py @@ -20,9 +20,8 @@ internal_multiclosure_dataset_loader_pca, ) - @table -def table_bias_variance_datasets(principal_components_bias_variance_datasets, principal_components_bias_variance_data, each_dataset): +def table_bias_variance_datasets(principal_components_bias_variance_datasets, each_dataset): """ Compute the bias, variance, ratio and sqrt(ratio) for each dataset and return a DataFrame with the results. @@ -32,9 +31,6 @@ def table_bias_variance_datasets(principal_components_bias_variance_datasets, pr principal_components_bias_variance_datasets: list List of tuples containing the values of bias, variance and number of degrees of freedom - - principal_components_bias_variance_data: list - Same of principal_components_bias_variance_datasets but for all the data each_dataset: list List of validphys.core.DataSetSpec @@ -45,31 +41,6 @@ def table_bias_variance_datasets(principal_components_bias_variance_datasets, pr DataFrame containing the bias, variance, ratio and sqrt(ratio) for each dataset """ records = [] - - # First let's do the total - biases_tot, variances_tot, n_comp_tot = principal_components_bias_variance_data - bias_tot = np.mean(biases_tot) - variance_tot = np.mean(variances_tot) - rbv_tot = bias_tot / variance_tot - # use gaussian uncertainty propagation - delta_rbv_tot = np.sqrt( - ((1 / variance_tot) * np.std(biases_tot)) ** 2 + (bias_tot / variance_tot**2 * np.std(variances_tot)) ** 2 - ) - sqrt_rbv_tot = np.sqrt(rbv_tot) - delta_sqrt_rbv_tot = 0.5 * delta_rbv_tot / np.sqrt(rbv_tot) - records.append( - dict( - dataset="Total", - dof=n_comp_tot, - bias=bias_tot, - variance=variance_tot, - ratio=rbv_tot, - error_ratio=delta_rbv_tot, - ratio_sqrt=sqrt_rbv_tot, - error_ratio_sqrt=delta_sqrt_rbv_tot, - ) - ) - # Now we do dataset per dataset for pc_bias_var_dataset, ds in zip(principal_components_bias_variance_datasets, each_dataset): biases, variances, n_comp = pc_bias_var_dataset bias = np.mean(biases) @@ -123,6 +94,79 @@ def table_bias_variance_datasets(principal_components_bias_variance_datasets, pr return df + +@table +def table_bias_variance_data(principal_components_bias_variance_data): + """ + Same as table_bias_variance_datasets but for all the data, meaning that + the correlations between the datasets are taken into account. + + Parameters + ---------- + principal_components_bias_variance_data: list + Same of principal_components_bias_variance_datasets but for all the data + + each_dataset: list + List of validphys.core.DataSetSpec + + Returns + ------- + pd.DataFrame + DataFrame containing the bias, variance, ratio and sqrt(ratio) for each dataset + """ + records = [] + + # First let's do the total + biases_tot, variances_tot, n_comp_tot = principal_components_bias_variance_data + bias_tot = np.mean(biases_tot) + variance_tot = np.mean(variances_tot) + rbv_tot = bias_tot / variance_tot + # use gaussian uncertainty propagation + delta_rbv_tot = np.sqrt( + ((1 / variance_tot) * np.std(biases_tot)) ** 2 + (bias_tot / variance_tot**2 * np.std(variances_tot)) ** 2 + ) + sqrt_rbv_tot = np.sqrt(rbv_tot) + delta_sqrt_rbv_tot = 0.5 * delta_rbv_tot / np.sqrt(rbv_tot) + records.append( + dict( + dataset="Total", + dof=n_comp_tot, + bias=bias_tot, + variance=variance_tot, + ratio=rbv_tot, + error_ratio=delta_rbv_tot, + ratio_sqrt=sqrt_rbv_tot, + error_ratio_sqrt=delta_sqrt_rbv_tot, + ) + ) + + df = pd.DataFrame.from_records( + records, + index="dataset", + columns=( + "dataset", + "dof", + "bias", + "variance", + "ratio", + "error_ratio", + "ratio_sqrt", + "error_ratio_sqrt", + ), + ) + df.columns = [ + "dof", + "bias", + "variance", + "ratio", + "error ratio", + "sqrt(ratio)", + "error sqrt(ratio)", + ] + + return df + + lambdavalues_table_bias_variance_datasets = collect( "table_bias_variance_datasets", ("lambdavalues",) ) @@ -141,17 +185,15 @@ def bootstrapped_table_bias_variance_datasets(bootstrapped_principal_components_ df = boot_ds mean_bias = df["bias"].mean() mean_variance = df["variance"].mean() - mean_ratio = mean_bias / mean_variance - bootstrap_unc_bias = df["bias"].std() + mean_ratio = mean_bias / mean_variance # gaussian error propagation for the ratio of the means uncertainty # only consider bias as source of uncertainty for the ratio (variance is almost constant) - bootstrap_unc_ratio = np.sqrt((1 / mean_variance * bootstrap_unc_bias) ** 2) - bootstrap_unc_ratio_alt = np.std(df["bias"]/df["variance"]) + bootstrap_unc_ratio = np.std(df["bias"]/df["variance"]) sqrt_ratio = np.mean(np.sqrt(df["bias"]/df["variance"])) + # gaussian error propagation for the sqrt of the ratio - bootstrap_unc_sqrt_ratio = 0.5 * bootstrap_unc_ratio / np.sqrt(mean_ratio) - bootstrap_unc_sqrt_ratio_alt = np.std(np.sqrt(df["bias"]/df["variance"])) + bootstrap_unc_sqrt_ratio = np.std(np.sqrt(df["bias"]/df["variance"])) records.append( dict( dataset=df["dataset"].iloc[0], @@ -160,10 +202,8 @@ def bootstrapped_table_bias_variance_datasets(bootstrapped_principal_components_ variance=mean_variance, ratio=mean_ratio, error_ratio=bootstrap_unc_ratio, - error_ratio_alt=bootstrap_unc_ratio_alt, ratio_sqrt=sqrt_ratio, error_ratio_sqrt=bootstrap_unc_sqrt_ratio, - error_ratio_sqrt_alt=bootstrap_unc_sqrt_ratio_alt ) ) @@ -177,10 +217,8 @@ def bootstrapped_table_bias_variance_datasets(bootstrapped_principal_components_ "variance", "ratio", "error_ratio", - "error_ratio_alt", "ratio_sqrt", "error_ratio_sqrt", - "error_ratio_sqrt_alt" ), ) df.columns = [ @@ -189,10 +227,8 @@ def bootstrapped_table_bias_variance_datasets(bootstrapped_principal_components_ "variance", "ratio", "error_ratio", - "error_ratio_alt", "ratio_sqrt", "error_ratio_sqrt", - "error_ratio_sqrt_alt" ] return df From 75773d40bbe529e7f3209e06c2bfadf7a339ebdb Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Fri, 21 Jun 2024 16:00:26 +0100 Subject: [PATCH 49/61] fixed PCA of correlation matrix for full dataset --- .../multiclosure_inconsistent.py | 31 +++++++++---------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py index 164fae9e6c..56af67ed0a 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py @@ -153,7 +153,8 @@ def internal_multiclosure_dataset_loader_pca( covmat_pca=covmat_pca, sqrt_covmat_pca=sqrt_covmat_pca, ) - + + @check_multifit_replicas def internal_multiclosure_data_loader_pca( internal_multiclosure_data_loader, @@ -188,20 +189,19 @@ def internal_multiclosure_data_loader_pca( """ closures_th, law_th, _, _ = internal_multiclosure_data_loader reps = np.asarray([th.error_members for th in closures_th]) - + # compute the covariance matrix of the theory predictions for each fit _covmats = [np.cov(rep, rowvar=True, bias=True) for rep in reps] # compute the mean covariance matrix _covmat_mean = np.mean(_covmats, axis=0) # Keep the sqrt of the diagonals to reconstruct the covmat later D = np.sqrt(np.diag(_covmat_mean)) - - # compute the correlation matrix of the theory predictions for each fit - _corrmats = [np.corrcoef(rep, rowvar=True, bias=True) for rep in reps] - _corrmat_mean = np.mean(_corrmats, axis=0) + + # compute the correlation matrix + _corrmat_mean = _covmat_mean / np.outer(D, D) + # diagonalize the mean correlation matrix and only keep the principal components # that explain the required variance - if _covmat_mean.shape == (): return PCAInternalMulticlosureLoader( closures_th=closures_th, @@ -228,12 +228,10 @@ def internal_multiclosure_data_loader_pca( # get the principal components pc_basis = eigvecs[:, :n_comp] - # compute the (PCA) regularized correlation matrix - corrmat_pca = pc_basis.T @ _corrmat_mean @ pc_basis - # project the diagonal matrix into the PCA space - D_pca = pc_basis.T @ np.diag(D) @ pc_basis - # compute the (PCA) regularized covariance matrix - covmat_pca = D_pca @ corrmat_pca @ D_pca + # compute the (PCA) regularized covariance matrix by projecting the mean covariance matrix + # onto the principal components + covmat_pca = pc_basis.T @ _covmat_mean @ pc_basis + if n_comp == 1: return PCAInternalMulticlosureLoader( closures_th=closures_th, @@ -378,6 +376,7 @@ def principal_components_bias_variance_dataset(internal_multiclosure_dataset_loa return biases, np.asarray(variances), pca_loader.n_comp + def principal_components_bias_variance_data(internal_multiclosure_data_loader_pca): """ Like principal_components_bias_variance_datasets but for all data @@ -425,6 +424,7 @@ def principal_components_bias_variance_data(internal_multiclosure_data_loader_pc return biases, np.asarray(variances), pca_loader.n_comp + def principal_components_normalized_delta_data(internal_multiclosure_data_loader_pca): """ Compute for all data only the normalized delta after PCA regularization @@ -461,10 +461,9 @@ def principal_components_normalized_delta_data(internal_multiclosure_data_loader diffs = pca_loader.pc_basis.T @ ( reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True) ) - standard_deviations.append(np.std(diffs,axis=1)) - + standard_deviations.append(np.std(diffs, axis=1)) - return (delta_bias/np.asarray(standard_deviations).T).flatten(), pca_loader.n_comp + return (delta_bias / np.asarray(standard_deviations).T).flatten(), pca_loader.n_comp principal_components_bias_variance_datasets = collect( From cb61d2245d0a09f750f389ccb47ab6a74ac1cf18 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Fri, 21 Jun 2024 16:27:29 +0100 Subject: [PATCH 50/61] compute rbv scan using bootstrap uncertainty quantification --- .../multiclosure_inconsistent_output.py | 16 +++++-- .../lambdavalues_report.yaml | 48 +++++++++++++++++++ 2 files changed, 59 insertions(+), 5 deletions(-) create mode 100644 validphys2/src/validphys/compareinconsistentclosuretemplates/lambdavalues_report.yaml diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py index 3e9687fc5a..288647e9bd 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py @@ -221,22 +221,28 @@ def bootstrapped_table_bias_variance_datasets(bootstrapped_principal_components_ "error_ratio_sqrt", ), ) + df.columns = [ "mean_dof", "bias", "variance", "ratio", - "error_ratio", - "ratio_sqrt", - "error_ratio_sqrt", + "error ratio", + "sqrt(ratio)", + "error sqrt(ratio)", ] return df +lambdavalues_bootstrapped_table_bias_variance_datasets = collect( + "bootstrapped_table_bias_variance_datasets", ("lambdavalues",) +) + + @figuregen def plot_lambdavalues_bias_variance_values( - lambdavalues_table_bias_variance_datasets, lambdavalues, each_dataset + lambdavalues_bootstrapped_table_bias_variance_datasets, lambdavalues, each_dataset ): """ Plot sqrt of ratio bias variance as a function of lambda for each dataset. @@ -260,7 +266,7 @@ def plot_lambdavalues_bias_variance_values( for ds in each_dataset: fig, ax = plotutils.subplots() for i, lambdavalue in enumerate(lambdavalues): - df = lambdavalues_table_bias_variance_datasets[i] + df = lambdavalues_bootstrapped_table_bias_variance_datasets[i] df = df[df.index == str(ds)] ax.errorbar( diff --git a/validphys2/src/validphys/compareinconsistentclosuretemplates/lambdavalues_report.yaml b/validphys2/src/validphys/compareinconsistentclosuretemplates/lambdavalues_report.yaml new file mode 100644 index 0000000000..c3b91f855c --- /dev/null +++ b/validphys2/src/validphys/compareinconsistentclosuretemplates/lambdavalues_report.yaml @@ -0,0 +1,48 @@ +meta: + title: Lambda Values scan multiclosure report + author: Mark N. Costantini + keywords: [inconsistencies, multiclosure] + + +################################################################## +#OTHER ANALYSIS SETTINGS + +fit: 230124_dis_ict_lam02_fs_122996 +theory: + from_: fit +theoryid: + from_: theory + +dataset_inputs: + from_: fit + +use_cuts: internal + +# bootstrap settings +n_fit_max: 2 +n_fit: 20 + +n_rep_max: 100 +n_rep: 60 +n_boot_multiclosure: 100 +rng_seed_mct_boot: 1234 + +use_t0: True +t0pdfset: 210223-mw-000_fakepdf + +lambdavalues: + - label: "LAMBDA02" + lambda_value: 0.2 + use_t0: True + t0pdfset: 210223-mw-000_fakepdf + explained_variance_ratio: 0.99 + variancepdf: 230124_dis_ict_lam02_fs_122996 + fits: + - 230124_dis_ict_lam02_fs_122996 + - 230124_dis_ict_lam02_fs_152326 + + +template: ratio_biasvar_template.md + +actions_: + - report(main=true) From 07b16a6a52349cc3e7427fe9b799c3534820e367 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Tue, 25 Jun 2024 15:53:38 +0100 Subject: [PATCH 51/61] added full data bootstrapped table --- .../examples/pca_bias_variance_report.yaml | 15 +- .../multiclosure_inconsistent.py | 66 ++++++-- .../multiclosure_inconsistent_output.py | 142 ++++++++++++++---- 3 files changed, 179 insertions(+), 44 deletions(-) diff --git a/validphys2/examples/pca_bias_variance_report.yaml b/validphys2/examples/pca_bias_variance_report.yaml index 024fe417e9..2b53ae59a1 100644 --- a/validphys2/examples/pca_bias_variance_report.yaml +++ b/validphys2/examples/pca_bias_variance_report.yaml @@ -23,6 +23,17 @@ t0pdfset: 210223-mw-000_fakepdf explained_variance_ratio: 0.99 +##### Bootstrap parameters +n_fit_max: 8 +n_fit: 8 + +n_rep_max: 100 +n_rep: 60 + +n_boot_multiclosure: 100 +rng_seed_mct_boot: 1 + + fits: - 25_5_2023_19_47_5_dis_pt1_mnc_commit_4d5d473c_filterseed_415295 - 25_5_2023_19_49_38_dis_pt1_mnc_commit_4d5d473c_filterseed_120750 @@ -42,7 +53,9 @@ template_text: | ## Table of bias and variance {@table_bias_variance_datasets@} - {@table_bias_variance_data@} + + ## Bootstrapped table for full dataset + {@bootstrapped_table_bias_variance_data@} ## L2 condition number {@plot_l2_condition_number@} diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py index 56af67ed0a..89a1679fe8 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py @@ -16,7 +16,10 @@ from validphys.calcutils import calc_chi2 from validphys.results import ThPredictionsResult from validphys.closuretest.closure_checks import check_multifit_replicas -from validphys.closuretest.multiclosure import bootstrapped_internal_multiclosure_dataset_loader +from validphys.closuretest.multiclosure import ( + bootstrapped_internal_multiclosure_dataset_loader, + bootstrapped_internal_multiclosure_data_loader, +) from reportengine import collect @@ -311,20 +314,27 @@ def bootstrapped_internal_multiclosure_data_loader_pca( """ Same as bootstrapped_internal_multiclosure_dataset_loader_pca but for all the data. """ - return bootstrapped_internal_multiclosure_dataset_loader_pca( + # get bootstrapped internal multiclosure dataset loader + bootstrap_imdl = bootstrapped_internal_multiclosure_data_loader( internal_multiclosure_data_loader, - n_fit_max, - n_fit, - n_rep_max, - n_rep, - n_boot_multiclosure, - rng_seed_mct_boot, - use_repeats, - explained_variance_ratio, - _internal_max_reps, - _internal_min_reps, + n_fit_max=n_fit_max, + n_fit=n_fit, + n_rep_max=n_rep_max, + n_rep=n_rep, + n_boot_multiclosure=n_boot_multiclosure, + rng_seed_mct_boot=rng_seed_mct_boot, + use_repeats=use_repeats, ) + # PCA regularise all the bootstrapped internal multiclosure dataset loaders + bootstrap_imdl_pca = [ + internal_multiclosure_data_loader_pca( + imdl, explained_variance_ratio, _internal_max_reps, _internal_min_reps + ) + for imdl in bootstrap_imdl + ] + return tuple(bootstrap_imdl_pca) + def principal_components_bias_variance_dataset(internal_multiclosure_dataset_loader_pca): """ @@ -501,6 +511,36 @@ def bootstrapped_principal_components_bias_variance_dataset( return df -bootstrapped_principal_components_bias_variance_data = collect( +bootstrapped_principal_components_bias_variance_datasets = collect( "bootstrapped_principal_components_bias_variance_dataset", ("data",) ) + + +def bootstrapped_principal_components_bias_variance_data( + bootstrapped_internal_multiclosure_data_loader_pca, +): + """ + Computes Bias and Variance for each bootstrap sample. + Returns a DataFrame with the results. + """ + boot_bias_var_samples = [] + for i, boot_imdl_pca in enumerate(bootstrapped_internal_multiclosure_data_loader_pca): + bias, var, n_comp = principal_components_bias_variance_data(boot_imdl_pca) + boot_bias_var_samples.append( + { + "bias": np.mean(bias), + "variance": np.mean(var), + "n_comp": n_comp, + "data": "Full dataset", + "bootstrap_index": i, + } + ) + + df = pd.DataFrame.from_records( + boot_bias_var_samples, + index="bootstrap_index", + columns=("bootstrap_index", "dataset", "n_comp", "bias", "variance"), + ) + + df.columns = ["dataset", "n_comp", "bias", "variance"] + return df diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py index 288647e9bd..b7aa0f7ed8 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py @@ -20,6 +20,7 @@ internal_multiclosure_dataset_loader_pca, ) + @table def table_bias_variance_datasets(principal_components_bias_variance_datasets, each_dataset): """ @@ -94,7 +95,6 @@ def table_bias_variance_datasets(principal_components_bias_variance_datasets, ea return df - @table def table_bias_variance_data(principal_components_bias_variance_data): """ @@ -115,7 +115,7 @@ def table_bias_variance_data(principal_components_bias_variance_data): DataFrame containing the bias, variance, ratio and sqrt(ratio) for each dataset """ records = [] - + # First let's do the total biases_tot, variances_tot, n_comp_tot = principal_components_bias_variance_data bias_tot = np.mean(biases_tot) @@ -123,23 +123,24 @@ def table_bias_variance_data(principal_components_bias_variance_data): rbv_tot = bias_tot / variance_tot # use gaussian uncertainty propagation delta_rbv_tot = np.sqrt( - ((1 / variance_tot) * np.std(biases_tot)) ** 2 + (bias_tot / variance_tot**2 * np.std(variances_tot)) ** 2 - ) + ((1 / variance_tot) * np.std(biases_tot)) ** 2 + + (bias_tot / variance_tot**2 * np.std(variances_tot)) ** 2 + ) sqrt_rbv_tot = np.sqrt(rbv_tot) delta_sqrt_rbv_tot = 0.5 * delta_rbv_tot / np.sqrt(rbv_tot) records.append( - dict( - dataset="Total", - dof=n_comp_tot, - bias=bias_tot, - variance=variance_tot, - ratio=rbv_tot, - error_ratio=delta_rbv_tot, - ratio_sqrt=sqrt_rbv_tot, - error_ratio_sqrt=delta_sqrt_rbv_tot, - ) + dict( + dataset="Total", + dof=n_comp_tot, + bias=bias_tot, + variance=variance_tot, + ratio=rbv_tot, + error_ratio=delta_rbv_tot, + ratio_sqrt=sqrt_rbv_tot, + error_ratio_sqrt=delta_sqrt_rbv_tot, ) - + ) + df = pd.DataFrame.from_records( records, index="dataset", @@ -173,7 +174,9 @@ def table_bias_variance_data(principal_components_bias_variance_data): @table -def bootstrapped_table_bias_variance_datasets(bootstrapped_principal_components_bias_variance_data): +def bootstrapped_table_bias_variance_datasets( + bootstrapped_principal_components_bias_variance_datasets, +): """ Compute the bias, variance, ratio and sqrt(ratio) for each dataset and return a DataFrame with the results. @@ -181,19 +184,19 @@ def bootstrapped_table_bias_variance_datasets(bootstrapped_principal_components_ of the bootstrap uncertainty on bias and variance. """ records = [] - for boot_ds in bootstrapped_principal_components_bias_variance_data: + for boot_ds in bootstrapped_principal_components_bias_variance_datasets: df = boot_ds mean_bias = df["bias"].mean() mean_variance = df["variance"].mean() - mean_ratio = mean_bias / mean_variance - + mean_ratio = mean_bias / mean_variance + # gaussian error propagation for the ratio of the means uncertainty # only consider bias as source of uncertainty for the ratio (variance is almost constant) - bootstrap_unc_ratio = np.std(df["bias"]/df["variance"]) - sqrt_ratio = np.mean(np.sqrt(df["bias"]/df["variance"])) + bootstrap_unc_ratio = np.std(df["bias"] / df["variance"]) + sqrt_ratio = np.mean(np.sqrt(df["bias"] / df["variance"])) # gaussian error propagation for the sqrt of the ratio - bootstrap_unc_sqrt_ratio = np.std(np.sqrt(df["bias"]/df["variance"])) + bootstrap_unc_sqrt_ratio = np.std(np.sqrt(df["bias"] / df["variance"])) records.append( dict( dataset=df["dataset"].iloc[0], @@ -240,6 +243,75 @@ def bootstrapped_table_bias_variance_datasets(bootstrapped_principal_components_ ) +@table +def bootstrapped_table_bias_variance_data(bootstrapped_principal_components_bias_variance_data): + """ + Compute the bias, variance, ratio and sqrt(ratio) for a Datagroup + and return a DataFrame with the results. + Uncertainty on ratio and sqrt ratio is computed by Gaussian error propagation + of the bootstrap uncertainty on bias and variance. + """ + records = [] + + df = bootstrapped_principal_components_bias_variance_data + + mean_bias = df["bias"].mean() + mean_variance = df["variance"].mean() + mean_ratio = mean_bias / mean_variance + + # gaussian error propagation for the ratio of the means uncertainty + # only consider bias as source of uncertainty for the ratio (variance is almost constant) + bootstrap_unc_ratio = np.std(df["bias"] / df["variance"]) + sqrt_ratio = np.mean(np.sqrt(df["bias"] / df["variance"])) + + # gaussian error propagation for the sqrt of the ratio + bootstrap_unc_sqrt_ratio = np.std(np.sqrt(df["bias"] / df["variance"])) + records.append( + dict( + dataset=df["dataset"].iloc[0], + mean_dof=df.n_comp.mean(), + bias=mean_bias, + variance=mean_variance, + ratio=mean_ratio, + error_ratio=bootstrap_unc_ratio, + ratio_sqrt=sqrt_ratio, + error_ratio_sqrt=bootstrap_unc_sqrt_ratio, + ) + ) + + df = pd.DataFrame.from_records( + records, + index="dataset", + columns=( + "dataset", + "mean_dof", + "bias", + "variance", + "ratio", + "error_ratio", + "ratio_sqrt", + "error_ratio_sqrt", + ), + ) + + df.columns = [ + "mean_dof", + "bias", + "variance", + "ratio", + "error ratio", + "sqrt(ratio)", + "error sqrt(ratio)", + ] + + return df + + +lambdavalues_bootstrapped_table_bias_variance_datasets = collect( + "bootstrapped_table_bias_variance_data", ("lambdavalues",) +) + + @figuregen def plot_lambdavalues_bias_variance_values( lambdavalues_bootstrapped_table_bias_variance_datasets, lambdavalues, each_dataset @@ -369,10 +441,11 @@ def plot_l2_condition_number( yield fig + @figure -def delta_histogram(principal_components_normalized_delta_data,title, label_hist=None): +def delta_histogram(principal_components_normalized_delta_data, title, label_hist=None): """ - Plot histogram of normalized delta regularized with PCA. + Plot histogram of normalized delta regularized with PCA. Parameters ---------- @@ -383,7 +456,7 @@ def delta_histogram(principal_components_normalized_delta_data,title, label_hist label_hist: str summary description of multiclosure - + Yields @@ -393,10 +466,19 @@ def delta_histogram(principal_components_normalized_delta_data,title, label_hist """ fig, ax = plotutils.subplots() size = np.shape(principal_components_normalized_delta_data[0])[0] - ax.hist(principal_components_normalized_delta_data[0],density=True,bins=int(np.sqrt(size)),label=label_hist) - ax.set_title(r"$\delta$ distribution, "+ title +f", dof={principal_components_normalized_delta_data[1]}") - x = np.linspace(-3,3,100) - y = norm.pdf(x,loc=0,scale=1) - ax.plot(x,y,label="Standard gaussian") + ax.hist( + principal_components_normalized_delta_data[0], + density=True, + bins=int(np.sqrt(size)), + label=label_hist, + ) + ax.set_title( + r"$\delta$ distribution, " + + title + + f", dof={principal_components_normalized_delta_data[1]}" + ) + x = np.linspace(-3, 3, 100) + y = norm.pdf(x, loc=0, scale=1) + ax.plot(x, y, label="Standard gaussian") ax.legend() return fig From 099af95fb417c47d7fe9f2a104560820ddddb553 Mon Sep 17 00:00:00 2001 From: Giovanni De Crescenzo Date: Wed, 26 Jun 2024 15:21:37 +0200 Subject: [PATCH 52/61] slight change in def of delta hist --- .../multiclosure_inconsistent.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py index 89a1679fe8..87054e7432 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py @@ -458,22 +458,22 @@ def principal_components_normalized_delta_data(internal_multiclosure_data_loader delta_bias = reps.mean(axis=2).T - pca_loader.law_th.central_value[:, np.newaxis] if pca_loader.n_comp == 1: delta_bias = pca_loader.pc_basis * delta_bias - standard_deviations = [] + variances = [] for i in range(reps.shape[0]): diffs = pca_loader.pc_basis * ( reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True) ) - standard_deviations.append(np.sqrt(np.mean((diffs / pca_loader.sqrt_covmat_pca) ** 2))) + variances.append(np.sqrt(np.mean((diffs / pca_loader.sqrt_covmat_pca) ** 2))) else: delta_bias = pca_loader.pc_basis.T @ delta_bias - standard_deviations = [] + variances = [] for i in range(reps.shape[0]): diffs = pca_loader.pc_basis.T @ ( reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True) ) - standard_deviations.append(np.std(diffs, axis=1)) - - return (delta_bias / np.asarray(standard_deviations).T).flatten(), pca_loader.n_comp + variances.append(np.var(diffs, axis=1)) + variances = (np.mean(variances)) + return (delta_bias / np.sqrt(variances)).flatten(), pca_loader.n_comp principal_components_bias_variance_datasets = collect( From 89398ef589815ed49a3f238456e9c896faa8c4a2 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Wed, 26 Jun 2024 16:24:18 +0100 Subject: [PATCH 53/61] added definition of delta in line with eq. 2.22 --- .../multiclosure_inconsistent.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py index 87054e7432..ac92d43d0c 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py @@ -456,6 +456,10 @@ def principal_components_normalized_delta_data(internal_multiclosure_data_loader # compute bias diff and project it onto space spanned by PCs delta_bias = reps.mean(axis=2).T - pca_loader.law_th.central_value[:, np.newaxis] + + # find basis that diagonalise covmat pca + eigvals, eigenvects = np.linalg.eigh(pca_loader.covmat_pca) + if pca_loader.n_comp == 1: delta_bias = pca_loader.pc_basis * delta_bias variances = [] @@ -463,17 +467,12 @@ def principal_components_normalized_delta_data(internal_multiclosure_data_loader diffs = pca_loader.pc_basis * ( reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True) ) - variances.append(np.sqrt(np.mean((diffs / pca_loader.sqrt_covmat_pca) ** 2))) + std_deviations.append(np.sqrt(np.mean((diffs / pca_loader.sqrt_covmat_pca) ** 2))) else: - delta_bias = pca_loader.pc_basis.T @ delta_bias - variances = [] - for i in range(reps.shape[0]): - diffs = pca_loader.pc_basis.T @ ( - reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True) - ) - variances.append(np.var(diffs, axis=1)) - variances = (np.mean(variances)) - return (delta_bias / np.sqrt(variances)).flatten(), pca_loader.n_comp + delta_bias = eigenvects.T @ (pca_loader.pc_basis.T @ delta_bias) + std_deviations = np.sqrt(eigvals)[:, None] + + return (delta_bias / std_deviations).flatten(), pca_loader.n_comp principal_components_bias_variance_datasets = collect( From 8d71a2e2cdcfe91d1f811011cdc5c31c476a9570 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Wed, 26 Jun 2024 16:27:43 +0100 Subject: [PATCH 54/61] added definition of delta in line with eq. 2.22 --- .../inconsistent_closuretest/multiclosure_inconsistent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py index ac92d43d0c..939e4659de 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py @@ -462,7 +462,7 @@ def principal_components_normalized_delta_data(internal_multiclosure_data_loader if pca_loader.n_comp == 1: delta_bias = pca_loader.pc_basis * delta_bias - variances = [] + std_deviations = [] for i in range(reps.shape[0]): diffs = pca_loader.pc_basis * ( reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True) From 8df13b2081a33e9604f07733b94c353a1f32a444 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Wed, 26 Jun 2024 19:11:05 +0100 Subject: [PATCH 55/61] added rbv scan for full dataset --- .../multiclosure_inconsistent_output.py | 41 ++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py index b7aa0f7ed8..08d82e5ca9 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py @@ -307,7 +307,7 @@ def bootstrapped_table_bias_variance_data(bootstrapped_principal_components_bias return df -lambdavalues_bootstrapped_table_bias_variance_datasets = collect( +lambdavalues_bootstrapped_table_bias_variance_data = collect( "bootstrapped_table_bias_variance_data", ("lambdavalues",) ) @@ -357,6 +357,45 @@ def plot_lambdavalues_bias_variance_values( yield fig +@figure +def plot_lambdavalues_bias_variance_values_full_data( + lambdavalues_bootstrapped_table_bias_variance_data, lambdavalues +): + """ + Plot sqrt of ratio bias variance as a function of lambda for each dataset. + + Parameters + ---------- + lambdavalues_bootstrapped_table_bias_variance_data: list + list of data frames computed as per table_bias_variance_data. + + lambdavalues: list + list specified in multiclosure_analysis.yaml + + Yields + ------ + figure + """ + fig, ax = plotutils.subplots() + for i, lambdavalue in enumerate(lambdavalues): + df = lambdavalues_bootstrapped_table_bias_variance_data[i] + + ax.errorbar( + lambdavalue["lambda_value"], + df["sqrt(ratio)"].values, + yerr=df["error sqrt(ratio)"].values, + color="blue", + fmt='o', + ) + ax.hlines(1, xmin=0, xmax=1.0, color="red", linestyle="--") + ax.set_ylabel(r"$R_{bv}$") + ax.set_xlabel(r"$\lambda$") + + ax.set_title("Full data") + + return fig + + internal_multiclosure_data_collected_loader = collect( "internal_multiclosure_dataset_loader", ("data",) ) From defcd9675ad14ec89cfd33af09f817d8ed6d5fbc Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Thu, 27 Jun 2024 14:41:22 +0100 Subject: [PATCH 56/61] added bootstrapped xi indicator function for full dataset --- .../multiclosure_inconsistent.py | 57 ++++++++- .../multiclosure_inconsistent_output.py | 34 ++++++ .../src/validphys/closuretest/multiclosure.py | 114 ++++++++++-------- 3 files changed, 153 insertions(+), 52 deletions(-) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py index 939e4659de..aa9a42f38c 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py @@ -19,6 +19,7 @@ from validphys.closuretest.multiclosure import ( bootstrapped_internal_multiclosure_dataset_loader, bootstrapped_internal_multiclosure_data_loader, + standard_indicator_function, ) from reportengine import collect @@ -459,7 +460,7 @@ def principal_components_normalized_delta_data(internal_multiclosure_data_loader # find basis that diagonalise covmat pca eigvals, eigenvects = np.linalg.eigh(pca_loader.covmat_pca) - + if pca_loader.n_comp == 1: delta_bias = pca_loader.pc_basis * delta_bias std_deviations = [] @@ -480,6 +481,60 @@ def principal_components_normalized_delta_data(internal_multiclosure_data_loader ) +def bootstrapped_principal_components_normalized_delta_data( + bootstrapped_internal_multiclosure_data_loader_pca, +): + """ + Compute the normalized deltas for each bootstrap sample. + + Parameters + ---------- + bootstrapped_internal_multiclosure_data_loader_pca: list + list of tuples containing the results of multiclosure fits after pca regularization + + Returns + ------- + list + list of tuples containing the normalized deltas and the number of principal components. + Each tuple corresponds to a bootstrap sample. + """ + normalised_deltas = [] + for boot_imdl_pca in bootstrapped_internal_multiclosure_data_loader_pca: + normalised_deltas.append(principal_components_normalized_delta_data(boot_imdl_pca)) + return normalised_deltas + + +def bootstrapped_indicator_function_data( + bootstrapped_principal_components_normalized_delta_data, nsigma=1 +): + """ + Compute the indicator function for each bootstrap sample. + + Parameters + ---------- + bootstrapped_principal_components_normalized_delta_data: list + list of tuples containing the normalized deltas and the number of principal components. + Each tuple corresponds to a bootstrap sample. + + nsigma: int, default is 1 + + Returns + ------- + 2-D tuple: + list + list of length N_boot and entrances are arrays of dim Npca x Nfits containing the indicator function for each bootstrap sample. + + float + average number of degrees of freedom + """ + indicator_list = [] + ndof_list = [] + for boot, ndof in bootstrapped_principal_components_normalized_delta_data: + indicator_list.append(standard_indicator_function(boot, nsigma)) + ndof_list.append(ndof) + return indicator_list, np.mean(np.asarray(ndof_list)) + + def bootstrapped_principal_components_bias_variance_dataset( bootstrapped_internal_multiclosure_dataset_loader_pca, dataset ): diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py index 08d82e5ca9..74bf7dbe7d 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py @@ -521,3 +521,37 @@ def delta_histogram(principal_components_normalized_delta_data, title, label_his ax.plot(x, y, label="Standard gaussian") ax.legend() return fig + + +@table +def table_xi_indicator_function_data(bootstrapped_indicator_function_data): + """ + Computes the bootstrap average and std of the indicator function for the data. + + Parameters + ---------- + bootstrapped_indicator_function_data: tuple + + + Returns + ------- + pd.DataFrame + DataFrame containing the average and std of the indicator function for the data. + """ + indicator_list, mean_dof = bootstrapped_indicator_function_data + + # average over data and fits within the bootstrap samples + mean_boot_vals = np.array([np.mean(boot_val) for boot_val in indicator_list]) + + # take bootstrap expectation and variance + mean_xi = np.mean(mean_boot_vals) + std_xi = np.std(mean_boot_vals) + + records = [dict(data="full data", mean_dof=mean_dof, mean_xi=mean_xi, std_xi=std_xi)] + + df = pd.DataFrame.from_records( + records, index="data", columns=("data", "mean_dof", "mean_xi", "std_xi") + ) + + df.columns = ["mean_dof", "mean_xi", "std_xi"] + return df diff --git a/validphys2/src/validphys/closuretest/multiclosure.py b/validphys2/src/validphys/closuretest/multiclosure.py index a9158b8c42..ed1190e84a 100644 --- a/validphys2/src/validphys/closuretest/multiclosure.py +++ b/validphys2/src/validphys/closuretest/multiclosure.py @@ -7,6 +7,7 @@ ``multiclosure_output.py`` """ + import numpy as np import scipy.linalg as la import scipy.special as special @@ -116,9 +117,7 @@ def internal_multiclosure_data_loader( @check_multifit_replicas def fits_dataset_bias_variance( - internal_multiclosure_dataset_loader, - _internal_max_reps=None, - _internal_min_reps=20, + internal_multiclosure_dataset_loader, _internal_max_reps=None, _internal_min_reps=20 ): """For a single dataset, calculate the bias and variance for each fit and return tuple (bias, variance, n_data), where bias and variance are @@ -149,22 +148,22 @@ def fits_dataset_bias_variance( variances.append(np.mean(calc_chi2(sqrtcov, diffs))) return biases, np.asarray(variances), len(law_th) + @check_multifit_replicas def fits_normed_dataset_central_delta( - internal_multiclosure_dataset_loader, - _internal_max_reps=None, - _internal_min_reps=20): + internal_multiclosure_dataset_loader, _internal_max_reps=None, _internal_min_reps=20 +): """ For each fit calculate the difference between central expectation value and true val. Normalize this value by the variance of the differences between replicas and central expectation value (different - for each fit but expected to vary only a little). Each observable central exp value is + for each fit but expected to vary only a little). Each observable central exp value is expected to be gaussianly distributed around the true value set by the fakepdf """ closures_th, law_th, exp_cov, sqrtcov = internal_multiclosure_dataset_loader # The dimentions here are (fit, data point, replica) reps = np.asarray([th.error_members[:, :_internal_max_reps] for th in closures_th]) # One could mask here some reps in order to avoid redundancy of information - #TODO + # TODO n_data = len(law_th) n_fits = np.shape(reps)[0] @@ -172,21 +171,19 @@ def fits_normed_dataset_central_delta( # There are n_fits pdf_covariances # flag to see whether to eliminate dataset for i in range(n_fits): - bias_diffs = np.mean(reps[i], axis = 1) - law_th.central_value + bias_diffs = np.mean(reps[i], axis=1) - law_th.central_value # bias diffs in the for loop should have dim = (n_obs) - sigmas = np.sqrt(np.var(reps[i], axis = 1)) + sigmas = np.sqrt(np.var(reps[i], axis=1)) # var diffs has shape (n_obs,reps) - bias_diffs = bias_diffs/sigmas - + bias_diffs = bias_diffs / sigmas + deltas.append(bias_diffs.tolist()) # biases.shape = (n_fits, n_obs_cut/uncut) # variances.shape = (n_fits, n_obs_cut/uncut, reps) return np.asarray(deltas) -fits_datasets_bias_variance = collect( - "fits_dataset_bias_variance", ("data",) -) +fits_datasets_bias_variance = collect("fits_dataset_bias_variance", ("data",)) def expected_dataset_bias_variance(fits_dataset_bias_variance): @@ -200,9 +197,7 @@ def expected_dataset_bias_variance(fits_dataset_bias_variance): @check_multifit_replicas def fits_data_bias_variance( - internal_multiclosure_data_loader, - _internal_max_reps=None, - _internal_min_reps=20, + internal_multiclosure_data_loader, _internal_max_reps=None, _internal_min_reps=20 ): """Like `fits_dataset_bias_variance` but for all data""" return fits_dataset_bias_variance( @@ -350,11 +345,7 @@ def n_replica_samples(fits_pdf, _internal_max_reps=None, _internal_min_reps=20): _internal_max_reps. """ return list( - range( - _internal_min_reps, - _internal_max_reps + SAMPLING_INTERVAL, - SAMPLING_INTERVAL, - ) + range(_internal_min_reps, _internal_max_reps + SAMPLING_INTERVAL, SAMPLING_INTERVAL) ) @@ -370,13 +361,7 @@ def __init__(self, data): def _bootstrap_multiclosure_fits( - internal_multiclosure_dataset_loader, - rng, - n_fit_max, - n_fit, - n_rep_max, - n_rep, - use_repeats, + internal_multiclosure_dataset_loader, rng, n_fit_max, n_fit, n_rep_max, n_rep, use_repeats ): """Perform a single bootstrap resample of the multiclosure fits and return a proxy of the base internal object used by relevant estimator actions @@ -907,9 +892,7 @@ def total_bootstrap_xi(experiments_bootstrap_xi): def dataset_fits_bias_replicas_variance_samples( - internal_multiclosure_dataset_loader, - _internal_max_reps=None, - _internal_min_reps=20, + internal_multiclosure_dataset_loader, _internal_max_reps=None, _internal_min_reps=20 ): """For a single dataset, calculate the samples of chi2-quantities which are used to calculate the bias and variance for each fit. The output of this @@ -950,14 +933,10 @@ def dataset_fits_bias_replicas_variance_samples( def dataset_inputs_fits_bias_replicas_variance_samples( - internal_multiclosure_data_loader, - _internal_max_reps=None, - _internal_min_reps=20, + internal_multiclosure_data_loader, _internal_max_reps=None, _internal_min_reps=20 ): return dataset_fits_bias_replicas_variance_samples( - internal_multiclosure_data_loader, - _internal_max_reps=None, - _internal_min_reps=20, + internal_multiclosure_data_loader, _internal_max_reps=None, _internal_min_reps=20 ) @@ -966,9 +945,13 @@ def dataset_inputs_fits_bias_replicas_variance_samples( ) -def xq2_dataset_map(commondata, cuts,internal_multiclosure_dataset_loader, - _internal_max_reps=None, - _internal_min_reps=20): +def xq2_dataset_map( + commondata, + cuts, + internal_multiclosure_dataset_loader, + _internal_max_reps=None, + _internal_min_reps=20, +): """ Load in a dictionary all the specs of a dataset meaning: - ds name @@ -980,17 +963,46 @@ def xq2_dataset_map(commondata, cuts,internal_multiclosure_dataset_loader, xq2_map_obj = xq2map_with_cuts(commondata, cuts) coords = xq2_map_obj[2] central_deltas = fits_normed_dataset_central_delta(internal_multiclosure_dataset_loader) - #std_devs = np.std(central_deltas, axis = 0) - std_devs = np.sqrt(np.mean(central_deltas**2, axis = 0)) - means = np.mean(central_deltas, axis = 0) - xi = dataset_xi(dataset_replica_and_central_diff(internal_multiclosure_dataset_loader,False)) + # std_devs = np.std(central_deltas, axis = 0) + std_devs = np.sqrt(np.mean(central_deltas**2, axis=0)) + means = np.mean(central_deltas, axis=0) + xi = dataset_xi(dataset_replica_and_central_diff(internal_multiclosure_dataset_loader, False)) # for case of DY observables we have 2 (x,Q) for each experimental point if coords[0].shape[0] != std_devs.shape[0]: - std_devs = np.concatenate((std_devs,std_devs)) - means = np.concatenate((means,means)) - xi = np.concatenate((xi,xi)) - return {'x_coords':coords[0], 'Q_coords':coords[1],'std_devs':std_devs,'name':commondata.name,'process':commondata.process_type, 'means': means, 'xi': xi} + std_devs = np.concatenate((std_devs, std_devs)) + means = np.concatenate((means, means)) + xi = np.concatenate((xi, xi)) + return { + 'x_coords': coords[0], + 'Q_coords': coords[1], + 'std_devs': std_devs, + 'name': commondata.name, + 'process': commondata.process_type, + 'means': means, + 'xi': xi, + } + +xq2_data_map = collect("xq2_dataset_map", ("data",)) -xq2_data_map = collect("xq2_dataset_map",("data",)) \ No newline at end of file + +def standard_indicator_function(standard_variable, nsigma=1): + """ + Calculate the indicator function for a standardised variable. + + Parameters + ---------- + standard_variable: np.array + array of variables that have been standardised: (x - mu)/sigma + + nsigma: float + number of standard deviations to consider + + Returns + ------- + np.array + array of ones and zeros. If 1 then the variable is within nsigma standard deviations + from the mean, otherwise it is 0. + """ + return np.array(abs(standard_variable) < nsigma, dtype=int) From e03c29ee24590b800a506d0fcec182f975d8f40a Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Wed, 24 Jul 2024 11:08:57 +0100 Subject: [PATCH 57/61] removed unused import from vp_multiclosure.py script --- validphys2/src/validphys/scripts/vp_multiclosure.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/validphys2/src/validphys/scripts/vp_multiclosure.py b/validphys2/src/validphys/scripts/vp_multiclosure.py index e87a0003d4..904e861068 100644 --- a/validphys2/src/validphys/scripts/vp_multiclosure.py +++ b/validphys2/src/validphys/scripts/vp_multiclosure.py @@ -1,18 +1,10 @@ import sys -import os import logging -#TODO: Look into making these lazy imports -import prompt_toolkit -from prompt_toolkit.completion import WordCompleter - from reportengine.compat import yaml -from reportengine.colors import t from validphys.app import App -from validphys.loader import RemoteLoader from validphys import compareinconsistentclosuretemplates -from validphys.promptutils import confirm, KeywordsWithCache log = logging.getLogger(__name__) From 5b03cf4df5d491064712e91ee29a11f284b801ae Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Wed, 24 Jul 2024 11:10:01 +0100 Subject: [PATCH 58/61] removed new lines --- validphys2/src/validphys/kinematics.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/validphys2/src/validphys/kinematics.py b/validphys2/src/validphys/kinematics.py index 554dd401b5..5644771f27 100644 --- a/validphys2/src/validphys/kinematics.py +++ b/validphys2/src/validphys/kinematics.py @@ -12,16 +12,9 @@ from reportengine import collect from reportengine.checks import check_positive from reportengine.table import table - - from validphys.core import CutsPolicy from validphys.plotoptions import core as plotoptions_core - - - - - log = logging.getLogger(__name__) From ddc3c0d1acfa0f73b0a96f843bc6586788ec436b Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Wed, 24 Jul 2024 11:11:46 +0100 Subject: [PATCH 59/61] removed unused variables --- validphys2/src/validphys/closuretest/multiclosure.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/validphys2/src/validphys/closuretest/multiclosure.py b/validphys2/src/validphys/closuretest/multiclosure.py index ed1190e84a..93ab6cb06c 100644 --- a/validphys2/src/validphys/closuretest/multiclosure.py +++ b/validphys2/src/validphys/closuretest/multiclosure.py @@ -159,13 +159,12 @@ def fits_normed_dataset_central_delta( for each fit but expected to vary only a little). Each observable central exp value is expected to be gaussianly distributed around the true value set by the fakepdf """ - closures_th, law_th, exp_cov, sqrtcov = internal_multiclosure_dataset_loader + closures_th, law_th, _, _ = internal_multiclosure_dataset_loader # The dimentions here are (fit, data point, replica) reps = np.asarray([th.error_members[:, :_internal_max_reps] for th in closures_th]) # One could mask here some reps in order to avoid redundancy of information # TODO - n_data = len(law_th) n_fits = np.shape(reps)[0] deltas = [] # There are n_fits pdf_covariances From e4fe43845e3bd3943cf6a2b8953e926bfd5d0743 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Wed, 24 Jul 2024 11:23:40 +0100 Subject: [PATCH 60/61] _covmats as array instead of list of arrays --- .../inconsistent_closuretest/multiclosure_inconsistent.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py index aa9a42f38c..60c4cf20c5 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py @@ -98,7 +98,7 @@ def internal_multiclosure_dataset_loader_pca( reps = np.asarray([th.error_members for th in closures_th]) # compute the covariance matrix of the theory predictions for each fit - _covmats = [np.cov(rep, rowvar=True, bias=True) for rep in reps] + _covmats = np.array([np.cov(rep, rowvar=True, bias=True) for rep in reps]) # compute the mean covariance matrix _covmat_mean = np.mean(_covmats, axis=0) @@ -195,7 +195,7 @@ def internal_multiclosure_data_loader_pca( reps = np.asarray([th.error_members for th in closures_th]) # compute the covariance matrix of the theory predictions for each fit - _covmats = [np.cov(rep, rowvar=True, bias=True) for rep in reps] + _covmats = np.array([np.cov(rep, rowvar=True, bias=True) for rep in reps]) # compute the mean covariance matrix _covmat_mean = np.mean(_covmats, axis=0) # Keep the sqrt of the diagonals to reconstruct the covmat later From 7cfdd9a10da65af98b51b64475a773fa881623b3 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Wed, 24 Jul 2024 11:28:02 +0100 Subject: [PATCH 61/61] added eigendecomposition function --- .../multiclosure_inconsistent.py | 40 +++++++++++++------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py index 60c4cf20c5..ad4aab6ea4 100644 --- a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py +++ b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py @@ -25,6 +25,32 @@ from reportengine import collect +def eigendecomposition(covmat): + """ + Compute the eigendecomposition of a covariance matrix. + + Parameters + ---------- + covmat: np.array + covariance matrix + + Returns + ------- + tuple + 3D tuple containing the eigenvalues, eigenvectors and the normalized + eigenvalues. + Note that the eigenvalues are sorted from largest to smallest. + """ + eighvals, eigvecs = np.linalg.eigh(covmat) + idx = np.argsort(eighvals)[::-1] + # sort eigenvalues from largest to smallest + eigvecs = eigvecs[:, idx] + eighvals = eighvals[idx] + eighvals_norm = eighvals / eighvals.sum() + + return eighvals, eigvecs, eighvals_norm + + @dataclasses.dataclass(frozen=True) class PCAInternalMulticlosureLoader: """ @@ -116,12 +142,7 @@ def internal_multiclosure_dataset_loader_pca( sqrt_covmat_pca=np.sqrt(_covmat_mean), ) - eighvals, eigvecs = np.linalg.eigh(_covmat_mean) - idx = np.argsort(eighvals)[::-1] - # sort eigenvalues from largest to smallest - eigvecs = eigvecs[:, idx] - eighvals = eighvals[idx] - eighvals_norm = eighvals / eighvals.sum() + eighvals, eigvecs, eighvals_norm = eigendecomposition(_covmat_mean) # choose components to keep based on EVR n_comp = 1 @@ -216,12 +237,7 @@ def internal_multiclosure_data_loader_pca( sqrt_covmat_pca=np.sqrt(_covmat_mean), ) - eighvals, eigvecs = np.linalg.eigh(_corrmat_mean) - idx = np.argsort(eighvals)[::-1] - # sort eigenvalues from largest to smallest - eigvecs = eigvecs[:, idx] - eighvals = eighvals[idx] - eighvals_norm = eighvals / eighvals.sum() + eighvals, eigvecs, eighvals_norm = eigendecomposition(_corrmat_mean) # choose components to keep based on EVR n_comp = 1