Skip to content

Commit

Permalink
added bootstrapped xi indicator function for full dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
comane committed Jun 27, 2024
1 parent 2fbe6b8 commit 76001f4
Show file tree
Hide file tree
Showing 3 changed files with 153 additions and 52 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from validphys.closuretest.multiclosure import (
bootstrapped_internal_multiclosure_dataset_loader,
bootstrapped_internal_multiclosure_data_loader,
standard_indicator_function,
)

from reportengine import collect
Expand Down Expand Up @@ -459,7 +460,7 @@ def principal_components_normalized_delta_data(internal_multiclosure_data_loader

# find basis that diagonalise covmat pca
eigvals, eigenvects = np.linalg.eigh(pca_loader.covmat_pca)

if pca_loader.n_comp == 1:
delta_bias = pca_loader.pc_basis * delta_bias
std_deviations = []
Expand All @@ -480,6 +481,60 @@ def principal_components_normalized_delta_data(internal_multiclosure_data_loader
)


def bootstrapped_principal_components_normalized_delta_data(
bootstrapped_internal_multiclosure_data_loader_pca,
):
"""
Compute the normalized deltas for each bootstrap sample.
Parameters
----------
bootstrapped_internal_multiclosure_data_loader_pca: list
list of tuples containing the results of multiclosure fits after pca regularization
Returns
-------
list
list of tuples containing the normalized deltas and the number of principal components.
Each tuple corresponds to a bootstrap sample.
"""
normalised_deltas = []
for boot_imdl_pca in bootstrapped_internal_multiclosure_data_loader_pca:
normalised_deltas.append(principal_components_normalized_delta_data(boot_imdl_pca))
return normalised_deltas


def bootstrapped_indicator_function_data(
bootstrapped_principal_components_normalized_delta_data, nsigma=1
):
"""
Compute the indicator function for each bootstrap sample.
Parameters
----------
bootstrapped_principal_components_normalized_delta_data: list
list of tuples containing the normalized deltas and the number of principal components.
Each tuple corresponds to a bootstrap sample.
nsigma: int, default is 1
Returns
-------
2-D tuple:
list
list of length N_boot and entrances are arrays of dim Npca x Nfits containing the indicator function for each bootstrap sample.
float
average number of degrees of freedom
"""
indicator_list = []
ndof_list = []
for boot, ndof in bootstrapped_principal_components_normalized_delta_data:
indicator_list.append(standard_indicator_function(boot, nsigma))
ndof_list.append(ndof)
return indicator_list, np.mean(np.asarray(ndof_list))


def bootstrapped_principal_components_bias_variance_dataset(
bootstrapped_internal_multiclosure_dataset_loader_pca, dataset
):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -521,3 +521,37 @@ def delta_histogram(principal_components_normalized_delta_data, title, label_his
ax.plot(x, y, label="Standard gaussian")
ax.legend()
return fig


@table
def table_xi_indicator_function_data(bootstrapped_indicator_function_data):
"""
Computes the bootstrap average and std of the indicator function for the data.
Parameters
----------
bootstrapped_indicator_function_data: tuple
Returns
-------
pd.DataFrame
DataFrame containing the average and std of the indicator function for the data.
"""
indicator_list, mean_dof = bootstrapped_indicator_function_data

# average over data and fits within the bootstrap samples
mean_boot_vals = np.array([np.mean(boot_val) for boot_val in indicator_list])

# take bootstrap expectation and variance
mean_xi = np.mean(mean_boot_vals)
std_xi = np.std(mean_boot_vals)

records = [dict(data="full data", mean_dof=mean_dof, mean_xi=mean_xi, std_xi=std_xi)]

df = pd.DataFrame.from_records(
records, index="data", columns=("data", "mean_dof", "mean_xi", "std_xi")
)

df.columns = ["mean_dof", "mean_xi", "std_xi"]
return df
114 changes: 63 additions & 51 deletions validphys2/src/validphys/closuretest/multiclosure.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
``multiclosure_output.py``
"""

import numpy as np
import scipy.linalg as la
import scipy.special as special
Expand Down Expand Up @@ -116,9 +117,7 @@ def internal_multiclosure_data_loader(

@check_multifit_replicas
def fits_dataset_bias_variance(
internal_multiclosure_dataset_loader,
_internal_max_reps=None,
_internal_min_reps=20,
internal_multiclosure_dataset_loader, _internal_max_reps=None, _internal_min_reps=20
):
"""For a single dataset, calculate the bias and variance for each fit
and return tuple (bias, variance, n_data), where bias and variance are
Expand Down Expand Up @@ -149,44 +148,42 @@ def fits_dataset_bias_variance(
variances.append(np.mean(calc_chi2(sqrtcov, diffs)))
return biases, np.asarray(variances), len(law_th)


@check_multifit_replicas
def fits_normed_dataset_central_delta(
internal_multiclosure_dataset_loader,
_internal_max_reps=None,
_internal_min_reps=20):
internal_multiclosure_dataset_loader, _internal_max_reps=None, _internal_min_reps=20
):
"""
For each fit calculate the difference between central expectation value and true val. Normalize this
value by the variance of the differences between replicas and central expectation value (different
for each fit but expected to vary only a little). Each observable central exp value is
for each fit but expected to vary only a little). Each observable central exp value is
expected to be gaussianly distributed around the true value set by the fakepdf
"""
closures_th, law_th, exp_cov, sqrtcov = internal_multiclosure_dataset_loader
# The dimentions here are (fit, data point, replica)
reps = np.asarray([th.error_members[:, :_internal_max_reps] for th in closures_th])
# One could mask here some reps in order to avoid redundancy of information
#TODO
# TODO

n_data = len(law_th)
n_fits = np.shape(reps)[0]
deltas = []
# There are n_fits pdf_covariances
# flag to see whether to eliminate dataset
for i in range(n_fits):
bias_diffs = np.mean(reps[i], axis = 1) - law_th.central_value
bias_diffs = np.mean(reps[i], axis=1) - law_th.central_value
# bias diffs in the for loop should have dim = (n_obs)
sigmas = np.sqrt(np.var(reps[i], axis = 1))
sigmas = np.sqrt(np.var(reps[i], axis=1))
# var diffs has shape (n_obs,reps)
bias_diffs = bias_diffs/sigmas
bias_diffs = bias_diffs / sigmas

deltas.append(bias_diffs.tolist())
# biases.shape = (n_fits, n_obs_cut/uncut)
# variances.shape = (n_fits, n_obs_cut/uncut, reps)
return np.asarray(deltas)


fits_datasets_bias_variance = collect(
"fits_dataset_bias_variance", ("data",)
)
fits_datasets_bias_variance = collect("fits_dataset_bias_variance", ("data",))


def expected_dataset_bias_variance(fits_dataset_bias_variance):
Expand All @@ -200,9 +197,7 @@ def expected_dataset_bias_variance(fits_dataset_bias_variance):

@check_multifit_replicas
def fits_data_bias_variance(
internal_multiclosure_data_loader,
_internal_max_reps=None,
_internal_min_reps=20,
internal_multiclosure_data_loader, _internal_max_reps=None, _internal_min_reps=20
):
"""Like `fits_dataset_bias_variance` but for all data"""
return fits_dataset_bias_variance(
Expand Down Expand Up @@ -350,11 +345,7 @@ def n_replica_samples(fits_pdf, _internal_max_reps=None, _internal_min_reps=20):
_internal_max_reps.
"""
return list(
range(
_internal_min_reps,
_internal_max_reps + SAMPLING_INTERVAL,
SAMPLING_INTERVAL,
)
range(_internal_min_reps, _internal_max_reps + SAMPLING_INTERVAL, SAMPLING_INTERVAL)
)


Expand All @@ -370,13 +361,7 @@ def __init__(self, data):


def _bootstrap_multiclosure_fits(
internal_multiclosure_dataset_loader,
rng,
n_fit_max,
n_fit,
n_rep_max,
n_rep,
use_repeats,
internal_multiclosure_dataset_loader, rng, n_fit_max, n_fit, n_rep_max, n_rep, use_repeats
):
"""Perform a single bootstrap resample of the multiclosure fits and return
a proxy of the base internal object used by relevant estimator actions
Expand Down Expand Up @@ -907,9 +892,7 @@ def total_bootstrap_xi(experiments_bootstrap_xi):


def dataset_fits_bias_replicas_variance_samples(
internal_multiclosure_dataset_loader,
_internal_max_reps=None,
_internal_min_reps=20,
internal_multiclosure_dataset_loader, _internal_max_reps=None, _internal_min_reps=20
):
"""For a single dataset, calculate the samples of chi2-quantities which
are used to calculate the bias and variance for each fit. The output of this
Expand Down Expand Up @@ -950,14 +933,10 @@ def dataset_fits_bias_replicas_variance_samples(


def dataset_inputs_fits_bias_replicas_variance_samples(
internal_multiclosure_data_loader,
_internal_max_reps=None,
_internal_min_reps=20,
internal_multiclosure_data_loader, _internal_max_reps=None, _internal_min_reps=20
):
return dataset_fits_bias_replicas_variance_samples(
internal_multiclosure_data_loader,
_internal_max_reps=None,
_internal_min_reps=20,
internal_multiclosure_data_loader, _internal_max_reps=None, _internal_min_reps=20
)


Expand All @@ -966,9 +945,13 @@ def dataset_inputs_fits_bias_replicas_variance_samples(
)


def xq2_dataset_map(commondata, cuts,internal_multiclosure_dataset_loader,
_internal_max_reps=None,
_internal_min_reps=20):
def xq2_dataset_map(
commondata,
cuts,
internal_multiclosure_dataset_loader,
_internal_max_reps=None,
_internal_min_reps=20,
):
"""
Load in a dictionary all the specs of a dataset meaning:
- ds name
Expand All @@ -980,17 +963,46 @@ def xq2_dataset_map(commondata, cuts,internal_multiclosure_dataset_loader,
xq2_map_obj = xq2map_with_cuts(commondata, cuts)
coords = xq2_map_obj[2]
central_deltas = fits_normed_dataset_central_delta(internal_multiclosure_dataset_loader)
#std_devs = np.std(central_deltas, axis = 0)
std_devs = np.sqrt(np.mean(central_deltas**2, axis = 0))
means = np.mean(central_deltas, axis = 0)
xi = dataset_xi(dataset_replica_and_central_diff(internal_multiclosure_dataset_loader,False))
# std_devs = np.std(central_deltas, axis = 0)
std_devs = np.sqrt(np.mean(central_deltas**2, axis=0))
means = np.mean(central_deltas, axis=0)
xi = dataset_xi(dataset_replica_and_central_diff(internal_multiclosure_dataset_loader, False))

# for case of DY observables we have 2 (x,Q) for each experimental point
if coords[0].shape[0] != std_devs.shape[0]:
std_devs = np.concatenate((std_devs,std_devs))
means = np.concatenate((means,means))
xi = np.concatenate((xi,xi))
return {'x_coords':coords[0], 'Q_coords':coords[1],'std_devs':std_devs,'name':commondata.name,'process':commondata.process_type, 'means': means, 'xi': xi}
std_devs = np.concatenate((std_devs, std_devs))
means = np.concatenate((means, means))
xi = np.concatenate((xi, xi))
return {
'x_coords': coords[0],
'Q_coords': coords[1],
'std_devs': std_devs,
'name': commondata.name,
'process': commondata.process_type,
'means': means,
'xi': xi,
}


xq2_data_map = collect("xq2_dataset_map", ("data",))

xq2_data_map = collect("xq2_dataset_map",("data",))

def standard_indicator_function(standard_variable, nsigma=1):
"""
Calculate the indicator function for a standardised variable.
Parameters
----------
standard_variable: np.array
array of variables that have been standardised: (x - mu)/sigma
nsigma: float
number of standard deviations to consider
Returns
-------
np.array
array of ones and zeros. If 1 then the variable is within nsigma standard deviations
from the mean, otherwise it is 0.
"""
return np.array(abs(standard_variable) < nsigma, dtype=int)

0 comments on commit 76001f4

Please sign in to comment.