Skip to content

Commit

Permalink
rewrote functions using new variance definition (covmat computed by a…
Browse files Browse the repository at this point in the history
…veraging over covmats of all fits) and removed sklearn dependency
  • Loading branch information
comane committed Apr 24, 2024
1 parent f86e171 commit aee641c
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 87 deletions.
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ pandas = "*"
numpy = "*"
validobj = "*"
prompt_toolkit = "*"
scikit-learn = "^1.4.1"
frozendict = "*" # validphys: needed for caching of data loading
# Reportengine needs to be installed from git
reportengine = { git = "https://github.com/NNPDF/reportengine" }
Expand Down
1 change: 0 additions & 1 deletion validphys2/examples/pca_bias_variance_report.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ template_text: |
## L2 condition number
{@plot_l2_condition_number@}
{@bootstrapped_bias_distribution@}
actions_:
- report(main=true)
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"""

import numpy as np
from sklearn.decomposition import PCA
import dataclasses

from validphys import covmats
from validphys.calcutils import calc_chi2
Expand All @@ -18,87 +18,68 @@
from reportengine import collect


def principal_components_dataset(dataset, fits_pdf, variancepdf, explained_variance_ratio=0.99):
@dataclasses.dataclass(frozen=True)
class PCAInternalMulticlosureLoader:
"""
Compute the principal components of theory predictions replica matrix
(Ndat x Nrep feature matrix).
Parameters
----------
dataset: (DataSetSpec, DataGroupSpec)
dataset for which the theory predictions and t0 covariance matrix
will be loaded. Note that due to the structure of `validphys` this
function can be overloaded to accept a DataGroupSpec.
closures_th: list
list containing validphys.results.ThPredictionsResult objects
for each fit
fits_pdf: list
list of PDF objects produced from performing multiple closure tests
fits. Each fit should have a different filterseed but the same
underlying law used to generate the pseudodata.
law_th: ThPredictionsResult object
underlying law theory predictions
variancepdf: validphys.core.PDF
PDF object used to estimate the variance of the fits.
pc_basis: np.array
basis of principal components
explained_variance_ratio: float, default is 0.93
n_comp: int
number of principal components kept after regularisation
Returns
-------
tuple
3D tuple:
- matrix of the principal components (PCs) of shape (N_pc, N_dat)
- reduced feature matrix, i.e., feature matrix projected onto PCs of shape (N_pc, N_rep)
- N_pc: number of principal components kept
covmat_pca: np.array
regularised covariance matrix computed from replicas
of theory predictions
sqrt_covmat_pca: np.array
cholesky decomposed covariance matrix
"""
# get replicas from variance fit, used to estimate variance
reps = ThPredictionsResult.from_convolution(variancepdf, dataset).error_members

# something that could be tested: rescale feature matrix
# reps_scaled = reps.preprocessing.scale(reps)

# choose number of principal components (PCs) based on explained variance ratio
n_comp = 1
for _ in range(reps.shape[0]):
pca = PCA(n_comp).fit(reps.T)
if np.sum(pca.explained_variance_ratio_) >= explained_variance_ratio:
break
n_comp += 1
closures_th: list
law_th: ThPredictionsResult
pc_basis: np.array
n_comp: int
covmat_pca: np.array
sqrt_covmat_pca: np.array

# project feature matrix onto PCs
pc_reps = pca.components_ @ reps

return pca.components_, pc_reps, n_comp


def principal_components_bias_variance_dataset(
def internal_multiclosure_dataset_loader_pca(
internal_multiclosure_dataset_loader, explained_variance_ratio=0.99
):
"""
Compute the bias and variance for one datasets
using the principal component reduced covariance matrix.
Similar to multiclosure.internal_multiclosure_dataset_loader but returns
PCA regularised covariance matrix, where the covariance matrix has been computed
from the replicas of the theory predictions.
Parameters
----------
internal_multiclosure_dataset_loader : tuple
Tuple containing the results of multiclosure fits
internal_multiclosure_dataset_loader: tuple
closure fits theory predictions,
underlying law theory predictions,
covariance matrix,
sqrt covariance matrix
principal_components_dataset : tuple
3D tuple containing the principal components of the theory predictions
explained_variance_ratio: float, default is 0.99
Returns
-------
tuple
3D tuple:
- biases: 1-D array of shape (Nfits,)
- variances: 1-D array of shape (Nfits, )
- n_comp: number of principal components kept
PCAInternalMulticlosureLoader
"""

closures_th, law_th, _, _ = internal_multiclosure_dataset_loader

reps = np.asarray([th.error_members for th in closures_th])

# compute the covariance matrix of the theory predictions for each fit
_covmats = [np.cov(rep, rowvar=True) for rep in reps]
_covmats = [np.cov(rep, rowvar=True, bias=True) for rep in reps]

# compute the mean covariance matrix
_covmat_mean = np.mean(_covmats, axis=0)
Expand All @@ -107,7 +88,14 @@ def principal_components_bias_variance_dataset(
# that explain the required variance

if _covmat_mean.shape == ():
return np.nan, np.nan, 1
return PCAInternalMulticlosureLoader(
closures_th=closures_th,
law_th=law_th,
pc_basis=1,
n_comp=1,
covmat_pca=_covmat_mean,
sqrt_covmat_pca=np.sqrt(_covmat_mean),
)

eighvals, eigvecs = np.linalg.eigh(_covmat_mean)
idx = np.argsort(eighvals)[::-1]
Expand All @@ -127,28 +115,80 @@ def principal_components_bias_variance_dataset(
pc_basis = eigvecs[:, :n_comp]

# compute the (PCA) regularized covariance matrix
covmat_pdf = pc_basis.T @ _covmat_mean @ pc_basis

if n_comp <= 1:
return np.nan, np.nan, n_comp
covmat_pca = pc_basis.T @ _covmat_mean @ pc_basis

if n_comp == 1:
return PCAInternalMulticlosureLoader(
closures_th=closures_th,
law_th=law_th,
pc_basis=pc_basis,
n_comp=1,
covmat_pca=covmat_pca,
sqrt_covmat_pca=np.sqrt(covmat_pca),
)

# compute sqrt of pdf covariance matrix
sqrt_covmat_pdf = covmats.sqrt_covmat(covmat_pdf)
sqrt_covmat_pca = covmats.sqrt_covmat(covmat_pca)

# compute bias diff and project it onto space spanned by PCs
delta_bias = reps.mean(axis=2).T - law_th.central_value[:, np.newaxis]
# shape here is (Npc, Nfits)
delta_bias = pc_basis.T @ delta_bias
return PCAInternalMulticlosureLoader(
closures_th=closures_th,
law_th=law_th,
pc_basis=pc_basis,
n_comp=n_comp,
covmat_pca=covmat_pca,
sqrt_covmat_pca=sqrt_covmat_pca,
)


def principal_components_bias_variance_dataset(internal_multiclosure_dataset_loader_pca):
"""
Compute the bias and variance for one datasets
using the principal component reduced covariance matrix.
# compute biases, shape of biases is (Nfits)
biases = calc_chi2(sqrt_covmat_pdf, delta_bias)
Parameters
----------
internal_multiclosure_dataset_loader : tuple
Tuple containing the results of multiclosure fits
variances = []
for i in range(reps.shape[0]):
diffs = pc_basis.T @ (reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True))
variances.append(np.mean(calc_chi2(sqrt_covmat_pdf, diffs)))
explained_variance_ratio : float, default is 0.99
3D tuple containing the principal components of the theory predictions
return biases, np.asarray(variances), n_comp
Returns
-------
tuple
3D tuple:
- biases: 1-D array of shape (Nfits,)
- variances: 1-D array of shape (Nfits, )
- n_comp: number of principal components kept
"""

pca_loader = internal_multiclosure_dataset_loader_pca

reps = np.asarray([th.error_members for th in pca_loader.closures_th])

# compute bias diff and project it onto space spanned by PCs
delta_bias = reps.mean(axis=2).T - pca_loader.law_th.central_value[:, np.newaxis]

if pca_loader.n_comp == 1:
delta_bias = pca_loader.pc_basis * delta_bias
biases = (delta_bias / pca_loader.sqrt_covmat_pca) ** 2
variances = []
for i in range(reps.shape[0]):
diffs = pca_loader.pc_basis * (
reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)
)
variances.append(np.mean((diffs / pca_loader.sqrt_covmat_pca) ** 2))
else:
delta_bias = pca_loader.pc_basis.T @ delta_bias
biases = calc_chi2(pca_loader.sqrt_covmat_pca, delta_bias)
variances = []
for i in range(reps.shape[0]):
diffs = pca_loader.pc_basis.T @ (
reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)
)
variances.append(np.mean(calc_chi2(pca_loader.sqrt_covmat_pca, diffs)))

return biases, np.asarray(variances), pca_loader.n_comp


principal_components_bias_variance_datasets = collect(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from validphys import plotutils
from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent import (
principal_components_dataset,
internal_multiclosure_dataset_loader_pca,
)


Expand Down Expand Up @@ -142,9 +142,14 @@ def plot_lambdavalues_bias_variance_values(
yield fig


internal_multiclosure_data_collected_loader = collect(
"internal_multiclosure_dataset_loader", ("data",)
)


@figuregen
def plot_l2_condition_number(
each_dataset, fits_pdf, variancepdf, evr_min=0.90, evr_max=0.995, evr_n=20
each_dataset, internal_multiclosure_data_collected_loader, evr_min=0.90, evr_max=0.995, evr_n=20
):
"""
Plot the L2 condition number of the covariance matrix as a function of the explained variance ratio.
Expand All @@ -162,11 +167,9 @@ def plot_l2_condition_number(
each_dataset : list
List of datasets
fits_pdf: list
list of validphys.core.PDF objects
internal_multiclosure_data_loader: list
list of internal_multiclosure_dataset_loader objects
variancepdf: validphys.core.PDF
PDF object for the variance
Yields
------
Expand All @@ -177,25 +180,26 @@ def plot_l2_condition_number(
# Explained variance ratio range
evr_range = np.linspace(evr_min, evr_max, evr_n)

for ds in each_dataset:
for internal_multiclosure_dataset_loader, ds in zip(
internal_multiclosure_data_collected_loader, each_dataset
):
l2_cond = []
dof = []

for evr in evr_range:
_, pc_reps, n_comp = principal_components_dataset(
ds, fits_pdf, variancepdf, explained_variance_ratio=evr
)

covmat_pdf = np.cov(pc_reps)
pca_loader = internal_multiclosure_dataset_loader_pca(
internal_multiclosure_dataset_loader, explained_variance_ratio=evr
)

# if the number of principal components is 1 then the covariance matrix is a scalar
# and the condition number is not computed
if n_comp <= 1:
if pca_loader.n_comp == 1:
l2_cond.append(np.nan)
else:
l2_cond.append(np.linalg.cond(covmat_pdf))
l2_cond.append(np.linalg.cond(pca_loader.covmat_pca))

dof.append(n_comp)
dof.append(pca_loader.n_comp)

fig, ax1 = plotutils.subplots()
ax1.plot(evr_range, l2_cond, label="L2 Condition Number")
Expand Down

0 comments on commit aee641c

Please sign in to comment.