rewrote functions using new variance definition (covmat computed by a…

…veraging over covmats of all fits) and removed sklearn dependency
NNPDF · Apr 24, 2024 · aee641c · aee641c
1 parent f86e171
commit aee641c
Show file tree

Hide file tree

Showing 4 changed files with 129 additions and 87 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -68,7 +68,6 @@ pandas = "*"
 numpy = "*"
 validobj = "*"
 prompt_toolkit = "*"
-scikit-learn = "^1.4.1"
 frozendict = "*"  # validphys: needed for caching of data loading
 # Reportengine needs to be installed from git
 reportengine = { git = "https://github.com/NNPDF/reportengine" }

diff --git a/validphys2/examples/pca_bias_variance_report.yaml b/validphys2/examples/pca_bias_variance_report.yaml
@@ -47,7 +47,6 @@ template_text: |
   ## L2 condition number
   {@plot_l2_condition_number@}
   
-  {@bootstrapped_bias_distribution@}
 
 actions_:
   - report(main=true)
diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent.py
@@ -9,7 +9,7 @@
 """
 
 import numpy as np
-from sklearn.decomposition import PCA
+import dataclasses
 
 from validphys import covmats
 from validphys.calcutils import calc_chi2
@@ -18,87 +18,68 @@
 from reportengine import collect
 
 
-def principal_components_dataset(dataset, fits_pdf, variancepdf, explained_variance_ratio=0.99):
+@dataclasses.dataclass(frozen=True)
+class PCAInternalMulticlosureLoader:
     """
-    Compute the principal components of theory predictions replica matrix
-    (Ndat x Nrep feature matrix).
-
     Parameters
     ----------
-    dataset: (DataSetSpec, DataGroupSpec)
-        dataset for which the theory predictions and t0 covariance matrix
-        will be loaded. Note that due to the structure of `validphys` this
-        function can be overloaded to accept a DataGroupSpec.
+    closures_th: list
+        list containing validphys.results.ThPredictionsResult objects
+        for each fit
 
-    fits_pdf: list
-        list of PDF objects produced from performing multiple closure tests
-        fits. Each fit should have a different filterseed but the same
-        underlying law used to generate the pseudodata.
+    law_th: ThPredictionsResult object
+        underlying law theory predictions
 
-    variancepdf: validphys.core.PDF
-            PDF object used to estimate the variance of the fits.
+    pc_basis: np.array
+        basis of principal components
 
-    explained_variance_ratio: float, default is 0.93
+    n_comp: int
+        number of principal components kept after regularisation
 
-    Returns
-    -------
-    tuple
-        3D tuple:
-        - matrix of the principal components (PCs) of shape (N_pc, N_dat)
-        - reduced feature matrix, i.e., feature matrix projected onto PCs of shape (N_pc, N_rep)
-        - N_pc: number of principal components kept
+    covmat_pca: np.array
+        regularised covariance matrix computed from replicas
+        of theory predictions
 
+    sqrt_covmat_pca: np.array
+        cholesky decomposed covariance matrix
     """
-    # get replicas from variance fit, used to estimate variance
-    reps = ThPredictionsResult.from_convolution(variancepdf, dataset).error_members
-
-    # something that could be tested: rescale feature matrix
-    # reps_scaled = reps.preprocessing.scale(reps)
 
-    # choose number of principal components (PCs) based on explained variance ratio
-    n_comp = 1
-    for _ in range(reps.shape[0]):
-        pca = PCA(n_comp).fit(reps.T)
-        if np.sum(pca.explained_variance_ratio_) >= explained_variance_ratio:
-            break
-        n_comp += 1
+    closures_th: list
+    law_th: ThPredictionsResult
+    pc_basis: np.array
+    n_comp: int
+    covmat_pca: np.array
+    sqrt_covmat_pca: np.array
 
-    # project feature matrix onto PCs
-    pc_reps = pca.components_ @ reps
 
-    return pca.components_, pc_reps, n_comp
-
-
-def principal_components_bias_variance_dataset(
+def internal_multiclosure_dataset_loader_pca(
     internal_multiclosure_dataset_loader, explained_variance_ratio=0.99
 ):
     """
-    Compute the bias and variance for one datasets
-    using the principal component reduced covariance matrix.
+    Similar to multiclosure.internal_multiclosure_dataset_loader but returns
+    PCA regularised covariance matrix, where the covariance matrix has been computed
+    from the replicas of the theory predictions.
 
     Parameters
     ----------
-    internal_multiclosure_dataset_loader : tuple
-        Tuple containing the results of multiclosure fits
+    internal_multiclosure_dataset_loader: tuple
+        closure fits theory predictions,
+        underlying law theory predictions,
+        covariance matrix,
+        sqrt covariance matrix
 
-    principal_components_dataset : tuple
-        3D tuple containing the principal components of the theory predictions
+    explained_variance_ratio: float, default is 0.99
 
     Returns
     -------
-    tuple
-        3D tuple:
-        - biases: 1-D array of shape (Nfits,)
-        - variances: 1-D array of shape (Nfits, )
-        - n_comp: number of principal components kept
+    PCAInternalMulticlosureLoader
     """
-
     closures_th, law_th, _, _ = internal_multiclosure_dataset_loader
 
     reps = np.asarray([th.error_members for th in closures_th])
 
     # compute the covariance matrix of the theory predictions for each fit
-    _covmats = [np.cov(rep, rowvar=True) for rep in reps]
+    _covmats = [np.cov(rep, rowvar=True, bias=True) for rep in reps]
 
     # compute the mean covariance matrix
     _covmat_mean = np.mean(_covmats, axis=0)
@@ -107,7 +88,14 @@ def principal_components_bias_variance_dataset(
     # that explain the required variance
 
     if _covmat_mean.shape == ():
-        return np.nan, np.nan, 1
+        return PCAInternalMulticlosureLoader(
+            closures_th=closures_th,
+            law_th=law_th,
+            pc_basis=1,
+            n_comp=1,
+            covmat_pca=_covmat_mean,
+            sqrt_covmat_pca=np.sqrt(_covmat_mean),
+        )
 
     eighvals, eigvecs = np.linalg.eigh(_covmat_mean)
     idx = np.argsort(eighvals)[::-1]
@@ -127,28 +115,80 @@ def principal_components_bias_variance_dataset(
     pc_basis = eigvecs[:, :n_comp]
 
     # compute the (PCA) regularized covariance matrix
-    covmat_pdf = pc_basis.T @ _covmat_mean @ pc_basis
-
-    if n_comp <= 1:
-        return np.nan, np.nan, n_comp
+    covmat_pca = pc_basis.T @ _covmat_mean @ pc_basis
+
+    if n_comp == 1:
+        return PCAInternalMulticlosureLoader(
+            closures_th=closures_th,
+            law_th=law_th,
+            pc_basis=pc_basis,
+            n_comp=1,
+            covmat_pca=covmat_pca,
+            sqrt_covmat_pca=np.sqrt(covmat_pca),
+        )
 
     # compute sqrt of pdf covariance matrix
-    sqrt_covmat_pdf = covmats.sqrt_covmat(covmat_pdf)
+    sqrt_covmat_pca = covmats.sqrt_covmat(covmat_pca)
 
-    # compute bias diff and project it onto space spanned by PCs
-    delta_bias = reps.mean(axis=2).T - law_th.central_value[:, np.newaxis]
-    # shape here is (Npc, Nfits)
-    delta_bias = pc_basis.T @ delta_bias
+    return PCAInternalMulticlosureLoader(
+        closures_th=closures_th,
+        law_th=law_th,
+        pc_basis=pc_basis,
+        n_comp=n_comp,
+        covmat_pca=covmat_pca,
+        sqrt_covmat_pca=sqrt_covmat_pca,
+    )
+
+
+def principal_components_bias_variance_dataset(internal_multiclosure_dataset_loader_pca):
+    """
+    Compute the bias and variance for one datasets
+    using the principal component reduced covariance matrix.
 
-    # compute biases, shape of biases is (Nfits)
-    biases = calc_chi2(sqrt_covmat_pdf, delta_bias)
+    Parameters
+    ----------
+    internal_multiclosure_dataset_loader : tuple
+        Tuple containing the results of multiclosure fits
 
-    variances = []
-    for i in range(reps.shape[0]):
-        diffs = pc_basis.T @ (reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True))
-        variances.append(np.mean(calc_chi2(sqrt_covmat_pdf, diffs)))
+    explained_variance_ratio : float, default is 0.99
+        3D tuple containing the principal components of the theory predictions
 
-    return biases, np.asarray(variances), n_comp
+    Returns
+    -------
+    tuple
+        3D tuple:
+        - biases: 1-D array of shape (Nfits,)
+        - variances: 1-D array of shape (Nfits, )
+        - n_comp: number of principal components kept
+    """
+
+    pca_loader = internal_multiclosure_dataset_loader_pca
+
+    reps = np.asarray([th.error_members for th in pca_loader.closures_th])
+
+    # compute bias diff and project it onto space spanned by PCs
+    delta_bias = reps.mean(axis=2).T - pca_loader.law_th.central_value[:, np.newaxis]
+
+    if pca_loader.n_comp == 1:
+        delta_bias = pca_loader.pc_basis * delta_bias
+        biases = (delta_bias / pca_loader.sqrt_covmat_pca) ** 2
+        variances = []
+        for i in range(reps.shape[0]):
+            diffs = pca_loader.pc_basis * (
+                reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)
+            )
+            variances.append(np.mean((diffs / pca_loader.sqrt_covmat_pca) ** 2))
+    else:
+        delta_bias = pca_loader.pc_basis.T @ delta_bias
+        biases = calc_chi2(pca_loader.sqrt_covmat_pca, delta_bias)
+        variances = []
+        for i in range(reps.shape[0]):
+            diffs = pca_loader.pc_basis.T @ (
+                reps[i, :, :] - reps[i, :, :].mean(axis=1, keepdims=True)
+            )
+            variances.append(np.mean(calc_chi2(pca_loader.sqrt_covmat_pca, diffs)))
+
+    return biases, np.asarray(variances), pca_loader.n_comp
 
 
 principal_components_bias_variance_datasets = collect(

diff --git a/...s2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py b/...s2/src/validphys/closuretest/inconsistent_closuretest/multiclosure_inconsistent_output.py
@@ -16,7 +16,7 @@
 
 from validphys import plotutils
 from validphys.closuretest.inconsistent_closuretest.multiclosure_inconsistent import (
-    principal_components_dataset,
+    internal_multiclosure_dataset_loader_pca,
 )
 
 
@@ -142,9 +142,14 @@ def plot_lambdavalues_bias_variance_values(
         yield fig
 
 
+internal_multiclosure_data_collected_loader = collect(
+    "internal_multiclosure_dataset_loader", ("data",)
+)
+
+
 @figuregen
 def plot_l2_condition_number(
-    each_dataset, fits_pdf, variancepdf, evr_min=0.90, evr_max=0.995, evr_n=20
+    each_dataset, internal_multiclosure_data_collected_loader, evr_min=0.90, evr_max=0.995, evr_n=20
 ):
     """
     Plot the L2 condition number of the covariance matrix as a function of the explained variance ratio.
@@ -162,11 +167,9 @@ def plot_l2_condition_number(
     each_dataset : list
         List of datasets
 
-    fits_pdf: list
-        list of validphys.core.PDF objects
+    internal_multiclosure_data_loader: list
+        list of internal_multiclosure_dataset_loader objects
 
-    variancepdf: validphys.core.PDF
-        PDF object for the variance
 
     Yields
     ------
@@ -177,25 +180,26 @@ def plot_l2_condition_number(
     # Explained variance ratio range
     evr_range = np.linspace(evr_min, evr_max, evr_n)
 
-    for ds in each_dataset:
+    for internal_multiclosure_dataset_loader, ds in zip(
+        internal_multiclosure_data_collected_loader, each_dataset
+    ):
         l2_cond = []
         dof = []
 
         for evr in evr_range:
-            _, pc_reps, n_comp = principal_components_dataset(
-                ds, fits_pdf, variancepdf, explained_variance_ratio=evr
-            )
 
-            covmat_pdf = np.cov(pc_reps)
+            pca_loader = internal_multiclosure_dataset_loader_pca(
+                internal_multiclosure_dataset_loader, explained_variance_ratio=evr
+            )
 
             # if the number of principal components is 1 then the covariance matrix is a scalar
             # and the condition number is not computed
-            if n_comp <= 1:
+            if pca_loader.n_comp == 1:
                 l2_cond.append(np.nan)
             else:
-                l2_cond.append(np.linalg.cond(covmat_pdf))
+                l2_cond.append(np.linalg.cond(pca_loader.covmat_pca))
 
-            dof.append(n_comp)
+            dof.append(pca_loader.n_comp)
 
         fig, ax1 = plotutils.subplots()
         ax1.plot(evr_range, l2_cond, label="L2 Condition Number")