Merge pull request #3 from Ciela-Institute/bootstrapping

Update to version 0.1
Ciela-Institute · Nov 21, 2023 · 8de0760 · 8de0760
2 parents ebbffce + ad55f6d
commit 8de0760
Show file tree

Hide file tree

Showing 6 changed files with 377 additions and 17 deletions.
diff --git a/notebooks/test.ipynb b/notebooks/test.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "tarp"
-version = "0.0.2"
+version = "0.1.0"
 authors = [
     {name = "Adam Coogan", email = "dr.adam.coogan@gmail.com"},
     {name = "Pablo Lemos", email = "plemos91@gmail.com"},

diff --git a/requirements.txt b/requirements.txt
@@ -7,3 +7,5 @@ Sphinx==5.3.0
 sphinx_autodoc_typehints==1.22
 sphinx-pyproject==0.1.0
 torch==1.13.1
+tqdm==4.65.0
+deprecation==2.1.0
diff --git a/src/tarp/__init__.py b/src/tarp/__init__.py
@@ -1 +1 @@
-from .drp import *
+from .drp import get_drp_coverage, get_tarp_coverage
diff --git a/src/tarp/drp.py b/src/tarp/drp.py
@@ -1,18 +1,43 @@
 from typing import Tuple, Union
-
 import numpy as np
+from tqdm import tqdm
+import deprecation
+
 
-__all__ = ("get_drp_coverage",)
+__all__ = ("get_tarp_coverage", "get_drp_coverage")
 
 
+@deprecation.deprecated(
+    deprecated_in="0.1.0",
+    removed_in="0.2.0",
+    current_version="0.1.0",
+    details="Use get_tarp_coverage instead",
+)
 def get_drp_coverage(
     samples: np.ndarray,
     theta: np.ndarray,
     references: Union[str, np.ndarray] = "random",
     metric: str = "euclidean",
+) -> Tuple[np.ndarray, np.ndarray]:
+    return get_tarp_coverage(samples=samples,
+                             theta=theta,
+                             references=references,
+                             metric=metric,
+                             norm=True,
+                             bootstrap=False,
+                             seed=None)
+
+
+def _get_tarp_coverage_single(
+    samples: np.ndarray,
+    theta: np.ndarray,
+    references: Union[str, np.ndarray] = "random",
+    metric: str = "euclidean",
+    norm: bool = True,
+    seed: Union[int, None] = None
 ) -> Tuple[np.ndarray, np.ndarray]:
     """
-    Estimates coverage with the distance to random point method.
+    Estimates coverage with the TARP method a single time.
 
     Reference: `Lemos, Coogan et al 2023 <https://arxiv.org/abs/2302.03026>`_
 
@@ -25,10 +50,13 @@ def get_drp_coverage(
             the parameter space.
         metric: the metric to use when computing the distance. Can be ``"euclidean"`` or
             ``"manhattan"``.
+        norm : whether to apply or not the normalization (Default = True)
+        seed: the seed to use for the random number generator. If ``None``, then no seed
 
     Returns:
-        Credibility values (``alpha``) and expected coverage probability (``ecp``).
+        Expected coverage probability (``ecp``) and credibility values (``alpha``) 
     """
+    np.random.seed(seed)
 
     # Check that shapes are correct
     if samples.ndim != 3:
@@ -51,10 +79,11 @@ def get_drp_coverage(
     theta = theta[np.newaxis, :, :]
 
     # Normalize
-    low = np.min(theta, axis=1, keepdims=True)
-    high = np.max(theta, axis=1, keepdims=True)
-    samples = (samples - low) / (high - low + 1e-10)
-    theta = (theta - low) / (high - low + 1e-10)
+    if norm:
+        low = np.min(theta, axis=1, keepdims=True)
+        high = np.max(theta, axis=1, keepdims=True)
+        samples = (samples - low) / (high - low + 1e-10)
+        theta = (theta - low) / (high - low + 1e-10)
 
     # Generate reference points
     if isinstance(references, str) and references == "random":
@@ -92,3 +121,94 @@ def get_drp_coverage(
     dx = alpha[1] - alpha[0]
     ecp = np.cumsum(h) * dx
     return ecp, alpha[1:]
+
+
+def _get_tarp_coverage_bootstrap(samples: np.ndarray,
+    theta: np.ndarray,
+    references: Union[str, np.ndarray] = "random",
+    metric: str = "euclidean",
+    norm: bool = True,
+    seed: Union[int, None] = None
+) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Estimates uncertainties on the expected probability and credibility values calculated with the
+    _get_tarp_coverage_single function using the bootstrapping method
+
+    Args:
+        samples: the samples to compute the coverage of, with shape ``(n_samples, n_sims, n_dims)``.
+        theta: the true parameter values for each samples, with shape ``(n_sims, n_dims)``.
+        references: the reference points to use for the DRP regions, with shape
+            ``(n_references, n_sims)``, or the string ``"random"``. If the later, then
+            the reference points are chosen randomly from the unit hypercube over
+            the parameter space.
+        metric: the metric to use when computing the distance. Can be ``"euclidean"`` or
+            ``"manhattan"``.
+        norm : whether to apply or not the normalization (Default = True)
+        seed: the seed to use for the random number generator. If ``None``, then no seed
+
+    Returns:
+        Mean and std of the expected coverage probability, and mean of the credibility
+    """
+    num_sims = samples.shape[1]
+
+    boot_ecp = np.empty(shape=(num_sims, num_sims//10))
+    boot_alpha = np.empty(shape=(num_sims, num_sims//10))
+    for i in tqdm(range(num_sims)):
+        idx_remove = np.random.randint(num_sims)
+        idx_add = np.random.randint(num_sims)
+
+        # Replacing one simulation and its samples by another and its associated samples
+        samples[:, idx_remove, :] = samples[:, idx_add, :] 
+        theta[idx_remove, :] = theta[idx_add, :]
+
+        boot_ecp[i, :], boot_alpha[i, :] = _get_tarp_coverage_single(samples,
+                                                                     theta,
+                                                                     references=references,
+                                                                     metric=metric,
+                                                                     norm=norm,
+                                                                     seed=seed)
+
+    # ecp_mean = boot_ecp.mean(axis=0)
+    # ecp_std = boot_ecp.std(axis=0)
+    # alpha_mean = boot_alpha.mean(axis=0)
+    # return ecp_mean, ecp_std, alpha_mean
+    return boot_ecp, boot_alpha
+
+
+def get_tarp_coverage(
+    samples: np.ndarray,
+    theta: np.ndarray,
+    references: Union[str, np.ndarray] = "random",
+    metric: str = "euclidean",
+    norm: bool = False,
+    bootstrap: bool = False,
+    seed: Union[int, None] = None
+) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Estimates coverage with the TARP method.
+
+    Reference: `Lemos, Coogan et al 2023 <https://arxiv.org/abs/2302.03026>`_
+
+    Args:
+        samples: the samples to compute the coverage of, with shape ``(n_samples, n_sims, n_dims)``.
+        theta: the true parameter values for each samples, with shape ``(n_sims, n_dims)``.
+        references: the reference points to use for the DRP regions, with shape
+            ``(n_references, n_sims)``, or the string ``"random"``. If the later, then
+            the reference points are chosen randomly from the unit hypercube over
+            the parameter space.
+        metric: the metric to use when computing the distance. Can be ``"euclidean"`` or
+            ``"manhattan"``.
+        norm : whether to apply or not the normalization (Default = False)
+        bootstrap : whether to use bootstrap to estimate uncertainties (Default = False)
+        seed: the seed to use for the random number generator. If ``None``, then no seed
+
+    Returns:
+        Expected coverage probability (``ecp``) and credibility values (``alpha``).
+        If bootstrap is True, both arrays have an extra dimension corresponding to the number of bootstrap iterations
+    """
+    if bootstrap:
+        ecp, alpha = _get_tarp_coverage_bootstrap(samples, theta, references, metric, norm, seed)
+    else:
+        ecp, alpha = _get_tarp_coverage_single(samples, theta, references, metric, norm, seed)
+    return ecp, alpha
+
diff --git a/tests/test_drp.py b/tests/test_drp.py
@@ -1,11 +1,11 @@
+import unittest
 import numpy as np
+from tarp import get_tarp_coverage
 
-from tarp import get_drp_coverage
 
-
-def test():
-    num_samples = 10
-    num_sims = 10
+def get_test_data():
+    num_samples = 100
+    num_sims = 100
     num_dims = 5
     theta = np.random.uniform(low=-5, high=5, size=(num_sims, num_dims))
     log_sigma = np.random.uniform(low=-5, high=-1, size=(num_sims, num_dims))
@@ -14,8 +14,43 @@ def test():
         loc=theta, scale=sigma, size=(num_samples, num_sims, num_dims)
     )
     theta = np.random.normal(loc=theta, scale=sigma, size=(num_sims, num_dims))
-    get_drp_coverage(samples, theta, references="random", metric="euclidean")
+    return samples, theta
+
+
+class TarpTest(unittest.TestCase):
+    def test_single(self):
+        samples, theta = get_test_data()
+        ecp, alpha = get_tarp_coverage(samples,
+                                       theta,
+                                       references="random",
+                                       metric="euclidean",
+                                       norm=False,
+                                       bootstrap=False)
+        self.assertAlmostEqual(np.max(np.abs(ecp - alpha)), 0., delta=0.25)
+
+    def test_norm(self):
+        samples, theta = get_test_data()
+        ecp, alpha = get_tarp_coverage(samples,
+                                       theta,
+                                       references="random",
+                                       metric="euclidean",
+                                       norm=True,
+                                       bootstrap=False)
+        self.assertAlmostEqual(np.max(np.abs(ecp - alpha)), 0., delta=0.25)
+
+    def test_bootstrap(self):
+        samples, theta = get_test_data()
+        ecp, alpha = get_tarp_coverage(samples,
+                                       theta,
+                                       references="random",
+                                       metric="euclidean",
+                                       norm=False,
+                                       bootstrap=True)
+        ecp_mean = np.mean(ecp, axis=0)
+        ecp_std = np.std(ecp, axis=0)
+        alpha = np.mean(alpha, axis=0)
+        self.assertAlmostEqual(np.max(np.abs(ecp_mean - alpha)/ecp_std), 0., delta=10.)
 
 
 if __name__ == "__main__":
-    test()
+    unittest.main()
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		from .drp import *
		from .drp import get_drp_coverage, get_tarp_coverage