Final touches

giacomov · Jan 23, 2018 · 39cda8f · 39cda8f
1 parent ba18e22
commit 39cda8f
Show file tree

Hide file tree

Showing 9 changed files with 251 additions and 28 deletions.
diff --git a/README.md b/README.md
@@ -24,6 +24,9 @@ or directly the `setup.py` file:
 # Examples
 See Vianello (2018) for details about these methods.
 
+NOTE: all functions accept either single values or arrays as inputs, even 
+though the following examples deal with single values only.
+
 ## Ideal case
 Compute the significance of a measurement of 100 counts with an expected 
 background of 90 in the ideal case of no uncertainty on the background:
@@ -32,7 +35,16 @@ background of 90 in the ideal case of no uncertainty on the background:
 from gv_significance import ideal_case
 
 ideal_case.significance(n=100, b=90)
+```
 
+Compute the the counts that a source must generate in order to be detected 
+at 5 sigma with an efficiency of 90% given a background of 100 counts:
+
+```python
+from gv_significance import ideal_case
+
+# Note: detection_efficiency must be either 0.5, 0.9 or 0.99
+ideal_case.five_sigma_threshold(B=100, detection_efficiency=0.9)
 ```
 
 ## Poisson observation, Poisson background with or without systematics
@@ -45,7 +57,15 @@ on-source (`alpha=0.1`):
 from gv_significance import poisson_poisson
 
 poisson_poisson.significance(n=100, b=900, alpha=0.1)
+```
 
+Alternative, one can use the $Z_Bi$ estimator described in Cousins 2008 (but be 
+careful about the range of usability, see Vianello 2018) by doing:
+
+```python
+from gv_significance import poisson_poisson
+
+poisson_poisson.z_bi_significance(n=100, b=900, alpha=0.1)
 ```
 
 If there is an additional systematic uncertainty on the background measurement of 
@@ -55,7 +75,6 @@ at most 10%, then:
 from gv_significance import poisson_poisson
 
 poisson_poisson.significance(n=100, b=900, alpha=0.1, k=0.1)
-
 ```
 
 If there is an additional systematic uncertainty on the background measurement
@@ -65,7 +84,6 @@ that is Gaussian-distributed with a sigma of 10%, then:
 from gv_significance import poisson_poisson
 
 poisson_poisson.significance(n=100, b=900, alpha=0.1, sigma=0.1)
-
 ```
 
 ## Poisson observation, Gaussian background
@@ -76,5 +94,5 @@ background estimation procedure that gives us an expected background of
 ```python
 from gv_significance import poisson_gaussian
 
-poisson_gaussian.significance(n=100, b=90, sigma=0.4)
+poisson_gaussian.significance(n=100, b=90, sigma=2.4)
 ```
diff --git a/gv_significance/poisson_gaussian.py b/gv_significance/poisson_gaussian.py
@@ -1,6 +1,8 @@
 import numpy as np
 from numpy import sqrt, squeeze
 from xlogy import xlogyv
+from z_bi import z_bi_vectorized
+from size_one_or_n import size_one_or_n
 
 
 def significance(n, b, sigma):
@@ -17,11 +19,13 @@ def significance(n, b, sigma):
     n_ = np.array(n, dtype=float, ndmin=1)
     b_ = np.array(b, dtype=float, ndmin=1)
 
+    sigma_ = size_one_or_n(sigma, n_, "sigma")
+
     # Assign sign depending on whether n_ > b_
 
     sign = np.where(n_ >= b_, 1, -1)
 
-    B0_mle = 0.5 * (b_ - sigma ** 2 + sqrt(b_ ** 2 - 2 * b_ * sigma ** 2 + 4 * n_ * sigma ** 2 + sigma ** 4))
+    B0_mle = 0.5 * (b_ - sigma_ ** 2 + sqrt(b_ ** 2 - 2 * b_ * sigma_ ** 2 + 4 * n_ * sigma_ ** 2 + sigma_ ** 4))
 
     # B0_mle could be slightly < 0 (even though it shouldn't) because of the
     # limited numerical precision of the calculator. let's accept as negative as 0.01, and clip
@@ -30,4 +34,33 @@ def significance(n, b, sigma):
 
     B0_mle = np.clip(B0_mle, 0, None)
 
-    return squeeze(sqrt(2) * sqrt(xlogyv(n_, n_ / B0_mle) + (b_ - B0_mle)**2 / (2 * sigma**2) + B0_mle - n_) * sign)
+    return squeeze(sqrt(2) * sqrt(xlogyv(n_, n_ / B0_mle) + (b_ - B0_mle)**2 / (2 * sigma_**2) + B0_mle - n_) * sign)
+
+
+def z_bi_significance(n, b, sigma):
+    """
+    Use the estimator Z_Bi from Cousins et al. 2008 to compute the significance
+
+    :param n: observed counts (can be an array)
+    :param b: expected background counts (can be an array)
+    :param sigma: error on the measurement of b
+    :return: the significance (z score) for the measurement(s)
+    """
+
+    n_ = np.array(n, dtype=float, ndmin=1)
+    b_ = np.array(b, dtype=float, ndmin=1)
+    sigma_ = size_one_or_n(sigma, n_, "sigma")
+
+    res = np.zeros(n_.shape[0], dtype=float)
+
+    idx = (sigma_ != 0)
+
+    # When sigma is zero there is no computation possible
+    res[~idx] = np.nan
+
+    tau = b_[idx] / sigma_[idx]**2  # Eq. 8 of Cousins et al. 2008
+    n_off = b_[idx] * tau
+
+    res[idx] = z_bi_vectorized(n_[idx], n_off, 1.0 / np.abs(tau))
+
+    return np.squeeze(res)
diff --git a/gv_significance/poisson_poisson.py b/gv_significance/poisson_poisson.py
@@ -1,14 +1,16 @@
 import numpy as np
-from xlogy import xlogy
+from xlogy import xlogy, xlogyv
 import scipy.optimize
+from z_bi import z_bi_vectorized
+from size_one_or_n import size_one_or_n
 
 
 def _li_and_ma(n_, b_, alpha):
 
     # In order to avoid numerical problem with 0 * log(0), we add a tiny number to n_ and b_, inconsequential
     # for the computation
-    n_ += 1e-15  # type: np.ndarray
-    b_ += 1e-15  # type: np.ndarray
+    n_ += 1E-25  # type: np.ndarray
+    b_ += 1E-25  # type: np.ndarray
 
     # Pre-compute for speed
     n_plus_b = n_ + b_
@@ -34,7 +36,7 @@ def _likelihood_with_sys(o, b, a, s, k, B, M):
     Ba = B * a
     Bak = B * a * k
 
-    res = -Bak - Ba - B - M + xlogy(b, B) - k ** 2 / (2 * s ** 2) + xlogy(o, Bak + Ba + M)
+    res = -Bak - Ba - B - M + xlogyv(b, B) - k ** 2 / (2 * s ** 2) + xlogyv(o, Bak + Ba + M)
 
     return res
 
@@ -68,6 +70,9 @@ def _get_TS_by_numerical_optimization(n_, b_, alpha, sigma):
     return TS
 
 
+_get_TS_by_numerical_optimization_v = np.vectorize(_get_TS_by_numerical_optimization)
+
+
 def significance(n, b, alpha, sigma=0, k=0):
     """
     Returns the significance for detecting n counts when alpha * B are expected.
@@ -89,10 +94,10 @@ def significance(n, b, alpha, sigma=0, k=0):
 
     :param n: observed counts (can be an array)
     :param b: expected background counts (can be an array)
-    :param alpha: ratio of the source observation efficiency and background observation efficiency (must be the same for
-    all items in n and b)
-    :param sigma: standard deviation for the Gaussian case (must be the same for all items in n and b)
-    :param k: maximum fractional systematic uncertainty expected. (must be the same for all items in n and b)
+    :param alpha: ratio of the source observation efficiency and background observation efficiency
+    (either a float, or an array of the same shape of n)
+    :param sigma: standard deviation for the Gaussian case (either a float, or an array of the same shape of n)
+    :param k: maximum fractional systematic uncertainty expected (either a float, or an array of the same shape of n)
     :return: the significance (z score) for the measurement(s)
     """
 
@@ -102,27 +107,53 @@ def significance(n, b, alpha, sigma=0, k=0):
     n_ = np.array(n, dtype=float, ndmin=1)
     b_ = np.array(b, dtype=float, ndmin=1)
 
+    k_ = size_one_or_n(k, n_, "k")
+
+    sigma_ = size_one_or_n(sigma, n_, "sigma")
+
+    alpha_ = size_one_or_n(alpha, n_, "alpha")
+
     # Assign sign depending on whether n_ > b_
 
-    sign = np.where(n_ >= alpha * b_, 1, -1)
+    sign = np.where(n_ >= alpha_ * b_, 1, -1)
+
+    # Prepare vector for results
+    res = np.zeros(n_.shape[0], dtype=float)
+
+    # Select elements where we need to apply Li & Ma and apply it
+    idx_lima = (sigma_ == 0) & (k_ == 0)
+
+    res[idx_lima] = _li_and_ma(n_[idx_lima], b_[idx_lima], alpha_[idx_lima])
 
-    if sigma == 0 and k == 0:
+    # Select elements where we need to apply eq. 7 from Vianello (2018),
+    # which is simply Li & Ma with alpha -> alpha * (k+1)
 
-        # Li & Ma
-        return np.squeeze(sign * _li_and_ma(n_, b_, alpha))
+    idx_eq7 = (k_ > 0)
+    res[idx_eq7] = _li_and_ma(n_[idx_eq7], b_[idx_eq7], alpha_[idx_eq7] * (k_[idx_eq7] + 1))
 
-    if k > 0:
+    # Select elements where we need to apply eq. 9 from Vianello (2018)
+    idx_eq9 = (sigma_ > 0)
 
-        # Need to use eq. 7 from Vianello (2018), which is simply Li & Ma with alpha -> alpha * (k+1)
-        return np.squeeze(sign * _li_and_ma(n_, b_, alpha * (k+1)))
+    if np.any(idx_eq9):
 
-    if sigma > 0:
+        TS = _get_TS_by_numerical_optimization_v(n_[idx_eq9], b_[idx_eq9], alpha_[idx_eq9], sigma_[idx_eq9])
 
-        # Need to use eq. 9 from Vianello (2018)
+        res[idx_eq9] = np.sqrt(TS)
 
-        # The computation is not vectorized because there is an optimization involved, so we can only loop...
-        TS = np.array([_get_TS_by_numerical_optimization(n_[i], b_[i], alpha, sigma) for i in range(n_.shape[0])])
+    # Return significance
 
-        # Return significance
+    return np.squeeze(sign * res)
+
+
+def z_bi_significance(n, b, alpha):
+    """
+    Use the estimator Z_Bi from Cousins et al. 2008 to compute the significance
+
+    :param n: observed counts (can be an array)
+    :param b: expected background counts (can be an array)
+    :param alpha: ratio of the source observation efficiency and background observation efficiency (must be the same for
+    all items in n and b)
+    :return: the significance (z score) for the measurement(s)
+    """
 
-        return np.squeeze(sign * np.sqrt(TS))
+    return z_bi_vectorized(n, b, alpha)
diff --git a/gv_significance/significance_from_pvalue.py b/gv_significance/significance_from_pvalue.py
@@ -20,4 +20,6 @@ def significance_from_pvalue(pvalue):
 
         raise ArithmeticError("One or more pvalues are too small for a significance computation.")
 
+    # This is equivalent to sqrt(2) * erfinv(1 - 2 * pvalue) but faster
+
     return -scipy.special.ndtri(pvalue)
diff --git a/gv_significance/size_one_or_n.py b/gv_significance/size_one_or_n.py
@@ -0,0 +1,16 @@
+import numpy as np
+
+
+def size_one_or_n(value, other_array, name):
+
+    value_ = np.array(value, dtype=float, ndmin=1)
+
+    if value_.shape[0] == 1:
+
+        value_ = np.zeros(other_array.shape[0], dtype=float) + value
+
+    else:
+
+        assert value_.shape[0] == other_array.shape[0], "The size of %s must be either 1 or the same size of n" % name
+
+    return value_
diff --git a/gv_significance/test_formulae.py b/gv_significance/test_formulae.py
@@ -0,0 +1,50 @@
+import pytest
+import numpy as np
+
+from poisson_poisson import significance as poi_poi_significance
+from poisson_poisson import z_bi_significance as poi_poi_zbi
+
+from poisson_gaussian import significance as poi_gau_significance
+from poisson_gaussian import z_bi_significance as poi_gau_zbi
+
+from ideal_case import significance as ideal_significance
+
+
+def test_poi_poi_significance():
+
+    # Test with one number
+    s = poi_poi_significance(20, 80, 0.1, k=0, sigma=0)
+
+    s = poi_poi_significance(20, 80, 0.1, k=0.1, sigma=0)
+
+    s = poi_poi_significance(20, 80, 0.1, k=0, sigma=0.1)
+
+    assert np.isclose(poi_poi_zbi(140, 100, 1 / 1.2), 3.93, rtol=0.01)
+
+    # Test with arrays
+    s = poi_poi_significance([20, 30], [80, 90], 0.1)
+    s = poi_poi_significance([20, 30], [80, 90], [0.1, 0.2])
+
+    s = poi_poi_significance([20, 30], [80, 90], 0.1, k=0.1)
+    s = poi_poi_significance([20, 30], [80, 90], [0.1, 0.2], k=[0.1, 0.1])
+
+    s = poi_poi_significance([20, 30], [80, 90], 0.1, sigma=0.1)
+    s = poi_poi_significance([20, 30], [80, 90], [0.1, 0.2], sigma=[0.1, 0.1])
+
+    assert np.allclose(poi_poi_zbi([140, 140], [100, 100], [1 / 1.2, 1/1.2]), [3.93, 3.93], rtol=0.01)
+
+    # Test mixed
+    s = poi_poi_significance([20, 30, 40], [80, 90, 100], alpha=[0.1, 0.1, 0.1],
+                             k=[0, 0.1, 0], sigma=[0, 0, 10])
+
+def test_poi_gau_significance():
+
+    # Test with one number
+    s = poi_gau_significance(120, 80, 5.3)
+
+    assert np.isclose(poi_gau_zbi(140, 83.33, 8.333), 3.93, rtol=0.01)
+
+    # Test with vectors
+    s = poi_gau_significance([120, 130], [80, 90], [5.3, 5.3])
+
+    assert np.allclose(poi_gau_zbi([140, 140], [83.33, 83.33], [8.333, 8.333]), [3.93, 3.93], rtol=0.01)
diff --git a/gv_significance/xlogy.py b/gv_significance/xlogy.py
@@ -1,6 +1,7 @@
 from math import log
 import numpy as np
 
+
 def xlogy(x, y):
     """
     This function implements x * log(y) so that if both x and y are 0, the results is zero (and not infinite or nan
@@ -33,6 +34,6 @@ def xlogyv(x, y):
 
     idx = (x != 0)
 
-    results[idx] = x[idx] * log(y[idx])
+    results[idx] = x[idx] * np.log(y[idx])
 
     return np.squeeze(results)