diff --git a/README.md b/README.md index d7d0577..387c9e5 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,9 @@ or directly the `setup.py` file: # Examples See Vianello (2018) for details about these methods. +NOTE: all functions accept either single values or arrays as inputs, even +though the following examples deal with single values only. + ## Ideal case Compute the significance of a measurement of 100 counts with an expected background of 90 in the ideal case of no uncertainty on the background: @@ -32,7 +35,16 @@ background of 90 in the ideal case of no uncertainty on the background: from gv_significance import ideal_case ideal_case.significance(n=100, b=90) +``` +Compute the the counts that a source must generate in order to be detected +at 5 sigma with an efficiency of 90% given a background of 100 counts: + +```python +from gv_significance import ideal_case + +# Note: detection_efficiency must be either 0.5, 0.9 or 0.99 +ideal_case.five_sigma_threshold(B=100, detection_efficiency=0.9) ``` ## Poisson observation, Poisson background with or without systematics @@ -45,7 +57,15 @@ on-source (`alpha=0.1`): from gv_significance import poisson_poisson poisson_poisson.significance(n=100, b=900, alpha=0.1) +``` +Alternative, one can use the $Z_Bi$ estimator described in Cousins 2008 (but be +careful about the range of usability, see Vianello 2018) by doing: + +```python +from gv_significance import poisson_poisson + +poisson_poisson.z_bi_significance(n=100, b=900, alpha=0.1) ``` If there is an additional systematic uncertainty on the background measurement of @@ -55,7 +75,6 @@ at most 10%, then: from gv_significance import poisson_poisson poisson_poisson.significance(n=100, b=900, alpha=0.1, k=0.1) - ``` If there is an additional systematic uncertainty on the background measurement @@ -65,7 +84,6 @@ that is Gaussian-distributed with a sigma of 10%, then: from gv_significance import poisson_poisson poisson_poisson.significance(n=100, b=900, alpha=0.1, sigma=0.1) - ``` ## Poisson observation, Gaussian background @@ -76,5 +94,5 @@ background estimation procedure that gives us an expected background of ```python from gv_significance import poisson_gaussian -poisson_gaussian.significance(n=100, b=90, sigma=0.4) +poisson_gaussian.significance(n=100, b=90, sigma=2.4) ``` \ No newline at end of file diff --git a/gv_significance/poisson_gaussian.py b/gv_significance/poisson_gaussian.py index 7a36730..283bb95 100644 --- a/gv_significance/poisson_gaussian.py +++ b/gv_significance/poisson_gaussian.py @@ -1,6 +1,8 @@ import numpy as np from numpy import sqrt, squeeze from xlogy import xlogyv +from z_bi import z_bi_vectorized +from size_one_or_n import size_one_or_n def significance(n, b, sigma): @@ -17,11 +19,13 @@ def significance(n, b, sigma): n_ = np.array(n, dtype=float, ndmin=1) b_ = np.array(b, dtype=float, ndmin=1) + sigma_ = size_one_or_n(sigma, n_, "sigma") + # Assign sign depending on whether n_ > b_ sign = np.where(n_ >= b_, 1, -1) - B0_mle = 0.5 * (b_ - sigma ** 2 + sqrt(b_ ** 2 - 2 * b_ * sigma ** 2 + 4 * n_ * sigma ** 2 + sigma ** 4)) + B0_mle = 0.5 * (b_ - sigma_ ** 2 + sqrt(b_ ** 2 - 2 * b_ * sigma_ ** 2 + 4 * n_ * sigma_ ** 2 + sigma_ ** 4)) # B0_mle could be slightly < 0 (even though it shouldn't) because of the # limited numerical precision of the calculator. let's accept as negative as 0.01, and clip @@ -30,4 +34,33 @@ def significance(n, b, sigma): B0_mle = np.clip(B0_mle, 0, None) - return squeeze(sqrt(2) * sqrt(xlogyv(n_, n_ / B0_mle) + (b_ - B0_mle)**2 / (2 * sigma**2) + B0_mle - n_) * sign) \ No newline at end of file + return squeeze(sqrt(2) * sqrt(xlogyv(n_, n_ / B0_mle) + (b_ - B0_mle)**2 / (2 * sigma_**2) + B0_mle - n_) * sign) + + +def z_bi_significance(n, b, sigma): + """ + Use the estimator Z_Bi from Cousins et al. 2008 to compute the significance + + :param n: observed counts (can be an array) + :param b: expected background counts (can be an array) + :param sigma: error on the measurement of b + :return: the significance (z score) for the measurement(s) + """ + + n_ = np.array(n, dtype=float, ndmin=1) + b_ = np.array(b, dtype=float, ndmin=1) + sigma_ = size_one_or_n(sigma, n_, "sigma") + + res = np.zeros(n_.shape[0], dtype=float) + + idx = (sigma_ != 0) + + # When sigma is zero there is no computation possible + res[~idx] = np.nan + + tau = b_[idx] / sigma_[idx]**2 # Eq. 8 of Cousins et al. 2008 + n_off = b_[idx] * tau + + res[idx] = z_bi_vectorized(n_[idx], n_off, 1.0 / np.abs(tau)) + + return np.squeeze(res) diff --git a/gv_significance/poisson_poisson.py b/gv_significance/poisson_poisson.py index 3437fe2..15f9121 100644 --- a/gv_significance/poisson_poisson.py +++ b/gv_significance/poisson_poisson.py @@ -1,14 +1,16 @@ import numpy as np -from xlogy import xlogy +from xlogy import xlogy, xlogyv import scipy.optimize +from z_bi import z_bi_vectorized +from size_one_or_n import size_one_or_n def _li_and_ma(n_, b_, alpha): # In order to avoid numerical problem with 0 * log(0), we add a tiny number to n_ and b_, inconsequential # for the computation - n_ += 1e-15 # type: np.ndarray - b_ += 1e-15 # type: np.ndarray + n_ += 1E-25 # type: np.ndarray + b_ += 1E-25 # type: np.ndarray # Pre-compute for speed n_plus_b = n_ + b_ @@ -34,7 +36,7 @@ def _likelihood_with_sys(o, b, a, s, k, B, M): Ba = B * a Bak = B * a * k - res = -Bak - Ba - B - M + xlogy(b, B) - k ** 2 / (2 * s ** 2) + xlogy(o, Bak + Ba + M) + res = -Bak - Ba - B - M + xlogyv(b, B) - k ** 2 / (2 * s ** 2) + xlogyv(o, Bak + Ba + M) return res @@ -68,6 +70,9 @@ def _get_TS_by_numerical_optimization(n_, b_, alpha, sigma): return TS +_get_TS_by_numerical_optimization_v = np.vectorize(_get_TS_by_numerical_optimization) + + def significance(n, b, alpha, sigma=0, k=0): """ Returns the significance for detecting n counts when alpha * B are expected. @@ -89,10 +94,10 @@ def significance(n, b, alpha, sigma=0, k=0): :param n: observed counts (can be an array) :param b: expected background counts (can be an array) - :param alpha: ratio of the source observation efficiency and background observation efficiency (must be the same for - all items in n and b) - :param sigma: standard deviation for the Gaussian case (must be the same for all items in n and b) - :param k: maximum fractional systematic uncertainty expected. (must be the same for all items in n and b) + :param alpha: ratio of the source observation efficiency and background observation efficiency + (either a float, or an array of the same shape of n) + :param sigma: standard deviation for the Gaussian case (either a float, or an array of the same shape of n) + :param k: maximum fractional systematic uncertainty expected (either a float, or an array of the same shape of n) :return: the significance (z score) for the measurement(s) """ @@ -102,27 +107,53 @@ def significance(n, b, alpha, sigma=0, k=0): n_ = np.array(n, dtype=float, ndmin=1) b_ = np.array(b, dtype=float, ndmin=1) + k_ = size_one_or_n(k, n_, "k") + + sigma_ = size_one_or_n(sigma, n_, "sigma") + + alpha_ = size_one_or_n(alpha, n_, "alpha") + # Assign sign depending on whether n_ > b_ - sign = np.where(n_ >= alpha * b_, 1, -1) + sign = np.where(n_ >= alpha_ * b_, 1, -1) + + # Prepare vector for results + res = np.zeros(n_.shape[0], dtype=float) + + # Select elements where we need to apply Li & Ma and apply it + idx_lima = (sigma_ == 0) & (k_ == 0) + + res[idx_lima] = _li_and_ma(n_[idx_lima], b_[idx_lima], alpha_[idx_lima]) - if sigma == 0 and k == 0: + # Select elements where we need to apply eq. 7 from Vianello (2018), + # which is simply Li & Ma with alpha -> alpha * (k+1) - # Li & Ma - return np.squeeze(sign * _li_and_ma(n_, b_, alpha)) + idx_eq7 = (k_ > 0) + res[idx_eq7] = _li_and_ma(n_[idx_eq7], b_[idx_eq7], alpha_[idx_eq7] * (k_[idx_eq7] + 1)) - if k > 0: + # Select elements where we need to apply eq. 9 from Vianello (2018) + idx_eq9 = (sigma_ > 0) - # Need to use eq. 7 from Vianello (2018), which is simply Li & Ma with alpha -> alpha * (k+1) - return np.squeeze(sign * _li_and_ma(n_, b_, alpha * (k+1))) + if np.any(idx_eq9): - if sigma > 0: + TS = _get_TS_by_numerical_optimization_v(n_[idx_eq9], b_[idx_eq9], alpha_[idx_eq9], sigma_[idx_eq9]) - # Need to use eq. 9 from Vianello (2018) + res[idx_eq9] = np.sqrt(TS) - # The computation is not vectorized because there is an optimization involved, so we can only loop... - TS = np.array([_get_TS_by_numerical_optimization(n_[i], b_[i], alpha, sigma) for i in range(n_.shape[0])]) + # Return significance - # Return significance + return np.squeeze(sign * res) + + +def z_bi_significance(n, b, alpha): + """ + Use the estimator Z_Bi from Cousins et al. 2008 to compute the significance + + :param n: observed counts (can be an array) + :param b: expected background counts (can be an array) + :param alpha: ratio of the source observation efficiency and background observation efficiency (must be the same for + all items in n and b) + :return: the significance (z score) for the measurement(s) + """ - return np.squeeze(sign * np.sqrt(TS)) \ No newline at end of file + return z_bi_vectorized(n, b, alpha) diff --git a/gv_significance/significance_from_pvalue.py b/gv_significance/significance_from_pvalue.py index 60a4742..a7db387 100644 --- a/gv_significance/significance_from_pvalue.py +++ b/gv_significance/significance_from_pvalue.py @@ -20,4 +20,6 @@ def significance_from_pvalue(pvalue): raise ArithmeticError("One or more pvalues are too small for a significance computation.") + # This is equivalent to sqrt(2) * erfinv(1 - 2 * pvalue) but faster + return -scipy.special.ndtri(pvalue) \ No newline at end of file diff --git a/gv_significance/size_one_or_n.py b/gv_significance/size_one_or_n.py new file mode 100644 index 0000000..ee1574e --- /dev/null +++ b/gv_significance/size_one_or_n.py @@ -0,0 +1,16 @@ +import numpy as np + + +def size_one_or_n(value, other_array, name): + + value_ = np.array(value, dtype=float, ndmin=1) + + if value_.shape[0] == 1: + + value_ = np.zeros(other_array.shape[0], dtype=float) + value + + else: + + assert value_.shape[0] == other_array.shape[0], "The size of %s must be either 1 or the same size of n" % name + + return value_ diff --git a/gv_significance/test_formulae.py b/gv_significance/test_formulae.py new file mode 100644 index 0000000..9cda7c0 --- /dev/null +++ b/gv_significance/test_formulae.py @@ -0,0 +1,50 @@ +import pytest +import numpy as np + +from poisson_poisson import significance as poi_poi_significance +from poisson_poisson import z_bi_significance as poi_poi_zbi + +from poisson_gaussian import significance as poi_gau_significance +from poisson_gaussian import z_bi_significance as poi_gau_zbi + +from ideal_case import significance as ideal_significance + + +def test_poi_poi_significance(): + + # Test with one number + s = poi_poi_significance(20, 80, 0.1, k=0, sigma=0) + + s = poi_poi_significance(20, 80, 0.1, k=0.1, sigma=0) + + s = poi_poi_significance(20, 80, 0.1, k=0, sigma=0.1) + + assert np.isclose(poi_poi_zbi(140, 100, 1 / 1.2), 3.93, rtol=0.01) + + # Test with arrays + s = poi_poi_significance([20, 30], [80, 90], 0.1) + s = poi_poi_significance([20, 30], [80, 90], [0.1, 0.2]) + + s = poi_poi_significance([20, 30], [80, 90], 0.1, k=0.1) + s = poi_poi_significance([20, 30], [80, 90], [0.1, 0.2], k=[0.1, 0.1]) + + s = poi_poi_significance([20, 30], [80, 90], 0.1, sigma=0.1) + s = poi_poi_significance([20, 30], [80, 90], [0.1, 0.2], sigma=[0.1, 0.1]) + + assert np.allclose(poi_poi_zbi([140, 140], [100, 100], [1 / 1.2, 1/1.2]), [3.93, 3.93], rtol=0.01) + + # Test mixed + s = poi_poi_significance([20, 30, 40], [80, 90, 100], alpha=[0.1, 0.1, 0.1], + k=[0, 0.1, 0], sigma=[0, 0, 10]) + +def test_poi_gau_significance(): + + # Test with one number + s = poi_gau_significance(120, 80, 5.3) + + assert np.isclose(poi_gau_zbi(140, 83.33, 8.333), 3.93, rtol=0.01) + + # Test with vectors + s = poi_gau_significance([120, 130], [80, 90], [5.3, 5.3]) + + assert np.allclose(poi_gau_zbi([140, 140], [83.33, 83.33], [8.333, 8.333]), [3.93, 3.93], rtol=0.01) diff --git a/gv_significance/xlogy.py b/gv_significance/xlogy.py index c874192..a0c39b6 100644 --- a/gv_significance/xlogy.py +++ b/gv_significance/xlogy.py @@ -1,6 +1,7 @@ from math import log import numpy as np + def xlogy(x, y): """ This function implements x * log(y) so that if both x and y are 0, the results is zero (and not infinite or nan @@ -33,6 +34,6 @@ def xlogyv(x, y): idx = (x != 0) - results[idx] = x[idx] * log(y[idx]) + results[idx] = x[idx] * np.log(y[idx]) return np.squeeze(results) diff --git a/gv_significance/z_bi.py b/gv_significance/z_bi.py new file mode 100644 index 0000000..fedaf1f --- /dev/null +++ b/gv_significance/z_bi.py @@ -0,0 +1,70 @@ +import numpy as np +from ncephes.cprob import incbet +from numba import vectorize, float64 +from significance_from_pvalue import significance_from_pvalue + + +# This decorator vectorize the function for fast execution +@vectorize([float64(float64, float64, float64)]) +def z_bi_cephes(n_on, n_off, alpha): + + tau = 1.0 / alpha + + aa = n_on + bb = n_off + 1 + xx = 1.0 / (1+tau) + + # Checks to avoid Nan in some cases + if aa <= 0.0 or bb <= 0.0: + + return 0.0 + + if xx <= 0.0: + + return 0.0 + + if xx >= 1.0: + + return 1.0 + + # I use the incbet from cephes instead of the scipy.special.betainc function because the latter has numerical + # problems in some instances and return Nans, while the incbet from Cephes is more robust + + P_Bi = incbet(aa, bb, xx) + + return significance_from_pvalue(P_Bi) + + +def z_bi_vectorized(n, b, alpha): + """ + Use the estimator Z_Bi from Cousins et al. 2008 to compute the significance + + :param n: observed counts (can be an array) + :param b: expected background counts (can be an array) + :param alpha: ratio of the source observation efficiency and background observation efficiency (must be the same for + all items in n and b) + :return: the significance (z score) for the measurement(s) + """ + + n_ = np.array(n, dtype=float, ndmin=1) + b_ = np.array(b, dtype=float, ndmin=1) + + assert n_.shape[0] == b_.shape[0], "n and b must have the same size" + + alpha_ = np.array(alpha, dtype=float, ndmin=1) + + if alpha_.shape[0] == 1: + + alpha_ = np.array([alpha] * n_.shape[0]) + + else: + + assert alpha_.shape[0] == n_.shape[0], "Alpha must be either a scalar or an array of the same length of n" + + # Assign sign depending on whether n_ > b_ + + sign = np.where(n_ >= alpha * b_, 1, -1) + + res = z_bi_cephes(n_, b_, alpha_) + + return np.squeeze(sign * res) \ No newline at end of file diff --git a/setup.py b/setup.py index 9142c7a..f60699d 100644 --- a/setup.py +++ b/setup.py @@ -10,5 +10,7 @@ author_email='giacomov@stanford.edu', description='Implement the formulae from Vianello (2018)', install_requires=['scipy >= 0.18', - 'numpy'] + 'numpy', + 'ncephes', + 'numba'] )