From 69dfc83f91c16d13e1160c22fec7c075c4d6428f Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Thu, 19 Sep 2024 21:47:23 +0200 Subject: [PATCH 01/17] Add cross-validation-based bandwidth selection --- arviz/stats/density_utils.py | 64 +++++++++++++++++++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) diff --git a/arviz/stats/density_utils.py b/arviz/stats/density_utils.py index 47b30db3a8..93505b2340 100644 --- a/arviz/stats/density_utils.py +++ b/arviz/stats/density_utils.py @@ -4,7 +4,7 @@ import numpy as np from scipy.fftpack import fft -from scipy.optimize import brentq +from scipy.optimize import brentq, minimize_scalar from scipy.signal import convolve, convolve2d from scipy.signal.windows import gaussian from scipy.sparse import coo_matrix @@ -34,6 +34,66 @@ def _bw_silverman(x, x_std=None, **kwargs): # pylint: disable=unused-argument return bw +def _bw_oversmoothed(x, x_std=None, **kwargs): # pylint: disable=unused-argument + """Oversmoothed bandwidth estimation.""" + if x_std is None: + x_std = np.std(x) + bw = 1.144 * x_std * len(x) ** (-0.2) + return bw + + +def _bw_cv(x, unbiased=True, bin_width=None, grid_counts=None, x_std=None, **kwargs): + """Cross-validation bandwidth estimation.""" + if x_std is None: + x_std = np.std(x) + + if bin_width is None or grid_counts is None: + x_min = x.min() + x_max = x.max() + grid_len = 256 + grid_min = x_min - 0.5 * x_std + grid_max = x_max + 0.5 * x_std + grid_counts, _, grid_edges = histogram(x, grid_len, (grid_min, grid_max)) + bin_width = grid_edges[1] - grid_edges[0] + + x_len = len(x) + grid_len = len(grid_counts) + ks = np.arange(1, grid_len) + + # entry j is the sum over i of grid_counts[i] * grid_counts[i + j + 1] + grid_counts_comb = convolve(grid_counts[:-1], grid_counts[:0:-1], mode="full")[grid_len-2::-1] + + def _compute_cv_score(bw): + if unbiased: + summand = np.exp(ks * -((0.5 * bin_width / bw) ** 2)) + summand -= np.sqrt(8) * summand**2 + else: + deltas = ks * (bin_width / bw) + summand = (deltas**4 - 12 * deltas**2 + 12) * np.exp(-0.25 * deltas**2) / 64 + score = (0.5 / x_len + bin_width**2 * np.inner(grid_counts_comb, summand)) / ( + bw * np.sqrt(np.pi) + ) + return score + + bw_max = _bw_oversmoothed(x, x_std=x_std) + bw_min = bin_width / (2 * np.pi) + + result = minimize_scalar(_compute_cv_score, bounds=(bw_min, bw_max), method="bounded") + if not result.success: + warnings.warn("Optimizing the bandwidth using cross-validation did not converge.") + bw_opt = result.x + + return bw_opt + + +def _bw_ucv(x, **kwargs): + return _bw_cv(x, unbiased=True, **kwargs) + + +def _bw_bcv(x, **kwargs): + return _bw_cv(x, unbiased=False, **kwargs) + + def _bw_isj(x, grid_counts=None, x_std=None, x_range=None): """Improved Sheather-Jones bandwidth estimation. @@ -111,6 +171,8 @@ def _bw_taylor(x): "silverman": _bw_silverman, "isj": _bw_isj, "experimental": _bw_experimental, + "ucv": _bw_ucv, + "bcv": _bw_bcv, } From 28317806683fbf848e44f8007806fa78c9a38779 Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Thu, 19 Sep 2024 21:48:43 +0200 Subject: [PATCH 02/17] Pass bin width to bandwidth functions --- arviz/stats/density_utils.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arviz/stats/density_utils.py b/arviz/stats/density_utils.py index 93505b2340..954e73cc85 100644 --- a/arviz/stats/density_utils.py +++ b/arviz/stats/density_utils.py @@ -94,7 +94,7 @@ def _bw_bcv(x, **kwargs): return _bw_cv(x, unbiased=False, **kwargs) -def _bw_isj(x, grid_counts=None, x_std=None, x_range=None): +def _bw_isj(x, grid_counts=None, x_std=None, x_range=None, **kwargs): # pylint: disable=unused-argument """Improved Sheather-Jones bandwidth estimation. Improved Sheather and Jones method as explained in [1]_. This method is used internally by the @@ -136,7 +136,7 @@ def _bw_isj(x, grid_counts=None, x_std=None, x_range=None): return h -def _bw_experimental(x, grid_counts=None, x_std=None, x_range=None): +def _bw_experimental(x, grid_counts=None, x_std=None, x_range=None, **kwargs): # pylint: disable=unused-argument """Experimental bandwidth estimator.""" bw_silverman = _bw_silverman(x, x_std=x_std) bw_isj = _bw_isj(x, grid_counts=grid_counts, x_range=x_range) @@ -176,7 +176,7 @@ def _bw_taylor(x): } -def _get_bw(x, bw, grid_counts=None, x_std=None, x_range=None): +def _get_bw(x, bw, grid_counts=None, bin_width=None, x_std=None, x_range=None): """Compute bandwidth for a given data `x` and `bw`. Also checks `bw` is correctly specified. @@ -217,7 +217,7 @@ def _get_bw(x, bw, grid_counts=None, x_std=None, x_range=None): ) bw_fun = _BW_METHODS_LINEAR[bw_lower] - bw = bw_fun(x, grid_counts=grid_counts, x_std=x_std, x_range=x_range) + bw = bw_fun(x, grid_counts=grid_counts, bin_width=bin_width, x_std=x_std, x_range=x_range) else: raise ValueError( "Unrecognized `bw` argument.\n" @@ -642,9 +642,10 @@ def _kde_linear( x_min, x_max, x_std, extend_fct, grid_len, custom_lims, extend, bound_correction ) grid_counts, _, grid_edges = histogram(x, grid_len, (grid_min, grid_max)) + bin_width = grid_edges[1] - grid_edges[0] # Bandwidth estimation - bw = bw_fct * _get_bw(x, bw, grid_counts, x_std, x_range) + bw = bw_fct * _get_bw(x, bw, grid_counts, bin_width, x_std, x_range) # Density estimation if adaptive: From b7b1df0d2f2cca355186ac3a8c4e4665d0462194 Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Thu, 19 Sep 2024 22:08:27 +0200 Subject: [PATCH 03/17] Make function more modular --- arviz/stats/density_utils.py | 44 +++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/arviz/stats/density_utils.py b/arviz/stats/density_utils.py index 954e73cc85..b59f0ad0c0 100644 --- a/arviz/stats/density_utils.py +++ b/arviz/stats/density_utils.py @@ -57,28 +57,15 @@ def _bw_cv(x, unbiased=True, bin_width=None, grid_counts=None, x_std=None, **kwa bin_width = grid_edges[1] - grid_edges[0] x_len = len(x) - grid_len = len(grid_counts) - ks = np.arange(1, grid_len) - - # entry j is the sum over i of grid_counts[i] * grid_counts[i + j + 1] - grid_counts_comb = convolve(grid_counts[:-1], grid_counts[:0:-1], mode="full")[grid_len-2::-1] - - def _compute_cv_score(bw): - if unbiased: - summand = np.exp(ks * -((0.5 * bin_width / bw) ** 2)) - summand -= np.sqrt(8) * summand**2 - else: - deltas = ks * (bin_width / bw) - summand = (deltas**4 - 12 * deltas**2 + 12) * np.exp(-0.25 * deltas**2) / 64 - score = (0.5 / x_len + bin_width**2 * np.inner(grid_counts_comb, summand)) / ( - bw * np.sqrt(np.pi) - ) - return score + grid_counts_comb, ks = _prepare_cv_score_inputs(grid_counts) bw_max = _bw_oversmoothed(x, x_std=x_std) bw_min = bin_width / (2 * np.pi) - result = minimize_scalar(_compute_cv_score, bounds=(bw_min, bw_max), method="bounded") + def _compute_score(bw): + return _compute_cv_score(bw, x_len, grid_counts_comb, bin_width, ks, unbiased) + + result = minimize_scalar(_compute_score, bounds=(bw_min, bw_max), method="bounded") if not result.success: warnings.warn("Optimizing the bandwidth using cross-validation did not converge.") bw_opt = result.x @@ -86,6 +73,27 @@ def _compute_cv_score(bw): return bw_opt +def _prepare_cv_score_inputs(grid_counts): + grid_len = len(grid_counts) + # entry j is the sum over i of grid_counts[i] * grid_counts[i + j + 1] + grid_counts_comb = convolve(grid_counts[:-1], grid_counts[:0:-1], mode="full")[grid_len-2::-1] + ks = np.arange(1, grid_len) + return grid_counts_comb, ks + + +def _compute_cv_score(bw, x_len, grid_counts_comb, bin_width, ks, unbiased): + if unbiased: + summand = np.exp(ks * -((0.5 * bin_width / bw) ** 2)) + summand -= np.sqrt(8) * summand**2 + else: + deltas = ks * (bin_width / bw) + summand = (deltas**4 - 12 * deltas**2 + 12) * np.exp(-0.25 * deltas**2) / 64 + score = (0.5 / x_len + bin_width**2 * np.inner(grid_counts_comb, summand)) / ( + bw * np.sqrt(np.pi) + ) + return score + + def _bw_ucv(x, **kwargs): return _bw_cv(x, unbiased=True, **kwargs) From d2eae902f93ad6412bcba90dc74f9a64e492f292 Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Thu, 19 Sep 2024 22:08:44 +0200 Subject: [PATCH 04/17] Document cv bandwidth methods --- arviz/stats/density_utils.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/arviz/stats/density_utils.py b/arviz/stats/density_utils.py index b59f0ad0c0..eecb14ec3c 100644 --- a/arviz/stats/density_utils.py +++ b/arviz/stats/density_utils.py @@ -95,10 +95,36 @@ def _compute_cv_score(bw, x_len, grid_counts_comb, bin_width, ks, unbiased): def _bw_ucv(x, **kwargs): + """Unbiased cross-validation bandwidth estimation. + + This method optimizes the bandwidth to minimize the integrated squared error of the kernel + density estimate as explained in [1]_. This implementation has been modified to operate on + binned data, which is more efficient. + + References + ---------- + .. [1] Multivariate Density Estimation: Theory, Practice, and Visualization. + D. Scott. + Wiley, 2015. + Section 6.5.1.3 + """ return _bw_cv(x, unbiased=True, **kwargs) def _bw_bcv(x, **kwargs): + """Biased cross-validation bandwidth estimation. + + This method optimizes the bandwidth to minimize the adjusted mean integrated squared error of + the kernel density estimate as explained in [1]_. This implementation has been modified to + operate on binned data, which is more efficient. + + References + ---------- + .. [1] Multivariate Density Estimation: Theory, Practice, and Visualization. + D. Scott. + Wiley, 2015. + Section 6.5.1.3 + """ return _bw_cv(x, unbiased=False, **kwargs) From 2ea80f11bb3824039f123b8ddc166d4bbf8c2cd5 Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Thu, 19 Sep 2024 22:12:22 +0200 Subject: [PATCH 05/17] Add example using UCV bandwidth --- arviz/stats/density_utils.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arviz/stats/density_utils.py b/arviz/stats/density_utils.py index eecb14ec3c..463186d6b5 100644 --- a/arviz/stats/density_utils.py +++ b/arviz/stats/density_utils.py @@ -546,6 +546,17 @@ def kde(x, circular=False, **kwargs): >>> grid, pdf = kde(rvs, bound_correction=False, custom_lims=(0, 11)) >>> plt.plot(grid, pdf) + Density estimation for well-separated modes with bandwidth chosen using unbiased + cross-validation + + .. plot:: + :context: close-figs + + >>> rvs = np.concatenate([np.random.normal(0, 1, 500), np.random.normal(30, 1, 500)]) + >>> grid, pdf = kde(rvs, bw='ucv') + >>> plt.plot(grid, pdf) + + Default density estimation for circular data .. plot:: From 276479d81d497b0fa8fcdedc88e6f724f2170685 Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Thu, 19 Sep 2024 22:14:12 +0200 Subject: [PATCH 06/17] Document that ucv and bcv are acceptable values for bw --- arviz/plots/kdeplot.py | 4 ++-- arviz/stats/density_utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arviz/plots/kdeplot.py b/arviz/plots/kdeplot.py index 522f192053..b2fd8c85d2 100644 --- a/arviz/plots/kdeplot.py +++ b/arviz/plots/kdeplot.py @@ -58,8 +58,8 @@ def plot_kde( bw : float or str, optional If numeric, indicates the bandwidth and must be positive. If str, indicates the method to estimate the bandwidth and must be - one of "scott", "silverman", "isj" or "experimental" when ``is_circular`` is False - and "taylor" (for now) when ``is_circular`` is True. + one of "scott", "silverman", "isj", "experimental", "ucv", or "bcv" when ``is_circular`` is + False and "taylor" (for now) when ``is_circular`` is True. Defaults to "default" which means "experimental" when variable is not circular and "taylor" when it is. adaptive : bool, default False diff --git a/arviz/stats/density_utils.py b/arviz/stats/density_utils.py index 463186d6b5..04d9a0b9ef 100644 --- a/arviz/stats/density_utils.py +++ b/arviz/stats/density_utils.py @@ -632,7 +632,7 @@ def _kde_linear( bw: int, float or str, optional If numeric, indicates the bandwidth and must be positive. If str, indicates the method to estimate the bandwidth and must be one of "scott", - "silverman", "isj" or "experimental". Defaults to "experimental". + "silverman", "isj", "experimental", "ucv", or "bcv". Defaults to "experimental". adaptive: boolean, optional Indicates if the bandwidth is adaptive or not. It is the recommended approach when there are multiple modes with different spread. From 0207f1481be7db344236290dbfb43449e5c56c98 Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Fri, 20 Sep 2024 10:22:53 +0200 Subject: [PATCH 07/17] Fix bugs in cv score computation --- arviz/stats/density_utils.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/arviz/stats/density_utils.py b/arviz/stats/density_utils.py index 04d9a0b9ef..160c189efb 100644 --- a/arviz/stats/density_utils.py +++ b/arviz/stats/density_utils.py @@ -53,17 +53,17 @@ def _bw_cv(x, unbiased=True, bin_width=None, grid_counts=None, x_std=None, **kwa grid_len = 256 grid_min = x_min - 0.5 * x_std grid_max = x_max + 0.5 * x_std - grid_counts, _, grid_edges = histogram(x, grid_len, (grid_min, grid_max)) + grid_counts, grid_edges = np.histogram(x, bins=grid_len, range=(grid_min, grid_max)) bin_width = grid_edges[1] - grid_edges[0] x_len = len(x) - grid_counts_comb, ks = _prepare_cv_score_inputs(grid_counts) + grid_counts_comb, ks = _prepare_cv_score_inputs(grid_counts, x_len) bw_max = _bw_oversmoothed(x, x_std=x_std) bw_min = bin_width / (2 * np.pi) def _compute_score(bw): - return _compute_cv_score(bw, x_len, grid_counts_comb, bin_width, ks, unbiased) + return _compute_cv_score(bw, x_len, bin_width, unbiased, grid_counts_comb, ks) result = minimize_scalar(_compute_score, bounds=(bw_min, bw_max), method="bounded") if not result.success: @@ -73,23 +73,24 @@ def _compute_score(bw): return bw_opt -def _prepare_cv_score_inputs(grid_counts): +def _prepare_cv_score_inputs(grid_counts, x_len): grid_len = len(grid_counts) - # entry j is the sum over i of grid_counts[i] * grid_counts[i + j + 1] - grid_counts_comb = convolve(grid_counts[:-1], grid_counts[:0:-1], mode="full")[grid_len-2::-1] - ks = np.arange(1, grid_len) + # entry j is the sum over i of grid_counts[i] * grid_counts[i + j] + grid_counts_comb = convolve(grid_counts[:-1], grid_counts[:0:-1], mode="full")[grid_len-1::-1] + # correct for within-bin counts + grid_counts_comb[0] = 0.5 * (grid_counts_comb[0] - x_len) + ks = np.arange(0, grid_len) return grid_counts_comb, ks -def _compute_cv_score(bw, x_len, grid_counts_comb, bin_width, ks, unbiased): +def _compute_cv_score(bw, x_len, bin_width, unbiased, grid_counts_comb, ks): + deltas = ks * (bin_width / bw) if unbiased: - summand = np.exp(ks * -((0.5 * bin_width / bw) ** 2)) - summand -= np.sqrt(8) * summand**2 + summand = np.exp(-0.25 * deltas**2) - np.sqrt(8) * np.exp(-0.5 * deltas**2) else: - deltas = ks * (bin_width / bw) summand = (deltas**4 - 12 * deltas**2 + 12) * np.exp(-0.25 * deltas**2) / 64 - score = (0.5 / x_len + bin_width**2 * np.inner(grid_counts_comb, summand)) / ( - bw * np.sqrt(np.pi) + score = (0.5 + np.inner(grid_counts_comb, summand) / x_len) / ( + x_len * bw * np.sqrt(np.pi) ) return score From ca30ec7aae4f2a6c31607282efa82ae04493416b Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Fri, 20 Sep 2024 11:29:24 +0200 Subject: [PATCH 08/17] Use histogram utility function --- arviz/stats/density_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arviz/stats/density_utils.py b/arviz/stats/density_utils.py index 160c189efb..ede7ef63da 100644 --- a/arviz/stats/density_utils.py +++ b/arviz/stats/density_utils.py @@ -53,7 +53,7 @@ def _bw_cv(x, unbiased=True, bin_width=None, grid_counts=None, x_std=None, **kwa grid_len = 256 grid_min = x_min - 0.5 * x_std grid_max = x_max + 0.5 * x_std - grid_counts, grid_edges = np.histogram(x, bins=grid_len, range=(grid_min, grid_max)) + grid_counts, _, grid_edges = histogram(x, grid_len, (grid_min, grid_max)) bin_width = grid_edges[1] - grid_edges[0] x_len = len(x) From c2515da8bc96cd80b96e07d471029e52182a7755 Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Fri, 20 Sep 2024 11:29:47 +0200 Subject: [PATCH 09/17] Add tests for density utils --- .../base_tests/test_stats_density_utils.py | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 arviz/tests/base_tests/test_stats_density_utils.py diff --git a/arviz/tests/base_tests/test_stats_density_utils.py b/arviz/tests/base_tests/test_stats_density_utils.py new file mode 100644 index 0000000000..66cb8b901d --- /dev/null +++ b/arviz/tests/base_tests/test_stats_density_utils.py @@ -0,0 +1,70 @@ +import pytest + +import numpy as np +import scipy.stats +from ...stats.density_utils import ( + _prepare_cv_score_inputs, + _compute_cv_score, + _bw_cv, + _bw_oversmoothed, + _bw_scott, + histogram, + kde, +) + + +def compute_cv_score_explicit(bw, x, unbiased): + """Explicit computation of the CV score for a 1D dataset.""" + n = len(x) + score = 0.0 + for i in range(n): + for j in range(i + 1, n): + delta = (x[i] - x[j]) / bw + if unbiased: + score += np.exp(-0.25 * delta**2) - np.sqrt(8) * np.exp(-0.5 * delta**2) + else: + score += (delta**4 - 12 * delta**2 + 12) * np.exp(-0.25 * delta**2) + if not unbiased: + score /= 64 + score = 0.5 / n / bw / np.sqrt(np.pi) + score / n**2 / bw / np.sqrt(np.pi) + return score + + +@pytest.mark.parametrize("unbiased", [True, False]) +@pytest.mark.parametrize("bw", [0.1, 0.5, 2.0]) +@pytest.mark.parametrize("n", [100, 1_000]) +def test_compute_cv_score(bw, unbiased, n, seed=42): + """Test that the histogram-based CV score matches the explicit CV score.""" + rng = np.random.default_rng(seed) + x = rng.normal(size=n) + x_std = x.std() + grid_counts, grid_edges = np.histogram( + x, bins=100, range=(x.min() - 0.5 * x_std, x.max() + 0.5 * x_std) + ) + bin_width = grid_edges[1] - grid_edges[0] + grid = grid_edges[:-1] + 0.5 * bin_width + + # if data is discretized to regularly-spaced bins, then explicit CV score should match + # the histogram-based CV score + x_discrete = np.repeat(grid, grid_counts) + rng.shuffle(x_discrete) + score_inputs = _prepare_cv_score_inputs(grid_counts, n) + score = _compute_cv_score(bw, n, bin_width, unbiased, *score_inputs) + score_explicit = compute_cv_score_explicit(bw, x_discrete, unbiased) + assert np.isclose(score, score_explicit) + + +@pytest.mark.parametrize("unbiased", [True, False]) +def test_bw_cv_normal(unbiased, seed=42, bins=512, n=100_000): + """Test that for normal target, selected CV bandwidth converges to known optimum.""" + rng = np.random.default_rng(seed) + x = rng.normal(size=n) + x_std = x.std() + grid_counts, grid_edges = np.histogram( + x, bins=bins, range=(x.min() - 0.5 * x_std, x.max() + 0.5 * x_std) + ) + bin_width = grid_edges[1] - grid_edges[0] + bw = _bw_cv(x, unbiased=unbiased, bin_width=bin_width, grid_counts=grid_counts) + assert bw > bin_width / (2 * np.pi) + assert bw < _bw_oversmoothed(x) + assert np.isclose(bw, _bw_scott(x), rtol=0.2) From d26eb17b29252fe45930e8171aac833c843ef0b4 Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Fri, 20 Sep 2024 11:31:56 +0200 Subject: [PATCH 10/17] Reorganizing existing histogram test with other density utils tests --- arviz/tests/base_tests/test_stats_density_utils.py | 9 +++++++++ arviz/tests/base_tests/test_stats_utils.py | 10 ---------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/arviz/tests/base_tests/test_stats_density_utils.py b/arviz/tests/base_tests/test_stats_density_utils.py index 66cb8b901d..2c6c2dfd70 100644 --- a/arviz/tests/base_tests/test_stats_density_utils.py +++ b/arviz/tests/base_tests/test_stats_density_utils.py @@ -30,6 +30,15 @@ def compute_cv_score_explicit(bw, x, unbiased): return score +def test_histogram(): + school = load_arviz_data("non_centered_eight").posterior["mu"].values + k_count_az, k_dens_az, _ = histogram(school, bins=np.asarray([-np.inf, 0.5, 0.7, 1, np.inf])) + k_dens_np, *_ = np.histogram(school, bins=[-np.inf, 0.5, 0.7, 1, np.inf], density=True) + k_count_np, *_ = np.histogram(school, bins=[-np.inf, 0.5, 0.7, 1, np.inf], density=False) + assert np.allclose(k_count_az, k_count_np) + assert np.allclose(k_dens_az, k_dens_np) + + @pytest.mark.parametrize("unbiased", [True, False]) @pytest.mark.parametrize("bw", [0.1, 0.5, 2.0]) @pytest.mark.parametrize("n", [100, 1_000]) diff --git a/arviz/tests/base_tests/test_stats_utils.py b/arviz/tests/base_tests/test_stats_utils.py index b16169c55b..83b599407a 100644 --- a/arviz/tests/base_tests/test_stats_utils.py +++ b/arviz/tests/base_tests/test_stats_utils.py @@ -8,7 +8,6 @@ from scipy.stats import circstd from ...data import from_dict, load_arviz_data -from ...stats.density_utils import histogram from ...stats.stats_utils import ( ELPDData, _angle, @@ -343,15 +342,6 @@ def test_variance_bad_data(): assert not np.allclose(stats_variance_2d(data), np.var(data, ddof=1)) -def test_histogram(): - school = load_arviz_data("non_centered_eight").posterior["mu"].values - k_count_az, k_dens_az, _ = histogram(school, bins=np.asarray([-np.inf, 0.5, 0.7, 1, np.inf])) - k_dens_np, *_ = np.histogram(school, bins=[-np.inf, 0.5, 0.7, 1, np.inf], density=True) - k_count_np, *_ = np.histogram(school, bins=[-np.inf, 0.5, 0.7, 1, np.inf], density=False) - assert np.allclose(k_count_az, k_count_np) - assert np.allclose(k_dens_az, k_dens_np) - - def test_sqrt(): x = np.random.rand(100) y = np.random.rand(100) From 76ac63c9950c314335eb98b4b0fc7f8929d1869e Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Fri, 20 Sep 2024 17:58:23 +0200 Subject: [PATCH 11/17] Make corrections to docstrings --- arviz/stats/density_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arviz/stats/density_utils.py b/arviz/stats/density_utils.py index ede7ef63da..fc24b8ad74 100644 --- a/arviz/stats/density_utils.py +++ b/arviz/stats/density_utils.py @@ -98,7 +98,7 @@ def _compute_cv_score(bw, x_len, bin_width, unbiased, grid_counts_comb, ks): def _bw_ucv(x, **kwargs): """Unbiased cross-validation bandwidth estimation. - This method optimizes the bandwidth to minimize the integrated squared error of the kernel + This method optimizes the bandwidth to minimize the mean integrated squared error of the kernel density estimate as explained in [1]_. This implementation has been modified to operate on binned data, which is more efficient. @@ -115,7 +115,7 @@ def _bw_ucv(x, **kwargs): def _bw_bcv(x, **kwargs): """Biased cross-validation bandwidth estimation. - This method optimizes the bandwidth to minimize the adjusted mean integrated squared error of + This method optimizes the bandwidth to minimize the asymptotic mean integrated squared error of the kernel density estimate as explained in [1]_. This implementation has been modified to operate on binned data, which is more efficient. From 1332338e03878ff922c3387b5e330e34196b8aea Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Fri, 20 Sep 2024 17:59:51 +0200 Subject: [PATCH 12/17] Add missing test import --- arviz/tests/base_tests/test_stats_density_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/arviz/tests/base_tests/test_stats_density_utils.py b/arviz/tests/base_tests/test_stats_density_utils.py index 2c6c2dfd70..a141d3c0d0 100644 --- a/arviz/tests/base_tests/test_stats_density_utils.py +++ b/arviz/tests/base_tests/test_stats_density_utils.py @@ -2,6 +2,7 @@ import numpy as np import scipy.stats +from ...data import load_arviz_data from ...stats.density_utils import ( _prepare_cv_score_inputs, _compute_cv_score, From 3588378e63351b3686aeacee7b63af52bda12787 Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Fri, 20 Sep 2024 18:00:44 +0200 Subject: [PATCH 13/17] Fix pylint error --- arviz/stats/density_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arviz/stats/density_utils.py b/arviz/stats/density_utils.py index fc24b8ad74..c43978f28d 100644 --- a/arviz/stats/density_utils.py +++ b/arviz/stats/density_utils.py @@ -42,7 +42,7 @@ def _bw_oversmoothed(x, x_std=None, **kwargs): # pylint: disable=unused-argumen return bw -def _bw_cv(x, unbiased=True, bin_width=None, grid_counts=None, x_std=None, **kwargs): +def _bw_cv(x, unbiased=True, bin_width=None, grid_counts=None, x_std=None, **kwargs): # pylint: disable=unused-argument """Cross-validation bandwidth estimation.""" if x_std is None: x_std = np.std(x) From 485b38af8ccec070479f4525a7fdbc5c0912175f Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Fri, 20 Sep 2024 20:24:00 +0200 Subject: [PATCH 14/17] Remove unused imports --- arviz/tests/base_tests/test_stats_density_utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/arviz/tests/base_tests/test_stats_density_utils.py b/arviz/tests/base_tests/test_stats_density_utils.py index a141d3c0d0..88093249e3 100644 --- a/arviz/tests/base_tests/test_stats_density_utils.py +++ b/arviz/tests/base_tests/test_stats_density_utils.py @@ -1,7 +1,6 @@ import pytest import numpy as np -import scipy.stats from ...data import load_arviz_data from ...stats.density_utils import ( _prepare_cv_score_inputs, @@ -10,7 +9,6 @@ _bw_oversmoothed, _bw_scott, histogram, - kde, ) From 51626c04916fa38bf7a36f9259ce2659d4079aaa Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Fri, 20 Sep 2024 21:15:11 +0200 Subject: [PATCH 15/17] Disable pylint checks --- arviz/stats/density_utils.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arviz/stats/density_utils.py b/arviz/stats/density_utils.py index c43978f28d..cb96566420 100644 --- a/arviz/stats/density_utils.py +++ b/arviz/stats/density_utils.py @@ -83,7 +83,7 @@ def _prepare_cv_score_inputs(grid_counts, x_len): return grid_counts_comb, ks -def _compute_cv_score(bw, x_len, bin_width, unbiased, grid_counts_comb, ks): +def _compute_cv_score(bw, x_len, bin_width, unbiased, grid_counts_comb, ks): # pylint: disable=too-many-positional-arguments deltas = ks * (bin_width / bw) if unbiased: summand = np.exp(-0.25 * deltas**2) - np.sqrt(8) * np.exp(-0.5 * deltas**2) @@ -211,7 +211,7 @@ def _bw_taylor(x): } -def _get_bw(x, bw, grid_counts=None, bin_width=None, x_std=None, x_range=None): +def _get_bw(x, bw, grid_counts=None, bin_width=None, x_std=None, x_range=None): # pylint: disable=too-many-positional-arguments """Compute bandwidth for a given data `x` and `bw`. Also checks `bw` is correctly specified. @@ -418,7 +418,7 @@ def _check_custom_lims(custom_lims, x_min, x_max): def _get_grid( x_min, x_max, x_std, extend_fct, grid_len, custom_lims, extend=True, bound_correction=False -): +): # pylint: disable=too-many-positional-arguments """Compute the grid that bins the data used to estimate the density function. Parameters @@ -607,7 +607,7 @@ def kde(x, circular=False, **kwargs): return kde_fun(x, **kwargs) -def _kde_linear( +def _kde_linear( # pylint: disable=too-many-positional-arguments x, bw="experimental", adaptive=False, @@ -708,7 +708,7 @@ def _kde_linear( return grid, pdf -def _kde_circular( +def _kde_circular( # pylint: disable=too-many-positional-arguments x, bw="taylor", bw_fct=1, @@ -798,7 +798,7 @@ def _kde_circular( # pylint: disable=unused-argument -def _kde_convolution(x, bw, grid_edges, grid_counts, grid_len, bound_correction, **kwargs): +def _kde_convolution(x, bw, grid_edges, grid_counts, grid_len, bound_correction, **kwargs): # pylint: disable=too-many-positional-arguments """Kernel density with convolution. One dimensional Gaussian kernel density estimation via convolution of the binned relative @@ -832,7 +832,7 @@ def _kde_convolution(x, bw, grid_edges, grid_counts, grid_len, bound_correction, return grid, pdf -def _kde_adaptive(x, bw, grid_edges, grid_counts, grid_len, bound_correction, **kwargs): +def _kde_adaptive(x, bw, grid_edges, grid_counts, grid_len, bound_correction, **kwargs): # pylint: disable=too-many-positional-arguments """Compute Adaptive Kernel Density Estimation. One dimensional adaptive Gaussian kernel density estimation. The implementation uses the binning From 8947d60b830baf5bf76af2f3c28ce4db6b62fdd2 Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Fri, 20 Sep 2024 21:16:55 +0200 Subject: [PATCH 16/17] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ddfa4e5dae..884e8fea72 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ### New features - Add optimized simultaneous ECDF confidence bands ([2368](https://github.com/arviz-devs/arviz/pull/2368)) - Add support for setting groups with `idata[group]` ([2374](https://github.com/arviz-devs/arviz/pull/2374)) +- Add cross-validation-based KDE bandwidth selection methods. ([2384](https://github.com/arviz-devs/arviz/pull/2384)) ### Maintenance and fixes From c85b86d113d8b7b53fb8efe6344867de8c4ced20 Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Sat, 21 Sep 2024 19:07:01 +0200 Subject: [PATCH 17/17] Simplify code by using correlate --- arviz/stats/density_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arviz/stats/density_utils.py b/arviz/stats/density_utils.py index cb96566420..663b6057e4 100644 --- a/arviz/stats/density_utils.py +++ b/arviz/stats/density_utils.py @@ -5,7 +5,7 @@ import numpy as np from scipy.fftpack import fft from scipy.optimize import brentq, minimize_scalar -from scipy.signal import convolve, convolve2d +from scipy.signal import convolve, convolve2d, correlate from scipy.signal.windows import gaussian from scipy.sparse import coo_matrix from scipy.special import ive # pylint: disable=no-name-in-module @@ -76,7 +76,7 @@ def _compute_score(bw): def _prepare_cv_score_inputs(grid_counts, x_len): grid_len = len(grid_counts) # entry j is the sum over i of grid_counts[i] * grid_counts[i + j] - grid_counts_comb = convolve(grid_counts[:-1], grid_counts[:0:-1], mode="full")[grid_len-1::-1] + grid_counts_comb = correlate(grid_counts[1:], grid_counts[:-1], mode="full")[-grid_len:] # correct for within-bin counts grid_counts_comb[0] = 0.5 * (grid_counts_comb[0] - x_len) ks = np.arange(0, grid_len)