removed default value for bandwidth argument of KDE calibrator (#68)

* deprecated default bandwidth argument of KDE calibrator * renamed DistanceFunctionTransformer
NetherlandsForensicInstitute · Jan 19, 2023 · 1f59d24 · 1f59d24
1 parent 08eb48f
commit 1f59d24
Show file tree

Hide file tree

Showing 5 changed files with 57 additions and 87 deletions.
diff --git a/lir/calibration.py b/lir/calibration.py
@@ -10,7 +10,7 @@
 from sklearn.linear_model import LogisticRegression
 from sklearn.mixture import GaussianMixture
 from sklearn.neighbors import KernelDensity
-from typing import Optional, Tuple, Union
+from typing import Optional, Tuple, Union, Iterable, Callable, Sized
 
 from .bayeserror import elub
 from .loss_functions import negative_log_likelihood_balanced
@@ -153,43 +153,51 @@ class KDECalibrator(BaseEstimator, TransformerMixin):
     two distributions. Uses kernel density estimation (KDE) for interpolation.
     """
 
-    def __init__(self, bandwidth: Optional[Union[float, Tuple[Optional[float], Optional[float]]]] = None):
+    def __init__(self, bandwidth: Union[Callable, str, float, Tuple[float, float]] = None):
         """
 
         :param bandwidth:
-            * If None is provided the Silverman's rule of thumb is
-            used to calculate the bandwidth for both distributions (independently)
-            * If a single float is provided this is used as the bandwith for both
-            distributions
-            * If a tuple is provided, the first entry is used for the bandwidth
-            of the first distribution (kde0) and the second entry for the second
-            distribution (if value is None: Silverman's rule of thumb is used)
+            * If bandwidth has a float value, this value is used as the bandwidth for both distributions.
+            * If bandwidth is a tuple, it should contain two floating point values: the bandwidth for the distribution
+              of the classes with labels 0 and 1, respectively.
+            * If bandwidth has the str value "silverman", Silverman's rule of thumb is used as the bandwidth for both
+              distributions separately.
+            * If bandwidth is callable, it should accept two arguments, `X` and `y`, and return a tuple of two values
+              which are the bandwidths for the two distributions.
         """
-        self.bandwidth: Tuple[Optional[float], Optional[float]] = self._parse_bandwidth(bandwidth)
+        if bandwidth is None:
+            warnings.warn("missing bandwidth argument for KDE, defaulting to silverman (default argument will be removed in the future)")
+            bandwidth = "silverman"
+        self.bandwidth: Callable = self._parse_bandwidth(bandwidth)
         self._kde0: Optional[KernelDensity] = None
         self._kde1: Optional[KernelDensity] = None
         self.numerator, self.denominator = None, None
 
     @staticmethod
-    def bandwidth_silverman(X):
+    def bandwidth_silverman(X, y):
         """
         Estimates the optimal bandwidth parameter using Silverman's rule of
         thumb.
         """
         assert len(X) > 0
 
-        std = np.std(X)
-        if std == 0:
-            # can happen eg if std(X) = 0
-            warnings.warn('silverman bandwidth cannot be calculated if standard deviation is 0', RuntimeWarning)
-            LOG.info('found a silverman bandwidth of 0 (using dummy value)')
-            std = 1
+        bandwidth = []
+        for label in np.unique(y):
+            values = X[y == label]
+            std = np.std(values)
+            if std == 0:
+                # can happen eg if std(values) = 0
+                warnings.warn('silverman bandwidth cannot be calculated if standard deviation is 0', RuntimeWarning)
+                LOG.info('found a silverman bandwidth of 0 (using dummy value)')
+                std = 1
 
-        v = math.pow(std, 5) / len(X) * 4. / 3
-        return math.pow(v, .2)
+            v = math.pow(std, 5) / len(values) * 4. / 3
+            bandwidth.append(math.pow(v, .2))
+
+        return bandwidth
 
     @staticmethod
-    def bandwidth_scott(X):
+    def bandwidth_scott(X, y):
         """
         Not implemented.
         """
@@ -207,8 +215,7 @@ def fit(self, X, y):
         X0 = X0.reshape(-1, 1)
         X1 = X1.reshape(-1, 1)
 
-        bandwidth0 = self.bandwidth[0] or self.bandwidth_silverman(X0)
-        bandwidth1 = self.bandwidth[1] or self.bandwidth_silverman(X1)
+        bandwidth0, bandwidth1 = self.bandwidth(X, y)
         self._kde0 = KernelDensity(kernel='gaussian', bandwidth=bandwidth0).fit(X0)
         self._kde1 = KernelDensity(kernel='gaussian', bandwidth=bandwidth1).fit(X1)
         return self
@@ -253,22 +260,28 @@ def transform(self, X):
         return np.float_power(10, LLRs_output)
 
     @staticmethod
-    def _parse_bandwidth(bandwidth: Optional[Union[float, Tuple[float, float]]]) \
-            -> Tuple[Optional[float], Optional[float]]:
+    def _parse_bandwidth(bandwidth: Union[Callable, float, Tuple[float, float]]) \
+            -> Callable:
         """
         Returns bandwidth as a tuple of two (optional) floats.
         Extrapolates a single bandwidth
         :param bandwidth: provided bandwidth
         :return: bandwidth used for kde0, bandwidth used for kde1
         """
-        if bandwidth is None:
-            return None, None
-        elif isinstance(bandwidth, float):
-            return bandwidth, bandwidth
-        elif len(bandwidth) == 2:
+        assert bandwidth is not None, "KDE requires a bandwidth argument"
+        if callable(bandwidth):
             return bandwidth
+        elif bandwidth == "silverman":
+            return KDECalibrator.bandwidth_silverman
+        elif bandwidth == "scott":
+            return KDECalibrator.bandwidth_scott
+        elif isinstance(bandwidth, str):
+            raise ValueError(f"invalid input for bandwidth: {bandwidth}")
+        elif isinstance(bandwidth, Sized):
+            assert len(bandwidth) == 2, f"bandwidth should have two elements; found {len(bandwidth)}; bandwidth = {bandwidth}"
+            return lambda X, y: bandwidth
         else:
-            raise ValueError('Invalid input for bandwidth')
+            return lambda X, y: (0+bandwidth, bandwidth)
 
 
 class KDECalibratorInProbabilityDomain(BaseEstimator, TransformerMixin):
@@ -277,7 +290,7 @@ class KDECalibratorInProbabilityDomain(BaseEstimator, TransformerMixin):
     two distributions. Uses kernel density estimation (KDE) for interpolation.
     """
 
-    def __init__(self, bandwidth: Optional[Union[float, Tuple[Optional[float], Optional[float]]]] = None):
+    def __init__(self, bandwidth: Union[Callable, str, float, Tuple[float, float]] = None):
         """
 
         :param bandwidth:
@@ -289,45 +302,21 @@ def __init__(self, bandwidth: Optional[Union[float, Tuple[Optional[float], Optio
             of the first distribution (kde0) and the second entry for the second
             distribution (if value is None: Silverman's rule of thumb is used)
         """
+
         warnings.warn(f"the class {type(self).__name__} will be removed in the future")
-        self.bandwidth: Tuple[Optional[float], Optional[float]] = \
-            self._parse_bandwidth(bandwidth)
+        if bandwidth is None:
+            warnings.warn("missing bandwidth argument for KDE, defaulting to 1 (default argument will be removed in the future)")
+            bandwidth = (1, 1)
+        self.bandwidth: Callable = KDECalibrator._parse_bandwidth(bandwidth)
         self._kde0: Optional[KernelDensity] = None
         self._kde1: Optional[KernelDensity] = None
 
-    @staticmethod
-    def bandwidth_silverman(X):
-        """
-        Estimates the optimal bandwidth parameter using Silverman's rule of
-        thumb.
-        """
-        assert len(X) > 0
-
-        std = np.std(X)
-        if std == 0:
-            # can happen eg if std(X) = 0
-            warnings.warn('silverman bandwidth cannot be calculated if standard deviation is 0', RuntimeWarning)
-            LOG.info('found a silverman bandwidth of 0 (using dummy value)')
-            std = 1
-
-        v = math.pow(std, 5) / len(X) * 4. / 3
-        return math.pow(v, .2)
-
-    @staticmethod
-    def bandwidth_scott(X):
-        """
-        Not implemented.
-        """
-        raise
-
     def fit(self, X, y):
         X0, X1 = Xy_to_Xn(X, y)
         X0 = X0.reshape(-1, 1)
         X1 = X1.reshape(-1, 1)
 
-        bandwidth0 = self.bandwidth[0] or self.bandwidth_silverman(X0)
-        bandwidth1 = self.bandwidth[1] or self.bandwidth_silverman(X1)
-
+        bandwidth0, bandwidth1 = self.bandwidth(X, y)
         self._kde0 = KernelDensity(kernel='gaussian', bandwidth=bandwidth0).fit(X0)
         self._kde1 = KernelDensity(kernel='gaussian', bandwidth=bandwidth1).fit(X1)
         return self
@@ -342,24 +331,6 @@ def transform(self, X):
         with np.errstate(divide='ignore'):
             return self.p1 / self.p0
 
-    @staticmethod
-    def _parse_bandwidth(bandwidth: Optional[Union[float, Tuple[float, float]]]) \
-            -> Tuple[Optional[float], Optional[float]]:
-        """
-        Returns bandwidth as a tuple of two (optional) floats.
-        Extrapolates a single bandwidth
-        :param bandwidth: provided bandwidth
-        :return: bandwidth used for kde0, bandwidth used for kde1
-        """
-        if bandwidth is None:
-            return None, None
-        elif isinstance(bandwidth, float):
-            return bandwidth, bandwidth
-        elif len(bandwidth) == 2:
-            return bandwidth
-        else:
-            raise ValueError('Invalid input for bandwidth')
-
 
 class LogitCalibrator(BaseEstimator, TransformerMixin):
     """

diff --git a/lir/lr.py b/lir/lr.py
@@ -1,13 +1,12 @@
 import logging
-from typing import Callable, Optional
 
 import numpy as np
 import sklearn
 import sklearn.mixture
 from sklearn.pipeline import Pipeline
 
 from .metrics import calculate_lr_statistics, LrStats
-from .transformers import EstimatorTransformer, DistanceFunctionTransformer
+from .transformers import EstimatorTransformer, ComparisonFunctionTransformer
 
 from .util import Xn_to_Xy, LR
 
@@ -20,7 +19,7 @@ def _create_transformer(scorer):
     elif hasattr(scorer, "predict_proba"):
         return EstimatorTransformer(scorer)
     elif callable(scorer):
-        return DistanceFunctionTransformer(scorer)
+        return ComparisonFunctionTransformer(scorer)
     else:
         raise NotImplementedError("`scorer` argument must either be callable or implement at least one of `transform`, `predict_proba`")
 

diff --git a/lir/transformers.py b/lir/transformers.py
@@ -35,7 +35,7 @@ def __getattr__(self, item):
         return getattr(self.estimator, item)
 
 
-class DistanceFunctionTransformer(FunctionTransformer):
+class ComparisonFunctionTransformer(FunctionTransformer):
     """
     A wrapper for a distance function to make it behave like a transformer.
 

diff --git a/tests/test_calibration.py b/tests/test_calibration.py
@@ -90,7 +90,7 @@ def test_kde_calibrator(self):
         X = to_probability(X)
         X = to_log_odds(X)
         desired = [3.59562799e-02, 1.75942116e-11, 2.59633540e-12, 1.36799721e-12, 8.15673411e-03, 2.10030624e-02, 3.70456430e-05, 1.40710861e-18, 1.04459592e-10, 3.14589737e+03, 2.59568527e+02, 1.08519904e+02, 8.56459139e+01, 3.81243702e+00, 6.23873841e+01, 1.43844114e+02, 2.64913149e+02, 1.49097168e+05]
-        calibrator = KDECalibrator()
+        calibrator = KDECalibrator(bandwidth="silverman")
         calibrator.fit(X, y)
         lrs_cal = calibrator.transform(X)
         np.testing.assert_allclose(lrs_cal, desired)
@@ -100,7 +100,7 @@ def test_on_extreme_values(self):
         X = to_log_odds(X)
         y = np.concatenate((np.zeros(12), np.ones(12)))
         desired = [6.148510640582358, 0.10548096579142373, 0.07571171879632102, 0.06774859414831141, 4.408883097248305, 5.446103603204983, 1.4258427450086562, 0.006102474459494191, 0.14360453961912525, 0.0, 0.0, 0.0, 17.786943105214274, 21.248067409078676, 21.10676921763807, 20.955468109356307, 16.029054988277238, 20.689727349181517, 21.22851434841379, 21.24246276550688, 11.31919250180751, math.inf, 2.846712755553574, math.inf]
-        calibrator = KDECalibrator()
+        calibrator = KDECalibrator(bandwidth="silverman")
         calibrator.fit(X, y)
         lrs_cal = calibrator.transform(X)
         np.testing.assert_allclose(lrs_cal, desired)

diff --git a/tests/test_lr.py b/tests/test_lr.py
@@ -190,11 +190,11 @@ def test_calibrated_scorer_with_distance_function(self):
         X = np.concatenate([np.random.normal(loc=i, scale=.1, size=(1, 3)) for i in y])
         X, y = InstancePairing(ratio_limit=1, seed=0).transform(X, y)
 
-        calibrated_scorer = CalibratedScorer(paired_manhattan_distances, KDECalibrator())
+        calibrated_scorer = CalibratedScorer(paired_manhattan_distances, KDECalibrator(bandwidth="silverman"))
         calibrated_scorer.fit(X, y)
         self.assertAlmostEqual(0, metrics.cllr(calibrated_scorer.predict_lr(X), y), places=3)
 
-        calibrated_scorer = CalibratedScorerCV(paired_manhattan_distances, KDECalibrator(), n_splits=5)
+        calibrated_scorer = CalibratedScorerCV(paired_manhattan_distances, KDECalibrator(bandwidth="silverman"), n_splits=5)
         calibrated_scorer.fit(X, y)
         self.assertAlmostEqual(0, metrics.cllr(calibrated_scorer.predict_lr(X), y), places=3)