Skip to content

Commit

Permalink
removed default value for bandwidth argument of KDE calibrator (#68)
Browse files Browse the repository at this point in the history
* deprecated default bandwidth argument of KDE calibrator
* renamed DistanceFunctionTransformer
  • Loading branch information
wowtor authored Jan 19, 2023
1 parent 08eb48f commit 1f59d24
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 87 deletions.
129 changes: 50 additions & 79 deletions lir/calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from sklearn.linear_model import LogisticRegression
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import KernelDensity
from typing import Optional, Tuple, Union
from typing import Optional, Tuple, Union, Iterable, Callable, Sized

from .bayeserror import elub
from .loss_functions import negative_log_likelihood_balanced
Expand Down Expand Up @@ -153,43 +153,51 @@ class KDECalibrator(BaseEstimator, TransformerMixin):
two distributions. Uses kernel density estimation (KDE) for interpolation.
"""

def __init__(self, bandwidth: Optional[Union[float, Tuple[Optional[float], Optional[float]]]] = None):
def __init__(self, bandwidth: Union[Callable, str, float, Tuple[float, float]] = None):
"""
:param bandwidth:
* If None is provided the Silverman's rule of thumb is
used to calculate the bandwidth for both distributions (independently)
* If a single float is provided this is used as the bandwith for both
distributions
* If a tuple is provided, the first entry is used for the bandwidth
of the first distribution (kde0) and the second entry for the second
distribution (if value is None: Silverman's rule of thumb is used)
* If bandwidth has a float value, this value is used as the bandwidth for both distributions.
* If bandwidth is a tuple, it should contain two floating point values: the bandwidth for the distribution
of the classes with labels 0 and 1, respectively.
* If bandwidth has the str value "silverman", Silverman's rule of thumb is used as the bandwidth for both
distributions separately.
* If bandwidth is callable, it should accept two arguments, `X` and `y`, and return a tuple of two values
which are the bandwidths for the two distributions.
"""
self.bandwidth: Tuple[Optional[float], Optional[float]] = self._parse_bandwidth(bandwidth)
if bandwidth is None:
warnings.warn("missing bandwidth argument for KDE, defaulting to silverman (default argument will be removed in the future)")
bandwidth = "silverman"
self.bandwidth: Callable = self._parse_bandwidth(bandwidth)
self._kde0: Optional[KernelDensity] = None
self._kde1: Optional[KernelDensity] = None
self.numerator, self.denominator = None, None

@staticmethod
def bandwidth_silverman(X):
def bandwidth_silverman(X, y):
"""
Estimates the optimal bandwidth parameter using Silverman's rule of
thumb.
"""
assert len(X) > 0

std = np.std(X)
if std == 0:
# can happen eg if std(X) = 0
warnings.warn('silverman bandwidth cannot be calculated if standard deviation is 0', RuntimeWarning)
LOG.info('found a silverman bandwidth of 0 (using dummy value)')
std = 1
bandwidth = []
for label in np.unique(y):
values = X[y == label]
std = np.std(values)
if std == 0:
# can happen eg if std(values) = 0
warnings.warn('silverman bandwidth cannot be calculated if standard deviation is 0', RuntimeWarning)
LOG.info('found a silverman bandwidth of 0 (using dummy value)')
std = 1

v = math.pow(std, 5) / len(X) * 4. / 3
return math.pow(v, .2)
v = math.pow(std, 5) / len(values) * 4. / 3
bandwidth.append(math.pow(v, .2))

return bandwidth

@staticmethod
def bandwidth_scott(X):
def bandwidth_scott(X, y):
"""
Not implemented.
"""
Expand All @@ -207,8 +215,7 @@ def fit(self, X, y):
X0 = X0.reshape(-1, 1)
X1 = X1.reshape(-1, 1)

bandwidth0 = self.bandwidth[0] or self.bandwidth_silverman(X0)
bandwidth1 = self.bandwidth[1] or self.bandwidth_silverman(X1)
bandwidth0, bandwidth1 = self.bandwidth(X, y)
self._kde0 = KernelDensity(kernel='gaussian', bandwidth=bandwidth0).fit(X0)
self._kde1 = KernelDensity(kernel='gaussian', bandwidth=bandwidth1).fit(X1)
return self
Expand Down Expand Up @@ -253,22 +260,28 @@ def transform(self, X):
return np.float_power(10, LLRs_output)

@staticmethod
def _parse_bandwidth(bandwidth: Optional[Union[float, Tuple[float, float]]]) \
-> Tuple[Optional[float], Optional[float]]:
def _parse_bandwidth(bandwidth: Union[Callable, float, Tuple[float, float]]) \
-> Callable:
"""
Returns bandwidth as a tuple of two (optional) floats.
Extrapolates a single bandwidth
:param bandwidth: provided bandwidth
:return: bandwidth used for kde0, bandwidth used for kde1
"""
if bandwidth is None:
return None, None
elif isinstance(bandwidth, float):
return bandwidth, bandwidth
elif len(bandwidth) == 2:
assert bandwidth is not None, "KDE requires a bandwidth argument"
if callable(bandwidth):
return bandwidth
elif bandwidth == "silverman":
return KDECalibrator.bandwidth_silverman
elif bandwidth == "scott":
return KDECalibrator.bandwidth_scott
elif isinstance(bandwidth, str):
raise ValueError(f"invalid input for bandwidth: {bandwidth}")
elif isinstance(bandwidth, Sized):
assert len(bandwidth) == 2, f"bandwidth should have two elements; found {len(bandwidth)}; bandwidth = {bandwidth}"
return lambda X, y: bandwidth
else:
raise ValueError('Invalid input for bandwidth')
return lambda X, y: (0+bandwidth, bandwidth)


class KDECalibratorInProbabilityDomain(BaseEstimator, TransformerMixin):
Expand All @@ -277,7 +290,7 @@ class KDECalibratorInProbabilityDomain(BaseEstimator, TransformerMixin):
two distributions. Uses kernel density estimation (KDE) for interpolation.
"""

def __init__(self, bandwidth: Optional[Union[float, Tuple[Optional[float], Optional[float]]]] = None):
def __init__(self, bandwidth: Union[Callable, str, float, Tuple[float, float]] = None):
"""
:param bandwidth:
Expand All @@ -289,45 +302,21 @@ def __init__(self, bandwidth: Optional[Union[float, Tuple[Optional[float], Optio
of the first distribution (kde0) and the second entry for the second
distribution (if value is None: Silverman's rule of thumb is used)
"""

warnings.warn(f"the class {type(self).__name__} will be removed in the future")
self.bandwidth: Tuple[Optional[float], Optional[float]] = \
self._parse_bandwidth(bandwidth)
if bandwidth is None:
warnings.warn("missing bandwidth argument for KDE, defaulting to 1 (default argument will be removed in the future)")
bandwidth = (1, 1)
self.bandwidth: Callable = KDECalibrator._parse_bandwidth(bandwidth)
self._kde0: Optional[KernelDensity] = None
self._kde1: Optional[KernelDensity] = None

@staticmethod
def bandwidth_silverman(X):
"""
Estimates the optimal bandwidth parameter using Silverman's rule of
thumb.
"""
assert len(X) > 0

std = np.std(X)
if std == 0:
# can happen eg if std(X) = 0
warnings.warn('silverman bandwidth cannot be calculated if standard deviation is 0', RuntimeWarning)
LOG.info('found a silverman bandwidth of 0 (using dummy value)')
std = 1

v = math.pow(std, 5) / len(X) * 4. / 3
return math.pow(v, .2)

@staticmethod
def bandwidth_scott(X):
"""
Not implemented.
"""
raise

def fit(self, X, y):
X0, X1 = Xy_to_Xn(X, y)
X0 = X0.reshape(-1, 1)
X1 = X1.reshape(-1, 1)

bandwidth0 = self.bandwidth[0] or self.bandwidth_silverman(X0)
bandwidth1 = self.bandwidth[1] or self.bandwidth_silverman(X1)

bandwidth0, bandwidth1 = self.bandwidth(X, y)
self._kde0 = KernelDensity(kernel='gaussian', bandwidth=bandwidth0).fit(X0)
self._kde1 = KernelDensity(kernel='gaussian', bandwidth=bandwidth1).fit(X1)
return self
Expand All @@ -342,24 +331,6 @@ def transform(self, X):
with np.errstate(divide='ignore'):
return self.p1 / self.p0

@staticmethod
def _parse_bandwidth(bandwidth: Optional[Union[float, Tuple[float, float]]]) \
-> Tuple[Optional[float], Optional[float]]:
"""
Returns bandwidth as a tuple of two (optional) floats.
Extrapolates a single bandwidth
:param bandwidth: provided bandwidth
:return: bandwidth used for kde0, bandwidth used for kde1
"""
if bandwidth is None:
return None, None
elif isinstance(bandwidth, float):
return bandwidth, bandwidth
elif len(bandwidth) == 2:
return bandwidth
else:
raise ValueError('Invalid input for bandwidth')


class LogitCalibrator(BaseEstimator, TransformerMixin):
"""
Expand Down
5 changes: 2 additions & 3 deletions lir/lr.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
import logging
from typing import Callable, Optional

import numpy as np
import sklearn
import sklearn.mixture
from sklearn.pipeline import Pipeline

from .metrics import calculate_lr_statistics, LrStats
from .transformers import EstimatorTransformer, DistanceFunctionTransformer
from .transformers import EstimatorTransformer, ComparisonFunctionTransformer

from .util import Xn_to_Xy, LR

Expand All @@ -20,7 +19,7 @@ def _create_transformer(scorer):
elif hasattr(scorer, "predict_proba"):
return EstimatorTransformer(scorer)
elif callable(scorer):
return DistanceFunctionTransformer(scorer)
return ComparisonFunctionTransformer(scorer)
else:
raise NotImplementedError("`scorer` argument must either be callable or implement at least one of `transform`, `predict_proba`")

Expand Down
2 changes: 1 addition & 1 deletion lir/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def __getattr__(self, item):
return getattr(self.estimator, item)


class DistanceFunctionTransformer(FunctionTransformer):
class ComparisonFunctionTransformer(FunctionTransformer):
"""
A wrapper for a distance function to make it behave like a transformer.
Expand Down
4 changes: 2 additions & 2 deletions tests/test_calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def test_kde_calibrator(self):
X = to_probability(X)
X = to_log_odds(X)
desired = [3.59562799e-02, 1.75942116e-11, 2.59633540e-12, 1.36799721e-12, 8.15673411e-03, 2.10030624e-02, 3.70456430e-05, 1.40710861e-18, 1.04459592e-10, 3.14589737e+03, 2.59568527e+02, 1.08519904e+02, 8.56459139e+01, 3.81243702e+00, 6.23873841e+01, 1.43844114e+02, 2.64913149e+02, 1.49097168e+05]
calibrator = KDECalibrator()
calibrator = KDECalibrator(bandwidth="silverman")
calibrator.fit(X, y)
lrs_cal = calibrator.transform(X)
np.testing.assert_allclose(lrs_cal, desired)
Expand All @@ -100,7 +100,7 @@ def test_on_extreme_values(self):
X = to_log_odds(X)
y = np.concatenate((np.zeros(12), np.ones(12)))
desired = [6.148510640582358, 0.10548096579142373, 0.07571171879632102, 0.06774859414831141, 4.408883097248305, 5.446103603204983, 1.4258427450086562, 0.006102474459494191, 0.14360453961912525, 0.0, 0.0, 0.0, 17.786943105214274, 21.248067409078676, 21.10676921763807, 20.955468109356307, 16.029054988277238, 20.689727349181517, 21.22851434841379, 21.24246276550688, 11.31919250180751, math.inf, 2.846712755553574, math.inf]
calibrator = KDECalibrator()
calibrator = KDECalibrator(bandwidth="silverman")
calibrator.fit(X, y)
lrs_cal = calibrator.transform(X)
np.testing.assert_allclose(lrs_cal, desired)
Expand Down
4 changes: 2 additions & 2 deletions tests/test_lr.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,11 +190,11 @@ def test_calibrated_scorer_with_distance_function(self):
X = np.concatenate([np.random.normal(loc=i, scale=.1, size=(1, 3)) for i in y])
X, y = InstancePairing(ratio_limit=1, seed=0).transform(X, y)

calibrated_scorer = CalibratedScorer(paired_manhattan_distances, KDECalibrator())
calibrated_scorer = CalibratedScorer(paired_manhattan_distances, KDECalibrator(bandwidth="silverman"))
calibrated_scorer.fit(X, y)
self.assertAlmostEqual(0, metrics.cllr(calibrated_scorer.predict_lr(X), y), places=3)

calibrated_scorer = CalibratedScorerCV(paired_manhattan_distances, KDECalibrator(), n_splits=5)
calibrated_scorer = CalibratedScorerCV(paired_manhattan_distances, KDECalibrator(bandwidth="silverman"), n_splits=5)
calibrated_scorer.fit(X, y)
self.assertAlmostEqual(0, metrics.cllr(calibrated_scorer.predict_lr(X), y), places=3)

Expand Down

0 comments on commit 1f59d24

Please sign in to comment.