From aad7edf7e59b3edd2f184f20cf5f8b39ae5f0a0c Mon Sep 17 00:00:00 2001 From: Michael Van de Steene <124588413+michael-nml@users.noreply.github.com> Date: Wed, 17 Jan 2024 01:46:23 +0100 Subject: [PATCH] Remove chi2 thresholds for analysis & reference (#349) Chi2 thresholding is based on p-values, which means the calculated thresholds are irrelevant. Previously thresholds were removed as part of alerting. As this is only applied on analysis data the threshold value was still available on reference data, which led to confusion. This commit changes that so the chi2 threshold values are removed across the entire dataset. --- nannyml/drift/univariate/methods.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/nannyml/drift/univariate/methods.py b/nannyml/drift/univariate/methods.py index 156ce0cf..b5710318 100644 --- a/nannyml/drift/univariate/methods.py +++ b/nannyml/drift/univariate/methods.py @@ -443,6 +443,14 @@ def __init__(self, **kwargs) -> None: self._p_value: float self._fitted = False + def fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self: + super().fit(reference_data, timestamps) + + # Thresholding is based on p-values. Ignoring all custom thresholding and disable plotting a threshold + self.lower_threshold_value = None + self.upper_threshold_value = None + return self + def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self: reference_data = _remove_nans(reference_data) self._reference_data_vcs = reference_data.value_counts().loc[lambda v: v != 0] @@ -462,9 +470,6 @@ def _calculate(self, data: pd.Series): return stat def alert(self, value: float): - self.lower_threshold_value = None # ignoring all custom thresholding, disable plotting a threshold - self.upper_threshold_value = None # ignoring all custom thresholding, disable plotting a threshold - return self._p_value < 0.05 def _calc_chi2(self, data: pd.Series):