From 20687e110ca9e38233f8382f2dc0985da6bc9997 Mon Sep 17 00:00:00 2001 From: Michael Van de Steene Date: Mon, 20 Nov 2023 15:52:09 +0100 Subject: [PATCH 1/3] Use pandas `value_counts` for L-Infinity method --- nannyml/drift/univariate/methods.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/nannyml/drift/univariate/methods.py b/nannyml/drift/univariate/methods.py index 84a7d3c1..0c7bd962 100644 --- a/nannyml/drift/univariate/methods.py +++ b/nannyml/drift/univariate/methods.py @@ -507,8 +507,7 @@ def __init__(self, **kwargs) -> None: def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self: reference_data = _remove_nans(reference_data) - ref_labels = reference_data.unique() - self._reference_proba = {label: (reference_data == label).sum() / len(reference_data) for label in ref_labels} + self._reference_proba = reference_data.value_counts(normalize=True) return self @@ -520,16 +519,14 @@ def _calculate(self, data: pd.Series): data = _remove_nans(data) if data.empty: return np.nan - data_labels = data.unique() - data_ratios = {label: (data == label).sum() / len(data) for label in data_labels} + analysis_data_ratio = data.value_counts(normalize=True) - union_labels = set(self._reference_proba.keys()) | set(data_labels) + # Unify indices so reference and analysis have an entry for all labels + unified_index = self._reference_proba.index.union(analysis_data_ratio.index) + reference_data_ratio = self._reference_proba.reindex(unified_index, fill_value=0) + analysis_data_ratio = analysis_data_ratio.reindex(unified_index, fill_value=0) - differences = {} - for label in union_labels: - differences[label] = np.abs(self._reference_proba.get(label, 0) - data_ratios.get(label, 0)) - - return max(differences.values()) + return (reference_data_ratio - analysis_data_ratio).abs().max() @MethodFactory.register(key='wasserstein', feature_type=FeatureType.CONTINUOUS) From cc43d14c1175e53d37323294d07e506bea6ec144 Mon Sep 17 00:00:00 2001 From: Michael Van de Steene Date: Mon, 20 Nov 2023 16:03:37 +0100 Subject: [PATCH 2/3] Optimize further using `sub` function --- nannyml/drift/univariate/methods.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/nannyml/drift/univariate/methods.py b/nannyml/drift/univariate/methods.py index 0c7bd962..5e35771f 100644 --- a/nannyml/drift/univariate/methods.py +++ b/nannyml/drift/univariate/methods.py @@ -519,14 +519,9 @@ def _calculate(self, data: pd.Series): data = _remove_nans(data) if data.empty: return np.nan - analysis_data_ratio = data.value_counts(normalize=True) - - # Unify indices so reference and analysis have an entry for all labels - unified_index = self._reference_proba.index.union(analysis_data_ratio.index) - reference_data_ratio = self._reference_proba.reindex(unified_index, fill_value=0) - analysis_data_ratio = analysis_data_ratio.reindex(unified_index, fill_value=0) - return (reference_data_ratio - analysis_data_ratio).abs().max() + analysis_data_ratio = data.value_counts(normalize=True) + return self._reference_proba.sub(analysis_data_ratio, fill_value=0).abs().max() @MethodFactory.register(key='wasserstein', feature_type=FeatureType.CONTINUOUS) From fb270dfcf8d0ef0b9528bfc83b510917e688bfac Mon Sep 17 00:00:00 2001 From: Michael Van de Steene Date: Mon, 20 Nov 2023 16:11:05 +0100 Subject: [PATCH 3/3] Fix mypy error --- nannyml/drift/univariate/methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nannyml/drift/univariate/methods.py b/nannyml/drift/univariate/methods.py index 5e35771f..5354cdf5 100644 --- a/nannyml/drift/univariate/methods.py +++ b/nannyml/drift/univariate/methods.py @@ -503,7 +503,7 @@ def __init__(self, **kwargs) -> None: An optional lower threshold for the performance metric. """ - self._reference_proba: Optional[dict] = None + self._reference_proba: Optional[pd.Series] = None def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self: reference_data = _remove_nans(reference_data)