Skip to content

Commit

Permalink
Adding PSI for continious data
Browse files Browse the repository at this point in the history
  • Loading branch information
hakimelakhrass committed Oct 14, 2023
1 parent dd20ef7 commit c58c7f5
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 8 deletions.
5 changes: 5 additions & 0 deletions nannyml/drift/univariate/calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
- L-infinity distance (categorical)
- Jensen-Shannon distance
- Hellinger distance
- Population Stability Index (continuous)
For more information, check out the `tutorial`_ or the `deep dive`_.
Expand Down Expand Up @@ -50,6 +51,7 @@
'wasserstein': StandardDeviationThreshold(std_lower_multiplier=None),
'hellinger': ConstantThreshold(lower=None, upper=0.1),
'l_infinity': ConstantThreshold(lower=None, upper=0.1),
'psi': ConstantThreshold(lower=None, upper=0.25),
}


Expand Down Expand Up @@ -97,6 +99,7 @@ def __init__(
- `kolmogorov_smirnov`
- `hellinger`
- `wasserstein`
- `psi`
chunk_size: int
Splits the data into chunks containing `chunks_size` observations.
Only one of `chunk_size`, `chunk_number` or `chunk_period` should be given.
Expand All @@ -118,6 +121,7 @@ def __init__(
'wasserstein': StandardDeviationThreshold(std_lower_multiplier=None),
'hellinger': ConstantThreshold(upper=0.1),
'l_infinity': ConstantThreshold(upper=0.1)
'psi': ConstantThreshold(upper=0.25)
}
A dictionary allowing users to set a custom threshold for each method. It links a `Threshold` subclass
Expand All @@ -130,6 +134,7 @@ def __init__(
- `wasserstein`: `StandardDeviationThreshold(std_lower_multiplier=None)`
- `hellinger`: `ConstantThreshold(upper=0.1)`
- `l_infinity`: `ConstantThreshold(upper=0.1)`
- `psi`: `ConstantThreshold(upper=0.25)`
The `chi2` method does not support custom thresholds for now. Additional research is required to determine
how to transition from its current p-value based implementation.
Expand Down
83 changes: 75 additions & 8 deletions nannyml/drift/univariate/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ def __init__(self, **kwargs) -> None:
column_name: str, default='jensen-shannon'
The name used to indicate the metric in columns of a DataFrame.
lower_threshold_limit : float, default=0
An optional lower threshold for the performance metric.
An optional lower threshold for the drift metric.
"""
self._treat_as_type: str
self._bins: np.ndarray
Expand Down Expand Up @@ -353,9 +353,9 @@ def __init__(self, **kwargs) -> None:
column_name: str, default='kolmogorov-smirnov'
The name used to indicate the metric in columns of a DataFrame.
upper_threshold_limit : float, default=1.0
An optional upper threshold for the performance metric.
An optional upper threshold for the drift metric.
lower_threshold_limit : float, default=0
An optional lower threshold for the performance metric.
An optional lower threshold for the drift metric.
"""
self._reference_data: Optional[pd.Series] = None
self._reference_size: float
Expand Down Expand Up @@ -434,9 +434,9 @@ def __init__(self, **kwargs) -> None:
column_name: str, default='chi2'
The name used to indicate the metric in columns of a DataFrame.
upper_threshold_limit : float, default=1.0
An optional upper threshold for the performance metric.
An optional upper threshold for the drift metric.
lower_threshold_limit : float, default=0
An optional lower threshold for the performance metric.
An optional lower threshold for the drift metric.
"""
self._reference_data_vcs: pd.Series
self._p_value: float
Expand Down Expand Up @@ -499,7 +499,7 @@ def __init__(self, **kwargs) -> None:
column_name: str, default='l_infinity'
The name used to indicate the metric in columns of a DataFrame.
lower_threshold_limit : float, default=0
An optional lower threshold for the performance metric.
An optional lower threshold for the drift metric.
"""

self._reference_proba: Optional[dict] = None
Expand Down Expand Up @@ -553,7 +553,7 @@ def __init__(self, **kwargs) -> None:
column_name: str, default='wasserstein'
The name used to indicate the metric in columns of a DataFrame.
lower_threshold_limit : float, default=0
An optional lower threshold for the performance metric.
An optional lower threshold for the drift metric.
"""

self._reference_data: Optional[pd.Series] = None
Expand Down Expand Up @@ -660,7 +660,7 @@ def __init__(self, **kwargs) -> None:
column_name: str, default='hellinger'
The name used to indicate the metric in columns of a DataFrame.
lower_threshold_limit : float, default=0
An optional lower threshold for the performance metric.
An optional lower threshold for the drift metric.
"""

self._treat_as_type: str
Expand Down Expand Up @@ -719,3 +719,70 @@ def _calculate(self, data: pd.Series):
del reference_proba_in_bins

return distance

@MethodFactory.register(key='psi', feature_type=FeatureType.CONTINUOUS)
class PSI(Method):
"""Calculates the Population Stability Index (PSI) between two distributions."""

def __init__(self, **kwargs) -> None:
super().__init__(

Check warning on line 728 in nannyml/drift/univariate/methods.py

View check run for this annotation

Codecov / codecov/patch

nannyml/drift/univariate/methods.py#L728

Added line #L728 was not covered by tests
display_name='Population Stability Index',
column_name='psi',
lower_threshold_limit=0,
**kwargs,
)
"""

Check warning on line 734 in nannyml/drift/univariate/methods.py

View check run for this annotation

Codecov / codecov/patch

nannyml/drift/univariate/methods.py#L734

Added line #L734 was not covered by tests
Parameters
----------
display_name : str, default='Population Stability Index'
The name of the metric. Used to display in plots.

Check warning on line 738 in nannyml/drift/univariate/methods.py

View check run for this annotation

Codecov / codecov/patch

nannyml/drift/univariate/methods.py#L738

Added line #L738 was not covered by tests
column_name: str, default='psi'
The name used to indicate the metric in columns of a DataFrame.
lower_threshold_limit : float, default=0
An optional lower threshold for the drift metric.
"""

self._reference_bins = None

Check warning on line 746 in nannyml/drift/univariate/methods.py

View check run for this annotation

Codecov / codecov/patch

nannyml/drift/univariate/methods.py#L742-L746

Added lines #L742 - L746 were not covered by tests
def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self:
#pick optimal bin size using freedman diaconis rule
data = np.array(reference_data)
IQR = np.percentile(data, 75) - np.percentile(data, 25)
n = len(data)
bin_width = 2 * IQR * n**(-1/3)
bin_num = int(np.ceil((data.max() - data.min()) / bin_width))

reference_data = _remove_missing_data(reference_data)
_, self._reference_bins = np.histogram(reference_data, bins=bin_num, density=True)

Check warning on line 756 in nannyml/drift/univariate/methods.py

View check run for this annotation

Codecov / codecov/patch

nannyml/drift/univariate/methods.py#L748-L756

Added lines #L748 - L756 were not covered by tests


# Calculate bin frequencies for the reference data
self._reference_count, _ = np.histogram(reference_data, bins=self._reference_bins)
self._reference_proba = self._reference_count / len(reference_data)

Check warning on line 761 in nannyml/drift/univariate/methods.py

View check run for this annotation

Codecov / codecov/patch

nannyml/drift/univariate/methods.py#L760-L761

Added lines #L760 - L761 were not covered by tests

return self

Check warning on line 763 in nannyml/drift/univariate/methods.py

View check run for this annotation

Codecov / codecov/patch

nannyml/drift/univariate/methods.py#L763

Added line #L763 was not covered by tests

def _calculate(self, data: pd.Series):
if self._reference_bins is None:
raise NotFittedException(
"tried to call 'calculate' on an unfitted method " f"{self.display_name}. Please run 'fit' first"
)
data = _remove_missing_data(data)

Check warning on line 770 in nannyml/drift/univariate/methods.py

View check run for this annotation

Codecov / codecov/patch

nannyml/drift/univariate/methods.py#L767-L770

Added lines #L767 - L770 were not covered by tests
if data.empty:
return np.nan

Check warning on line 772 in nannyml/drift/univariate/methods.py

View check run for this annotation

Codecov / codecov/patch

nannyml/drift/univariate/methods.py#L772

Added line #L772 was not covered by tests

# Calculate bin frequencies for the analysis data
data_counts, _ = np.histogram(data, bins=self._reference_bins)
data_probs = data_counts / len(data)

Check warning on line 776 in nannyml/drift/univariate/methods.py

View check run for this annotation

Codecov / codecov/patch

nannyml/drift/univariate/methods.py#L775-L776

Added lines #L775 - L776 were not covered by tests

# Use the previously calculated bin frequencies for the reference data
ref_probs = self._reference_proba

Check warning on line 779 in nannyml/drift/univariate/methods.py

View check run for this annotation

Codecov / codecov/patch

nannyml/drift/univariate/methods.py#L779

Added line #L779 was not covered by tests

psi_values = [
(data_prob - ref_prob) * np.log(data_prob / ref_prob)
if ref_prob > 1e-10 and data_prob > 1e-10
else 0
for data_prob, ref_prob in zip(data_probs, ref_probs)
]

return sum(psi_values)

Check warning on line 788 in nannyml/drift/univariate/methods.py

View check run for this annotation

Codecov / codecov/patch

nannyml/drift/univariate/methods.py#L788

Added line #L788 was not covered by tests
1 change: 1 addition & 0 deletions tests/drift/test_drift.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,7 @@ def test_univariate_drift_calculator_without_custom_thresholds():
'hellinger': ConstantThreshold(lower=1, upper=2),
'jensen_shannon': ConstantThreshold(lower=1, upper=2),
'l_infinity': ConstantThreshold(lower=1, upper=2),
'psi': ConstantThreshold(lower=1, upper=2),
},
],
)
Expand Down

0 comments on commit c58c7f5

Please sign in to comment.